# Description

It reads the final set of samples in genotype data and generates random phenotypes for them.

# Modules

In [1]:
import pandas as pd
import numpy as np

import conf

# Settings

In [2]:
N_PHENOTYPES = 1000

# Paths

In [3]:
SUBSETS_DIR = conf.A1000G["GENOTYPES_DIR"] / "subsets"

In [4]:
SUBSET_FILE_PREFIX = "all_phase3.8"

# Load data

In [5]:
input_filepath = SUBSETS_DIR / f"{SUBSET_FILE_PREFIX}.fam"
display(input_filepath)

data = pd.read_csv(input_filepath, sep="\s+", header=None)

PosixPath('/opt/data/data/1000g/genotypes/subsets/all_phase3.8.fam')

In [6]:
data.shape

(312, 6)

In [7]:
data.head()

Unnamed: 0,0,1,2,3,4,5
0,0,HG00096,0,0,1,-9
1,0,HG00100,0,0,2,-9
2,0,HG00101,0,0,1,-9
3,0,HG00102,0,0,2,-9
4,0,HG00103,0,0,1,-9


In [8]:
data = data.iloc[:, 0:2]

In [9]:
data.head()

Unnamed: 0,0,1
0,0,HG00096
1,0,HG00100
2,0,HG00101
3,0,HG00102
4,0,HG00103


In [10]:
data = data.rename(columns={0: "FID", 1: "IID"})

In [11]:
data.head()

Unnamed: 0,FID,IID
0,0,HG00096
1,0,HG00100
2,0,HG00101
3,0,HG00102
4,0,HG00103


In [12]:
n_samples = data.shape[0]
display(n_samples)

312

# Generate random phenotypes

In [13]:
rs = np.random.RandomState(0)
random_phenos = {}
for i in range(N_PHENOTYPES):
    random_phenos[f"pheno{i}"] = rs.normal(size=n_samples)

In [14]:
random_data = data.assign(**random_phenos)

In [15]:
random_data.shape

(312, 1002)

In [16]:
random_data.head()

Unnamed: 0,FID,IID,pheno0,pheno1,pheno2,pheno3,pheno4,pheno5,pheno6,pheno7,...,pheno990,pheno991,pheno992,pheno993,pheno994,pheno995,pheno996,pheno997,pheno998,pheno999
0,0,HG00096,1.764052,-1.126826,-0.375147,-1.006832,-0.291642,1.587762,-0.730874,-1.012457,...,0.885934,0.014284,-0.528878,0.386922,0.073649,0.518042,0.19998,0.469002,2.23829,1.494255
1,0,HG00100,0.400157,-0.730678,-1.226196,1.681577,-1.741023,-0.643258,-1.641703,1.25057,...,-0.604045,-1.16145,0.209438,-0.272401,0.690999,0.967376,0.59001,-0.061214,-0.278893,0.862034
2,0,HG00101,0.978738,-0.38488,0.183339,-0.792287,-0.780304,-1.133593,0.275667,-0.234538,...,1.471394,1.790997,-1.296376,-2.28071,-0.154906,1.523906,1.347885,0.755101,-0.787863,1.166847
3,0,HG00102,2.240893,0.094352,1.670943,-0.531606,0.271113,0.99676,-0.708385,-0.863356,...,-0.303073,0.674192,0.3559,-1.3467,1.484254,-1.44016,-0.522326,1.21058,0.089535,-0.161223
4,0,HG00103,1.867558,-0.042171,-0.056133,0.365849,1.045023,-0.148766,-0.015779,-1.035606,...,-0.8499,0.342795,0.729472,0.475116,-0.521791,-0.022873,-0.150469,0.002442,-0.056189,0.974792


In [17]:
output_filename = SUBSETS_DIR / f"{SUBSET_FILE_PREFIX}.random_pheno.txt"
display(output_filename)

random_data.to_csv(
    output_filename, sep=" ", index=False, header=True, float_format="%.5f"
)

PosixPath('/opt/data/data/1000g/genotypes/subsets/all_phase3.8.random_pheno.txt')