# Description

# Modules

In [1]:
import pandas as pd
import numpy as np

import conf

# Settings

In [2]:
N_PHENOTYPES = 100

# Paths

In [3]:
SUBSETS_DIR = conf.A1000G["GENOTYPES_DIR"] / "subsets"

In [4]:
SUBSET_FILE_PREFIX = "all_phase3.8"

# Load data

In [5]:
input_filepath = SUBSETS_DIR / f"{SUBSET_FILE_PREFIX}.fam"
display(input_filepath)

data = pd.read_csv(input_filepath, sep="\s+", header=None)

PosixPath('/opt/data/data/1000g/genotypes/subsets/all_phase3.8.fam')

In [6]:
data.shape

(312, 6)

In [7]:
data.head()

Unnamed: 0,0,1,2,3,4,5
0,0,HG00096,0,0,1,-9
1,0,HG00100,0,0,2,-9
2,0,HG00101,0,0,1,-9
3,0,HG00102,0,0,2,-9
4,0,HG00103,0,0,1,-9


In [8]:
data = data.iloc[:, 0:2]

In [9]:
data.head()

Unnamed: 0,0,1
0,0,HG00096
1,0,HG00100
2,0,HG00101
3,0,HG00102
4,0,HG00103


In [10]:
data = data.rename(columns={0: "FID", 1: "IID"})

In [11]:
data.head()

Unnamed: 0,FID,IID
0,0,HG00096
1,0,HG00100
2,0,HG00101
3,0,HG00102
4,0,HG00103


In [12]:
n_samples = data.shape[0]
display(n_samples)

312

# Generate random phenotypes

In [13]:
rs = np.random.RandomState(0)
random_phenos = {}
for i in range(N_PHENOTYPES):
    random_phenos[f"pheno{i}"] = rs.normal(size=n_samples)

In [14]:
random_data = data.assign(**random_phenos)

In [15]:
random_data.shape

(312, 102)

In [16]:
random_data.head()

Unnamed: 0,FID,IID,pheno0,pheno1,pheno2,pheno3,pheno4,pheno5,pheno6,pheno7,...,pheno90,pheno91,pheno92,pheno93,pheno94,pheno95,pheno96,pheno97,pheno98,pheno99
0,0,HG00096,1.764052,-1.126826,-0.375147,-1.006832,-0.291642,1.587762,-0.730874,-1.012457,...,-0.242948,0.461256,-1.450561,1.47086,-0.81553,0.991766,-1.552248,1.752155,-0.081803,-0.908184
1,0,HG00100,0.400157,-0.730678,-1.226196,1.681577,-1.741023,-0.643258,-1.641703,1.25057,...,0.010482,-1.15815,0.71036,-1.328706,-1.373909,0.47843,0.863075,0.739282,0.287118,-0.149202
2,0,HG00101,0.978738,-0.38488,0.183339,-0.792287,-0.780304,-1.133593,0.275667,-0.234538,...,0.988784,-0.637922,-1.25598,-1.875025,-2.38481,-0.529093,-0.36185,1.107525,1.850835,-1.710601
3,0,HG00102,2.240893,0.094352,1.670943,-0.531606,0.271113,0.99676,-0.708385,-0.863356,...,-0.523287,-0.523388,0.049111,-0.653856,-1.120972,-1.124456,-0.275997,-0.949599,-0.412174,-0.775918
4,0,HG00103,1.867558,-0.042171,-0.056133,0.365849,1.045023,-0.148766,-0.015779,-1.035606,...,-0.654061,0.245352,2.452137,-0.443658,-0.818241,0.34024,-0.849328,-0.29715,-0.369514,-0.499161


In [17]:
output_filename = SUBSETS_DIR / f"{SUBSET_FILE_PREFIX}.random_pheno.txt"
display(output_filename)

random_data.to_csv(
    output_filename, sep=" ", index=False, header=True, float_format="%.5f"
)

PosixPath('/opt/data/data/1000g/genotypes/subsets/all_phase3.8.random_pheno.txt')