## Imports Packages and Set Defaults

In [32]:
from pandas_plink import read_plink1_bin, read_plink
import pandas as pd
import numpy as np
from numpy.random import default_rng

In [36]:
rng = default_rng(seed=10)

## Import Data

In [3]:
bim, fam, bed = read_plink('sim/CEDAR')

Mapping files: 100%|██████████| 3/3 [00:01<00:00,  2.02it/s]


## Data Exploration and Pre-Processing

In [105]:
bed # 730K SNPs x 322 ppl

Unnamed: 0,Array,Chunk
Bytes,940.92 MB,1.32 MB
Shape,"(730525, 322)","(1024, 322)"
Count,3570 Tasks,714 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 940.92 MB 1.32 MB Shape (730525, 322) (1024, 322) Count 3570 Tasks 714 Chunks Type float32 numpy.ndarray",322  730525,

Unnamed: 0,Array,Chunk
Bytes,940.92 MB,1.32 MB
Shape,"(730525, 322)","(1024, 322)"
Count,3570 Tasks,714 Chunks
Type,float32,numpy.ndarray


In [157]:
X = bed.blocks[5:7].compute().T # (num ppl x num SNPs)

print(X.shape)
print(X)

(322, 2048)
[[1. 2. 2. ... 2. 2. 2.]
 [2. 2. 2. ... 2. 2. 2.]
 [2. 2. 2. ... 2. 2. 2.]
 ...
 [2. 0. 1. ... 2. 1. 2.]
 [2. 2. 2. ... 2. 2. 2.]
 [1. 2. 2. ... 2. 1. 2.]]


##### Check and Remove NaN Values

If a SNP is NaN for any invidual, remove the entire SNP for all individuals in the dataset.

In [158]:
num_nan = np.count_nonzero(np.isnan(X))
print(num_nan, f'NaN values in data')

X_snp_nan = np.isnan(X.sum(axis=0))
num_nan_snp = np.count_nonzero(X_snp_nan)
print(num_nan_snp, f'different SNPs that contain a NaN value')

3345 NaN values in data
819 different SNPs that contain a NaN value


In [159]:
X = X[:, ~np.any(np.isnan(X),axis=0)] # remove columns w/ Nan

print(X.shape)
print(X)

(322, 1229)
[[1. 2. 2. ... 2. 0. 2.]
 [2. 2. 2. ... 2. 2. 2.]
 [2. 2. 2. ... 2. 1. 2.]
 ...
 [2. 0. 1. ... 2. 1. 1.]
 [2. 2. 2. ... 2. 2. 2.]
 [1. 2. 2. ... 2. 1. 1.]]


## Generate Synthetic Data

In [174]:
# default values
num_casual_snp = 10
num_ppl, num_snp = X.shape # 322 ppl x 1229 SNPs

In [173]:
# sample casual SNPs 
casual_idx = rng.integers(low=0, high=num_snp, 
                          size=num_casual_snp, dtype=np.int32)

# sample effect size (num_snp x 1)
beta = np.zeros(num_snp)
beta[casual_idx] = rng.multivariate_normal(mean=np.zeros(num_casual_snp), 
                                           cov=np.eye(num_casual_snp))

# sample error (num_ppl x 1)
eps = rng.multivariate_normal(mean=np.zeros(num_ppl), 
                              cov=np.eye(num_ppl))

# generate phenotype (num_ppl x 1)
y = X @ beta + eps