## Simulation of quantitative phenotype given genotypes


In [102]:
import pandas as pd
import numpy as np
import os

class MMSimulator:
    def __init__(self, genotype_file):
        self.gfile = genotype_file
        
    def get_genes(self, limit = 5):
        res = pd.HDFStore(self.gfile).keys()
        if len(res) > limit:
            res = res[:limit]
        return res
    
    def get_X(self, table):
        return pd.read_hdf(self.gfile, table)
    
    def get_ld(self, tables, save_to = None):
        '''r^2 based LD calculation'''
        ld = {table: pd.read_hdf(self.gfile, table).transpose().corr(method = 'pearson') for table in tables}
        ld = {key: (np.power(value, 2) * np.sign(value)).astype(np.float16) for key, value in ld.items()}
        if save_to is not None:
            if os.path.isfile(save_to):
                os.remove(save_to)
            for key in ld:
                ld[key].to_hdf(save_to, key, mode = 'a', complevel = 9, complib = 'zlib')
        return ld
    
    def ld_heatmap(self, corrmat, out):
        import seaborn as sns
        import matplotlib.pyplot as plt
        fig, ax = plt.subplots()
        sns.heatmap(corrmat, ax = ax, vmin=-1, vmax=1, square=True, xticklabels = False, yticklabels = False)
        plt.savefig(out, dpi = 500)
        
    def generate_betamix(self, nbeta, sigmas, pis, pi0 = 0):
        '''beta ~ \pi_0\delta_0 + \sum \pi_i N(0, sigma_i)
        sigma here is a nbeta list or nbeta * nbeta matrix
        '''
        if isinstance(sigmas, list):
            sigmas = np.diag(sigmas)
        assert (len(pis), len(pis)) == sigmas.shape
        masks = np.random.multinomial(1, pis, size = nbeta)
        mix = np.random.multivariate_normal([0] * len(pis), sigmas, nbeta)
        return np.sum(mix * masks, axis = 1) * np.random.binomial(1, 1 - pi0, nbeta)
    
    def generate_y(self, X, beta, sigma):
        assert X.shape[0] == len(beta)
        beta.reshape((len(beta),1))
        return np.dot(X.T, beta) + np.random.normal(0, 1, X.shape[1])

## Load data

In [103]:
ms = MMSimulator("/home/gaow/Documents/ToyExample/TY.genotype.h5")
tables = ms.get_genes()

## Compute and save LD

In [29]:
ld = ms.get_ld(tables, save_to = "/home/gaow/Documents/ToyExample/TY.ld.h5")

## View and select LD structure
Take gene `ENSG00000264247` for example:

In [35]:
ld['/chr18/ENSG00000264247'].head()

Unnamed: 0,18:73591857:T:C,18:73591871:C:T,18:73592510:G:A,18:73592634:A:T,18:73592876:C:T,18:73593143:G:A,18:73593306:C:T,18:73593390:C:G,18:73593520:A:G,18:73594308:C:T,...,18:75590669:C:T,18:75590671:C:T,18:75590985:A:T,18:75591080:C:T,18:75591083:G:C,18:75591259:A:C,18:75591309:T:C,18:75591343:G:A,18:75591593:T:C,18:75591671:C:A
18:73591857:T:C,1.0,-0.457275,-0.00745,1.0,-0.011086,-0.005173,-0.45166,-0.45752,-0.148438,0.793945,...,-0.032593,0.000545,-0.016373,-0.01075,-0.003101,-0.010529,0.004395,-0.002008,0.000545,0.000545
18:73591871:C:T,-0.457275,1.0,0.024582,-0.457275,0.021378,0.024582,0.990723,0.99707,-0.175659,-0.554199,...,-0.00247,-0.001062,-0.000659,-0.000128,0.000181,-0.000118,-0.001764,0.000423,-0.001062,-0.001062
18:73592510:G:A,-0.00745,0.024582,1.0,-0.00745,0.000168,-0.000668,0.024689,0.024643,-0.008102,-0.009346,...,-0.000218,-4.8e-05,-0.001693,-0.002493,-0.001722,-0.002516,-0.00055,-7.7e-05,-4.8e-05,-4.8e-05
18:73592634:A:T,1.0,-0.457275,-0.00745,1.0,-0.011086,-0.005173,-0.45166,-0.45752,-0.148438,0.793945,...,-0.032593,0.000545,-0.016373,-0.01075,-0.003101,-0.010529,0.004395,-0.002008,0.000545,0.000545
18:73592876:C:T,-0.011086,0.021378,0.000168,-0.011086,1.0,0.000168,0.021408,0.021469,-0.00285,-0.009285,...,0.000102,-1.7e-05,-0.000359,-0.000344,-0.000275,-0.00036,-0.001019,8e-06,-1.7e-05,-1.7e-05


In [87]:
ms.ld_heatmap(ld['/chr18/ENSG00000264247'].iloc[:1000,:1000], 'img/ENSG00000264247.ld.png')

![](img/ENSG00000264247.ld.png)

## Simulating effect size
Now let's degress to effect size simulation. Effect size refers to $\beta$ in the linear model $ Y = X \beta + E$ where for simplicity we assume $E_{ij} \sim N(0,1)$. We sample $\beta$ from a mixture of gaussian distribution and a point mass.

Here I start with a simple 3 components mixture, and a point mass of 95% at the null:

In [93]:
nbeta = ld['/chr18/ENSG00000264247'].shape[0]
pis = [0.25, 0.3, 0.45]
pi0 = 0.95
sigmas = [1, 0.4, 3]
beta = ms.generate_betamix(nbeta=nbeta,pi0=pi0,pis=pis,sigmas=sigmas)

## Swap big effect size to most LD-convoluted SNPs

In [107]:
X = ms.get_X(table='/chr18/ENSG00000264247')
y = ms.generate_y(beta=beta,sigma=1,X=X)

array([ 39.65906218,  35.49526935,  11.47496833,  44.63175627,
        16.28701958,  14.86007608,  14.48171811,  55.06798035,
        12.96719083,  24.83519252,  15.78988186,  20.83907165,
        23.7154485 ,  22.32831885,   3.43386601,  13.19188103,
        23.8614422 ,  22.3767278 ,  17.4851875 ,  18.69392984,
        20.64893659,  48.00630834,  16.55713499,  27.67218895,
        48.18375822,  29.0869412 ,  13.24114659,  39.1931791 ,
        25.97746757,  13.68004058,  24.12629591,  23.7553819 ,
        25.27740606,  17.56743665,  36.86839409,  29.59898613,
        30.62425701,  14.7721995 ,  16.46814296,  33.82811882,
        11.42287229,  49.80764191,  -2.29610058,  40.82723556,
         1.19427127,  18.30384807,  24.97877197,   9.08195473,
        18.75567954,  29.37102348,  22.51197147,   4.74334803,
        16.71737032,  23.62048838,  32.64531544,  23.93122014,
         9.3182069 ,  13.75309363,   8.45999621,  40.94804988,
        15.86559905,  36.49882624, -26.12444483,  25.86