## Simulation of quantitative phenotype given genotypes


In [137]:
import pandas as pd
import numpy as np
import os, random

class PhenotypeSimulator:
    def __init__(self, genotype_file):
        self.gfile = genotype_file
        self.phenotype = {}
        self.beta = {}
        self.pid = None
        
    def get_genes(self, limit = 5):
        res = pd.HDFStore(self.gfile).keys()
        if len(res) > limit:
            res = res[:limit]
        return res
    
    def get_X(self, table):
        return pd.read_hdf(self.gfile, table)
    
    def get_ld(self, tables, save_to = None):
        '''r^2 based LD calculation'''
        ld = {table: pd.read_hdf(self.gfile, table).transpose().corr(method = 'pearson') for table in tables}
        ld = {key: (np.power(value, 2) * np.sign(value)).astype(np.float16) for key, value in ld.items()}
        if save_to is not None:
            if os.path.isfile(save_to):
                os.remove(save_to)
            for key in ld:
                ld[key].to_hdf(save_to, key, mode = 'a', complevel = 9, complib = 'zlib')
        return ld
    
    def ld_heatmap(self, corrmat, out):
        import seaborn as sns
        import matplotlib.pyplot as plt
        fig, ax = plt.subplots()
        sns.heatmap(corrmat, ax = ax, vmin=-1, vmax=1, square=True, xticklabels = False, yticklabels = False)
        plt.savefig(out, dpi = 500)
        
    def generate_betamix(self, nbeta, sigmas, pis, pi0 = 0):
        '''beta ~ \pi_0\delta_0 + \sum \pi_i N(0, sigma_i)
        sigma here is a nbeta list or nbeta * nbeta matrix
        '''
        if isinstance(sigmas, list):
            sigmas = np.diag(sigmas)
        assert (len(pis), len(pis)) == sigmas.shape
        masks = np.random.multinomial(1, pis, size = nbeta)
        mix = np.random.multivariate_normal([0] * len(pis), sigmas, nbeta)
        return np.sum(mix * masks, axis = 1) * np.random.binomial(1, 1 - pi0, nbeta)
    
    def generate_y(self, X, beta, sigma, force = False):
        if self.pid in self.phenotype and force is not True:
            print('Name "{}" already exists. Use "force = True" to overwrite it'.format(self.pid))
            return self.phenotype[self.pid]
        assert X.shape[0] == len(beta)
        self.beta[self.pid] = beta.tolist()
        beta.reshape((len(beta),1))
        y = np.dot(X.T, beta) + np.random.normal(0, 1, X.shape[1])
        y.reshape(len(y), 1)
        y = pd.DataFrame(data = y, columns = [self.pid], index = X.columns).transpose()
        self.phenotype[self.pid] = y
        return y
    
    def select_convoluted_snps(self, ld, cutoff1 = 0.9, cutoff2 = 10, cutoff3 = 0.01):
        '''based on LD matrix select SNPs in strong LD with other SNPs 
        yet are independent between themselves'''
        print('Count strong LD')
        strong_ld_count = ((np.absolute(ld) > cutoff1) * ld).sum(axis = 0).sort_values(ascending = False)
        strong_ld_count = strong_ld_count[strong_ld_count > cutoff2]
        print('Filter by LD')
        exclude = []
        for x in strong_ld_count.index:
            if x in exclude:
                continue
            for y in strong_ld_count.index:
                if y in exclude or y == x:
                    continue
                if np.absolute(ld[x][y]) > cutoff3:
                    exclude.append(y)
        print('Done')
        return [i for i, x in enumerate(strong_ld_count.index) if not x in exclude]
    
    def swap_beta(self, beta, strength_index):
        '''Set tops of beta to tops in strength_index'''
        nb = [0] * len(beta)
        beta = sorted(beta, key=abs, reverse=True)
        for item in strength_index:
            nb[item] = beta.pop(0)
        random.shuffle(beta)
        for idx in range(len(nb)):
            if not idx in strength_index:
                nb[idx] = beta.pop(0)
        assert len(beta) == 0
        return np.array(nb)
    
    def set_id(self, name):
        self.pid = name

## Load data

In [138]:
ms = PhenotypeSimulator("/home/gaow/Documents/GTEx/ToyExample/TY.genotype.h5")
tables = ms.get_genes()

## Compute and save LD

In [6]:
ld = ms.get_ld(tables, save_to = "/home/gaow/Documents/GTEx/ToyExample/TY.ld.h5")

## Putting all together
If you just want to get simulated data without knowing the details you can run the following code and find the output `/home/gaow/Documents/GTEx/ToyExample/TY.expr_simulated.h5`. Otherwise you should read on for more details.

In [139]:
pis = [0.25, 0.3, 0.45]
pi0 = 0.98
sigmas = [1, 0.4, 3]
for table in tables:
    ms.set_id(os.path.basename(table))
    nbeta = ld[table].shape[0]
    beta = ms.generate_betamix(nbeta=nbeta,pi0=pi0,pis=pis,sigmas=sigmas)
    strong_snps_idx = ms.select_convoluted_snps(ld[table])
    beta = ms.swap_beta(beta, strong_snps_idx)
    X = ms.get_X(table=table)
    y = ms.generate_y(beta=beta,sigma=1, X=X)
pd.concat(ms.phenotype.values()).to_hdf('/home/gaow/Documents/GTEx/ToyExample/TY.expr_simulated.h5', '/simulated', mode = 'a', complevel = 9, complib = 'zlib')
import json
with open("/home/gaow/Documents/GTEx/ToyExample/TY.beta_simulated.json", 'w') as fp:
    json.dump(ms.beta, fp)

Count strong LD
Filter by LD
Done
Count strong LD
Filter by LD
Done
Count strong LD
Filter by LD
Done


## View and select LD structure
Take gene `ENSG00000264247` for example:

In [35]:
ld['/chr18/ENSG00000264247'].head()

Unnamed: 0,18:73591857:T:C,18:73591871:C:T,18:73592510:G:A,18:73592634:A:T,18:73592876:C:T,18:73593143:G:A,18:73593306:C:T,18:73593390:C:G,18:73593520:A:G,18:73594308:C:T,...,18:75590669:C:T,18:75590671:C:T,18:75590985:A:T,18:75591080:C:T,18:75591083:G:C,18:75591259:A:C,18:75591309:T:C,18:75591343:G:A,18:75591593:T:C,18:75591671:C:A
18:73591857:T:C,1.0,-0.457275,-0.00745,1.0,-0.011086,-0.005173,-0.45166,-0.45752,-0.148438,0.793945,...,-0.032593,0.000545,-0.016373,-0.01075,-0.003101,-0.010529,0.004395,-0.002008,0.000545,0.000545
18:73591871:C:T,-0.457275,1.0,0.024582,-0.457275,0.021378,0.024582,0.990723,0.99707,-0.175659,-0.554199,...,-0.00247,-0.001062,-0.000659,-0.000128,0.000181,-0.000118,-0.001764,0.000423,-0.001062,-0.001062
18:73592510:G:A,-0.00745,0.024582,1.0,-0.00745,0.000168,-0.000668,0.024689,0.024643,-0.008102,-0.009346,...,-0.000218,-4.8e-05,-0.001693,-0.002493,-0.001722,-0.002516,-0.00055,-7.7e-05,-4.8e-05,-4.8e-05
18:73592634:A:T,1.0,-0.457275,-0.00745,1.0,-0.011086,-0.005173,-0.45166,-0.45752,-0.148438,0.793945,...,-0.032593,0.000545,-0.016373,-0.01075,-0.003101,-0.010529,0.004395,-0.002008,0.000545,0.000545
18:73592876:C:T,-0.011086,0.021378,0.000168,-0.011086,1.0,0.000168,0.021408,0.021469,-0.00285,-0.009285,...,0.000102,-1.7e-05,-0.000359,-0.000344,-0.000275,-0.00036,-0.001019,8e-06,-1.7e-05,-1.7e-05


In [87]:
ms.ld_heatmap(ld['/chr18/ENSG00000264247'].iloc[:1000,:1000], 'img/ENSG00000264247.ld.png')

![](img/ENSG00000264247.ld.png)

## Simulating effect size
Now let's degress to effect size simulation. Effect size refers to $\beta$ in the linear model $ Y = X \beta + E$ where for simplicity we assume $E_{ij} \sim N(0,1)$. We sample $\beta$ from a mixture of gaussian distribution and a point mass.

Here I start with a simple 3 components mixture, and a point mass of 95% at the null:

In [105]:
nbeta = ld['/chr18/ENSG00000264247'].shape[0]
pis = [0.25, 0.3, 0.45]
pi0 = 0.98
sigmas = [1, 0.4, 3]
beta = ms.generate_betamix(nbeta=nbeta,pi0=pi0,pis=pis,sigmas=sigmas)

## Swap big effect size to most LD-convoluted SNPs
To better illustrate the potential of mr-ash I identify from genotype matrix potentially the more LD-convoluted SNPs and assign them the largest effect size. Specifically, I rank SNPs by their number of having LD with other SNPs greater than 0.9 (unsigned), and filter the top ranked SNPs until there is no strong LD between them ($r^2<0.1$). Then I swap $\beta$s so that these SNPs have large effect size.

In [74]:
strong_snps_idx = ms.select_convoluted_snps(ld['/chr18/ENSG00000264247'])

Count strong LD
Filter by LD
Done


In [106]:
beta = ms.swap_beta(beta, strong_snps_idx)

## Simulate phenotypes

In [122]:
X = ms.get_X(table='/chr18/ENSG00000264247')
y = ms.generate_y(beta=beta,sigma=1, X=X, name = 'ENSG00000264247')

In [124]:
y

iid,GTEX-1A3MV,GTEX-1A3MW,GTEX-1A3MX,GTEX-1A8FM,GTEX-1A8G6,GTEX-1A8G7,GTEX-1A32A,GTEX-1AMEY,GTEX-1AMFI,GTEX-1AX8Z,...,GTEX-ZYFC,GTEX-ZYFD,GTEX-ZYFG,GTEX-ZYT6,GTEX-ZYVF,GTEX-ZYW4,GTEX-ZYY3,GTEX-ZZ64,GTEX-ZZPT,GTEX-ZZPU
ENSG00000264247,-23.986548,-18.094246,-16.663314,-14.677913,-23.234465,-12.28504,-1.389807,-13.192255,-16.23599,-5.80662,...,4.487896,-15.573231,-12.584244,-12.163406,-1.492891,-33.267576,-4.070685,-26.442516,-5.318166,-5.144975
