# DAP on simulation results

In [1]:
import numpy as np
import pandas as pd
from pprint import pprint
# from pandasql import sqldf
from scipy import stats
# import pickle
import feather
from collections import Counter
from datetime import datetime

#### Data transform: $y \in \{0,1\}$ to $y \in \{0,t\}$

In [2]:
# the lifetime prevalence of schizophrenia is 4.0/1000, not 1%, but we first set it as 0.01
# according to https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1140960/
prevalence = 0.01
# relative risk is normally between 2 and 10
RR = 2.5
t = stats.norm(0,1).ppf(1-prevalence)
# effect size of beta is z_mu, prior
z_mu = t - stats.norm(0,1).ppf(1-RR*prevalence)
print (t, z_mu)

2.32634787404 0.366383889501


In [3]:
# create prior file
cal_pthwy_genes = pd.read_table("../data/calciumgeneset.txt", skiprows = 2, header = None, names = ["gene_name"])
ref_genes = pd.read_table("data/refGene.txt.gz", compression="gzip", sep="\t", header = None, 
                          usecols=(2,12), names=["chrom", "gene_name"]).drop_duplicates(subset=("gene_name"))
calpath_gene_pos = pd.merge(ref_genes, cal_pthwy_genes, how="inner", on=["gene_name"])
diff = set(cal_pthwy_genes["gene_name"].tolist()) - set(calpath_gene_pos["gene_name"].tolist())
# add the two missing genes and their chromosome in the dataframe
calpath_gene_pos.loc[calpath_gene_pos.shape[0]] = ["chr2", list(diff)[0]]
calpath_gene_pos.loc[calpath_gene_pos.shape[0]] = ["chr15", list(diff)[1]]

In [4]:
calpath_gene_pos["chr.gene"] = calpath_gene_pos.apply(lambda row: "{}.{}".format(row["chrom"], row["gene_name"]), 
                                                      axis=1)
calpath_gene_pos["prior.pr"] = calpath_gene_pos.apply(lambda row: np.random.uniform(0, z_mu), axis=1)

In [5]:
beta = z_mu
y_tran = t

In [6]:
def run_dap(df, fileout, pthwy_genes, multiplier = 5, prefix = None, exec_path = None):
    '''Convert pandas dataframe to dap input:
        - phenotype / genotype file
        - prior file
        - grid file (of effect size): omega^2 + phi^2 is what we care. Let's set it to
          (0, beta) for now, as we only have one Y
    '''
    print (str(datetime.now()))
    import os
    if prefix is None:
        import time
        prefix = "/tmp/F" + str(time.time())
    if exec_path is None:
        exec_path = 'dap/dap'
#     chrom = ["chr{}".format(i) for i in list(range(1,23))+["X"]]
#     pos is all the gene names showed in .feather
    pos = df.columns.values[1:].tolist()
#     first item in "dat" is phenotype for all samples (# of rows in .feather)
    dat = [['pheno', 'trait', 'chicago'] + [str(x) for x in df['phenotype']]]
    prior = []
    grid = [(0, beta)]
#     item is gene name
    for idx, item in enumerate(df.columns.values):
        if item == "phenotype":
            continue
        dat.append(['geno', '{}.{}'.format(ref_genes[ref_genes["gene_name"]==item]["chrom"].tolist()[0], item), 
                    'chicago'] + [str(x) for x in df[item]])
        n_overlap_gene = len(set(pthwy_genes["gene_name"]) & set(pos))
        prior_causal = multiplier / len(set(pos))
        prior_noncausal = (1-prior_causal*n_overlap_gene) / (len(set(pos))-n_overlap_gene)
        if multiplier <= 1.0:
            prior_pr = 1 / len(set(pos))
        else:
            prior_pr = str(prior_causal) if item in pthwy_genes["gene_name"].tolist() else str(prior_noncausal)
        prior.append(['{}.{}'.format(ref_genes[ref_genes["gene_name"]==item]["chrom"].tolist()[0], item), prior_pr])
#     print (prior_causal, prior_noncausal)
    with open(prefix + '.dat', 'w') as f:
        f.write('\n'.join([' '.join(x) for x in dat]))
    with open(prefix + '.prior', 'w') as f:
        f.write('\n'.join([' '.join(x) for x in prior]))
    with open(prefix + '.grid', 'w') as f:
        f.write('\n'.join([' '.join(map(str, x)) for x in grid]))
    os.system("{0} -d {1}.dat -g {1}.grid -t 8 -it 0.05 -prior {1}.prior > {2}".format(exec_path, prefix, fileout))
    print (str(datetime.now()))
#     return prior

In [7]:
filename = "data/calcium_pathway_N4000.data.feather"
fileout = "data/calcium_pathway_N4000.data.dap"

# filename = "data/del_test_data.feather"
# fileout = "data/del_test_data.dap"

input_file = feather.read_dataframe(filename)
input_file["phenotype"] = input_file.apply(lambda row: y_tran if row["phenotype"]==1.0 else 0, axis=1)

run_dap(input_file, fileout, cal_pthwy_genes)

2017-08-10 17:49:34.493148
2017-08-10 18:19:40.418724


In [None]:
# test, check if the genes with high posterior are similar to calcium. If yes, then simulation has problem.
filename1 = "data/del_sim_shape5_scale1_p0.01_N4000_M200_12.data.feather"
fileout1 = "data/del_sim_shape5_scale1_p0.01_N4000_M200_12.test.dap"
input_file1 = feather.read_dataframe(filename1)
input_file1["phenotype"] = input_file1.apply(lambda row: y_tran if row["phenotype"]==1.0 else 0, axis=1)
run_dap(input_file1, fileout1, cal_pthwy_genes)

2017-08-10 19:22:02.873925


In [9]:
# calpath_gene_pos.to_csv("test.csv", sep="\t")
print (cal_pthwy_genes["gene_name"].tolist())

['ADCY1', 'ADCY2', 'ADCY3', 'ADCY4', 'ADCY7', 'ADCY8', 'ADCY9', 'ADORA2A', 'ADORA2B', 'ADRA1A', 'ADRA1B', 'ADRA1D', 'ADRB1', 'ADRB2', 'ADRB3', 'AGTR1', 'ATP2A1', 'ATP2A2', 'ATP2A3', 'ATP2B1', 'ATP2B2', 'ATP2B3', 'ATP2B4', 'AVPR1A', 'AVPR1B', 'BDKRB1', 'BDKRB2', 'BST1', 'CACNA1A', 'CACNA1B', 'CACNA1C', 'CACNA1D', 'CACNA1E', 'CACNA1F', 'CACNA1G', 'CACNA1H', 'CACNA1I', 'CACNA1S', 'CALM1', 'CALM2', 'CALM3', 'CALML3', 'CALML5', 'CALML6', 'CAMK2A', 'CAMK2B', 'CAMK2D', 'CAMK2G', 'CAMK4', 'CCKAR', 'CCKBR', 'CD38', 'CHP', 'CHP2', 'CHRM1', 'CHRM2', 'CHRM3', 'CHRM5', 'CHRNA7', 'CYSLTR1', 'CYSLTR2', 'DRD1', 'DRD5', 'EDNRA', 'EDNRB', 'EGFR', 'ERBB2', 'ERBB3', 'ERBB4', 'F2R', 'GNA11', 'GNA14', 'GNA15', 'GNAL', 'GNAQ', 'GNAS', 'GRIN1', 'GRIN2A', 'GRIN2C', 'GRIN2D', 'GRM1', 'GRM5', 'GRPR', 'HRH1', 'HRH2', 'HTR2A', 'HTR2B', 'HTR2C', 'HTR4', 'HTR5A', 'HTR6', 'HTR7', 'ITPKA', 'ITPKB', 'ITPR1', 'ITPR2', 'ITPR3', 'LHCGR', 'LOC729317', 'LTB4R2', 'MYLK', 'MYLK2', 'MYLK3', 'NOS1', 'NOS2', 'NOS3', 'NTSR1', 'OX

In [None]:
# chrom
# chrom = ["chr{}".format(i) for i in list(range(1,23))+["X"]]
# print ((chrom))

In [None]:
# [I 15:34:54.151 NotebookApp] Accepting one-time-token-authenticated connection from ::1
# [I 15:34:54.814 NotebookApp] Kernel started: 60079e22-d81d-4f83-a293-c5750b5456a1
# [I 15:36:54.808 NotebookApp] Saving file at /20170726_Dap_on_simulation.ipynb
# [W 15:42:01.170 NotebookApp] WebSocket ping timeout after 92790 ms.
# [I 15:44:57.820 NotebookApp] KernelRestarter: restarting kernel (1/5)