# DAP on simulation results

In [1]:
import numpy as np
import pandas as pd
from pprint import pprint
# from pandasql import sqldf
from scipy import stats
# import pickle
import feather
from collections import Counter
from datetime import datetime
from utils import run_dap, load_data
import os
filename = 'data/calcium_pathway_N40_p0.01.data.blocks.pkl'
fileout = "data/calcium_pathway_N40_p0.01.data"
print(fileout)

data/calcium_pathway_N40_p0.01.data


## Data transform: $y \in \{0,1\}$ to $y \in \{0,t\}$

In [2]:
data = load_data(filename)
# the lifetime prevalence of schizophrenia is 4.0/1000, not 1%, but we first set it as 0.01
# according to https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1140960/
# relative risk is normally between 2 and 10
RR = 2.5
prevalence = data['debug']['args']['prevalence']
t = stats.norm(0,1).ppf(1-prevalence)
# effect size of beta is z_mu, prior
z_mu = t - stats.norm(0,1).ppf(1-RR*prevalence)
print (t, z_mu)

2.32634787404 0.366383889501


In [4]:
# create prior file
cal_pthwy_genes = pd.read_table("../data/calciumgeneset.txt", skiprows = 2, header = None, names = ["gene_name"])
ref_genes = pd.read_table("../data/refGene.txt.gz", compression="gzip", sep="\t", header = None, 
                          usecols=(2,12), names=["chrom", "gene_name"]).drop_duplicates(subset=("gene_name"))
calpath_gene_pos = pd.merge(ref_genes, cal_pthwy_genes, how="inner", on=["gene_name"])
diff = set(cal_pthwy_genes["gene_name"].tolist()) - set(calpath_gene_pos["gene_name"].tolist())
# add the two missing genes and their chromosome in the dataframe
calpath_gene_pos.loc[calpath_gene_pos.shape[0]] = ["chr2", list(diff)[0]]
calpath_gene_pos.loc[calpath_gene_pos.shape[0]] = ["chr15", list(diff)[1]]

In [5]:
calpath_gene_pos["chr.gene"] = calpath_gene_pos.apply(lambda row: "{}.{}".format(row["chrom"], row["gene_name"]), 
                                                      axis=1)
calpath_gene_pos["prior.pr"] = calpath_gene_pos.apply(lambda row: np.random.uniform(0, z_mu), axis=1)

In [6]:
beta = z_mu
y_tran = t

## Run DAP

In [7]:
dap_method = 'dap-g'
multiplier = 10
fileout = fileout + '_multiplier{}.{}'.format(multiplier, dap_method)
# clean it up, very important!
if os.path.isfile(fileout):
    os.remove(fileout)
os.system('rm -f /tmp/F*{dat,prior,grid}')
for idx, df in enumerate(data['data']):
    df["phenotype"] = df.apply(lambda row: y_tran if row["phenotype"]==1.0 else 0, axis=1)
    times = run_dap(df, fileout, cal_pthwy_genes, ref_genes, grid = [(0, beta)], 
                    multiplier = multiplier, ncpu = 4, dry_run = False, dap_method = dap_method)
    print('Job {2}/{3}, elapsed {0:.2f}s prepare & {1:.2f}s DAP'.\
          format(times[1] - times[0], times[2] - times[1], idx + 1, len(data['data'])))

Job 1/132, elapsed 0.01s prepare & 0.01s DAP
Job 2/132, elapsed 0.03s prepare & 0.01s DAP
Job 3/132, elapsed 0.04s prepare & 0.00s DAP
Job 4/132, elapsed 0.06s prepare & 0.00s DAP
Job 5/132, elapsed 0.03s prepare & 0.00s DAP
Job 6/132, elapsed 0.02s prepare & 0.00s DAP
Job 7/132, elapsed 0.02s prepare & 0.01s DAP
Job 8/132, elapsed 0.02s prepare & 0.01s DAP
Job 9/132, elapsed 0.01s prepare & 0.00s DAP
Job 10/132, elapsed 0.02s prepare & 0.00s DAP
Job 11/132, elapsed 0.11s prepare & 0.00s DAP
Job 12/132, elapsed 0.01s prepare & 0.00s DAP
Job 13/132, elapsed 0.03s prepare & 0.01s DAP
Job 14/132, elapsed 0.01s prepare & 0.01s DAP
Job 15/132, elapsed 0.01s prepare & 0.01s DAP
Job 16/132, elapsed 0.06s prepare & 0.00s DAP
Job 17/132, elapsed 0.01s prepare & 0.00s DAP
Job 18/132, elapsed 0.01s prepare & 0.00s DAP
Job 19/132, elapsed 0.01s prepare & 0.00s DAP
Job 20/132, elapsed 0.05s prepare & 0.01s DAP
Job 21/132, elapsed 0.01s prepare & 0.01s DAP
Job 22/132, elapsed 0.01s prepare & 0.00s D

In [1]:
# dap/dap -d /tmp/F1503014064.6944265.dat -g /tmp/F1503014064.6944265.grid -it 0.05 -prior /tmp/F1503014064.6944265.prior -t 2 > data/calcium_pathway_N4000_shape25_nomask.data.dap 

In [9]:
# calpath_gene_pos.to_csv("test.csv", sep="\t")
print (cal_pthwy_genes["gene_name"].tolist())

['ADCY1', 'ADCY2', 'ADCY3', 'ADCY4', 'ADCY7', 'ADCY8', 'ADCY9', 'ADORA2A', 'ADORA2B', 'ADRA1A', 'ADRA1B', 'ADRA1D', 'ADRB1', 'ADRB2', 'ADRB3', 'AGTR1', 'ATP2A1', 'ATP2A2', 'ATP2A3', 'ATP2B1', 'ATP2B2', 'ATP2B3', 'ATP2B4', 'AVPR1A', 'AVPR1B', 'BDKRB1', 'BDKRB2', 'BST1', 'CACNA1A', 'CACNA1B', 'CACNA1C', 'CACNA1D', 'CACNA1E', 'CACNA1F', 'CACNA1G', 'CACNA1H', 'CACNA1I', 'CACNA1S', 'CALM1', 'CALM2', 'CALM3', 'CALML3', 'CALML5', 'CALML6', 'CAMK2A', 'CAMK2B', 'CAMK2D', 'CAMK2G', 'CAMK4', 'CCKAR', 'CCKBR', 'CD38', 'CHP', 'CHP2', 'CHRM1', 'CHRM2', 'CHRM3', 'CHRM5', 'CHRNA7', 'CYSLTR1', 'CYSLTR2', 'DRD1', 'DRD5', 'EDNRA', 'EDNRB', 'EGFR', 'ERBB2', 'ERBB3', 'ERBB4', 'F2R', 'GNA11', 'GNA14', 'GNA15', 'GNAL', 'GNAQ', 'GNAS', 'GRIN1', 'GRIN2A', 'GRIN2C', 'GRIN2D', 'GRM1', 'GRM5', 'GRPR', 'HRH1', 'HRH2', 'HTR2A', 'HTR2B', 'HTR2C', 'HTR4', 'HTR5A', 'HTR6', 'HTR7', 'ITPKA', 'ITPKB', 'ITPR1', 'ITPR2', 'ITPR3', 'LHCGR', 'LOC729317', 'LTB4R2', 'MYLK', 'MYLK2', 'MYLK3', 'NOS1', 'NOS2', 'NOS3', 'NTSR1', 'OX

In [None]:
# chrom
# chrom = ["chr{}".format(i) for i in list(range(1,23))+["X"]]
# print ((chrom))

In [None]:
# [I 15:34:54.151 NotebookApp] Accepting one-time-token-authenticated connection from ::1
# [I 15:34:54.814 NotebookApp] Kernel started: 60079e22-d81d-4f83-a293-c5750b5456a1
# [I 15:36:54.808 NotebookApp] Saving file at /20170726_Dap_on_simulation.ipynb
# [W 15:42:01.170 NotebookApp] WebSocket ping timeout after 92790 ms.
# [I 15:44:57.820 NotebookApp] KernelRestarter: restarting kernel (1/5)