# Data transform: $y \in \{0,1\}$ to $y \in \{0,t\}$

In [1]:
import numpy as np
import pandas as pd
from pprint import pprint
from pandasql import sqldf
from scipy import stats
import pickle
import feather
from collections import Counter
from datetime import datetime

In [2]:
# the lifetime prevalence of schizophrenia is 4.0/1000, not 1%, but we first set it as 0.01
# according to https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1140960/
prevalence = 0.01
# relative risk is normally between 2 and 10
RR = 2.5
t = stats.norm(0,1).ppf(1-prevalence)
# effect size of beta is z_mu, prior
z_mu = t - stats.norm(0,1).ppf(1-RR*prevalence)
print (t, z_mu)

2.32634787404 0.366383889501


In [3]:
# create prior file
cal_pthwy_genes = pd.read_table("../data/calciumgeneset.txt", skiprows = 2, header = None, names = ["gene_name"])
ref_genes = pd.read_table("data/refGene.txt.gz", compression="gzip", sep="\t", header = None, 
                          usecols=(2,12), names=["chrom", "gene_name"]).drop_duplicates(subset=("gene_name"))
calpath_gene_pos = pd.merge(ref_genes, cal_pthwy_genes, how="inner", on=["gene_name"])
diff = set(cal_pthwy_genes["gene_name"].tolist()) - set(calpath_gene_pos["gene_name"].tolist())
# add the two missing genes and their chromosome in the dataframe
calpath_gene_pos.loc[calpath_gene_pos.shape[0]] = ["chr2", list(diff)[0]]
calpath_gene_pos.loc[calpath_gene_pos.shape[0]] = ["chr15", list(diff)[1]]

In [4]:
calpath_gene_pos["chr.gene"] = calpath_gene_pos.apply(lambda row: "{}.{}".format(row["chrom"], row["gene_name"]), 
                                                      axis=1)
calpath_gene_pos["prior.pr"] = calpath_gene_pos.apply(lambda row: np.random.uniform(0, z_mu), axis=1)
calpath_gene_pos

Unnamed: 0,chrom,gene_name,chr.gene,prior.pr
0,chr17,CACNA1G,chr17.CACNA1G,0.266285
1,chr2,PLCD4,chr2.PLCD4,0.107040
2,chr20,PLCB4,chr20.PLCB4,0.362211
3,chr4,CAMK2D,chr4.CAMK2D,0.140378
4,chr14,SLC8A3,chr14.SLC8A3,0.169377
5,chr8,PTK2B,chr8.PTK2B,0.210375
6,chr9,GNA14,chr9.GNA14,0.059547
7,chr8,VDAC3,chr8.VDAC3,0.157412
8,chr12,PDE1B,chr12.PDE1B,0.075162
9,chr12,CACNA1C,chr12.CACNA1C,0.256323


In [5]:
beta = z_mu
y_tran = t

In [6]:
filename = "data/del_sim_scale3_shape1_N100_M100_999.feather"
fileout = "data/del_sim_scale3_shape1_N100_M100_999.dap"

input_file = feather.read_dataframe(filename)
input_file["phenotype"] = input_file.apply(lambda row: y_tran if row["phenotype"]==1.0 else 0, axis=1)

def run_dap(df, fileout, multiplier = 2, prefix = None, exec_path = None):
    '''Convert pandas dataframe to dap input:
        - phenotype / genotype file
        - prior file
        - grid file (of effect size): omega^2 + phi^2 is what we care. Let's set it to
            *************** for now, as we only have one Y
    '''
    print (str(datetime.now()))
    import os
    if prefix is None:
        import time
        prefix = "/tmp/F" + str(time.time())
    if exec_path is None:
        exec_path = 'dap/dap'
#     chrom = ["chr{}".format(i) for i in list(range(1,23))+["X"]]
#     pos is all the gene names showed in .feather
    pos = df.columns.values[1:].tolist()
#     first item in "dat" is phenotype for all samples (# of rows in .feather)
    dat = [['pheno', 'trait', 'chicago'] + [str(x) for x in df['phenotype']]]
    prior = []
    grid = [(0,beta)]
#     item is gene name
    for idx, item in enumerate(df.columns.values):
        if item == "phenotype":
            continue
        dat.append(['geno', '{}.{}'.format(ref_genes[ref_genes["gene_name"]==item]["chrom"].tolist()[0], item), 
                    'chicago'] + [str(x) for x in df[item]])
        n_overlap_gene = len(set(cal_pthwy_genes["gene_name"]) & set(pos))
        prior_causal = multiplier / len(set(pos))
        prior_noncausal = (1-prior_causal*n_overlap_gene) / (len(set(pos))-n_overlap_gene)
        prior_pr = str(prior_causal) if item in cal_pthwy_genes["gene_name"].tolist() else str(prior_noncausal)
        prior.append(['{}.{}'.format(ref_genes[ref_genes["gene_name"]==item]["chrom"].tolist()[0], item), prior_pr])
#     print (prior_causal, prior_noncausal)
    with open(prefix + '.dat', 'w') as f:
        f.write('\n'.join([' '.join(x) for x in dat]))
    with open(prefix + '.prior', 'w') as f:
        f.write('\n'.join([' '.join(x) for x in prior]))
    with open(prefix + '.grid', 'w') as f:
        f.write('\n'.join([' '.join(map(str, x)) for x in grid]))
    os.system("{0} -d {1}.dat -g {1}.grid -t 8 -it 0.05 -prior {1}.prior > {2}".format(exec_path, prefix, fileout))
    print (str(datetime.now()))
#     return prior

In [7]:
testp=run_dap(input_file, fileout)

2017-08-04 20:22:54.007086
2017-08-04 20:23:02.148236


In [18]:
calpath_gene_pos.to_csv("test.csv", sep="\t")

In [10]:
# prior_c = len(set(cal_pthwy_genes["gene_name"]) & set(df.columns.values[1:].tolist())) *2 / len(set(df.columns.values[1:].tolist())) / len(set(cal_pthwy_genes["gene_name"]))
prior_c = 2/len(set(df.columns.values[1:].tolist()))
n_overlap_gene = len(set(cal_pthwy_genes["gene_name"]) & set(df.columns.values[1:].tolist()))
prior_nc = (1-prior_c*n_overlap_gene)/(len(set(df.columns.values[1:].tolist()))-n_overlap_gene)
for idx, item in enumerate(df.columns.values):
    prior_t = str(prior_c) if item in cal_pthwy_genes["gene_name"].tolist() else str(prior_nc)
#     print (prior_t)

In [11]:
# chrom
chrom = ["chr{}".format(i) for i in list(range(1,23))+["X"]]
print ((chrom))

['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX']
