# Data transform: $y \in \{0,1\}$ to $y \in \{0,t\}$

In [1]:
import numpy as np
import pandas as pd
from pprint import pprint
from pandasql import sqldf
from scipy import stats
import pickle
import feather
from collections import Counter
from datetime import datetime

In [2]:
# the lifetime prevalence of schizophrenia is 4.0/1000, not 1%, but we first set it as 0.01
# according to https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1140960/
prevalence = 0.01
# relative risk is normally between 2 and 10
RR = 2.5
t = stats.norm(0,1).ppf(1-prevalence)
# effect size of beta is z_mu, prior
z_mu = t - stats.norm(0,1).ppf(1-RR*prevalence)
print (t, z_mu)

2.32634787404 0.366383889501


In [3]:
# create prior file
cal_pthwy_genes = pd.read_table("../data/calciumgeneset.txt", skiprows = 2, header = None, names = ["gene_name"])
ref_genes = pd.read_table("data/refGene.txt.gz", compression="gzip", sep="\t", header = None, 
                          usecols=(2,12), names=["chrom", "gene_name"]).drop_duplicates(subset=("gene_name"))
calpath_gene_pos = pd.merge(ref_genes, cal_pthwy_genes, how="inner", on=["gene_name"])
diff = set(cal_pthwy_genes["gene_name"].tolist()) - set(calpath_gene_pos["gene_name"].tolist())
# add the two missing genes and their chromosome in the dataframe
calpath_gene_pos.loc[calpath_gene_pos.shape[0]] = ["chr2", list(diff)[0]]
calpath_gene_pos.loc[calpath_gene_pos.shape[0]] = ["chr15", list(diff)[1]]

In [19]:
calpath_gene_pos["chr.gene"] = calpath_gene_pos.apply(lambda row: "{}.{}".format(row["chrom"], row["gene_name"]), 
                                                      axis=1)
calpath_gene_pos["prior.pr"] = calpath_gene_pos.apply(lambda row: np.random.uniform(0, z_mu), axis=1)
calpath_gene_pos

Unnamed: 0,chrom,gene_name,chr.gene,prior.pr
0,chr17,CACNA1G,chr17.CACNA1G,0.184038
1,chr2,PLCD4,chr2.PLCD4,0.242758
2,chr20,PLCB4,chr20.PLCB4,0.365112
3,chr4,CAMK2D,chr4.CAMK2D,0.192109
4,chr14,SLC8A3,chr14.SLC8A3,0.155206
5,chr8,PTK2B,chr8.PTK2B,0.109224
6,chr9,GNA14,chr9.GNA14,0.111205
7,chr8,VDAC3,chr8.VDAC3,0.064767
8,chr12,PDE1B,chr12.PDE1B,0.031932
9,chr12,CACNA1C,chr12.CACNA1C,0.223430


In [18]:
testprior = pd.read_table("dap/ENSG00000112799.null.prior", sep="\s+", header = None, 
                          usecols = (0,1), names = ["SNP", "prior"])
np.max(testprior["prior"].tolist())

0.023190000000000002

In [6]:
# create/write grid file
beta = z_mu

In [7]:
# create
y_tran = t
y_phe0 = 0

In [26]:
for a,b in enumerate(feather.read_dataframe("data/toy_n2000_p0.9_causal5_const1.0_0.05.feather").columns.values):
    print (a, b)

0 phenotype
1 gene1
2 gene2
3 gene3
4 gene4
5 gene5
6 gene6
7 gene7
8 gene8
9 gene9
10 gene10


In [33]:
feather.read_dataframe("data/del_sim_scale3_shape1_N100_M50_999.feather")

Unnamed: 0,phenotype,C6orf48,FIGNL2,C9orf135-AS1,MIR642A,SRC,MYL7,LOC100506498,ACAT2,NOS3,...,APOM,LINC01078,PRDM5,LINC01144,AK9,MIR1295A,C3orf33,NTM,CHST11,C2CD3
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
filename = "data/del_sim_scale3_shape1_N100_M100_999.feather"
fileout = "data/del_sim_scale3_shape1_N100_M100_999.dap"

input_file = feather.read_dataframe(filename)
print (input_file.shape)

def run_dap(df, fileout, prefix = None, exec_path = None):
    '''Convert pandas dataframe to dap input:
        - phenotype / genotype file
        - prior file
        - grid file (of effect size): omega^2 + phi^2 is what we care. Let's set it to
            *************** for now, as we only have one Y
    '''
    print (str(datetime.now()))
    import os
    if prefix is None:
        import time
        prefix = "/tmp/F" + str(time.time())
    if exec_path is None:
        exec_path = 'dap/dap'
    chrom = ["chr{}".format(i) for i in list(range(1,23))+["X"]]
    pos = df.columns.values[1:].tolist()
    dat = [['pheno', 'trait', 'chicago'] + [str(x) for x in df['phenotype']]]
    prior = []
    grid = [(1,1),(2,2),(3,3),(4,4)]
    for idx, item in enumerate(df.columns.values):
        if item == 'phenotype':
            continue
#         fix "pos" for the next two lines
        dat.append(['geno', '{}.{}'.format(chrom, pos + idx), 'chicago'] + [str(x) for x in df[item]])
        prior.append(['{}.{}'.format(chrom, pos + idx), str(1/(df.shape[1] - 1))])
    with open(prefix + '.dat', 'w') as f:
        f.write('\n'.join([' '.join(x) for x in dat]))
    with open(prefix + '.prior', 'w') as f:
        f.write('\n'.join([' '.join(x) for x in prior]))
    with open(prefix + '.grid', 'w') as f:
        f.write('\n'.join([' '.join(map(str, x)) for x in grid]))
    os.system("{0} -d {1}.dat -g {1}.grid -t 8 -it 0.05 -prior {1}.prior > {2}".format(exec_path, prefix, fileout))
    print (str(datetime.now()))

In [71]:
chrom = ["chr{}".format(i) for i in list(range(1,23))+["X"]]
print ((chrom))

['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX']


In [63]:
feather.read_dataframe("data/del_sim_scale3_shape1_N100_M100_999.feather").columns.values[1:].tolist()

['STXBP4',
 'PKD1P1',
 'FAM120AOS',
 'LRRC3B',
 'KANSL3',
 'LOC641746',
 'LOC101929337',
 'OLFM3',
 'ZNF257',
 'PRAMEF9',
 'HIPK1',
 'NOMO3',
 'OR8B8',
 'C19orf35',
 'MKRN1',
 'LPL',
 'SNORD41',
 'PGAM5',
 'RGP1',
 'DPY19L2P1',
 'MIR185',
 'NOMO2',
 'SRSF6',
 'ZNF132',
 'KDELC2',
 'LRRC63',
 'HDAC9',
 'ZMAT4',
 'LOC340512',
 'CIRBP',
 'SLC25A24',
 'CPED1',
 'FAM179B',
 'MBNL1-AS1',
 'PKN2-AS1',
 'PSMG1',
 'FERMT3',
 'ST20',
 'LOC100130872',
 'MIR1268A',
 'LINC01518',
 'ATP6V0E2',
 'COX11',
 'MIRLET7C',
 'TOP3A',
 'KRBOX1-AS1',
 'EVPLL',
 'LINC00184',
 'LOC100130451',
 'GOLGA8J',
 'NR5A2',
 'PRAMEF6',
 'MAP9',
 'TMEM35B',
 'SH3RF3',
 'OR2AP1',
 'NMU',
 'CSGALNACT1',
 'TMEM109',
 'DLGAP1',
 'UBQLN3',
 'HECTD2',
 'DNAH14',
 'JUNB',
 'MIR6794',
 'RNASEH2A',
 'MIS18BP1',
 'MFF',
 'FAM221B',
 'SERPINB8',
 'GJB7',
 'SLC27A5',
 'GOLGA6D',
 'LOC101929340',
 'C7orf13',
 'GIPC3',
 'MIR1306',
 'UBE3C',
 'PLCB3',
 'OR6X1',
 'MIR6515',
 'IL1RL1',
 'IARS',
 'XKR4',
 'CELF5',
 'TRIM22',
 'ZFR2',
 'LRR

In [None]:
run_dap(input_file, fileout)