In [28]:
from simulation import *
from scipy import stats

def get_power(stats_table, causal_genes, p = 0.05):
    overlap_causal_genes = []
    overlap_p_value = []
    
    for gene in causal_genes:
        if gene in stats_table["genes"]:
            overlap_causal_genes.append(gene)
            overlap_p_value.append(stats_table["p_value"][stats_table["genes"].index(gene)])
    count = 0
    for x in overlap_p_value:
        if x < p:
            count += 1
    return count/len(overlap_causal_genes)

def get_typeIerror(stats_table, causal_genes, p = 0.05):
    noncausal_overlap_genes = [gene for gene in stats_table["genes"] if gene not in causal_genes]
    noncausal_overlap_genes_pvalue = []
    for gene in noncausal_overlap_genes:
        noncausal_overlap_genes_pvalue.append(stats_table["p_value"][stats_table["genes"].index(gene)])
    count = 0
    for x in noncausal_overlap_genes_pvalue:
        if x < p:
            count += 1
    return count/len(noncausal_overlap_genes_pvalue)

def test_contingency_table(gene_table, method = "Fisher", option = False): 
    if (method == "Fisher"):
        stats_table = {"genes": get_stats(gene_table)["logp_gene"], "p_value": get_stats(gene_table)["p_value"]}
    else:
        table = [( stats.chi2_contingency([[row["n_case_gene"], row["n_ctrl_gene"]], 
                                           [row["n_case_nogene"], row["n_ctrl_nogene"]]], 
                                          correction = option), row["gene_name"] ) 
                 for idx, row in gene_table.iterrows()]
        p_value = [x[0][1] for x in table]
        gene = [x[1] for x in table]
        stats_table = {"p_value": p_value, "genes": gene}
    return stats_table

def get_power_and_typeIerror(input_data, method_option = "chi2", correction_option = False, p_option = 0.05):
    sample_table = load_data(input_data)
    causal_genes = sample_table["debug"]["causal genes"]
    gene_table = get_gene_table(sample_table)
    stats_table = test_contingency_table(gene_table, method = method_option, option = correction_option)
    power = get_power(stats_table, causal_genes, p = p_option)
    typeI_error = get_typeIerror(stats_table, causal_genes, p = p_option)
    return {"power": power, "typeI_error": typeI_error, "debug": sample_table["debug"]["args"]}

def run_power_typeIerror(datasets):
    res = {}
    i = 0
    for data in datasets:
        res_data = get_power_and_typeIerror(data)
        res["dataset_{}".format(i)] = res_data
        i += 1
    return res

num = 18
run_power_typeIerror(["data/del_sim_{}.data.pkl".format(i) for i in range(num)])

{'dataset_0': {'debug': {'avg_cnv_per_individual': 5,
   'block_size': 100000,
   'case_dataset': 'delCases',
   'cnv_file': 'data/ISC-r1.CNV.bed',
   'ctrl_dataset': 'delControls',
   'n_case': 50,
   'n_causal_gene': 50,
   'n_ctrl': 50,
   'odds_ratio_params': {'scale': None, 'shape': None},
   'output': 'data/del_sim',
   'prevalence': 0.005,
   'refgene_file': 'data/refGene.txt.gz'},
  'power': 0.0,
  'typeI_error': 0.0},
 'dataset_1': {'debug': {'avg_cnv_per_individual': 5,
   'block_size': 100000,
   'case_dataset': 'delCases',
   'cnv_file': 'data/ISC-r1.CNV.bed',
   'ctrl_dataset': 'delControls',
   'n_case': 50,
   'n_causal_gene': 50,
   'n_ctrl': 50,
   'odds_ratio_params': {'scale': 1, 'shape': 3},
   'output': 'data/del_sim',
   'prevalence': 0.005,
   'refgene_file': 'data/refGene.txt.gz'},
  'power': 0.0,
  'typeI_error': 0.0005470459518599562},
 'dataset_10': {'debug': {'avg_cnv_per_individual': 5,
   'block_size': 100000,
   'case_dataset': 'delCases',
   'cnv_file': 

In [1]:
from simulation import *
import numpy as np
del_sample = load_data("data/del_sim_scale5_shape1_N1000_M100_999.pkl")
causal_genes = del_sample["debug"]["causal genes"]
print (causal_genes)
del_sample_gene = get_gene_table(del_sample)
# print (del_sample_gene)
from pandasql import sqldf
query = """
select gene_name, n_case_gene,  n_ctrl_gene,  n_case_nogene,  n_ctrl_nogene
from del_sample_gene
where gene_name = "BDNF"
"""
print (sqldf(query))

['LINC00229', 'TPGS2', 'OR11H6', 'LOC106699570', 'KAZN', 'ACP1', 'ZNF645', 'WIZ', 'CSDE1', 'CRTAM', 'SERHL2', 'DPP10-AS1', 'SNORA80B', 'LOC101927143', 'MIR625', 'LOC101929153', 'NDUFAF2', 'FBXO41', 'LOXL3', 'KLHL13', 'VAC14', 'PTMS', 'C2', 'UNC5D', 'CDC37L1', 'MIR4269', 'ELP5', 'NPY', 'CPEB1-AS1', 'POLR2J2', 'VAPB', 'C5orf15', 'KCNH2', 'OR12D3', 'ANKRD40', 'SLIT1', 'SNORD114-11', 'CCR7', 'PORCN', 'PLEK2', 'BHMT2', 'TMEM245', 'PQBP1', 'CD14', 'ZNF276', 'TICRR', 'SNCB', 'MAPK10', 'MIR34AHG', 'NRM', 'C3orf30', 'LOC105372493', 'RABEP1', 'PTGS1', 'MIR199A2', 'MIR3173', 'NEK6', 'PAAF1', 'TUBB4A', 'ALDH1B1', 'HCAR3', 'LOC101929006', 'ECE1', 'OR10C1', 'SRD5A1', 'LINC00970', 'LOC101929163', 'MGST1', 'FBLN1', 'ADAMTS20', 'UST', 'KRTAP6-2', 'MIR124-3', 'LOC102724652', 'LOC643623', 'POM121L12', 'SHC2', 'AURKA', 'PLEKHG5', 'LINC01391', 'MUC7', 'NOD1', 'PHACTR1', 'HCG26', 'ZBTB45', 'LOC100507053', 'CRNKL1', 'TTYH2', 'SLC39A10', 'DDX3X', 'MIR3161', 'PRKG2', 'RAD54L2', 'TMEM5-AS1', 'VOPP1', 'GFOD1', '

In [2]:
from collections import Counter
from pprint import pprint
def get_num_cnv_per_sample(input_data):
    '''input_data is simulated dataset, either "case" or "ctrl"'''
    num_cnv = []
    for i in range(len(input_data)):
        input_data[i] = input_data[i].drop_duplicates(subset=("chrom", "cnv_start", "cnv_terminate"))
    num_cnv = [ len(input_data[i]) for i in range(len(input_data))]
    return num_cnv
num_cnv_case = get_num_cnv_per_sample(del_sample["case"])
num_cnv_ctrl = get_num_cnv_per_sample(del_sample["ctrl"])
pprint (dict(Counter(num_cnv_case)))
pprint (dict(Counter(num_cnv_ctrl)))

{0: 9,
 1: 36,
 2: 74,
 3: 93,
 4: 94,
 5: 81,
 6: 50,
 7: 29,
 8: 18,
 9: 8,
 10: 4,
 11: 1,
 12: 2,
 14: 1}
{0: 16,
 1: 48,
 2: 70,
 3: 99,
 4: 77,
 5: 83,
 6: 54,
 7: 37,
 8: 11,
 9: 3,
 10: 1,
 11: 1}


In [3]:
import plotly
plotly.offline.init_notebook_mode(connected=True)
from plotly.offline import iplot
import plotly.graph_objs as go
%matplotlib inline

trace1 = go.Histogram( x=num_cnv_case, histnorm='count', name='case=500', autobinx=False, 
                      xbins=dict(start=0, end=15, size=1), marker=dict(color='lightred'), opacity=0.75 )

trace2 = go.Histogram( x=num_cnv_ctrl, histnorm='count', name='control=500', autobinx=False, 
                      xbins=dict(start=0, end=15, size=1), marker=dict(color='lightblue'), opacity=0.75 )

data = [trace1, trace2]

layout = go.Layout(title='Number of CNV in cases vs Controls', xaxis=dict(title='Value'), yaxis=dict(title='Count'),
                   bargap=0.2, bargroupgap=0.1)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='styled histogram')

In [4]:
from collections import Counter
def get_num_causalgene_overlap_cnv(input_data):
    '''get the number of causal gene that overlapped with CNV'''
    gene_list = {}
    num_causalgene_overlap_cnv = []
    for i in range(len(input_data)):
        gene_list[i] = [ gene for gene in input_data[i]["gene_name"].tolist() if gene in causal_genes ]
        num_causalgene_overlap_cnv.append(len(gene_list[i]))
    return num_causalgene_overlap_cnv

num_causalgene_overlap_cnv_case = get_num_causalgene_overlap_cnv(del_sample["case"])
num_causalgene_overlap_cnv_ctrl = get_num_causalgene_overlap_cnv(del_sample["ctrl"])
print (dict(Counter(num_causalgene_overlap_cnv_case)))
print (dict(Counter(num_causalgene_overlap_cnv_ctrl)))

{0: 468, 1: 32}
{0: 494, 1: 6}


In [5]:
from pprint import pprint
import pandas as pd
def get_length_cnv(input_data):
    cnv_length = {}
    for i in range(len(del_sample["case"])):
        input_data[i] = input_data[i].drop_duplicates(subset=("chrom", "cnv_start", "cnv_terminate"))
        cnv_length[i] = input_data[i]["cnv_terminate"] - input_data[i]["cnv_start"]
    return dict(pd.concat(cnv_length))

cnv_length_case = list(get_length_cnv(del_sample["case"]).values())
cnv_length_ctrl = list(get_length_cnv(del_sample["ctrl"]).values())
print (np.median(cnv_length_case), np.median(cnv_length_ctrl))

191709.5 181886.0


In [15]:
import plotly
plotly.offline.init_notebook_mode(connected=True)
from plotly.offline import iplot
import plotly.graph_objs as go
%matplotlib inline

trace1 = go.Histogram( x=cnv_length_case, histnorm='density', name='case', autobinx=False, 
                      xbins=dict(start=0, end=1500000, size=40000), marker=dict(color='lightred'), opacity=0.75 )

trace2 = go.Histogram( x=cnv_length_ctrl, histnorm='density', name='control', autobinx=False, 
                      xbins=dict(start=0, end=1500000, size=40000), marker=dict(color='lightblue'), opacity=0.75 )

data = [trace1, trace2]

layout = go.Layout(title='CNV length in Cases versus Controls', xaxis=dict(title='CNV length'), 
                   yaxis=dict(title='density'), bargap=0.2, bargroupgap=0.1)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='styled histogram')

{0: 468, 1: 32}
{0: 494, 1: 6}


In [22]:
del_stats_table = get_stats(del_sample_gene, sort=0)

In [23]:
print (del_stats_table)

{'OR_gene': ['LOC101929006', 'OR10C1', 'OR11A1', 'OR12D2', 'OR12D3', 'OR14J1', 'OR5V1', 'MAS1L', 'OR2H1', 'TNXB', 'AGER', 'AGPAT1', 'ATF6B', 'C6orf10', 'EGFL8', 'FKBPL', 'GABBR1', 'GPSM3', 'HCG4B', 'HLA-F', 'HLA-F-AS1', 'HLA-G', 'HLA-H', 'IFITM4P', 'LINC01015', 'LOC100507547', 'LOC101929163', 'MIR6721', 'MIR6833', 'MOG', 'NOTCH4', 'OR2H2', 'PBX2', 'PPT2', 'PPT2-EGFL8', 'PRRT1', 'RNF5', 'RNF5P1', 'SNORD32B', 'UBD', 'ZFP57', 'IMMP2L', 'C4B_2', 'CYP21A2', 'HCG4', 'HLA-A', 'LOC100129636', 'LOC554223', 'OR2B3', 'OR2J2', 'OR2J3', 'NRXN3', 'C2', 'C4A', 'CFB', 'CYP21A1P', 'DXO', 'GFOD1', 'MIR1236', 'MIR6891', 'NELFE', 'OR2W1', 'SKIV2L', 'STK19', 'TNXA', 'LRRN3', 'MACROD2', 'TMPRSS11E', 'BTNL2', 'C2-AS1', 'C6orf25', 'C6orf48', 'CLIC1', 'DDAH2', 'DHX16', 'EHMT2', 'FLOT1', 'HCG16', 'HCG17', 'HCG18', 'HCG23', 'HCG8', 'HCG9', 'HLA-J', 'HSPA1A', 'HSPA1B', 'HSPA1L', 'IER3', 'LOC100130357', 'LSM2', 'LY6G6C', 'LY6G6D', 'LY6G6E', 'MDC1', 'MDC1-AS1', 'MSH5', 'MSH5-SAPCD1', 'NEU1', 'NRM', 'PHACTR1', 'PPP1

In [None]:
del_pvalue = del_stats_table["p_value"]
print (del_pvalue)

In [None]:
# power for causal genes
overlap_causal_genes = []
overlap_p_value = []
for gene in causal_genes:
    if gene in del_stats_table["logp_gene"]:
        overlap_causal_genes.append(del_stats_table["logp_gene"][del_stats_table["logp_gene"].index(gene)])
        overlap_p_value.append(del_stats_table["p_value"][del_stats_table["logp_gene"].index(gene)])
    else: pass
#print (overlap_p_value)
count = 0
for x in overlap_p_value:
    if x<=0.05:
        count += 1
print (count/73)
# a= [overlap_p_value[overlap_p_value.index[x]] for x in overlap_p_value if x<=0.05]

In [None]:
# type I error for noncausal genes
noncausal_overlap_genes_pvalue = []
noncausal_overlap_genes = set(del_stats_table["logp_gene"]) - set(causal_genes)
print (len(noncausal_overlap_genes))

for gene in noncausal_overlap_genes:
    noncausal_overlap_genes_pvalue.append(del_stats_table["p_value"][del_stats_table["logp_gene"].index(gene)])
#     overlap_noncausal_gene.append(del_stats_table["logp_gene"][del_stats_table["logp_gene"].index(gene)])
# print ((noncausal_overlap_genes_pvalue))
count_p = 0
for x in noncausal_overlap_genes_pvalue:
    if x < 0.05:
        count_p += 1
print (count_p/len(noncausal_overlap_genes))

In [None]:
# type I error function
def get_typeIerror(stats_table, causal_genes):
    noncausal_overlap_genes = [gene for gene in stats_table["genes"] if gene not in causal_genes]
    noncausal_overlap_genes_pvalue = []
    for gene in noncausal_overlap_genes:
        noncausal_overlap_genes_pvalue.append(stats_table["p_value"][stats_table["genes"].index(gene)])
    count = 0
    for x in noncausal_overlap_genes_pvalue:
        if x < 0.05:
            count += 1
    return count/len(noncausal_overlap_genes_pvalue)

get_typeIerror(chi2_table, causal_genes)

In [None]:
from scipy import stats
def test_contingency_table(gene_table, method = "Fisher", option = False): 
    if (method == "Fisher"):
        stats_table = {"genes": get_stats(gene_table)["logp_gene"], "p_value": get_stats(gene_table)["p_value"]}
    else:
#         stats.chi2_contingency(obs, correction=True)
        table = [( stats.chi2_contingency([[row["n_case_gene"], row["n_ctrl_gene"]], 
                                           [row["n_case_nogene"], row["n_ctrl_nogene"]]], correction=option), 
                  row["gene_name"] ) for idx, row in gene_table.iterrows()]
        p_value = [x[0][1] for x in table]
        gene = [x[1] for x in table]
        stats_table = {"p_value": p_value, "genes": gene}
    return stats_table

In [None]:
fisher_table = test_contingency_table(del_sample_gene)
chi2_table = test_contingency_table(del_sample_gene, method = "chi2")

In [None]:
# power function
def get_power(stats_table, causal_genes):
    overlap_causal_genes = []
    overlap_p_value = []
    
    for gene in causal_genes:
        if gene in stats_table["genes"]:
            overlap_causal_genes.append(stats_table["genes"][stats_table["genes"].index(gene)])
            overlap_p_value.append(stats_table["p_value"][stats_table["genes"].index(gene)])
        else: pass
    count = 0
    for x in overlap_p_value:
        if x<=0.05:
            count += 1
    return count/len(overlap_causal_genes)

get_power(chi2_table, causal_genes)

In [None]:
non_causal_genes = [gene for gene in stats_table["genes"] if gene not in causal_genes]

In [None]:
pprint (test_contingency_table(del_sample_gene, method="Fisher"))

In [None]:
p = pvalue(12, 5, 29, 2)
print (p.two_tail)
p2 = stats.chi2_contingency([[12,5],[29,2]])
print (p2[1])

In [None]:
oddsratio, p_1 = stats.fisher_exact([[12, 5], [29, 2]])
print (p_1)

In [12]:
print (["data/del_sim_{}.data.pkl".format(i) for i in range(18)])

['data/del_sim_0.data.pkl', 'data/del_sim_1.data.pkl', 'data/del_sim_2.data.pkl', 'data/del_sim_3.data.pkl', 'data/del_sim_4.data.pkl', 'data/del_sim_5.data.pkl', 'data/del_sim_6.data.pkl', 'data/del_sim_7.data.pkl', 'data/del_sim_8.data.pkl', 'data/del_sim_9.data.pkl', 'data/del_sim_10.data.pkl', 'data/del_sim_11.data.pkl', 'data/del_sim_12.data.pkl', 'data/del_sim_13.data.pkl', 'data/del_sim_14.data.pkl', 'data/del_sim_15.data.pkl', 'data/del_sim_16.data.pkl', 'data/del_sim_17.data.pkl']
