In [1]:
from simulation import *
from scipy import stats
import glob

def get_power(stats_table, causal_genes, p = 0.05):
    '''get power from each simulated dataset. 
    First get overlapped genes from stats table and causal genes, and find corresponding p value
    Then power = #(p_value < 0.05) / #(p_value)'''
    overlap_causal_genes = [gene for gene in causal_genes if gene in stats_table["genes"]]
    overlap_p_value = [stats_table["p_value"][stats_table["genes"].index(gene)] 
                       for gene in overlap_causal_genes]
    pvalue_less_than_p = [x for x in overlap_p_value if x < p]
    return len(pvalue_less_than_p)/len(overlap_causal_genes)

def get_typeIerror(stats_table, causal_genes, p = 0.05):
    '''get type I error for each simulated dataset.
    First get the overlapped genes from noncausal genes and genes in stats table, find corresponding p value
    Then type I error = #(p_value < 0.05) / #(p_value)
    '''
    noncausal_overlap_genes = [gene for gene in stats_table["genes"] if gene not in causal_genes]
    noncausal_overlap_genes_pvalue = [stats_table["p_value"][stats_table["genes"].index(gene)]
                                      for gene in noncausal_overlap_genes]
    pvalue_less_than_p = [x for x in noncausal_overlap_genes_pvalue if x < p]
    return len(pvalue_less_than_p)/len(noncausal_overlap_genes_pvalue)

def test_contingency_table(gene_table, method = "Fisher", option = False): 
    if (method == "Fisher"):
        stats_table = {"genes": get_stats(gene_table)["logp_gene"], "p_value": get_stats(gene_table)["p_value"]}
    else:
        table = [( stats.chi2_contingency([[row["n_case_gene"], row["n_ctrl_gene"]], 
                                           [row["n_case_nogene"], row["n_ctrl_nogene"]]], 
                                          correction = option), row["gene_name"] ) 
                 for idx, row in gene_table.iterrows()]
        p_value = [x[0][1] for x in table]
        gene = [x[1] for x in table]
        stats_table = {"p_value": p_value, "genes": gene}
    return stats_table

def get_power_and_typeIerror(input_data, method_option = "chi2", correction_option = False, p_option = 0.05):
    '''use function "load_data" and "get_gene_table" from simulation.py, use simulated dataset as input data,
    and get stats table by using Fisher or chisquare test.
    Then get power and type I error by using functions above, input is stats table and causal genes'''
    sample_table = load_data(input_data)
    causal_genes = sample_table["debug"]["causal genes"]
    gene_table = get_gene_table(sample_table)
    stats_table = test_contingency_table(gene_table, method = method_option, option = correction_option)
    power = get_power(stats_table, causal_genes, p = p_option)
    typeI_error = get_typeIerror(stats_table, causal_genes, p = p_option)
    return {"power": power, "typeI_error": typeI_error, "debug": sample_table["debug"]["args"]}

def run_power_typeIerror(datasets):
    res = {}
    i = 0
    for data in datasets:
        res_data = get_power_and_typeIerror(data)
        res["dataset_{}".format(i)] = res_data
        i += 1
    return res

# run_power_typeIerror( ['data/del_sim_scale5_shape1_N1000_M100_999.pkl'] )
run_power_typeIerror( data for data in glob.glob('data/del_sim_*_999.pkl') )

{'dataset_0': {'debug': {'avg_cnv_per_individual': 5,
   'block_size': 100000,
   'case_dataset': 'delCases',
   'cnv_file': 'data/ISC-r1.CNV.bed',
   'ctrl_dataset': 'delControls',
   'n_case': 500,
   'n_causal_gene': 100,
   'n_ctrl': 500,
   'odds_ratio_params': {'scale': 1, 'shape': 3},
   'output': 'data/del_sim',
   'prevalence': 0.005,
   'refgene_file': 'data/refGene.txt.gz'},
  'power': 0.0,
  'typeI_error': 0.001708963513628984},
 'dataset_1': {'debug': {'avg_cnv_per_individual': 5,
   'block_size': 100000,
   'case_dataset': 'delCases',
   'cnv_file': 'data/ISC-r1.CNV.bed',
   'ctrl_dataset': 'delControls',
   'n_case': 50,
   'n_causal_gene': 50,
   'n_ctrl': 50,
   'odds_ratio_params': {'scale': None, 'shape': None},
   'output': 'data/del_sim',
   'prevalence': 0.005,
   'refgene_file': 'data/refGene.txt.gz'},
  'power': 0.0,
  'typeI_error': 0.0},
 'dataset_10': {'debug': {'avg_cnv_per_individual': 5,
   'block_size': 100000,
   'case_dataset': 'delCases',
   'cnv_file'

In [2]:
# trial
from simulation import *
import numpy as np
del_sample = load_data("data/del_sim_scale5_shape1_N1000_M100_999.pkl")
causal_genes = del_sample["debug"]["causal genes"]
print (causal_genes)
del_sample_gene = get_gene_table(del_sample)
# print (del_sample_gene)
from pandasql import sqldf
query = """
select gene_name, n_case_gene,  n_ctrl_gene,  n_case_nogene,  n_ctrl_nogene
from del_sample_gene
where gene_name = "BDNF"
"""
print (sqldf(query))

['LINC00229', 'TPGS2', 'OR11H6', 'LOC106699570', 'KAZN', 'ACP1', 'ZNF645', 'WIZ', 'CSDE1', 'CRTAM', 'SERHL2', 'DPP10-AS1', 'SNORA80B', 'LOC101927143', 'MIR625', 'LOC101929153', 'NDUFAF2', 'FBXO41', 'LOXL3', 'KLHL13', 'VAC14', 'PTMS', 'C2', 'UNC5D', 'CDC37L1', 'MIR4269', 'ELP5', 'NPY', 'CPEB1-AS1', 'POLR2J2', 'VAPB', 'C5orf15', 'KCNH2', 'OR12D3', 'ANKRD40', 'SLIT1', 'SNORD114-11', 'CCR7', 'PORCN', 'PLEK2', 'BHMT2', 'TMEM245', 'PQBP1', 'CD14', 'ZNF276', 'TICRR', 'SNCB', 'MAPK10', 'MIR34AHG', 'NRM', 'C3orf30', 'LOC105372493', 'RABEP1', 'PTGS1', 'MIR199A2', 'MIR3173', 'NEK6', 'PAAF1', 'TUBB4A', 'ALDH1B1', 'HCAR3', 'LOC101929006', 'ECE1', 'OR10C1', 'SRD5A1', 'LINC00970', 'LOC101929163', 'MGST1', 'FBLN1', 'ADAMTS20', 'UST', 'KRTAP6-2', 'MIR124-3', 'LOC102724652', 'LOC643623', 'POM121L12', 'SHC2', 'AURKA', 'PLEKHG5', 'LINC01391', 'MUC7', 'NOD1', 'PHACTR1', 'HCG26', 'ZBTB45', 'LOC100507053', 'CRNKL1', 'TTYH2', 'SLC39A10', 'DDX3X', 'MIR3161', 'PRKG2', 'RAD54L2', 'TMEM5-AS1', 'VOPP1', 'GFOD1', '

In [17]:
from simulation import *
from collections import Counter
from pprint import pprint
def get_num_cnv_per_sample(input_data):
    '''input_data is either "case" or "ctrl" in simulated dataset'''
    num_cnv = []
    for i in range(len(input_data)):
        input_data[i] = input_data[i].drop_duplicates(subset=("chrom", "cnv_start", "cnv_terminate"))
    num_cnv = [ len(input_data[i]) for i in range(len(input_data))]
    return num_cnv
dat = load_data("data/del_sim_scale5_shape1_N1000_M100_999.pkl")
num_cnv_case = get_num_cnv_per_sample(dat["case"])
print (num_cnv_case)
num_cnv_ctrl = get_num_cnv_per_sample(dat["ctrl"])
pprint (dict(Counter(num_cnv_case)))
pprint (dict(Counter(num_cnv_ctrl)))

[4, 4, 5, 2, 9, 6, 2, 4, 5, 5, 5, 7, 3, 4, 5, 6, 6, 1, 5, 6, 1, 4, 3, 7, 3, 3, 8, 3, 6, 7, 10, 1, 4, 4, 8, 5, 2, 2, 3, 5, 4, 3, 6, 8, 3, 4, 8, 5, 2, 1, 5, 6, 4, 4, 7, 4, 4, 2, 9, 1, 4, 2, 9, 3, 3, 4, 2, 2, 2, 1, 3, 4, 2, 5, 3, 5, 0, 8, 3, 4, 5, 3, 1, 2, 1, 4, 5, 3, 3, 5, 5, 3, 5, 4, 1, 4, 3, 1, 6, 2, 2, 5, 7, 2, 3, 3, 5, 3, 2, 3, 4, 2, 3, 2, 3, 3, 12, 2, 6, 4, 2, 3, 7, 2, 8, 4, 2, 5, 4, 3, 3, 4, 1, 4, 2, 10, 4, 5, 3, 4, 4, 2, 9, 6, 4, 2, 3, 5, 7, 1, 7, 6, 2, 6, 3, 3, 2, 1, 3, 9, 4, 4, 10, 3, 1, 2, 3, 1, 3, 6, 5, 5, 6, 5, 3, 1, 4, 8, 3, 5, 6, 6, 2, 7, 5, 9, 7, 5, 6, 2, 3, 4, 6, 4, 5, 7, 6, 11, 6, 6, 4, 12, 6, 4, 4, 5, 4, 6, 5, 5, 6, 4, 5, 8, 10, 4, 4, 5, 1, 3, 1, 2, 4, 4, 3, 7, 2, 5, 4, 5, 4, 4, 2, 3, 6, 3, 2, 1, 3, 2, 3, 5, 1, 5, 3, 3, 2, 2, 7, 8, 4, 4, 3, 2, 3, 4, 4, 1, 2, 2, 5, 2, 6, 2, 6, 5, 2, 7, 1, 2, 4, 3, 6, 5, 3, 6, 4, 5, 2, 3, 4, 5, 5, 1, 2, 2, 7, 2, 1, 8, 5, 3, 2, 2, 5, 0, 4, 5, 5, 4, 4, 6, 2, 7, 3, 5, 7, 2, 3, 7, 9, 5, 3, 6, 1, 8, 2, 4, 3, 1, 4, 5, 0, 5, 1, 5, 3, 2, 3, 4, 4,

In [50]:
import plotly
plotly.offline.init_notebook_mode(connected=True)
from plotly.offline import iplot
import plotly.graph_objs as go
%matplotlib inline

trace1 = go.Histogram( x=num_cnv_case, histnorm='count', name='case=500', autobinx=True, 
                      xbins=dict(start=0, end=15, size=1), marker=dict(color='lightred'), opacity=0.75 )

trace2 = go.Histogram( x=num_cnv_ctrl, histnorm='count', name='control=500', autobinx=True, 
                      xbins=dict(start=0, end=15, size=1), marker=dict(color='lightblue'), opacity=0.75 )

data = [trace1, trace2]

layout = go.Layout(title='Number of CNV in cases vs Controls', xaxis=dict(title='Value'), yaxis=dict(title='Count'),
                   bargap=0.2, bargroupgap=0.1)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='styled histogram')

In [62]:
from simulation import *
from collections import Counter
def get_num_causalgene_overlap_cnv(input_data):
    '''get the number of causal gene that overlapped with CNV in case and control'''
    dat = load_data(input_data)
    case_dat = dat["case"]
    ctrl_dat = dat["ctrl"]
    causal_genes = dat["debug"]["causal genes"]
    gene_list_case = {}
    gene_list_ctrl = {}
    num_causalgene_overlap_cnv_incase = []
    num_causalgene_overlap_cnv_inctrl = []
    
    for i in range(len(case_dat)):
        gene_list_case[i] = [ gene for gene in case_dat[i]["gene_name"].tolist() if gene in causal_genes ]
        num_causalgene_overlap_cnv_incase.append(len(gene_list_case[i]))
    for j in range(len(ctrl_dat)):
        gene_list_ctrl[j] = [ gene for gene in ctrl_dat[j]["gene_name"].tolist() if gene in causal_genes ]
        num_causalgene_overlap_cnv_inctrl.append(len(gene_list_ctrl[j]))
    return (dict(Counter(num_causalgene_overlap_cnv_incase)), dict(Counter(num_causalgene_overlap_cnv_inctrl)), 
            num_causalgene_overlap_cnv_incase, num_causalgene_overlap_cnv_inctrl)

num_causalgene_overlap_cnv = get_num_causalgene_overlap_cnv("data/del_sim_scale5_shape1_N1000_M100_999.pkl")
print (num_causalgene_overlap_cnv[0])
print (num_causalgene_overlap_cnv[1])

{0: 372, 1: 99, 2: 11, 3: 10, 4: 6, 5: 1, 6: 1}
{0: 479, 1: 19, 2: 1, 3: 1}


In [33]:
key = list(num_causalgene_overlap_cnv[0].keys())
print (key)

[0, 1, 2, 3, 4, 5, 6]


In [57]:
import plotly
plotly.offline.init_notebook_mode(connected=True)
from plotly.offline import iplot
import plotly.graph_objs as go
%matplotlib inline

trace1 = go.Histogram( x=num_causalgene_overlap_cnv[2], histnorm='count', name='case', autobinx=True, 
                      xbins=dict(start=0, end=15, size=1), marker=dict(color='lightred'), opacity=0.75 )

trace2 = go.Histogram( x=num_causalgene_overlap_cnv[3], histnorm='count', name='control', autobinx=True, 
                      xbins=dict(start=0, end=15, size=1), marker=dict(color='lightblue'), opacity=0.75 )

data = [trace1, trace2]

layout = go.Layout(title='Number of Causal gene overlapped with CNV in cases vs Controls', 
                   xaxis=dict(title='Value'), yaxis=dict(title='Count'), bargap=0.2, bargroupgap=0.1)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='styled histogram')

In [5]:
from pprint import pprint
import pandas as pd
def get_length_cnv(input_data):
    cnv_length = {}
    for i in range(len(del_sample["case"])):
        input_data[i] = input_data[i].drop_duplicates(subset=("chrom", "cnv_start", "cnv_terminate"))
        cnv_length[i] = input_data[i]["cnv_terminate"] - input_data[i]["cnv_start"]
    return dict(pd.concat(cnv_length))

cnv_length_case = list(get_length_cnv(del_sample["case"]).values())
cnv_length_ctrl = list(get_length_cnv(del_sample["ctrl"]).values())
print (np.median(cnv_length_case), np.median(cnv_length_ctrl))

191709.5 181886.0


In [53]:
import plotly
plotly.offline.init_notebook_mode(connected=True)
from plotly.offline import iplot
import plotly.graph_objs as go
%matplotlib inline

trace1 = go.Histogram( x=cnv_length_case, histnorm='density', name='case', autobinx=False, 
                      xbins=dict(start=0, end=1500000, size=40000), marker=dict(color='lightred'), opacity=0.75 )

trace2 = go.Histogram( x=cnv_length_ctrl, histnorm='density', name='control', autobinx=False, 
                      xbins=dict(start=0, end=1500000, size=40000), marker=dict(color='lightblue'), opacity=0.75 )

data = [trace1, trace2]

layout = go.Layout(title='CNV length in Cases versus Controls', xaxis=dict(title='CNV length'), 
                   yaxis=dict(title='density'), bargap=0.2, bargroupgap=0.1)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='styled histogram')

In [None]:
del_stats_table = get_stats(del_sample_gene, sort=0)

In [None]:
print (del_stats_table)

In [None]:
del_pvalue = del_stats_table["p_value"]
print (del_pvalue)

In [None]:
# power for causal genes
overlap_causal_genes = []
overlap_p_value = []
for gene in causal_genes:
    if gene in del_stats_table["logp_gene"]:
        overlap_causal_genes.append(del_stats_table["logp_gene"][del_stats_table["logp_gene"].index(gene)])
        overlap_p_value.append(del_stats_table["p_value"][del_stats_table["logp_gene"].index(gene)])
    else: pass
#print (overlap_p_value)
count = 0
for x in overlap_p_value:
    if x<=0.05:
        count += 1
print (count/73)
# a= [overlap_p_value[overlap_p_value.index[x]] for x in overlap_p_value if x<=0.05]

In [None]:
# type I error for noncausal genes
noncausal_overlap_genes_pvalue = []
noncausal_overlap_genes = set(del_stats_table["logp_gene"]) - set(causal_genes)
print (len(noncausal_overlap_genes))

for gene in noncausal_overlap_genes:
    noncausal_overlap_genes_pvalue.append(del_stats_table["p_value"][del_stats_table["logp_gene"].index(gene)])
#     overlap_noncausal_gene.append(del_stats_table["logp_gene"][del_stats_table["logp_gene"].index(gene)])
# print ((noncausal_overlap_genes_pvalue))
count_p = 0
for x in noncausal_overlap_genes_pvalue:
    if x < 0.05:
        count_p += 1
print (count_p/len(noncausal_overlap_genes))

In [None]:
# type I error function
def get_typeIerror(stats_table, causal_genes):
    noncausal_overlap_genes = [gene for gene in stats_table["genes"] if gene not in causal_genes]
    noncausal_overlap_genes_pvalue = []
    for gene in noncausal_overlap_genes:
        noncausal_overlap_genes_pvalue.append(stats_table["p_value"][stats_table["genes"].index(gene)])
    count = 0
    for x in noncausal_overlap_genes_pvalue:
        if x < 0.05:
            count += 1
    return count/len(noncausal_overlap_genes_pvalue)

get_typeIerror(chi2_table, causal_genes)

In [None]:
from scipy import stats
def test_contingency_table(gene_table, method = "Fisher", option = False): 
    if (method == "Fisher"):
        stats_table = {"genes": get_stats(gene_table)["logp_gene"], "p_value": get_stats(gene_table)["p_value"]}
    else:
#         stats.chi2_contingency(obs, correction=True)
        table = [( stats.chi2_contingency([[row["n_case_gene"], row["n_ctrl_gene"]], 
                                           [row["n_case_nogene"], row["n_ctrl_nogene"]]], correction=option), 
                  row["gene_name"] ) for idx, row in gene_table.iterrows()]
        p_value = [x[0][1] for x in table]
        gene = [x[1] for x in table]
        stats_table = {"p_value": p_value, "genes": gene}
    return stats_table

In [None]:
fisher_table = test_contingency_table(del_sample_gene)
chi2_table = test_contingency_table(del_sample_gene, method = "chi2")

In [None]:
# power function
def get_power(stats_table, causal_genes):
    overlap_causal_genes = []
    overlap_p_value = []
    
    for gene in causal_genes:
        if gene in stats_table["genes"]:
            overlap_causal_genes.append(stats_table["genes"][stats_table["genes"].index(gene)])
            overlap_p_value.append(stats_table["p_value"][stats_table["genes"].index(gene)])
        else: pass
    count = 0
    for x in overlap_p_value:
        if x<=0.05:
            count += 1
    return count/len(overlap_causal_genes)

get_power(chi2_table, causal_genes)