In [1]:
import numpy as np
import pandas as pd
import feather
from pandasql import sqldf
from fisher import pvalue
from scipy import stats
from pprint import pprint

In [2]:
# 4 adjacent genes, the first gene is caucal
# common
s1 = [0,0,0,0]
s2 = [1,0,0,0] # need to be added a constant
s5 = [0,0,0,1]
s6 = [1,1,0,0] # need to be added a constant
s8 = [0,0,1,1]
s9 = [1,1,1,0] # need to be added a constant
s10 = [0,1,1,1]
s11 = [1,1,1,1] # need to be added a constant
# less common
s3 = [0,1,0,0]
s4 = [0,0,1,0]
s7 = [0,1,1,0]
# most uncommon
s12 = [1,0,1,0] # can be divided into [1,0,0,0] and [0,0,1,0]
s13 = [1,0,0,1] # can be divided into [1,0,0,0] and [0,0,0,1]
s14 = [0,1,0,1] # can be divided into [0,1,0,0] and [0,0,0,1]
s15 = [1,1,0,1] # can be divided into [1,1,0,0] and [0,0,0,1]
s16 = [1,0,1,1] # can be divided into [1,0,0,0] and [0,0,1,1]

In [3]:
def sample_4gene(n1, n2, n3, n4, n5, n6, n7, n8, n9, n10, n11, n12, n13, n14, n15, n16, 
                 const2, const6, const9, const11, divide=0, seed=999):
    np.random.seed(seed)
    if not divide==0:
        # s12 can be divided into 1000 and 0010, etc
        overlap = np.matrix( [s1 for i in range(n1)] + [s2 for i in range(n2)] + [s5 for i in range(n5)] 
                           + [s6 for i in range(n6)] + [s8 for i in range(n8)] + [s9 for i in range(n9)] 
                           + [s10 for i in range(n10)] + [s11 for i in range(n11)] 
                           + [s3 for i in range(n3)] + [s4 for i in range(n4)] + [s7 for i in range(n7)]
                           + [s2 for i in range(n12)] + [s4 for i in range(n12)]
                           + [s2 for i in range(n13)] + [s5 for i in range(n13)] 
                           + [s3 for i in range(n14)] + [s5 for i in range(n14)]
                           + [s6 for i in range(n15)] + [s5 for i in range(n15)]
                           + [s2 for i in range(n16)] + [s8 for i in range(n16)] )
        config = [np.random.normal(0,1) for i in range(n1)] + [np.random.normal(0,1)+const2 for i in range(n2)] + [np.random.normal(0,1) for i in range(n5)] + [np.random.normal(0,1)+const6 for i in range(n6)] + [np.random.normal(0,1) for i in range(n8)] + [np.random.normal(0,1)+const9 for i in range(n9)] + [np.random.normal(0,1) for i in range(n10)] + [np.random.normal(0,1)+const11 for i in range(n11)] + [np.random.normal(0,1) for i in range(n3)] + [np.random.normal(0,1) for i in range(n4)] + [np.random.normal(0,1) for i in range(n7)] + [np.random.normal(0,1)+const2 for i in range(n12)]*2 + [np.random.normal(0,1)+const2 for i in range(n13)]*2 + [np.random.normal(0,1) for i in range(n14)]*2 + [np.random.normal(0,1)+const2 for i in range(n15)]*2 + [np.random.normal(0,1)+const2 for i in range(n16)]*2
    else:
        overlap = np.matrix( [s1 for i in range(n1)] + [s2 for i in range(n2)] + [s5 for i in range(n5)] 
                           + [s6 for i in range(n6)] + [s8 for i in range(n8)] + [s9 for i in range(n9)] 
                           + [s10 for i in range(n10)] + [s11 for i in range(n11)] 
                           + [s3 for i in range(n3)] + [s4 for i in range(n4)] + [s7 for i in range(n7)]
                           + [s12 for i in range(n12)] + [s13 for i in range(n13)] + [s14 for i in range(n14)]
                           + [s15 for i in range(n15)] + [s16 for i in range(n16)] )
        config = [np.random.normal(0,1) for i in range(n1)] + [np.random.normal(0,1)+const2 for i in range(n2)] + [np.random.normal(0,1) for i in range(n5)] + [np.random.normal(0,1)+const6 for i in range(n6)] + [np.random.normal(0,1) for i in range(n8)] + [np.random.normal(0,1)+const9 for i in range(n9)] + [np.random.normal(0,1) for i in range(n10)] + [np.random.normal(0,1)+const11 for i in range(n11)] + [np.random.normal(0,1) for i in range(n3)] + [np.random.normal(0,1) for i in range(n4)] + [np.random.normal(0,1) for i in range(n7)] + [np.random.normal(0,1)+const2 for i in range(n12)] + [np.random.normal(0,1)+const2 for i in range(n13)] + [np.random.normal(0,1) for i in range(n14)] + [np.random.normal(0,1)+const2 for i in range(n15)] + [np.random.normal(0,1)+const2 for i in range(n16)]
    config = np.matrix([[1] if i>np.median(config) else [0] for i in config])
    sample = np.concatenate((config, overlap), axis=1)
    df = pd.DataFrame(sample, columns = ["phenotype"] + ["gene{}".format(i+1) for i in range(sample.shape[1]-1)])
    for col in df.columns:
        df[col] = df[col].astype(np.uint8)
    output = feather.write_dataframe(df, "data/toy_4genes_n_{}.feather"
                                     .format(n1+n2+n3+n4+n5+n6+n7+n8+n9+n10+n11+n12+n13+n14+n15+n16))
    return df

In [4]:
# 500 samples without any CNV in this region. 5 for 1010, 10 for 1001, 5 for 0101, 10 for 1101, 10 for 1011
# 1010 can be divided into 1000 and 0010, 1101 can be divided into 1100 and 0001
# So we add 25 for 1000 and 0001, 5 for 0010 and 0100, 10 for 1100 and 0011
n0000 = 640
n1000 = 60
n0001 = 60
n1100 = 80
n0011 = 80
n1110 = 100
n0111 = 100
n1111 = 100
n0100 = 10
n0010 = 10
n0110 = 15
n1010 = 5
n1001 = 5
n0101 = 5
n1101 = 5
n1011 = 5
diff1000 = 1.0 # 0.66
diff1100 = 0.75 # 0.44
diff1110 = 0.50 # 0.22
diff1111 = 0.25
sample = sample_4gene(n1=n0000, n2=n1000, n5=n0001, n6=n1100, n8=n0011, n9=n1110, n10=n0111, n11=n1111, n3=n0100, 
                      n4=n0010, n7=n0110, n12 = n1010, n13 = n1001, n14 = n0101, n15 = n1101, n16 = n1011, 
                      const2=diff1000, const6=diff1100, const9=diff1110, const11=diff1111, divide=1)

In [5]:
def get_OR(gene_df, fisher=0):
    cases = gene_df[gene_df["phenotype"]==1]
    ctrls = gene_df[gene_df["phenotype"]==0]
    res_dict = {}
    for gene in list(gene_df)[1:]:
        n_gene_case = len(cases[cases[gene]==1])
        n_nogene_case = len(cases[cases[gene]==0])
        n_gene_ctrl = len(ctrls[ctrls[gene]==1])
        n_nogene_ctrl = len(ctrls[ctrls[gene]==0])
#         print (n_gene_case, n_nogene_case, n_gene_ctrl, n_nogene_ctrl)
        odds_ratio = stats.fisher_exact([[n_gene_case, n_gene_ctrl], [n_nogene_case, n_nogene_ctrl]])[0]
        res_dict[gene] = odds_ratio
        if not fisher==0:
            logp = -np.log10(pvalue(n_gene_case, n_gene_ctrl, n_nogene_case, n_nogene_ctrl).two_tail)
            res_dict[gene] = logp
    return res_dict

In [6]:
pprint (get_OR(sample, fisher=0))
pprint (get_OR(sample, fisher=1))

{'gene1': 2.3031590024993891,
 'gene2': 1.3592490912326189,
 'gene3': 0.87626626606305447,
 'gene4': 0.75974025974025972}
{'gene1': 5.4934888127143431,
 'gene2': 1.9743215672164185,
 'gene3': 0.54557255989613296,
 'gene4': 1.5210780539773865}
