In [1]:
import numpy as np
import pandas as pd
import feather
from pandasql import sqldf
from fisher import pvalue
from scipy import stats
from pprint import pprint

In [2]:
# 4 adjacent genes, the first gene is caucal
# common
s1 = [0,0,0,0]
s2 = [1,0,0,0] # need to be added a constant
s5 = [0,0,0,1]
s6 = [1,1,0,0] # need to be added a constant
s8 = [0,0,1,1]
s9 = [1,1,1,0] # need to be added a constant
s10 = [0,1,1,1]
s11 = [1,1,1,1] # need to be added a constant
# less common
s3 = [0,1,0,0]
s4 = [0,0,1,0]
s7 = [0,1,1,0]

In [3]:
def sample_4gene(n1, n2, n3, n4, n5, n6, n7, n8, n9, n10, n11, const, causal_2nd = 0, seed=999):
    np.random.seed(seed)
    overlap = np.matrix( [s1 for i in range(n1)] + [s2 for i in range(n2)] + [s5 for i in range(n5)] 
                       + [s6 for i in range(n6)] + [s8 for i in range(n8)] + [s9 for i in range(n9)] 
                       + [s10 for i in range(n10)] + [s11 for i in range(n11)] 
                       + [s3 for i in range(n3)] + [s4 for i in range(n4)] + [s7 for i in range(n7)] )
    config = [np.random.normal(0,1) for i in range(n1)] + [np.random.normal(0,1)+const for i in range(n2)] + [np.random.normal(0,1) for i in range(n5)] + [np.random.normal(0,1)+const for i in range(n6)] + [np.random.normal(0,1) for i in range(n8)] + [np.random.normal(0,1)+const for i in range(n9)] + [np.random.normal(0,1) for i in range(n10)] + [np.random.normal(0,1)+const for i in range(n11)] + [np.random.normal(0,1) for i in range(n3)] + [np.random.normal(0,1) for i in range(n4)] + [np.random.normal(0,1) for i in range(n7)]
    if not causal_2nd == 0:
        config = [np.random.normal(0,1) for i in range(n1)] + [np.random.normal(0,1) for i in range(n2)] + [np.random.normal(0,1) for i in range(n5)] + [np.random.normal(0,1)+const for i in range(n6)] + [np.random.normal(0,1) for i in range(n8)] + [np.random.normal(0,1)+const for i in range(n9)] + [np.random.normal(0,1)+const for i in range(n10)] + [np.random.normal(0,1)+const for i in range(n11)] + [np.random.normal(0,1)+const for i in range(n3)] + [np.random.normal(0,1) for i in range(n4)] + [np.random.normal(0,1)+const for i in range(n7)]
    config = np.matrix([[1] if i>np.median(config) else [0] for i in config])
    sample = np.concatenate((config, overlap), axis=1)
    df = pd.DataFrame(sample, columns = ["phenotype"] + ["gene{}".format(i+1) for i in range(sample.shape[1]-1)])
    counts = df.groupby(["gene1","gene2","gene3","gene4","phenotype"]).size()
    for col in df.columns:
        df[col] = df[col].astype(np.uint8)
    output = feather.write_dataframe(df, "data/toy_4genes_n{}_causal{}.feather"
                                     .format(n1+n2+n3+n4+n5+n6+n7+n8+n9+n10+n11, causal_2nd+1))
    return df, counts

In [4]:
# 1010 can be divided into 1000 and 0010, 1101 can be divided into 1100 and 0001
# So we add 25 for 1000 and 0001, 5 for 0010 and 0100, 10 for 1100 and 0011
n0000 = 200; n1000 = 100; n0001 = 100; n1100 = 150; n0011 = 150; n1110 = 200; n0111 = 200; n1111 = 150
n0100 = 20; n0010 = 20; n0110 = 30
### a special case
# n0000 = 1000; n1000 = 0; n0001 = 0; n1100 = 500; n0011 = 500; n1110 = 0; n0111 = 0; n1111 = 0
# n0100 = 0; n0010 = 0; n0110 = 0
diff = 1.0
sample, counts = sample_4gene(n1=n0000, n2=n1000, n5=n0001, n6=n1100, n8=n0011, n9=n1110, n10=n0111, n11=n1111, 
                              n3=n0100, n4=n0010, n7=n0110, const=diff, causal_2nd=0)
sample_2, counts_2 = sample_4gene(n1=n0000, n2=n1000, n5=n0001, n6=n1100, n8=n0011, n9=n1110, n10=n0111, n11=n1111, 
                                  n3=n0100, n4=n0010, n7=n0110, const=diff, causal_2nd=1)

In [5]:
def get_OR(gene_df, fisher=0):
    cases = gene_df[gene_df["phenotype"]==1]
    ctrls = gene_df[gene_df["phenotype"]==0]
    res_dict = {}
    for gene in list(gene_df)[1:]:
        n_gene_case = len(cases[cases[gene]==1])
        n_nogene_case = len(cases[cases[gene]==0])
        n_gene_ctrl = len(ctrls[ctrls[gene]==1])
        n_nogene_ctrl = len(ctrls[ctrls[gene]==0])
#         print (n_gene_case, n_nogene_case, n_gene_ctrl, n_nogene_ctrl)
        odds_ratio = stats.fisher_exact([[n_gene_case, n_gene_ctrl], [n_nogene_case, n_nogene_ctrl]])[0]
        res_dict[gene] = odds_ratio
        if not fisher==0:
            logp = -np.log10(pvalue(n_gene_case, n_gene_ctrl, n_nogene_case, n_nogene_ctrl).two_tail)
            res_dict[gene] = logp
    return res_dict

In [6]:
pprint (get_OR(sample, fisher=0))

{'gene1': 4.5364635364635362,
 'gene2': 2.144790718835305,
 'gene3': 1.1176051318602993,
 'gene4': 0.5967530767216549}


In [7]:
pprint (get_OR(sample_2, fisher=0))

{'gene1': 2.1814345991561179,
 'gene2': 4.977072310405644,
 'gene3': 2.3473633648058363,
 'gene4': 1.0373482726423904}
