# Toy example of 3-gene configuration overlapped with one CNV

In [1]:
import numpy as np
import pandas as pd
import feather
from pandasql import sqldf
from fisher import pvalue
from scipy import stats
from pprint import pprint

In [2]:
# first gene is causal, extreme situation include [1,1,1] and [0,0,0]
s1 = [0,0,0]
s2 = [1,0,0] # need to add a constance in config
s3 = [0,1,0]
s4 = [0,0,1]
s5 = [1,1,0] # need to add a constance in config
s6 = [0,1,1]
s7 = [1,1,1] # need to add a constance in config
# [1,0,1] is also possible, but contains two CNVs, so it can be separated to [1,0,0] and [0,0,1]

In [3]:
def sample_3gene(n1, n2, n3, n4, n5, n6, n7, const, seed=999):
    np.random.seed(seed)
    overlap = np.matrix( [s7 for i in range(n7)] + [s1 for i in range(n1)] + [s2 for i in range(n2)] 
                       + [s3 for i in range(n3)] + [s4 for i in range(n4)] + [s5 for i in range(n5)] 
                       + [s6 for i in range(n6)] )
    config = [np.random.normal(0,1)+const for i in range(n7)] + [np.random.normal(0,1) for i in range(n1)] + [np.random.normal(0,1)+const for i in range(n2)] + [np.random.normal(0,1) for i in range(n3)] + [np.random.normal(0,1) for i in range(n4)] + [np.random.normal(0,1)+const for i in range(n5)] + [np.random.normal(0,1) for i in range(n6)]
    config = np.matrix([[1] if i>np.median(config) else [0] for i in config])
    sample = np.concatenate((config, overlap), axis=1)
    df = pd.DataFrame(sample, columns = ["phenotype"] + ["gene{}".format(i+1) for i in range(sample.shape[1]-1)])
    counts = df.groupby(["gene1", "gene2", "gene3", "phenotype"]).size()
    for col in df.columns:
        df[col] = df[col].astype(np.uint8)
    output = feather.write_dataframe(df, "data/toy_3genes_n_{}.feather".format(n1+n2+n3+n4+n5+n6+n7))
    return df, counts

In [4]:
n000 = 600
n100 = 105
n010 = 100
n001 = 105
n110 = 100
n011 = 100
n111 = 100
diff = 1.0 # 0.66
sample, counts = sample_3gene(n1=n000, n2=n100, n3=n010, n4=n001, n5=n110, n6=n011, n7=n111, const=diff)

In [8]:
print (counts)

gene1  gene2  gene3  phenotype
0      0      0      0            362
                     1            238
              1      0             61
                     1             44
       1      0      0             59
                     1             41
              1      0             62
                     1             38
1      0      0      0             22
                     1             83
       1      0      0             22
                     1             78
              1      0             17
                     1             83
dtype: int64


In [5]:
def get_OR(gene_df, fisher=0):
    cases = gene_df[gene_df["phenotype"]==1]
    ctrls = gene_df[gene_df["phenotype"]==0]
    res_dict = {}
    for gene in list(gene_df)[1:]:
        n_gene_case = len(cases[cases[gene]==1])
        n_nogene_case = len(cases[cases[gene]==0])
        n_gene_ctrl = len(ctrls[ctrls[gene]==1])
        n_nogene_ctrl = len(ctrls[ctrls[gene]==0])
        odds_ratio = stats.fisher_exact([[n_gene_case, n_gene_ctrl], [n_nogene_case, n_nogene_ctrl]])[0]
        res_dict[gene] = odds_ratio
        if not fisher==0:
            logp = -np.log10(pvalue(n_gene_case, n_gene_ctrl, n_nogene_case, n_nogene_ctrl).two_tail)
            res_dict[gene] = logp
    return res_dict

In [6]:
get_OR(sample, fisher=0)

{'gene1': 6.0277008310249309,
 'gene2': 1.8287671232876712,
 'gene3': 1.2455357142857142}