In [1]:
import numpy as np
import pandas as pd
import feather
from pandasql import sqldf
from fisher import pvalue
from scipy import stats
from pprint import pprint

In [2]:
def simu_toy(n_pos, n_max, causal, n=50, const=1, seed=999):
    ptn_ls = []
    if n_pos <= n_max:
        ptn_keys = ["p{}".format(i+1) for i in range(n_pos*2)]
        for i in range(n_pos):
            ptn = [1]*(i+1) + [0]*(n_pos-i-1)
            ptn_ls.append(ptn)
        for i in range(n_pos, n_pos*2):
            ptn = [0]*(i-n_pos+1) + [1]*(2*n_pos-i-1)
            ptn_ls.append(ptn)
    else:
        ptn_keys = ["p{}".format(i+1) for i in range(n_pos+n_max)]
        for i in range(n_max):
            ptn = [1]*(i+1) + [0]*(n_pos-i-1)
            ptn_ls.append(ptn)
        for i in range(n_max, n_pos):
            ptn = [0]*(i-n_max+1) + [1]*n_max + [0]*(n_pos-i-1)
            ptn_ls.append(ptn)
        for i in range(n_pos, n_pos+n_max):
            ptn = [0]*(i-n_max+1) + [1]*(n_pos+n_max-i-1)
            ptn_ls.append(ptn)

    ptn_dict = dict(zip(ptn_keys, ptn_ls))
    n_ls = [n]*len(ptn_keys)
    n_dict = dict(zip(ptn_keys, n_ls))
    sample = sum([[ptn_dict[i]] * n_dict[i] for i in n_dict.keys()], [])
    
    np.random.seed(seed)    
    mat = [[np.random.normal(0,1)] + line for line in sample]
    config = []
    for line in mat:
        if line[causal] == 1:
            line[0] = line[0] + const
            config.append(line[0])
        else:
            config.append(line[0])
        if line[0] > np.median(config):
            line[0] = 1
        else:
            line[0] = 0
    mat = np.matrix(mat)
    df = pd.DataFrame(mat, columns = ["phenotype"] + ["pos{}".format(i+1) for i in range(mat.shape[1]-1)])
    counts = df.groupby(["pos{}".format(i+1) for i in range(mat.shape[1]-1)] + ["phenotype"]).size()
    for col in df.columns:
        df[col] = df[col].astype(np.uint8)
    output = feather.write_dataframe(df, "data/toy_pos{}_causal{}_n{}_const{}.feather"
                                     .format(n_pos, causal, sum(n_ls), const))
    return df, counts

In [3]:
df, counts = simu_toy(n_pos=15, n_max=10, causal=8, const=1, seed=999)
# pprint (counts)

In [4]:
def get_OR(gene_df, fisher=0):
    cases = gene_df[gene_df["phenotype"]==1]
    ctrls = gene_df[gene_df["phenotype"]==0]
    res_dict = {}
    for gene in list(gene_df)[1:]:
        n_gene_case = len(cases[cases[gene]==1])
        n_nogene_case = len(cases[cases[gene]==0])
        n_gene_ctrl = len(ctrls[ctrls[gene]==1])
        n_nogene_ctrl = len(ctrls[ctrls[gene]==0])
#         print (n_gene_case, n_nogene_case, n_gene_ctrl, n_nogene_ctrl)
        odds_ratio = stats.fisher_exact([[n_gene_case, n_gene_ctrl], [n_nogene_case, n_nogene_ctrl]])[0]
        res_dict[gene] = odds_ratio
        if not fisher==0:
            logp = -np.log10(pvalue(n_gene_case, n_gene_ctrl, n_nogene_case, n_nogene_ctrl).two_tail)
            res_dict[gene] = logp
    return res_dict

In [5]:
get_OR(df)

{'pos1': 0.82739171869226136,
 'pos10': 2.7426045562733763,
 'pos11': 2.1569170604529901,
 'pos12': 1.6610425585494837,
 'pos13': 1.234884163455592,
 'pos14': 1.0242954324586977,
 'pos15': 0.74338345447483767,
 'pos2': 1.0380484411010131,
 'pos3': 1.2515271289974848,
 'pos4': 1.6837920144371756,
 'pos5': 1.8783783783783783,
 'pos6': 2.2491644385026737,
 'pos7': 3.5175228462425694,
 'pos8': 4.6365581047114821,
 'pos9': 3.7924167521686409}