# Toy example for multi-genes and multi-CNVs in a region
## One or more causal genes

In [1]:
import numpy as np
import pandas as pd
import feather
from pandasql import sqldf
from fisher import pvalue
from scipy import stats
import sys

In [2]:
def toy_multicnv_multicausal(p, n_cnv, causal, n_geom=10000, n_gene=10, const1=0.5, const2=0.05, seed=12):
    '''A region with multiple genes and CNVs. CNVs probably overlap. At least two causal genes.
    "causal" is a list of positions of causal genes.
    "n_gene" is the number of genes this region harbors.
    "n_cnv" is the number of CNV in this region. "n_geom" is the number of geometric variables generated.
    "p" is the probability of geometric dist, then randomly generate a certain number of geometric variables.
    I use these variables minus 1 as the length of CNVs, which means the number of genes that a CNV overlap with.
    The maximum length for CNV in this region is overlapping with 10 genes'''
    np.random.seed(seed)
    if n_cnv > n_geom:
        print ("The number of CNV is larger than the number of geometric numbers")
        sys.exit()
        return None
    geom_minus_1 = np.random.geometric(p, size=n_geom) - 1
    if n_cnv > len(geom_minus_1[geom_minus_1 <= n_gene]):
        print ("Cannot take a larger sample than population when 'replace=False'")
        sys.exit()
        return None
    cnv_len = np.random.choice(geom_minus_1[geom_minus_1 <= n_gene], n_cnv, replace=False)
    cnv_start_pos = [np.random.choice(range(n_gene+1-i)) if i!=0 else -1 for i in cnv_len]
    ptn_ls = []
    for j,i in enumerate(cnv_start_pos):
    ## j is the index of i in cnv start position list, so cnv_len[j] is the length of the corresponding cnv;
    ## i is the corresponding cnv start position
        if i == -1:
            ptn = [0]*n_gene
        else:
            ptn = [0]*i + [1]*cnv_len[j] + [0]*(n_gene - cnv_len[j] - i)
        ptn_ls.append(ptn)
    mat = [[np.random.normal(0,1)] + line for line in ptn_ls]
    config = []
    for line in mat:
        line[0] = line[0] + line[1:].count(1)*const2
        config.append(line[0])
#         only one causal gene, and gene pos is no more than the number of gene covered in this region
        if len(causal) == 1 and (causal[0] > len(line[1:]) or causal[0] <= 0):
            print ("Single causal CNV index out of range")
            sys.exit()
        elif len(causal) == 1 and causal[0] <= len(line[1:]):
            line[0] = line[0] + line[causal[0]]*const1
            config[-1] = line[0]
        elif max(causal) > len(line[1:]) or min(causal) <= 0:
            print ("Causal CNV index out of range")
            sys.exit()
        elif any(line[x] for x in causal):
            line[0] = line[0] + const1*[line[x] for x in causal].count(1)
            config[-1] = line[0]
        else:
            continue
    for line in mat:
        line[0] = 1 if line[0] >= np.median(config) else 0
    mat = np.matrix(mat)
    df = pd.DataFrame(mat, columns = ["phenotype"] + ["gene{}".format(i+1) for i in range(mat.shape[1]-1)])
    counts = df.groupby(["gene{}".format(i+1) for i in range(mat.shape[1]-1)] + ["phenotype"]).size()
    
    for col in df.columns:
        df[col] = df[col].astype(np.uint8)
    output = feather.write_dataframe(df, "data/toy_n{}_p{}_causal{}_const{}_{}.feather"
                                     .format(len(cnv_len), p, causal, const1, const2))
    return df, counts

In [9]:
df, counts = toy_multicnv_multicausal(p=0.5, n_cnv=5000, causal=[1,15], n_gene=15, const1=0.45, const2=0)
print (df["phenotype"].tolist().count(1))

2500


In [10]:
from utils import run_dap_lite
fileout = "data/toy_multi_causal_10.dap"
run_dap_lite(df, fileout, grid=[(0, -9.2)])

2017-09-05 18:23:23.243223
2017-09-05 18:23:38.362238


In [5]:
def get_OR(gene_df, fisher=1):
    cases = gene_df[gene_df["phenotype"]==1]
    ctrls = gene_df[gene_df["phenotype"]==0]
    res_dict = {}
    for gene in list(gene_df)[1:]:
        n_gene_case = len(cases[cases[gene]==1])
        n_nogene_case = len(cases[cases[gene]==0])
        n_gene_ctrl = len(ctrls[ctrls[gene]==1])
        n_nogene_ctrl = len(ctrls[ctrls[gene]==0])
#         print (n_gene_case, n_nogene_case, n_gene_ctrl, n_nogene_ctrl)
        odds_ratio = stats.fisher_exact([[n_gene_case, n_gene_ctrl], [n_nogene_case, n_nogene_ctrl]])[0]
#         p_value = [x[0].two_tail for x in stats_table]
        res_dict[gene] = odds_ratio
        if not fisher==0:
            logp = -np.log10(pvalue(n_gene_case, n_gene_ctrl, n_nogene_case, n_nogene_ctrl).two_tail)
            res_dict[gene] = logp
    return res_dict

In [6]:
get_OR(df)

{'gene1': 2.8540999492546013,
 'gene10': 0.87420960321337415,
 'gene11': 0.50320491970398695,
 'gene12': 2.2349389530648245,
 'gene13': 2.8589496807756709,
 'gene14': 1.8407079742770978,
 'gene15': 3.8622152477490674,
 'gene2': 0.11461436729841877,
 'gene3': 0.57561227797355863,
 'gene4': 0.42357723981291229,
 'gene5': 0.3815067971916592,
 'gene6': 1.559468397448611,
 'gene7': 0.018978894194349533,
 'gene8': 0.94002873923930341,
 'gene9': 0.83693813473959255}

In [17]:
for idx, item in enumerate(df.columns.values):
    print (idx, item)
[0]*0+[1]*5

0 phenotype
1 gene1
2 gene2
3 gene3
4 gene4
5 gene5
6 gene6
7 gene7
8 gene8
9 gene9
10 gene10


[1, 1, 1, 1, 1]

In [10]:
startp=[0,6,8,9,1,4,3]
for j, i in enumerate(startp):
    print (j,i, startp[j])

0 0 0
1 6 6
2 8 8
3 9 9
4 1 1
5 4 4
6 3 3


In [7]:
df.to_csv("data/df.csv")

In [8]:
counts.to_csv("data/counts.csv")

In [121]:
# bug in it, fixed in function "toy_multicnv_multicausal"
def toy_multi_gene_cnv(p, n_cnv, n_max=10, causal=5, const1=1, const2=0.1, seed=1):
    '''A region with multiple genes and CNVs. CNVs may overlap or not overlap. Only one causal gene.'''
    np.random.seed(seed)
    cnv_len = np.random.choice((np.random.geometric(p, size=n_cnv) - 1), n_cnv, replace=False)
    cnv_len = cnv_len[cnv_len <= 10].tolist()
    start_pos = [np.random.choice(range(n_max+1-i)) if i!=0 else -1 for i in cnv_len]
    ptn_ls = []
    for j,i in enumerate(start_pos):
        if i == -1:
            ptn = [0]*n_max
        elif i == 0:
            ptn = [0]*(i-1) + [1]*cnv_len[j] + [0]*(n_max-cnv_len[j]-i)
        else:
            ptn = [0]*(i-1) + [1]*cnv_len[j] + [0]*(n_max-cnv_len[j]-(i-1))
        ptn_ls.append(ptn)
    mat = [[np.random.normal(0,1)] + line for line in ptn_ls]
    config = []
    for line in mat:
##         config.append(line[0]+const1) if line[causal-1] == 1 else config.append(line[0])
        if line[causal] == 1:
            line[0] = line[0] + const1
        config.append(line[0])
        
#       the longer the CNV is, the less common it is, and larger OR
        line[0] = line[0] + line[1:].count(1)*const2
        
##         line[0] = 1 if line[0] > np.median(config) else 0
        if line[0] >= np.median(config):
            line[0] = 1
        else:
            line[0] = 0
        
    mat = np.matrix(mat)
    df = pd.DataFrame(mat, columns = ["phenotype"] + ["gene{}".format(i+1) for i in range(mat.shape[1]-1)])
    counts = df.groupby(["gene{}".format(i+1) for i in range(mat.shape[1]-1)] + ["phenotype"]).size()
    for col in df.columns:
        df[col] = df[col].astype(np.uint8)
    output = feather.write_dataframe(df, "data/toy_n{}_p{}_causal{}_const{}_{}.feather"
                                     .format(len(cnv_len), p, causal, const1, const2))
    return df, counts

In [123]:
df1, counts1 = toy_multi_gene_cnv(p=0.6, n_cnv=20, const1=1.0, const2=0.05)
# print (df["phenotype"].tolist())