# Toy example for multi-genes and multi-CNVs in a region
## One or more causal genes

In [1]:
import numpy as np
import pandas as pd
import feather
from pandasql import sqldf
from fisher import pvalue
from scipy import stats

In [7]:
def toy_multi_gene_cnv(p, n_cnv, n_max=10, causal=5, const1=1, const2=0.1, seed=1):
    '''A region with multiple genes and CNVs. CNVs may overlap or not overlap. Only one causal gene.'''
    np.random.seed(seed)
    cnv_len = np.random.choice((np.random.geometric(p, size=n_cnv) - 1), n_cnv, replace=False)
    cnv_len = cnv_len[cnv_len <= 10].tolist()
    start_pos = [np.random.choice(range(n_max+1-i)) if i!=0 else -1 for i in cnv_len]
    ptn_ls = []
    for j,i in enumerate(start_pos):
        if i == -1:
            ptn = [0]*n_max
        elif i == 0:
            ptn = [0]*(i-1) + [1]*cnv_len[j] + [0]*(n_max-cnv_len[j]-i)
        else:
            ptn = [0]*(i-1) + [1]*cnv_len[j] + [0]*(n_max-cnv_len[j]-(i-1))
        ptn_ls.append(ptn)
    mat = [[np.random.normal(0,1)] + line for line in ptn_ls]
    config = []
    for line in mat:
##         config.append(line[0]+const1) if line[causal-1] == 1 else config.append(line[0])
        if line[causal] == 1:
            line[0] = line[0] + const1
        config.append(line[0])
        
#       the longer the CNV is, the less common it is, and larger OR
        line[0] = line[0] + line[1:].count(1)*const2
        
##         line[0] = 1 if line[0] > np.median(config) else 0
        if line[0] >= np.median(config):
            print (np.median(config))
            line[0] = 1
        else:
            line[0] = 0
        
    mat = np.matrix(mat)
    df = pd.DataFrame(mat, columns = ["phenotype"] + ["gene{}".format(i+1) for i in range(mat.shape[1]-1)])
    counts = df.groupby(["gene{}".format(i+1) for i in range(mat.shape[1]-1)] + ["phenotype"]).size()
    for col in df.columns:
        df[col] = df[col].astype(np.uint8)
    output = feather.write_dataframe(df, "data/toy_n{}_p{}_causal{}_const{}_{}.feather"
                                     .format(len(cnv_len), p, causal, const1, const2))
    return df, counts

In [8]:
df, counts = toy_multi_gene_cnv(p=0.6, n_cnv=20, const1=1.0, const2=0.05)
# print (df["phenotype"].tolist())

-1.30486124098
-0.84271813821
-0.743627010285
-0.590374392413
-0.43712177454
-0.431785933517
-0.426450092493
-0.403512563966
-0.403512563966
-0.403512563966
-0.380575035439
-0.380575035439
-0.380575035439
-0.375016259386
-0.369457483333
-0.328278688388


In [58]:
max(df["phenotype"].tolist())

1

In [86]:
causal = [10,12]
line = [1,0,0,0,0,1,1,0,0,7]
if min(causal) > len(line)-1:
    print (0)
elif any(line[x] for x in causal):
    print (1)
else: print (0)
line[-1]=3
line

0


[1, 0, 0, 0, 0, 1, 1, 0, 0, 3]

In [115]:
def toy_multicnv_multicausal(p, n_cnv, causal, n_geom=5000, n_gene=10, const1=0.5, const2=0.05, seed=12):
    '''A region with multiple genes and CNVs. CNVs probably overlap. At least two causal genes.
    "causal" is a list of positions of causal genes.
    "n_gene" is the number of genes this region harbors.
    "n_cnv" is the number of CNV in this region. "n_geom" is the number of geometric variables generated.
    "p" is the probability of geometric dist, then randomly generate a certain number of geometric variables.
    I use these variables minus 1 as the length of CNVs, which means the number of genes that a CNV overlap with.
    The maximum length for CNV in this region is overlapping with 10 genes'''
    np.random.seed(seed)
    geom_minus_1 = np.random.geometric(p, size=n_geom) - 1
    cnv_len = np.random.choice(geom_minus_1[geom_minus_1 <= 10], n_cnv, replace=False)
    cnv_start_pos = [np.random.choice(range(n_gene+1-i)) if i!=0 else -1 for i in cnv_len]
    ptn_ls = []
    for j,i in enumerate(cnv_start_pos):
    ## j is the index of i in cnv start position list, so cnv_len[j] is the length of the corresponding cnv;
    ## i is the corresponding cnv start position
        if i == -1:
            ptn = [0]*n_gene
        elif i == 0:
            ptn = [0]*(i - 1) + [1]*cnv_len[j] + [0]*(n_gene - cnv_len[j] - i)
        else:
            ptn = [0]*(i - 1) + [1]*cnv_len[j] + [0]*(n_gene - cnv_len[j] - (i-1))
        ptn_ls.append(ptn)
    
    mat = [[np.random.normal(0,1)] + line for line in ptn_ls]
    config = []
    for line in mat:
        line[0] = line[0] + line[1:].count(1)*const2
        config.append(line[0])
        if min(causal) > len(line[1:])-1:
            continue
        elif any(line[x] for x in causal):
            line[0] = line[0] + const1*[line[x] for x in causal].count(1)
            config[-1] = line[0]
        else:
            continue
    print (np.median(config))
    for line in mat:
        line[0] = 1 if line[0] >= np.median(config) else 0
    mat = np.matrix(mat)
    df = pd.DataFrame(mat, columns = ["phenotype"] + ["gene{}".format(i+1) for i in range(mat.shape[1]-1)])
    counts = df.groupby(["gene{}".format(i+1) for i in range(mat.shape[1]-1)] + ["phenotype"]).size()
    
    for col in df.columns:
        df[col] = df[col].astype(np.uint8)
    output = feather.write_dataframe(df, "data/toy_n{}_p{}_causal{}_const{}_{}.feather"
                                     .format(len(cnv_len), p, causal, const1, const2))
    return df, counts

In [116]:
df, counts = toy_multicnv_multicausal(p=0.6, n_cnv=5000, causal = [1,6])
print (df["phenotype"].tolist().count(1))
df

0.0835101117052
2500


Unnamed: 0,phenotype,gene1,gene2,gene3,gene4,gene5,gene6,gene7,gene8,gene9,gene10
0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,0
3,0,0,0,1,1,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0
5,1,0,0,0,0,0,0,0,0,0,0
6,1,0,0,1,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0
8,1,1,1,0,0,0,0,0,0,0,0
9,0,0,0,0,1,0,0,0,0,0,0


In [48]:
counts

gene1  gene2  gene3  gene4  gene5  gene6  gene7  gene8  gene9  gene10  phenotype
0      0      0      0      0      0      0      0      0      0       0            635
                                                                       1            556
                                                        1      0       0             22
                                                                       1             20
                                                 1      0      0       0             26
                                                                       1             25
                                                        1      0       0              9
                                                                       1             13
                                          1      0      0      0       0             19
                                                                       1             41
                                       

In [5]:
df.to_csv("data/df.csv")

In [6]:
counts.to_csv("data/counts.csv")