In [1]:
import numpy as np
import pandas as pd
import feather
from pandasql import sqldf
from fisher import pvalue
from scipy import stats
from pprint import pprint

In [2]:
def toy_multi_gene_cnv(p, n_cnv, n_max=10, causal=5, const1=1, const2=0.1, seed=1):
    np.random.seed(seed)
    cnv_len = np.random.choice((np.random.geometric(p, size=n_cnv) - 1), n_cnv, replace=False)
    cnv_len = cnv_len[cnv_len <= 10].tolist()
    start_pos = [np.random.choice(range(n_max+1-i)) if i!=0 else -1 for i in cnv_len]
    ptn_ls = []
    for j,i in enumerate(start_pos):
        if i == -1:
            ptn = [0]*n_max
        elif i == 0:
            ptn = [0]*(i-1) + [1]*cnv_len[j] + [0]*(n_max-cnv_len[j]-i)
        else:
            ptn = [0]*(i-1) + [1]*cnv_len[j] + [0]*(n_max-cnv_len[j]-(i-1))
        ptn_ls.append(ptn)
    mat = [[np.random.normal(0,1)] + line for line in ptn_ls]
    config = []
    for line in mat:
##         config.append(line[0]+const1) if line[causal-1] == 1 else config.append(line[0])
        if line[causal] == 1:
            line[0] = line[0] + const1
        config.append(line[0])
        
#       the longer the CNV is, the less common it is, and larger OR
        line[0] = line[0] + line[1:].count(1)*const2
        
##         line[0] = 1 if line[0] > np.median(config) else 0
        if line[0] > np.median(config):
            line[0] = 1
        else:
            line[0] = 0
        
    mat = np.matrix(mat)
    df = pd.DataFrame(mat, columns = ["phenotype"] + ["gene{}".format(i+1) for i in range(mat.shape[1]-1)])
    counts = df.groupby(["gene{}".format(i+1) for i in range(mat.shape[1]-1)] + ["phenotype"]).size()
    for col in df.columns:
        df[col] = df[col].astype(np.uint8)
    output = feather.write_dataframe(df, "data/toy_n{}_p{}_causal{}_const{}_{}.feather"
                                     .format(len(cnv_len), p, causal, const1, const2))
    return df, counts

In [3]:
df, counts = toy_multi_gene_cnv(p=0.5, n_cnv=2000, const1=0.1, const2=0.05)
# print (df["phenotype"].tolist())

In [4]:
print (df)

      phenotype  gene1  gene2  gene3  gene4  gene5  gene6  gene7  gene8  \
0             1      0      1      0      0      0      0      0      0   
1             1      0      0      0      1      1      1      0      0   
2             1      1      1      0      0      0      0      0      0   
3             0      0      0      0      0      1      1      0      0   
4             0      0      0      0      0      1      0      0      0   
5             1      0      0      0      0      1      0      0      0   
6             1      0      0      0      0      0      0      0      0   
7             0      0      0      0      0      0      0      0      0   
8             0      0      0      0      0      0      0      0      0   
9             0      0      0      0      0      0      0      0      0   
10            0      0      0      0      0      1      1      1      1   
11            0      0      0      0      0      0      0      0      0   
12            0      0   

In [9]:
df.to_csv("data/test.csv")