# Toy example for multi-genes and multi-CNVs in a region
## One or more causal genes
## Obtain summary statistics - z_score

In [1]:
import numpy as np
import pandas as pd
import feather
from pandasql import sqldf
from scipy import stats
import sys
import fisher

In [2]:
def toy_multicnv_multicausal(p, n_cnv, causal, n_geom=10000, n_gene=10, const1=0.5, const2=0.05, seed=999):
    '''A region with multiple genes and CNVs. CNVs probably overlap. At least two causal genes.
    "causal" is a list of positions of causal genes.
    "n_gene" is the number of genes this region harbors.
    "n_cnv" is the number of CNV in this region. "n_geom" is the number of geometric variables generated.
    "p" is the probability of geometric dist, then randomly generate a certain number of geometric variables.
    I use these variables minus 1 as the length of CNVs, which means the number of genes that a CNV overlap with.
    The maximum length for CNV in this region is overlapping with 10 genes'''
    np.random.seed(seed)
    if n_cnv > n_geom:
        print ("The number of CNV is larger than the number of geometric numbers")
        sys.exit()
        return None
    geom_minus_1 = np.random.geometric(p, size=n_geom) - 1
    if n_cnv > len(geom_minus_1[geom_minus_1 <= n_gene]):
        print ("Cannot take a larger sample than population when 'replace=False'")
        sys.exit()
        return None
    cnv_len = np.random.choice(geom_minus_1[geom_minus_1 <= n_gene], n_cnv, replace=False)
    cnv_start_pos = [np.random.choice(range(n_gene+1-i)) if i!=0 else -1 for i in cnv_len]
    ptn_ls = []
    for j,i in enumerate(cnv_start_pos):
    ## j is the index of i in cnv start position list, so cnv_len[j] is the length of the corresponding cnv;
    ## i is the corresponding cnv start position
        if i == -1:
            ptn = [0]*n_gene
        else:
            ptn = [0]*i + [1]*cnv_len[j] + [0]*(n_gene - cnv_len[j] - i)
        ptn_ls.append(ptn)
    mat = [[np.random.normal(0,1)] + line for line in ptn_ls]
    config = []
    for line in mat:
        line[0] = line[0] + line[1:].count(1)*const2
        config.append(line[0])
#         only one causal gene, and gene pos is no more than the number of gene covered in this region
        if len(causal) == 1 and (causal[0] > len(line[1:]) or causal[0] <= 0):
            print ("Single causal CNV index out of range")
            sys.exit()
        elif len(causal) == 1 and causal[0] <= len(line[1:]):
            line[0] = line[0] + line[causal[0]]*const1
            config[-1] = line[0]
        elif max(causal) > len(line[1:]) or min(causal) <= 0:
            print ("Causal CNV index out of range")
            sys.exit()
        elif any(line[x] for x in causal):
            line[0] = line[0] + const1*[line[x] for x in causal].count(1)
            config[-1] = line[0]
        else:
            continue
    for line in mat:
        line[0] = 1 if line[0] >= np.median(config) else 0
    mat = np.matrix(mat)
    df = pd.DataFrame(mat, columns = ["phenotype"] + ["gene{}".format(i+1) for i in range(mat.shape[1]-1)])
    counts = df.groupby(["gene{}".format(i+1) for i in range(mat.shape[1]-1)] + ["phenotype"]).size()
    
    for col in df.columns:
        df[col] = df[col].astype(np.uint8)
    output = feather.write_dataframe(df, "data/toy_n{}_p{}_causal{}_const{}_{}.feather"
                                     .format(len(cnv_len), p, causal, const1, const2))
    return df, counts, causal

In [3]:
df, counts, causal = toy_multicnv_multicausal(p=0.5, n_cnv=5000, causal=[1,15], n_gene=15, const1=0.45, const2=0.03)
print (df["phenotype"].tolist().count(1))

2500


In [4]:
from utils import run_dap_lite
fileout = "data/toy_multi_causal_10.dap"
run_dap_lite(df, fileout, grid=[(0, -9.2)])

2017-10-09 14:49:47.261231
2017-10-09 14:50:02.751030


In [45]:
def get_summary_stats(gene_df, causal, subst = -10, chrom = 'chr6', multiplier = 1000, multi_cat = 0):
    '''get summary statistics (e.g. t-statistic, p-value) to be used for TORUS, then transfer p-value to z-score
       Actually do NOT need SNP (gene) map and gene (chromosome) map
    '''
    cases = gene_df[gene_df["phenotype"]==1]
    ctrls = gene_df[gene_df["phenotype"]==0]
    df_bf = pd.DataFrame()
    df_beta = pd.DataFrame()
    df_zscore = pd.DataFrame()
    SNP_anno = pd.DataFrame()
    i = 1
    cat = [0]*3 + [2]*3 + [3]*3 + [4]*2 + [5]*2
    for gene in list(gene_df)[1:]:
        n_gene_case = len(cases[cases[gene]==1])
        n_nogene_case = len(cases[cases[gene]==0])
        n_gene_ctrl = len(ctrls[ctrls[gene]==1])
        n_nogene_ctrl = len(ctrls[ctrls[gene]==0])
        odds_ratio = stats.fisher_exact([[n_gene_case, n_gene_ctrl], [n_nogene_case, n_nogene_ctrl]])[0]
#         print (n_gene_case, n_gene_ctrl, odds_ratio)
        beta1 = np.log(odds_ratio)
        fisher_stats = fisher.pvalue(n_gene_case, n_gene_ctrl, n_nogene_case, n_nogene_ctrl)
        p_value_two = fisher_stats.two_tail
        p_value_left = fisher_stats.left_tail
        p_value_right = fisher_stats.right_tail
        if p_value_right < 0.5 and p_value_two < 1:
            z_score = stats.norm.ppf(1-p_value_right)
        elif p_value_left < 0.5 and p_value_two < 1:
            z_score = -stats.norm.ppf(1-p_value_left)
        else:
            z_score = subst
        df_bf = df_bf.append([("{}.{}".format(chrom, gene), chrom, odds_ratio)])
        df_beta = df_beta.append([("{}.{}".format(chrom, gene), chrom, beta1, z_score, p_value_two/2)])
        df_zscore = df_zscore.append([("{}.{}".format(chrom, gene), chrom, z_score)])
        ### gene annotation needs to be determined
        if multi_cat == 0:
            SNP_anno = SNP_anno.append([("{}.{}".format(chrom, gene), 1 if int(gene[4:]) in causal else 0)])
        else:
            if len(cat) != 0:
                rand_cat = np.random.choice(cat)
            SNP_anno = SNP_anno.append([("{}.{}".format(chrom, gene), 1 if int(gene[4:]) in causal else rand_cat)])
            if not int(gene[4:]) in causal:
                cat.remove(rand_cat)
        i += 1
    df_bf.columns = ["SNP", "gene", "bf"]
    df_beta.columns = ["SNP", "gene", "beta", "t-stat", "p-value"]
    df_zscore.columns = ["SNP", "gene", "z-score"]
    SNP_anno.columns = ["SNP", "binding_d"]
    df_bf.to_csv("data/TORUS/bf", sep = "\t", header = False, index = False)
    df_beta.to_csv("data/TORUS/beta", sep = "\t", header = True, index = False)
    df_zscore.to_csv("data/TORUS/z_score", sep = "\t", header = False, index = False)
    SNP_anno.to_csv("data/TORUS/snp_anno", sep = "\t", header = True, index = False)
    return df_bf, df_beta, df_zscore, SNP_anno

In [46]:
df_bf, df_beta, df_zscore, snp_anno = get_summary_stats(df, causal, multi_cat = 1)

### command

Do NOT need the optional annotation files - snp_map and gene_map, only need snp annotation.

gzip *

torus.sh -d beta.gz -annot snp_anno.gz -est > toy_multi_beta.est

/# torus.sh -d beta.gz -smap snp_map.gz -gmap gene_map.gz -annot snp_anno.gz -est > toy_multi_beta.est

/# torus.sh -d bf.gz --load_bf -smap snp_map.gz -gmap gene_map.gz -annot snp_anno.gz -qtl > toy_multi_bf.est

/# torus.sh -d z_score.gz --load_zval -smap snp_map.gz -gmap gene_map.gz -annot snp_anno.gz -qtl > toy_multi_zscore.est

In [22]:
df_bf, df_beta, df_zscore, snp_anno = get_summary_stats(df, causal)

134 63 2.19081163037
162 134 1.22343372956
195 143 1.39439952672
178 168 1.06408678889
187 175 1.07411524921
205 169 1.23204548091
204 180 1.14518002323
201 174 1.16873903196
183 180 1.01798302403
187 179 1.04830602835
182 170 1.0761305385
180 148 1.2329916123
173 119 1.48751774023
163 108 1.5447788396
132 63 2.15629021879


In [14]:
print (df_bf)
print (df_beta)
print (df_zscore)
print (snp_anno)

           SNP  gene        bf
0   chr6.gene1  chr6  2.190812
0   chr6.gene2  chr6  1.223434
0   chr6.gene3  chr6  1.394400
0   chr6.gene4  chr6  1.064087
0   chr6.gene5  chr6  1.074115
0   chr6.gene6  chr6  1.232045
0   chr6.gene7  chr6  1.145180
0   chr6.gene8  chr6  1.168739
0   chr6.gene9  chr6  1.017983
0  chr6.gene10  chr6  1.048306
0  chr6.gene11  chr6  1.076131
0  chr6.gene12  chr6  1.232992
0  chr6.gene13  chr6  1.487518
0  chr6.gene14  chr6  1.544779
0  chr6.gene15  chr6  2.156290
           SNP  gene      beta    t-stat   p-value
0   chr6.gene1  chr6  0.784272  5.140413  0.000001
0   chr6.gene2  chr6  0.201661  1.618618  0.052765
0   chr6.gene3  chr6  0.332464  2.877339  0.002005
0   chr6.gene4  chr6  0.062117  0.501427  0.308035
0   chr6.gene5  chr6  0.071497  0.600209  0.274183
0   chr6.gene6  chr6  0.208676  1.882458  0.029887
0   chr6.gene7  chr6  0.135562  1.221669  0.110916
0   chr6.gene8  chr6  0.155925  1.396256  0.081319
0   chr6.gene9  chr6  0.017823  0.108983  0.4

In [9]:
for idx, item in enumerate(df.columns.values):
    print (idx, item)

0 phenotype
1 gene1
2 gene2
3 gene3
4 gene4
5 gene5
6 gene6
7 gene7
8 gene8
9 gene9
10 gene10
11 gene11
12 gene12
13 gene13
14 gene14
15 gene15


In [41]:
stats.norm.ppf((1+(1-0.9999999999985895))/2)

1.7677085129515064e-12

In [10]:
startp=[0,6,8,9,1,4,3]
for j, i in enumerate(startp):
    print (j,i, startp[j])

0 0 0
1 6 6
2 8 8
3 9 9
4 1 1
5 4 4
6 3 3


In [21]:
cat = [0]*3 + [2]*3 + [3]*3 + [4]*2 + [5]*2
if (5) in causal:
    print ("balala")

In [121]:
# previous version, bug in it, fixed in function "toy_multicnv_multicausal"
def toy_multi_gene_cnv(p, n_cnv, n_max=10, causal=5, const1=1, const2=0.1, seed=1):
    '''A region with multiple genes and CNVs. CNVs may overlap or not overlap. Only one causal gene.'''
    np.random.seed(seed)
    cnv_len = np.random.choice((np.random.geometric(p, size=n_cnv) - 1), n_cnv, replace=False)
    cnv_len = cnv_len[cnv_len <= 10].tolist()
    start_pos = [np.random.choice(range(n_max+1-i)) if i!=0 else -1 for i in cnv_len]
    ptn_ls = []
    for j,i in enumerate(start_pos):
        if i == -1:
            ptn = [0]*n_max
        elif i == 0:
            ptn = [0]*(i-1) + [1]*cnv_len[j] + [0]*(n_max-cnv_len[j]-i)
        else:
            ptn = [0]*(i-1) + [1]*cnv_len[j] + [0]*(n_max-cnv_len[j]-(i-1))
        ptn_ls.append(ptn)
    mat = [[np.random.normal(0,1)] + line for line in ptn_ls]
    config = []
    for line in mat:
##         config.append(line[0]+const1) if line[causal-1] == 1 else config.append(line[0])
        if line[causal] == 1:
            line[0] = line[0] + const1
        config.append(line[0])
        
#       the longer the CNV is, the less common it is, and larger OR
        line[0] = line[0] + line[1:].count(1)*const2
        
##         line[0] = 1 if line[0] > np.median(config) else 0
        if line[0] >= np.median(config):
            line[0] = 1
        else:
            line[0] = 0
        
    mat = np.matrix(mat)
    df = pd.DataFrame(mat, columns = ["phenotype"] + ["gene{}".format(i+1) for i in range(mat.shape[1]-1)])
    counts = df.groupby(["gene{}".format(i+1) for i in range(mat.shape[1]-1)] + ["phenotype"]).size()
    for col in df.columns:
        df[col] = df[col].astype(np.uint8)
    output = feather.write_dataframe(df, "data/toy_n{}_p{}_causal{}_const{}_{}.feather"
                                     .format(len(cnv_len), p, causal, const1, const2))
    return df, counts