# Implement TORUS by using CNV data of Schizophrenia in Sweden
### Use Sweden Schizphrenia individual data to obtain gene and CNV overlap configuration; columns represent genes, rows represent individuals
### Use the transformed file to perform TOURS

#### Obtain file format used in TORUS

Use 

In [1]:
import pandas as pd
from pandasql import sqldf
from utils import load_reference_gene, load_data, save_data, get_analysis_blocks
import re
from collections import Counter
import numpy as np
import fisher
from scipy import stats

In [2]:
# load reference genes and Sweden CNV data
ref_genes = load_reference_gene("../data/refGene.txt.gz")
ref_genes["chrom"] = ref_genes.apply(lambda row: row["chrom"][3:5] 
                                     if re.match("^[c,h,r,0-9]*$", row["chrom"][:5]) else row["chrom"][3:4], axis=1)
ref_genes["chrom"] = ref_genes.apply(lambda row: row["chrom"] if not row["chrom"] in ("X", "Y") else 23, axis=1)

In [3]:
chromosome = list(str(x) for x in range(1,24))
ref_genes["chrom"] = ref_genes.apply(lambda row: row["chrom"] if str(row["chrom"]) in chromosome else 0, axis=1)
ref_genes = ref_genes[ref_genes.chrom != 0]

In [4]:
sw_cnv = pd.read_table("../data/swcnv/swcnv.qc6.cnv", sep="\s+")
sw_indiv = pd.read_table("../data/swcnv/swcnv.qc6.cnv.indiv", sep="\s+") # "NSEG": number of CNV in each sample
sw_cnv_indiv = pd.merge(sw_cnv, sw_indiv, how="inner", on=["FID", "IID"])
sw_cnv_indiv["PHE"] = sw_cnv_indiv.apply(lambda row: 1 if row["PHE"]==2 else 0, axis=1)

In [5]:
# calcium pathway genes
calpath_genes = pd.read_table("../data/calciumgeneset.txt", skiprows = 2, header = None, names = ["gene_name"])
calpath_genes_list = calpath_genes["gene_name"].tolist()
calpath_genes_pos = pd.merge(ref_genes, calpath_genes, how="inner", on=["gene_name"])

In [6]:
query = '''
SELECT gene_name, chrom, min(tx_start), max(tx_end)
FROM calpath_genes_pos
GROUP BY gene_name
'''
calpath_genes_pos = sqldf(query)
calpath_genes_pos = calpath_genes_pos.rename(columns={"min(tx_start)": "gene_start", "max(tx_end)": "gene_end"})

In [7]:
diff = list(set(calpath_genes_list) - set(calpath_genes_pos["gene_name"].tolist()))
calpath_genes_pos.loc[calpath_genes_pos.shape[0]] = [diff[0], "2", 65205076, 65206278]
calpath_genes_pos.loc[calpath_genes_pos.shape[0]] = [diff[1], "15", 41231149, 41281887]
calpath_genes_pos["chr"] = calpath_genes_pos.apply(lambda row: int(row["chrom"]) 
                                                if not row["chrom"] in ("X", "Y") else 23, axis=1)

In [8]:
query = '''
SELECT sw.FID, sw.PHE, sw.CHR, sw.BP1, sw.BP2, sw.TYPE, cal.gene_name
FROM calpath_genes_pos cal LEFT JOIN sw_cnv_indiv sw
WHERE sw.CHR == cal.chr
AND (
(sw.BP1 >= cal.gene_start AND sw.BP1 <= cal.gene_end)
OR
(sw.BP2 >= cal.gene_start AND sw.BP2 <= cal.gene_end)
OR
(sw.BP1 <= cal.gene_start AND sw.BP2 >= cal.gene_end)
OR
(sw.BP1 >= cal.gene_start AND sw.BP2 <= cal.gene_end)
)
GROUP BY sw.PHE, sw.CHR, sw.FID, sw.BP1, sw.BP2, sw.TYPE, cal.gene_name
'''
cnv_gene_overlap_calpath = sqldf(query).drop_duplicates(subset=("FID", "PHE", "CHR", "BP1", "BP2", "TYPE", "gene_name"))
cnv_gene_overlap_calpath["tx_name"] = "test_ABCDE"
print (cnv_gene_overlap_calpath)

         FID  PHE  CHR        BP1        BP2  TYPE gene_name     tx_name
0    PT-8TWI    0    1    1804302    2047584     3    CALML6  test_ABCDE
1    PT-BQT1    0    1  239678115  239800921     3     CHRM3  test_ABCDE
2    PT-BRNL    0    2  218749319  219480897     3     PLCD4  test_ABCDE
3    PT-L1G8    0    2   40674308   41559590     3    SLC8A1  test_ABCDE
4    PT-L31A    0    2   74969763   75323267     3     TACR1  test_ABCDE
5    PT-ESLA    0    3  123486169  123634548     1      MYLK  test_ABCDE
6    PT-FFX5    0    3    4317445    4747247     3     ITPR1  test_ABCDE
7    PT-8U8J    0    4    9688014    9794193     3      DRD5  test_ABCDE
8    PT-OPJ8    0    4  104179046  104766986     1     TACR3  test_ABCDE
9    PT-27QK    0    6  118701889  118900024     3       PLN  test_ABCDE
10   PT-29AJ    0    6  118853079  119033109     3       PLN  test_ABCDE
11   PT-2M7G    0    6  118842321  119039482     3       PLN  test_ABCDE
12   PT-8W9S    0    6  118835126  119032859     3 

In [10]:
# cnv_gene_overlap_calpath.reset_index(inplace=True)
# cnv_gene_overlap_calpath = cnv_gene_overlap_calpath.groupby(("PHE", "FID", "CHR", "BP1", "BP2", "TYPE"))
# cnv_gene_overlap_calpath = cnv_gene_overlap_calpath["gene_name"].unique()
# cnv_gene_overlap_calpath = cnv_gene_overlap_calpath.reset_index()

In [11]:
query = '''
SELECT chrom, gene_name, tx_name, min(tx_start), max(tx_end)
FROM ref_genes
GROUP BY chrom, gene_name
'''
ref_genes = sqldf(query).drop_duplicates(subset = ("gene_name"))
ref_genes = ref_genes.rename(columns={"min(tx_start)": "gene_start", "max(tx_end)": "gene_end"})

#### The table of overlap of CNVs (Sweden data) and genes

In [12]:
query = '''
SELECT sw.FID, sw.CHR, sw.BP1, sw.BP2, ref.gene_name, ref.tx_name, sw.PHE, sw.TYPE
FROM ref_genes ref LEFT JOIN sw_cnv_indiv sw
WHERE sw.CHR == ref.chrom
AND (
(sw.BP1 >= ref.gene_start AND sw.BP1 <= ref.gene_end)
OR
(sw.BP2 >= ref.gene_start AND sw.BP2 <= ref.gene_end)
OR
(sw.BP1 <= ref.gene_start AND sw.BP2 >= ref.gene_end)
OR
(sw.BP1 >= ref.gene_start AND sw.BP2 <= ref.gene_end)
)
GROUP BY sw.PHE, sw.CHR, sw.FID, sw.BP1, sw.BP2, sw.TYPE, ref.gene_name
'''
cnv_gene_overlap = sqldf(query).drop_duplicates(subset=("FID", "PHE", "CHR", "BP1", "BP2", "TYPE", "gene_name"))

In [13]:
def get_block_matrix(input_data, ref_genes, deletion = True, make_block = False, dtype = np.uint8):
    '''input_data is a dataframe from a query of overlapping of CNVs and genes. Controls come first, cases follow.
       Therefore, controls (0) come first in "phenotype", cases (1) follow, 
       in order to coordinate the format of input data.
       The input_data may contain deletion and duplication. Separate them if needed.
       ref_genes is a dataframe from a query of reference genes
    '''
    if deletion:
        dat = input_data[input_data["TYPE"]==1]
    else:
        dat = input_data[input_data["TYPE"]==3]
    n_case = dat[dat["PHE"]==1].drop_duplicates(subset=("FID")).groupby("PHE").size()[1]
    n_ctrl = dat[dat["PHE"]==0].drop_duplicates(subset=("FID")).groupby("PHE").size()[0]
    print (n_case + n_ctrl)
    dat_select = dat[["CHR", "BP1", "BP2", "tx_name", "gene_name"]]
    dat_select = dat_select.rename(columns={"CHR": "chrom", "BP1": "cnv_start", "BP2": "cnv_terminate"})
    case = dat_select[dat["PHE"]==1]
    ctrl = dat_select[dat["PHE"]==0]
    genes = ref_genes["gene_name"]
    print (len(genes.tolist()))
    regression_data = np.array([np.array(genes.isin(dat[dat["FID"]==item]["gene_name"]), dtype=float) 
                                for item in dat.drop_duplicates(subset=("FID"))["FID"]])
    phenotype = np.matrix([0]*n_ctrl + [1]*n_case).T
    regression_data = np.hstack((phenotype, regression_data))
    print (np.shape(regression_data))
    df = pd.DataFrame(regression_data, columns = ['phenotype'] + genes.tolist())
    newdf = pd.DataFrame()
    for col in df:
        if sum(df[col]) > 0:
            newdf[col] = df[col]
        else: continue
    if not make_block:
        res = newdf.astype(dtype, copy = True)
    else:
        blocks = get_analysis_blocks(pd.concat([case, ctrl]))
        res = [newdf[['phenotype'] + item].astype(dtype, copy = True) for item in blocks]
    save_data(res, "data/calcium_pathway_Sweden_CNVs_{}.data.blocks.pkl".format("del" if deletion else "dup"))
    return {"data": res}

In [14]:
res = get_block_matrix(cnv_gene_overlap_calpath, ref_genes, make_block = True)

47
26910
(47, 26911)


### Run TORUS

In [15]:
# from get_summary_for_torus import get_sum_stats_cal_simu as get_sum_Sweden_for_torus
cal_pthwy_genes = pd.read_table("../data/calciumgeneset.txt", skiprows = 2, header = None, names = ["gene_name"])
causal_genes = cal_pthwy_genes["gene_name"].tolist()

In [16]:
col = []
for item in res["data"]:
    col.append(np.shape(item)[1])
[i for i, j in enumerate(col) if j == max(col)]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]

In [17]:
for index, item in enumerate(res["data"]):
    print (index, [gene for gene in item.columns.values[1:].tolist() if gene in causal_genes])

0 ['ITPKB']
1 ['RYR2']
2 ['SLC8A1']
3 ['PDE1A']
4 ['MYLK']
5 ['PPP3CA']
6 ['TACR3']
7 ['PDE1C']
8 ['CACNA1B']
9 ['P2RX2']
10 ['ITPR2']
11 ['BDKRB2']
12 ['CHRNA7']
13 ['ATP2A1']
14 ['PHKB']
15 ['PRKACA']
16 ['PLCB1']
17 ['P2RX6']
18 ['PRKX']


In [None]:
max_block = res["data"][25]
block_genes = max_block.columns.values[1:].tolist()
pheno = []
for gene in block_genes:
    if gene in causal_genes:
        pheno.append(1)
    else: 
        pheno.append(0)

In [None]:
def get_sum_Sweden_for_torus(gene_df, causal, cat, multi_cat = 0, subst = -10, chrom = "chr6", seed = 999):
    '''get summary statistics (e.g. t-statistic, p-value) to be used for TORUS, then transfer p-value to z-score
       Actually do NOT need SNP (gene) map and gene (chromosome) map
       If multi_cat!=0, then cat must be multiple categories
       If multi_cat==0, then cat can be any list (because it won't be used) but must be given
    '''
    np.random.seed(seed)
    cases = gene_df[gene_df["phenotype"]==1]
    ctrls = gene_df[gene_df["phenotype"]==0]
    df_cal_bf = pd.DataFrame()
    df_cal_beta = pd.DataFrame()
    df_cal_zscore = pd.DataFrame()
    SNP_anno_cal = pd.DataFrame()
    i = 1
    for gene in list(gene_df)[1:]:
        print (gene)
        n_gene_case = len(cases[cases[gene]==1])
        n_nogene_case = len(cases[cases[gene]==0])
        n_gene_ctrl = len(ctrls[ctrls[gene]==1])
        n_nogene_ctrl = len(ctrls[ctrls[gene]==0])
        if n_gene_case == 0 or n_gene_ctrl == 0:
            odds_ratio = 0
        else:
            odds_ratio = n_gene_case * n_nogene_ctrl / n_gene_ctrl / n_nogene_case
        print (n_gene_case, n_nogene_case, n_gene_ctrl, n_nogene_ctrl, odds_ratio)
        if np.isinf(odds_ratio):
            beta1 = 10
        elif odds_ratio == 0:
            beta1 = -10
        else:
            beta1 = np.log(odds_ratio)
        fisher_stats = fisher.pvalue(n_gene_case, n_gene_ctrl, n_nogene_case, n_nogene_ctrl)
        p_value_two = fisher_stats.two_tail
        p_value_left = fisher_stats.left_tail
        p_value_right = fisher_stats.right_tail
        if p_value_right < 0.5 and p_value_two < 1:
            z_score = stats.norm.ppf(1-p_value_right)
        elif p_value_left < 0.5 and p_value_two < 1:
            z_score = -stats.norm.ppf(1-p_value_left)
        else:
            z_score = subst
        df_cal_bf = df_cal_bf.append([("{}.{}".format(chrom, gene), chrom, odds_ratio)])
        df_cal_beta = df_cal_beta.append([("{}.{}".format(chrom, gene), chrom, beta1, z_score, p_value_two/2)])
        df_cal_zscore = df_cal_zscore.append([("{}.{}".format(chrom, gene), chrom, z_score)])
        ### gene annotation needs to be determined
        if multi_cat == 0:
            SNP_anno_cal = SNP_anno_cal.append([("{}.{}".format(chrom, gene), 1 if gene in causal else 0)])
        else:
            if len(cat) != 0:
                rand_cat = np.random.choice(cat)
            SNP_anno_cal = SNP_anno_cal.append([("{}.{}".format(chrom, gene), 1 if gene in causal else rand_cat)])
            if not gene in causal:
                cat.remove(rand_cat)
        i += 1
    df_cal_bf.columns = ["SNP", "gene", "bf"]
    df_cal_beta.columns = ["SNP", "gene", "beta", "t-stat", "p-value"]
    df_cal_zscore.columns = ["SNP", "gene", "z-score"]
    SNP_anno_cal.columns = ["SNP", "binding_d"]
    df_cal_bf.to_csv("data/TORUS/calcium/bf_cal", sep = "\t", header = False, index = False)
    df_cal_beta.to_csv("data/TORUS/calcium/beta_cal", sep = "\t", header = True, index = False)
    df_cal_zscore.to_csv("data/TORUS/calcium/z_score_cal", sep = "\t", header = False, index = False)
    SNP_anno_cal.to_csv("data/TORUS/calcium/snp_anno_cal", sep = "\t", header = True, index = False)

In [None]:
get_sum_Sweden_for_torus(gene_df = max_block, causal = causal_genes, cat = pheno, multi_cat = 1)

In [None]:
!rm data/TORUS/calcium/*.gz
!gzip data/TORUS/calcium/*_cal
!torus.sh -d data/TORUS/calcium/beta_cal.gz -annot data/TORUS/calcium/snp_anno_cal.gz -est > data/TORUS/calcium/cal_block_beta.est

In [None]:
print (block_genes)