# Implement TORUS by using CNV data of Schizophrenia in Sweden
### Use Sweden Schizphrenia individual data to obtain gene and CNV overlap configuration; columns represent genes, rows represent individuals
### Use the transformed file to perform TOURS

#### Obtain file format used in TORUS

Use 

In [1]:
import pandas as pd
from pandasql import sqldf
from utils import load_reference_gene, load_data, save_data, get_analysis_blocks
import re
from collections import Counter
import numpy as np

In [2]:
# load reference genes and Sweden CNV data
ref_genes = load_reference_gene("../data/refGene.txt.gz")
ref_genes["chrom"] = ref_genes.apply(lambda row: row["chrom"][3:5] 
                                     if re.match("^[c,h,r,0-9]*$", row["chrom"][:5]) else row["chrom"][3:4], axis=1)
ref_genes["chrom"] = ref_genes.apply(lambda row: row["chrom"] if not row["chrom"] in ("X", "Y") else 23, axis=1)

In [3]:
chromosome = list(str(x) for x in range(1,24))
ref_genes["chrom"] = ref_genes.apply(lambda row: row["chrom"] if str(row["chrom"]) in chromosome else 0, axis=1)
ref_genes = ref_genes[ref_genes.chrom != 0]

In [4]:
sw_cnv = pd.read_table("../data/swcnv/swcnv.qc6.cnv", sep="\s+")
sw_indiv = pd.read_table("../data/swcnv/swcnv.qc6.cnv.indiv", sep="\s+") # "NSEG": number of CNV in each sample
sw_cnv_indiv = pd.merge(sw_cnv, sw_indiv, how="inner", on=["FID", "IID"])
sw_cnv_indiv["PHE"] = sw_cnv_indiv.apply(lambda row: 1 if row["PHE"]==2 else 0, axis=1)

In [5]:
# calcium pathway genes
calpath_genes = pd.read_table("../data/calciumgeneset.txt", skiprows = 2, header = None, names = ["gene_name"])
calpath_genes_list = calpath_genes["gene_name"].tolist()
calpath_genes_pos = pd.merge(ref_genes, calpath_genes, how="inner", on=["gene_name"])

In [6]:
query = '''
SELECT gene_name, chrom, min(tx_start), max(tx_end)
FROM calpath_genes_pos
GROUP BY gene_name
'''
calpath_genes_pos = sqldf(query)
calpath_genes_pos = calpath_genes_pos.rename(columns={"min(tx_start)": "gene_start", "max(tx_end)": "gene_end"})

In [7]:
diff = list(set(calpath_genes_list) - set(calpath_genes_pos["gene_name"].tolist()))
calpath_genes_pos.loc[calpath_genes_pos.shape[0]] = [diff[0], "2", 65205076, 65206278]
calpath_genes_pos.loc[calpath_genes_pos.shape[0]] = [diff[1], "15", 41231149, 41281887]
calpath_genes_pos["chr"] = calpath_genes_pos.apply(lambda row: int(row["chrom"]) 
                                                if not row["chrom"] in ("X", "Y") else 23, axis=1)

In [8]:
query = '''
SELECT sw.FID, sw.PHE, sw.CHR, sw.BP1, sw.BP2, sw.TYPE, cal.gene_name
FROM calpath_genes_pos cal LEFT JOIN sw_cnv_indiv sw
WHERE sw.CHR == cal.chr
AND (
(sw.BP1 >= cal.gene_start AND sw.BP1 <= cal.gene_end)
OR
(sw.BP2 >= cal.gene_start AND sw.BP2 <= cal.gene_end)
OR
(sw.BP1 <= cal.gene_start AND sw.BP2 >= cal.gene_end)
OR
(sw.BP1 >= cal.gene_start AND sw.BP2 <= cal.gene_end)
)
GROUP BY sw.PHE, sw.CHR, sw.FID, sw.BP1, sw.BP2, sw.TYPE, cal.gene_name
'''
overlap_CNV_gene_calpath = sqldf(query).drop_duplicates(subset=("FID", "PHE", "CHR", "BP1", "BP2", "TYPE", "gene_name"))
overlap_CNV_gene_calpath.reset_index(inplace=True)

In [9]:
overlap_CNV_gene_calpath = overlap_CNV_gene_calpath.groupby(("PHE", "FID", "CHR", "BP1", "BP2", "TYPE"))
overlap_CNV_gene_calpath = overlap_CNV_gene_calpath["gene_name"].unique()
overlap_CNV_gene_calpath = overlap_CNV_gene_calpath.reset_index()

In [10]:
query = '''
SELECT chrom, gene_name, tx_name, min(tx_start), max(tx_end)
FROM ref_genes
GROUP BY chrom, gene_name
'''
ref_genes = sqldf(query).drop_duplicates(subset = ("gene_name"))
ref_genes = ref_genes.rename(columns={"min(tx_start)": "gene_start", "max(tx_end)": "gene_end"})

#### The table of overlap of CNVs (Sweden data) and genes

In [11]:
# needs to be modified, does not work
query = '''
SELECT sw.FID, sw.CHR, sw.BP1, sw.BP2, ref.gene_name, ref.tx_name, sw.PHE, sw.TYPE
FROM ref_genes ref LEFT JOIN sw_cnv_indiv sw
WHERE sw.CHR == ref.chrom
AND (
(sw.BP1 >= ref.gene_start AND sw.BP1 <= ref.gene_end)
OR
(sw.BP2 >= ref.gene_start AND sw.BP2 <= ref.gene_end)
OR
(sw.BP1 <= ref.gene_start AND sw.BP2 >= ref.gene_end)
OR
(sw.BP1 >= ref.gene_start AND sw.BP2 <= ref.gene_end)
)
GROUP BY sw.PHE, sw.CHR, sw.FID, sw.BP1, sw.BP2, sw.TYPE, ref.gene_name
'''
cnv_gene_overlap = sqldf(query).drop_duplicates(subset=("FID", "PHE", "CHR", "BP1", "BP2", "TYPE", "gene_name"))

In [19]:
def get_block_matrix(input_data, ref_genes, deletion =make_block = False, dtype = np.uint8):
    '''input_data is a dataframe from a query of overlapping of CNVs and genes
       The input_data may contain deletion and duplication. Separate them if needed.
       ref_genes is a dataframe from a query of reference genes
    '''
    del_cnv = input_data[cnv_gene_overlap["TYPE"]==1]
    dup_cnv = input_data[cnv_gene_overlap["TYPE"]==3]
    n_del_cnv_case = del_cnv[del_cnv["PHE"]==1].drop_duplicates(subset=("FID")).groupby("PHE").size()[1]
    n_del_cnv_ctrl = del_cnv[del_cnv["PHE"]==0].drop_duplicates(subset=("FID")).groupby("PHE").size()[0]
#     print (n_del_cnv_case, n_del_cnv_ctrl)
    del_cnv_select = del_cnv[["CHR", "BP1", "BP2", "tx_name", "gene_name"]]
    del_cnv_select = del_cnv_select.rename(columns={"CHR": "chrom", "BP1": "cnv_start", "BP2": "cnv_terminate"})
    case = del_cnv_select[del_cnv["PHE"]==1]
    ctrl = del_cnv_select[del_cnv["PHE"]==0]
    genes = ref_genes["gene_name"]
    regression_data = np.array([np.array(genes.isin(del_cnv[del_cnv["FID"]==item]["gene_name"]), dtype=float) 
                                for item in del_cnv.drop_duplicates(subset=("FID"))["FID"]])
    phenotype = np.matrix([0]*n_del_cnv_ctrl + [1]*n_del_cnv_case).T
    regression_data = np.hstack((phenotype, regression_data))
    df = pd.DataFrame(regression_data, columns = ['phenotype'] + genes.tolist())
    newdf = pd.DataFrame()
    for col in df:
        if sum(df[col]) > 0:
            newdf[col] = df[col]
        else: continue
    if not make_block:
        res = newdf.astype(dtype, copy = True)
    else:
        blocks = get_analysis_blocks(pd.concat([case, ctrl]))
        res = [newdf[['phenotype'] + item].astype(dtype, copy = True) for item in blocks]
    save_data(res, "data/calcium_pathway_Sweden_CNVs_{}.data.blocks.pkl".format("del"))
    return {"data": res}

In [20]:
res = get_block_matrix(cnv_gene_overlap, ref_genes, make_block = True)

1048 1191


In [15]:
res["data"][66]

Unnamed: 0,phenotype,LOC101927926,LOC101927967,LRRTM4,SNAR-H
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0
5,0,0,0,0,0
6,0,0,0,0,0
7,0,0,0,0,0
8,0,0,0,0,0
9,0,0,0,0,0


In [17]:
save_data(res)

In [18]:
test["data"][1]

Unnamed: 0,phenotype,TRIM62
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
5,1,0
6,1,1
7,1,0
8,1,0
9,1,0
