# Implement TORUS by using CNV data of Schizophrenia in Sweden
### Use Sweden Schizphrenia individual data to obtain gene and CNV overlap configuration; columns represent genes, rows represent individuals
### Use the transformed file to perform TOURS

#### Obtain file format used in TORUS

Use 

In [1]:
import pandas as pd
from pandasql import sqldf
from utils import load_reference_gene, load_data, save_data
import re
from collections import Counter

In [2]:
# load reference genes and Sweden CNV data
ref_genes = load_reference_gene("../data/refGene.txt.gz")
ref_genes["chrom"] = ref_genes.apply(lambda row: row["chrom"][3:5] 
                                     if re.match("^[c,h,r,0-9]*$", row["chrom"][:5]) else row["chrom"][3:4], axis=1)
ref_genes["chrom"] = ref_genes.apply(lambda row: row["chrom"] if not row["chrom"] in ("X", "Y") else 23, axis=1)

In [3]:
chromosome = list(str(x) for x in range(1,24))
ref_genes["chrom"] = ref_genes.apply(lambda row: row["chrom"] if str(row["chrom"]) in chromosome else 0, axis=1)
ref_genes = ref_genes[ref_genes.chrom != 0]

In [4]:
sw_cnv = pd.read_table("../data/swcnv/swcnv.qc6.cnv", sep="\s+")
sw_indiv = pd.read_table("../data/swcnv/swcnv.qc6.cnv.indiv", sep="\s+") # "NSEG": number of CNV in each sample
sw_cnv_indiv = pd.merge(sw_cnv, sw_indiv, how="inner", on=["FID", "IID"])
sw_cnv_indiv["PHE"] = sw_cnv_indiv.apply(lambda row: 1 if row["PHE"]==2 else 0, axis=1)

In [5]:
# calcium pathway genes
calpath_genes = pd.read_table("../data/calciumgeneset.txt", skiprows = 2, header = None, names = ["gene_name"])
calpath_genes_list = calpath_genes["gene_name"].tolist()
calpath_genes_pos = pd.merge(ref_genes, calpath_genes, how="inner", on=["gene_name"])

In [6]:
query = '''
SELECT gene_name, chrom, min(tx_start), max(tx_end)
FROM calpath_genes_pos
GROUP BY gene_name
'''
calpath_genes_pos = sqldf(query)
calpath_genes_pos = calpath_genes_pos.rename(columns={"min(tx_start)": "gene_start", "max(tx_end)": "gene_end"})

In [7]:
diff = list(set(calpath_genes_list) - set(calpath_genes_pos["gene_name"].tolist()))
calpath_genes_pos.loc[calpath_genes_pos.shape[0]] = [diff[0], "2", 65205076, 65206278]
calpath_genes_pos.loc[calpath_genes_pos.shape[0]] = [diff[1], "15", 41231149, 41281887]
calpath_genes_pos["chr"] = calpath_genes_pos.apply(lambda row: int(row["chrom"]) 
                                                if not row["chrom"] in ("X", "Y") else 23, axis=1)

In [8]:
query = '''
SELECT sw.FID, sw.PHE, sw.CHR, sw.BP1, sw.BP2, sw.TYPE, cal.gene_name
FROM calpath_genes_pos cal LEFT JOIN sw_cnv_indiv sw
WHERE sw.CHR == cal.chr
AND (
(sw.BP1 >= cal.gene_start AND sw.BP1 <= cal.gene_end)
OR
(sw.BP2 >= cal.gene_start AND sw.BP2 <= cal.gene_end)
OR
(sw.BP1 <= cal.gene_start AND sw.BP2 >= cal.gene_end)
OR
(sw.BP1 >= cal.gene_start AND sw.BP2 <= cal.gene_end)
)
GROUP BY sw.PHE, sw.CHR, sw.FID, sw.BP1, sw.BP2, sw.TYPE, cal.gene_name
'''
overlap_CNV_gene_calpath = sqldf(query).drop_duplicates(subset=("FID", "PHE", "CHR", "BP1", "BP2", "TYPE", "gene_name"))
overlap_CNV_gene_calpath.reset_index(inplace=True)
overlap_CNV_gene_calpath

Unnamed: 0,index,FID,PHE,CHR,BP1,BP2,TYPE,gene_name
0,0,PT-8TWI,0,1,1804302,2047584,3,CALML6
1,1,PT-BQT1,0,1,239678115,239800921,3,CHRM3
2,2,PT-BRNL,0,2,218749319,219480897,3,PLCD4
3,3,PT-L1G8,0,2,40674308,41559590,3,SLC8A1
4,4,PT-L31A,0,2,74969763,75323267,3,TACR1
5,5,PT-ESLA,0,3,123486169,123634548,1,MYLK
6,6,PT-FFX5,0,3,4317445,4747247,3,ITPR1
7,7,PT-8U8J,0,4,9688014,9794193,3,DRD5
8,8,PT-OPJ8,0,4,104179046,104766986,1,TACR3
9,9,PT-27QK,0,6,118701889,118900024,3,PLN


In [9]:
overlap_CNV_gene_calpath = overlap_CNV_gene_calpath.groupby(("PHE", "FID", "CHR", "BP1", "BP2", "TYPE"))
overlap_CNV_gene_calpath = overlap_CNV_gene_calpath["gene_name"].unique()
overlap_CNV_gene_calpath = overlap_CNV_gene_calpath.reset_index()

In [10]:
overlap_CNV_gene_calpath

Unnamed: 0,PHE,FID,CHR,BP1,BP2,TYPE,gene_name
0,0,PT-1RV1,23,2830696,3649902,3,[PRKX]
1,0,PT-1S1Z,23,3231040,3539248,3,[PRKX]
2,0,PT-1S24,23,2850801,3659193,3,[PRKX]
3,0,PT-1S2E,23,2830696,3659193,3,[PRKX]
4,0,PT-1SBE,23,2830696,3626408,3,[PRKX]
5,0,PT-1SBN,23,2741339,3659193,3,[PRKX]
6,0,PT-1SC5,23,2916735,3554481,3,[PRKX]
7,0,PT-1SCT,23,2850801,3659193,3,[PRKX]
8,0,PT-1TAY,23,2979529,3659193,3,[PRKX]
9,0,PT-1TC6,23,2830696,3659193,3,[PRKX]


In [11]:
for index, row in overlap_CNV_gene_calpath.iterrows():
    print (row["PHE"], row["gene_name"])

0 ['PRKX']
0 ['PRKX']
0 ['PRKX']
0 ['PRKX']
0 ['PRKX']
0 ['PRKX']
0 ['PRKX']
0 ['PRKX']
0 ['PRKX']
0 ['PRKX']
0 ['PRKX']
0 ['PLN']
0 ['CACNA1H']
0 ['PLN']
0 ['MYLK3']
0 ['ATP2A3' 'P2RX1']
0 ['ADORA2B']
0 ['PLN']
0 ['PHKB']
0 ['CHRNA7']
0 ['CHRNA7']
0 ['CHRNA7']
0 ['ATP2A1']
0 ['CHRNA7']
0 ['CALML6']
0 ['DRD5']
0 ['P2RX6']
0 ['PRKX']
0 ['CHRNA7']
0 ['PDE1C']
0 ['PDE1C']
0 ['CHRNA7']
0 ['CHRNA7']
0 ['P2RX5']
0 ['PLN']
0 ['PLCB1']
0 ['CHRNA7']
0 ['PLN']
0 ['P2RX6']
0 ['PDE1C']
0 ['PLN']
0 ['CHRNA7']
0 ['PLN']
0 ['ATP2A3' 'P2RX1']
0 ['PDE1C']
0 ['PDE1C']
0 ['CHRM3']
0 ['PRKCA']
0 ['PLCD4']
0 ['ATP2A1']
0 ['P2RX6']
0 ['ATP2A1']
0 ['ADORA2B']
0 ['BDKRB1' 'BDKRB2']
0 ['CHRNA7']
0 ['CHRNA7']
0 ['P2RX6']
0 ['PLN']
0 ['MYLK']
0 ['CHRNA7']
0 ['PLN']
0 ['P2RX6']
0 ['P2RX6']
0 ['GRIN2A']
0 ['PLN']
0 ['PDE1C']
0 ['CHRNA7']
0 ['ITPR1']
0 ['PLN']
0 ['P2RX6']
0 ['PLN']
0 ['PDE1C']
0 ['CHRNA7']
0 ['BDKRB1' 'BDKRB2']
0 ['BDKRB1' 'BDKRB2']
0 ['CHRNA7']
0 ['PPP3R2']
0 ['ATP2A1']
0 ['CHRNA7']
0 ['SLC8A1']
0

In [17]:
test["case"]

[   chrom  cnv_start  cnv_terminate    tx_name gene_name
 0  chr16   12298062       12418629  NM_032167     SNX29
 1   chr2   43953144       44109485  NM_015522  DYNC2LI1
 3   chr2   43953144       44109485  NM_022436     ABCG5
 4   chr2   43953144       44109485  NM_022437     ABCG8
 5   chr2   43953144       44109485  NM_172069   PLEKHH2,
     chrom  cnv_start  cnv_terminate       tx_name     gene_name
 0    chr1  109136721      109276393  NM_001010883       FAM102B
 1    chr1  109136721      109276393  NM_001102592        HENMT1
 2    chr1  109136721      109276393  NM_001144937         FNDC7
 3    chr1  109136721      109276393     NM_018061       PRPF38B
 6   chr10  125701683      126316176  NM_001146340        NKX1-2
 7   chr10  125701683      126316176  NM_001167880          LHPP
 8   chr10  125701683      126316176  NM_001270764        CHST15
 11  chr10  125701683      126316176  NM_001322971           OAT
 12  chr10  125701683      126316176     NM_014661        FAM53B
 15  ch

In [12]:
query = '''
SELECT chrom, gene_name, min(tx_start), max(tx_end)
FROM ref_genes
GROUP BY chrom, gene_name
'''
ref_genes = sqldf(query)
ref_genes = ref_genes.rename(columns={"min(tx_start)": "gene_start", "max(tx_end)": "gene_end"})
ref_genes

Unnamed: 0,chrom,gene_name,gene_start,gene_end
0,1,A3GALT2,33772366,33786699
1,1,AADACL3,12776117,12788726
2,1,AADACL4,12704565,12727097
3,1,ABCA4,94458393,94586705
4,1,ABCB10,229652328,229694442
5,1,ABCD3,94883932,94984219
6,1,ABL2,179068461,179198819
7,1,ACADM,76190031,76229363
8,1,ACAP3,1227763,1243269
9,1,ACBD3,226332379,226374423


#### The table of overlap of CNVs (Sweden data) and genes

In [15]:
# needs to be modified, does not work
query = '''
SELECT sw.FID, sw.CHR, sw.BP1, sw.BP2, ref.gene_name, sw.PHE, sw.TYPE
FROM ref_genes ref LEFT JOIN sw_cnv_indiv sw
WHERE sw.CHR == ref.chrom
AND (
(sw.BP1 >= ref.gene_start AND sw.BP1 <= ref.gene_end)
OR
(sw.BP2 >= ref.gene_start AND sw.BP2 <= ref.gene_end)
OR
(sw.BP1 <= ref.gene_start AND sw.BP2 >= ref.gene_end)
OR
(sw.BP1 >= ref.gene_start AND sw.BP2 <= ref.gene_end)
)
GROUP BY sw.PHE, sw.CHR, sw.FID, sw.BP1, sw.BP2, sw.TYPE, ref.gene_name
'''
cnv_gene_overlap = sqldf(query).drop_duplicates(subset=("FID", "PHE", "CHR", "BP1", "BP2", "TYPE", "gene_name"))

In [20]:
del_cnv = cnv_gene_overlap[cnv_gene_overlap["TYPE"]==1]
dup_cnv = cnv_gene_overlap[cnv_gene_overlap["TYPE"]==3]
del_cnv

Unnamed: 0,FID,CHR,BP1,BP2,gene_name,PHE,TYPE
5,PT-1RUT,1,248683400,248797122,OR2G6,0,1
6,PT-1RUT,1,248683400,248797122,OR2T10,0,1
7,PT-1RUT,1,248683400,248797122,OR2T11,0,1
8,PT-1RUT,1,248683400,248797122,OR2T29,0,1
9,PT-1RUT,1,248683400,248797122,OR2T34,0,1
25,PT-1RW5,1,16970089,17114634,ESPNP,0,1
26,PT-1RW5,1,16970089,17114634,FAM231A,0,1
27,PT-1RW5,1,16970089,17114634,FAM231C,0,1
28,PT-1RW5,1,16970089,17114634,LOC102724562,0,1
29,PT-1RW5,1,16970089,17114634,LOC440570,0,1


In [44]:
del_cnv_case = del_cnv[del_cnv["PHE"]==0]
del_cnv_ctrl = del_cnv[del_cnv["PHE"]==1]
n_del_cnv_case = del_cnv_case.drop_duplicates(subset=("FID")).groupby("PHE").size()
n_del_cnv_ctrl = del_cnv_ctrl.drop_duplicates(subset=("FID")).groupby("PHE").size()
print (n_del_cnv_ctrl)

PHE
1    1142
dtype: int64


In [16]:
from utils import load_data
test = load_data("data/calcium_pathway_20_shape_5.data.pkl")

In [None]:
def pkl_to_matrix(input_data, make_block = False, dtype = np.uint8):
    dat = load_data(input_data)
    ref = load_reference_gene("../data/refGene.txt.gz")
    genes = pd.Series(list(set(ref['gene_name'])))
    regression_data = np.array([np.array(genes.isin(item["gene_name"]), dtype = float) 
                                for item in dat['case'] + dat['ctrl']])
    phenotype = np.matrix([1]*len(dat['case']) + [0]*len(dat['ctrl'])).T
    regression_data = np.hstack((phenotype, regression_data))
    df = pd.DataFrame(regression_data, columns = ['phenotype'] + genes.tolist())
    newdf = pd.DataFrame()
    for col in df:
        if sum(df[col]) > 0:
            newdf[col] = df[col]
        else: continue
    if not make_block:
        res = newdf.astype(dtype, copy = True)
    else:
        blocks = get_analysis_blocks(pd.concat([pd.concat(dat['case']), pd.concat(dat['ctrl'])]))
        res = [newdf[['phenotype'] + item].astype(dtype, copy = True) for item in blocks]
    return {"data": res}