# Implement TORUS by using CNV data of Schizophrenia in Sweden
### Use Sweden Schizphrenia individual data to obtain gene and CNV overlap configuration; columns represent genes, rows represent individuals
### Use the transformed file to perform TOURS

#### Obtain file format used in TORUS

Use 

In [1]:
import pandas as pd
from pandasql import sqldf
from utils import load_reference_gene

In [2]:
# load reference genes and Sweden CNV data
ref_genes = load_reference_gene("../data/refGene.txt.gz")
sw_cnv = pd.read_table("../data/swcnv/swcnv.qc6.cnv", sep="\s+")
sw_indiv = pd.read_table("../data/swcnv/swcnv.qc6.cnv.indiv", sep="\s+") # "NSEG": number of CNV in each sample
sw_cnv_indiv = pd.merge(sw_cnv, sw_indiv, how="inner", on=["FID", "IID"])
sw_cnv_indiv["PHE"] = sw_cnv_indiv.apply(lambda row: 1 if row["PHE"]==2 else 0, axis=1)

In [3]:
# calcium pathway genes
calpath_genes = pd.read_table("../data/calciumgeneset.txt", skiprows = 2, header = None, names = ["gene_name"])
calpath_genes_list = calpath_genes["gene_name"].tolist()
calpath_genes_pos = pd.merge(ref_genes, calpath_genes, how="inner", on=["gene_name"])

In [4]:
query = '''
SELECT gene_name, chrom, min(tx_start), max(tx_end)
FROM calpath_genes_pos
GROUP BY gene_name
'''
calpath_genes_pos = sqldf(query)
calpath_genes_pos = calpath_genes_pos.rename(columns={"min(tx_start)": "gene_start", "max(tx_end)": "gene_end"})

In [5]:
diff = list(set(calpath_genes_list) - set(calpath_genes_pos["gene_name"].tolist()))
calpath_genes_pos.loc[calpath_genes_pos.shape[0]] = [diff[0], "chr2", 65205076, 65206278]
calpath_genes_pos.loc[calpath_genes_pos.shape[0]] = [diff[1], "chr15", 41231149, 41281887]
calpath_genes_pos["chr"] = calpath_genes_pos.apply(lambda row: int(row["chrom"][3:]) 
                                                   if not row["chrom"][3:] in ("X", "Y") else 23, axis=1)

In [6]:
print (sw_cnv_indiv)

               FID  IID  CHR        BP1        BP2  TYPE     SCORE  SITES  \
0          PT-8K8C    1    8   30371917   30849740     3    95.760     97   
1          PT-8UHJ    1    7  143218408  143532998     3    20.780     74   
2          PT-8UHU    1    2  117775371  117925547     3    69.060     73   
3          PT-8UHU    1    2  230714025  230902238     3    53.580     73   
4          PT-8UHU    1   15   24578499   24778891     4    91.330    102   
5          PT-8K6N    1    4    2058475    2316874     3    29.670     51   
6          PT-8K6N    1    7  157925600  158260724     1    52.935     73   
7          PT-8TUA    1    2   83597659   83749480     3    46.110     82   
8          PT-8UCU    1    7   57481104   57636170     3    54.280    117   
9          PT-8UD3    1    7  143444216  143568059     1    13.750     51   
10         PT-8UD3    1   21   10736871   11002646     3    18.440     80   
11         PT-8K79    1   18    1725758    1839387     1   110.920     42   

In [13]:
# need to change
query = '''
SELECT sw.FID, sw.PHE, sw.CHR, sw.BP1, sw.BP2, sw.TYPE, cal.gene_name
FROM calpath_genes_pos cal LEFT JOIN sw_cnv_indiv sw
WHERE sw.CHR == cal.chr
AND (
(sw.BP1 >= cal.gene_start AND sw.BP1 <= cal.gene_end)
OR
(sw.BP2 >= cal.gene_start AND sw.BP2 <= cal.gene_end)
OR
(sw.BP1 <= cal.gene_start AND sw.BP2 >= cal.gene_end)
OR
(sw.BP1 >= cal.gene_start AND sw.BP2 <= cal.gene_end)
)
GROUP BY sw.PHE, sw.CHR, sw.FID, sw.BP1, sw.BP2, sw.TYPE, cal.gene_name
'''
overlap_CNV_gene = sqldf(query).drop_duplicates(subset=("FID", "PHE", "CHR", "BP1", "BP2", "TYPE", "gene_name"))
overlap_CNV_gene.reset_index(inplace=True)

In [14]:
overlap_CNV_gene

Unnamed: 0,index,FID,PHE,CHR,BP1,BP2,TYPE,gene_name
0,0,PT-8TWI,0,1,1804302,2047584,3,CALML6
1,1,PT-BQT1,0,1,239678115,239800921,3,CHRM3
2,2,PT-BRNL,0,2,218749319,219480897,3,PLCD4
3,3,PT-L1G8,0,2,40674308,41559590,3,SLC8A1
4,4,PT-L31A,0,2,74969763,75323267,3,TACR1
5,5,PT-ESLA,0,3,123486169,123634548,1,MYLK
6,6,PT-FFX5,0,3,4317445,4747247,3,ITPR1
7,7,PT-8U8J,0,4,9688014,9794193,3,DRD5
8,8,PT-OPJ8,0,4,104179046,104766986,1,TACR3
9,9,PT-27QK,0,6,118701889,118900024,3,PLN


In [15]:
overlap_CNV_gene = overlap_CNV_gene.groupby(("PHE", "FID", "CHR", "BP1", "BP2", "TYPE"))
overlap_CNV_gene = overlap_CNV_gene["gene_name"].unique()
overlap_CNV_gene = overlap_CNV_gene.reset_index()

In [16]:
overlap_CNV_gene["TYPE"].tolist()

[3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 1,
 3,
 1,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 1,
 1,
 3,
 3,
 1,
 1,
 3,
 1,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 1,
 3,
 3,
 1,
 3,
 3,
 3,
 1,
 1,
 3,
 1,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 1,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 1,
 3,
 3,
 3,
 1,
 3,
 3,
 3,
 3,
 1,
 1,
 3,
 1,
 3,
 3,
 1,
 3,
 3,
 3,
 3,
 1,
 1,
 1,
 1,
 3,
 3,
 3,
 3,
 3,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 3,
 3,
 1,
 3,
 3,
 3,
 1,
 1,
 3,
 1,
 1,
 3,
 3,
 3,
 3,
 3,
 3,
 1,
 1,
 1,
 3,
 1,
 3,
 1,
 3,
 1,
 1,
 1,
 3,
 3,
 3,
 3,
 1,
 1,
 3,
 3,
 3,
 3,
 1,
 3,
 3,
 3,
 1,
 1,
 3,
 3]

In [19]:
for gene in calpath_genes_list:
    print (gene)

ADCY1
ADCY2
ADCY3
ADCY4
ADCY7
ADCY8
ADCY9
ADORA2A
ADORA2B
ADRA1A
ADRA1B
ADRA1D
ADRB1
ADRB2
ADRB3
AGTR1
ATP2A1
ATP2A2
ATP2A3
ATP2B1
ATP2B2
ATP2B3
ATP2B4
AVPR1A
AVPR1B
BDKRB1
BDKRB2
BST1
CACNA1A
CACNA1B
CACNA1C
CACNA1D
CACNA1E
CACNA1F
CACNA1G
CACNA1H
CACNA1I
CACNA1S
CALM1
CALM2
CALM3
CALML3
CALML5
CALML6
CAMK2A
CAMK2B
CAMK2D
CAMK2G
CAMK4
CCKAR
CCKBR
CD38
CHP
CHP2
CHRM1
CHRM2
CHRM3
CHRM5
CHRNA7
CYSLTR1
CYSLTR2
DRD1
DRD5
EDNRA
EDNRB
EGFR
ERBB2
ERBB3
ERBB4
F2R
GNA11
GNA14
GNA15
GNAL
GNAQ
GNAS
GRIN1
GRIN2A
GRIN2C
GRIN2D
GRM1
GRM5
GRPR
HRH1
HRH2
HTR2A
HTR2B
HTR2C
HTR4
HTR5A
HTR6
HTR7
ITPKA
ITPKB
ITPR1
ITPR2
ITPR3
LHCGR
LOC729317
LTB4R2
MYLK
MYLK2
MYLK3
NOS1
NOS2
NOS3
NTSR1
OXTR
P2RX1
P2RX2
P2RX3
P2RX4
P2RX5
P2RX6
P2RX7
PDE1A
PDE1B
PDE1C
PDGFRA
PDGFRB
PHKA1
PHKA2
PHKB
PHKG1
PHKG2
PLCB1
PLCB2
PLCB3
PLCB4
PLCD1
PLCD3
PLCD4
PLCE1
PLCG1
PLCG2
PLCZ1
PLN
PPID
PPP3CA
PPP3CB
PPP3CC
PPP3R1
PPP3R2
PRKACA
PRKACB
PRKACG
PRKCA
PRKCB
PRKCG
PRKX
PTAFR
PTGER1
PTGER3
PTGFR
PTK2B
RYR1
RYR2
RYR3
SLC25A31
SLC25A4