#### Implement calcium signaling pathway. Geneset downloaded from KEGG
http://software.broadinstitute.org/gsea/msigdb/cards/KEGG_CALCIUM_SIGNALING_PATHWAY

In [1]:
import pandas as pd
import numpy as np
from pandasql import sqldf
from scipy import stats
from simulation import *

In [2]:
calpath_gene_df = pd.read_table("../data/calciumgeneset.txt", skiprows = 2, header = None, names = ["gene_name"])
sw_cnv_df = pd.read_table("../data/swcnv/swcnv.qc6.cnv", sep="\s+")
refgene_df = load_reference_gene("data/refGene.txt.gz")

In [3]:
calpath_gene_list = calpath_gene_df["gene_name"].tolist()

In [4]:
print (calpath_gene_list)

['ADCY1', 'ADCY2', 'ADCY3', 'ADCY4', 'ADCY7', 'ADCY8', 'ADCY9', 'ADORA2A', 'ADORA2B', 'ADRA1A', 'ADRA1B', 'ADRA1D', 'ADRB1', 'ADRB2', 'ADRB3', 'AGTR1', 'ATP2A1', 'ATP2A2', 'ATP2A3', 'ATP2B1', 'ATP2B2', 'ATP2B3', 'ATP2B4', 'AVPR1A', 'AVPR1B', 'BDKRB1', 'BDKRB2', 'BST1', 'CACNA1A', 'CACNA1B', 'CACNA1C', 'CACNA1D', 'CACNA1E', 'CACNA1F', 'CACNA1G', 'CACNA1H', 'CACNA1I', 'CACNA1S', 'CALM1', 'CALM2', 'CALM3', 'CALML3', 'CALML5', 'CALML6', 'CAMK2A', 'CAMK2B', 'CAMK2D', 'CAMK2G', 'CAMK4', 'CCKAR', 'CCKBR', 'CD38', 'CHP', 'CHP2', 'CHRM1', 'CHRM2', 'CHRM3', 'CHRM5', 'CHRNA7', 'CYSLTR1', 'CYSLTR2', 'DRD1', 'DRD5', 'EDNRA', 'EDNRB', 'EGFR', 'ERBB2', 'ERBB3', 'ERBB4', 'F2R', 'GNA11', 'GNA14', 'GNA15', 'GNAL', 'GNAQ', 'GNAS', 'GRIN1', 'GRIN2A', 'GRIN2C', 'GRIN2D', 'GRM1', 'GRM5', 'GRPR', 'HRH1', 'HRH2', 'HTR2A', 'HTR2B', 'HTR2C', 'HTR4', 'HTR5A', 'HTR6', 'HTR7', 'ITPKA', 'ITPKB', 'ITPR1', 'ITPR2', 'ITPR3', 'LHCGR', 'LOC729317', 'LTB4R2', 'MYLK', 'MYLK2', 'MYLK3', 'NOS1', 'NOS2', 'NOS3', 'NTSR1', 'OX

In [5]:
calpath_gene_pos = pd.merge(refgene_df, calpath_gene_df, how="inner", on=["gene_name"])

In [6]:
calpath_gene_pos

Unnamed: 0,tx_name,chrom,tx_start,tx_end,gene_name
0,NM_018896,chr17,48638428,48704832,CACNA1G
1,NM_032726,chr2,219472487,219501909,PLCD4
2,NM_182797,chr20,9198036,9461462,PLCB4
3,NM_001172646,chr20,9049700,9461462,PLCB4
4,NM_000933,chr20,9288446,9461462,PLCB4
5,NM_001321589,chr4,114372187,114683083,CAMK2D
6,NM_172115,chr4,114378343,114683083,CAMK2D
7,NM_001321592,chr4,114675630,114683083,CAMK2D
8,NM_182932,chr14,70510933,70655787,SLC8A3
9,NM_001130417,chr14,70510933,70546934,SLC8A3


In [7]:
query = '''
SELECT gene_name, chrom, min(tx_start), max(tx_end)
FROM calpath_gene_pos
GROUP BY gene_name
'''
calpath_gene_pos = sqldf(query)
calpath_gene_pos = calpath_gene_pos.rename(columns={"min(tx_start)": "gene_start", "max(tx_end)": "gene_end"})

In [8]:
diff = set(calpath_gene_list) - set(calpath_gene_pos["gene_name"].tolist())

In [9]:
# https://www.ncbi.nlm.nih.gov/gene/729317
calpath_gene_pos.loc[calpath_gene_pos.shape[0]] = ["LOC729317", "chr2", 65205076, 65206278]
# https://www.ncbi.nlm.nih.gov/gene/?term=CHP
calpath_gene_pos.loc[calpath_gene_pos.shape[0]] = ["CHP", "chr15", 41231149, 41281887]

In [10]:
calpath_gene_pos

Unnamed: 0,gene_name,chrom,gene_start,gene_end
0,ADCY1,chr7,45613738,45762714
1,ADCY2,chr5,7396342,7830194
2,ADCY3,chr2,25042038,25142886
3,ADCY4,chr14,24787554,24804277
4,ADCY7,chr16,50300450,50352045
5,ADCY8,chr8,131792546,132052835
6,ADCY9,chr16,4012649,4166186
7,ADORA2A,chr22,24819564,24838328
8,ADORA2B,chr17,15848230,15879210
9,ADRA1A,chr8,26605666,26724760
