In [1]:
from pathlib import Path

import pandas as pd

import mpmp.config as cfg

%load_ext autoreload
%autoreload 2

In [2]:
cosmic_df = pd.read_csv(
    cfg.cosmic_genes_file, sep='\t', index_col=0
)

cosmic_df = cosmic_df[
    # use only tier 1 genes
    ((cosmic_df.Tier == 1) &
    # drop genes without a catalogued somatic mutation
     (cosmic_df.Somatic == 'yes') &
    # drop genes that are only observed in cancer as fusions
    # (we're not calling fusion genes in our mutation data)
     (cosmic_df['Role in Cancer'] != 'fusion'))
].copy()
     
print(cosmic_df.shape)
cosmic_df.head()

(445, 19)


Unnamed: 0_level_0,Name,Entrez GeneId,Genome Location,Tier,Hallmark,Chr Band,Somatic,Germline,Tumour Types(Somatic),Tumour Types(Germline),Cancer Syndrome,Tissue Type,Molecular Genetics,Role in Cancer,Mutation Types,Translocation Partner,Other Germline Mut,Other Syndrome,Synonyms
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
ABI1,abl-interactor 1,10006,10:26746593-26860935,1,Yes,12.1,yes,,AML,,,L,Dom,"TSG, fusion",T,KMT2A,,,"10006,ABI-1,ABI1,E3B1,ENSG00000136754.17,Q8IZP..."
ABL1,v-abl Abelson murine leukemia viral oncogene h...,25,9:130713946-130885683,1,Yes,34.12,yes,,"CML, ALL, T-ALL",,,L,Dom,"oncogene, fusion","T, Mis","BCR, ETV6, NUP214",,,"25,ABL,ABL1,ENSG00000097007.17,JTK7,P00519,c-A..."
ABL2,"c-abl oncogene 2, non-receptor tyrosine kinase",27,1:179099327-179229601,1,,25.2,yes,,AML,,,L,Dom,"oncogene, fusion",T,ETV6,,,"27,ABL2,ABLL,ARG,ENSG00000143322.19,P42684"
ACKR3,atypical chemokine receptor 3,57007,2:236569641-236582358,1,Yes,37.3,yes,,lipoma,,,M,Dom,"oncogene, fusion",T,HMGA2,,,"57007,ACKR3,CMKOR1,CXCR7,ENSG00000144476.5,GPR..."
ACVR1,"activin A receptor, type I",90,2:157736444-157875111,1,Yes,24.1,yes,,DIPG,,,O,Dom,oncogene,Mis,,yes,Fibrodysplasia ossificans progressiva,"90,ACVR1,ACVR1A,ACVRLK2,ALK2,ENSG00000115170.1..."


### Clean up the oncogene/TSG annotations

We need each gene to be annotated as _either_ an oncogene or TSG, so we know whether to use copy gain or copy loss data to define relevant CNV info. 

So, here, we will:

1) drop genes that are annotated only as fusion genes (since we're not calling fusions at this time)  
2) try to resolve genes that are annotated as both oncogene/TSG (usually context/cancer type specific) into their most likely pan-cancer category  
3) for genes that can't be resolved confidently, we'll keep them as "oncogene, TSG" and run our scripts for both conditions downstream.

In [3]:
print(cosmic_df['Role in Cancer'].unique())

# if a gene is annotated as an oncogene/TSG and a fusion gene, just
# get rid of the fusion component
cosmic_df['Role in Cancer'] = cosmic_df['Role in Cancer'].str.replace(', fusion', '')

print(cosmic_df['Role in Cancer'].unique())

['TSG, fusion' 'oncogene, fusion' 'oncogene' 'TSG' 'oncogene, TSG, fusion'
 'oncogene, TSG']
['TSG' 'oncogene' 'oncogene, TSG']


In [6]:
# how to resolve genes annotated as both oncogene and TSG?
cosmic_dual_df = cosmic_df[cosmic_df['Role in Cancer'] == 'oncogene, TSG']
print(cosmic_dual_df.shape)
print(cosmic_dual_df.index)
cosmic_dual_df.head()

(65, 19)
Index(['ARNT', 'ATP1A1', 'BCL11B', 'BCL9L', 'BCORL1', 'BIRC3', 'BTK', 'CBL',
       'CBLC', 'CIC', 'CREBBP', 'CUX1', 'DAXX', 'ELF4', 'EPAS1', 'ERBB4',
       'ESR1', 'EZH2', 'FES', 'FOXL2', 'FOXO1', 'FOXO3', 'FOXO4', 'GATA1',
       'GATA3', 'HOXA11', 'HOXA9', 'IKZF3', 'IRF4', 'IRS4', 'JAK1', 'KDM6A',
       'KLF4', 'KMT2D', 'LEF1', 'MAP2K4', 'MAP3K1', 'MAP3K13', 'MRTFA',
       'NFE2L2', 'NFKB2', 'NKX2-1', 'NOTCH1', 'NOTCH2', 'NTRK1', 'PAX5',
       'POLQ', 'PRKAR1A', 'PTK6', 'QKI', 'RAD21', 'RHOA', 'RUNX1', 'RUNX1T1',
       'STAT5B', 'SUZ12', 'TBL1XR1', 'TBX3', 'TCF3', 'TERT', 'TET1', 'TP53',
       'TP63', 'TRIM24', 'WT1'],
      dtype='object', name='Gene Symbol')


Unnamed: 0_level_0,Name,Entrez GeneId,Genome Location,Tier,Hallmark,Chr Band,Somatic,Germline,Tumour Types(Somatic),Tumour Types(Germline),Cancer Syndrome,Tissue Type,Molecular Genetics,Role in Cancer,Mutation Types,Translocation Partner,Other Germline Mut,Other Syndrome,Synonyms
Gene Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
ARNT,aryl hydrocarbon receptor nuclear translocator,405,1:150809705-150876768,1,Yes,21.3,yes,,AML,,,L,Dom,"oncogene, TSG",T,ETV6,,,"405,ARNT,ENSG00000143437.20,HIF-1beta,P27540,b..."
ATP1A1,"ATPase, Na+/K+ transporting, alpha 1 polypeptide",476,1:116373867-116404772,1,Yes,13.1,yes,,adrenal aldosterone producing adenoma,,,E,Dom,"oncogene, TSG","Mis, O",,,,"476,ATP1A1,ENSG00000163399.15,P05023"
BCL11B,B-cell CLL/lymphoma 11B (CTIP2),64919,14:99169287-99271228,1,Yes,32.2,yes,,T-ALL,,,L,Dom,"oncogene, TSG",T,TLX3,,,"64919,BCL11B,CTIP-2,CTIP2,ENSG00000127152.17,Q..."
BCL9L,B-cell CLL/lymphoma 9-like,283149,11:118893875-118910904,1,Yes,23.3,yes,,"colorectal cancer, endometrial carcinoma, gast...",,,E,,"oncogene, TSG","Mis, F",,,,"283149,BCL9L,DLNB11,ENSG00000186174.12,Q86UU0"
BCORL1,BCL6 corepressor-like 1,63035,X:130005188-130058083,1,Yes,26.1,yes,,"AML, HNSCC",,,"E, L",,"oncogene, TSG","Mis, N, F",,,,"63035,BCORL1,CXorf10,ENSG00000085185.15,FLJ113..."


In [10]:
# load Bailey et al. data
# supplementary table from https://www.sciencedirect.com/science/article/pii/S009286741830237X
# this contains oncogene/TSG predictions for genes/cancer types using 20/20+ classifier
class_df = pd.read_excel(
    cfg.data_dir / '1-s2.0-S009286741830237X-mmc1.xlsx', 
    engine='openpyxl', sheet_name='Table S1', index_col='KEY', header=3
)
class_df.rename(columns={'Tumor suppressor or oncogene prediction (by 20/20+)':
                         'classification'},
                inplace=True)

print(class_df.shape)
class_df.head()

(782, 25)


  warn(msg)


Unnamed: 0_level_0,Gene,Cancer,classification,Decision,Tissue Frequency,Pancan Frequency,Consensus Score,Correlation adusted score,Novel,Rescue Notes,...,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25
KEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ABL1_PANCAN,ABL1,PANCAN,,rescued,,0.011675,0.0,,0.0,Evidence from OncoImpact/DriverNET overlap (SN...,...,,,,,,,,,,
ACVR1_UCEC,ACVR1,UCEC,oncogene,official,0.05303,0.00749,1.5,1.5,0.0,,...,,,,,,,,,,
ACVR1B_PANCAN,ACVR1B,PANCAN,possible tsg,official,,0.010904,1.0,0.0,0.0,,...,,,,,,,,,,
ACVR2A_COADREAD,ACVR2A,COADREAD,tsg,official,0.028481,0.013988,1.5,1.5,0.0,,...,,,,,,,,,,
ACVR2A_LIHC,ACVR2A,LIHC,possible tsg,official,0.031073,0.013988,1.5,1.5,0.0,,...,,,,,,,,,,


In [20]:
bailey_predicted_df = (
    class_df[((class_df.Cancer == 'PANCAN') &
              (class_df.Gene.isin(cosmic_dual_df.index)) &
              (~class_df.classification.isna()))]
)
print(bailey_predicted_df.shape)
bailey_predicted_df.head(20)

(16, 25)


Unnamed: 0_level_0,Gene,Cancer,classification,Decision,Tissue Frequency,Pancan Frequency,Consensus Score,Correlation adusted score,Novel,Rescue Notes,...,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25
KEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CIC_PANCAN,CIC,PANCAN,possible tsg,official,,0.027646,4.5,3.984499,0.0,,...,,,,,,,,,,
CREBBP_PANCAN,CREBBP,PANCAN,tsg,official,,0.036568,3.0,2.484499,0.0,,...,,,,,,,,,,
GATA3_PANCAN,GATA3,PANCAN,possible tsg,official,,0.024011,1.5,1.5,0.0,,...,,,,,,,,,,
JAK1_PANCAN,JAK1,PANCAN,tsg,official,,0.013658,1.5,1.5,0.0,,...,,,,,,,,,,
KDM6A_PANCAN,KDM6A,PANCAN,tsg,official,,0.029409,3.5,2.984499,0.0,,...,,,,,,,,,,
KMT2D_PANCAN,KMT2D,PANCAN,tsg,official,,0.083269,3.0,2.484499,0.0,,...,,,,,,,,,,
MAP2K4_PANCAN,MAP2K4,PANCAN,tsg,official,,0.011896,2.5,1.984499,0.0,,...,,,,,,,,,,
MAP3K1_PANCAN,MAP3K1,PANCAN,tsg,official,,0.020707,2.5,1.984499,0.0,,...,,,,,,,,,,
NFE2L2_PANCAN,NFE2L2,PANCAN,oncogene,official,,0.027756,3.0,2.484499,0.0,,...,,,,,,,,,,
NOTCH1_PANCAN,NOTCH1,PANCAN,tsg,official,,0.039432,3.0,2.484499,0.0,,...,,,,,,,,,,
