## blah blah

blah blah

In [1]:
from pathlib import Path
import pickle as pkl

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import sys; sys.path.append('..')
import config as cfg

%load_ext autoreload
%autoreload 2

In [2]:
# park geneset info
park_loss_data = cfg.data_dir / 'park_loss_df.tsv'
park_gain_data = cfg.data_dir / 'park_gain_df.tsv'

# park gene/cancer type predictions
park_preds_dir = cfg.data_dir / 'park_genes_preds'

# mutation and copy number data
pancancer_pickle = Path('/home/jake/research/mpmp/data/pancancer_data.pkl')

### Load mutation/copy number info

For now, just use binary mutation status from the pancancer repo. In the future we could pull more granular info from MC3, but it would take some engineering of `1_get_mutation_counts` to do this for lots of genes.

In [3]:
park_loss_df = pd.read_csv(park_loss_data, sep='\t', index_col=0)
park_loss_df.head()

Unnamed: 0,Gene,Tissue,Effect,Pval,FDR,LogFDR,Target,classification
AJUBA_HNSC,AJUBA,HNSC,-0.93605,0.008732955,0.045816,1.338886,A_Hit,TSG
ARID1A_LGG,ARID1A,LGG,-0.904202,0.01956617,0.076291,1.117471,A_Hit,TSG
ARID1A_STAD,ARID1A,STAD,1.133548,0.0003979932,0.000294,3.517309,B_Target,TSG
ARID1A_UCEC,ARID1A,UCEC,1.597876,3.451937e-07,0.0,5.0,B_Target,TSG
ARID2_LIHC,ARID2,LIHC,-0.978419,0.01019284,0.050276,1.298552,A_Hit,TSG


In [4]:
park_gain_df = pd.read_csv(park_gain_data, sep='\t', index_col=0)
park_gain_df.head()

Unnamed: 0,Gene,Tissue,Effect,Pval,FDR,LogFDR,Target,classification
ARID1A_UCEC,ARID1A,UCEC,-2.351526,0.001154365,-0.000447,3.340299,B_Target,TSG
ATRX_LGG,ATRX,LGG,1.440987,4.025141e-08,0.0,5.0,A_Hit,TSG
BRAF_SKCM,BRAF,SKCM,1.239939,2.042839e-11,0.0,5.0,A_Hit,Oncogene
BRAF_THCA,BRAF,THCA,-2.449684,0.0007509064,-0.000209,3.659041,B_Target,Oncogene
CTNNB1_UCEC,CTNNB1,UCEC,-1.257861,0.01597968,0.096292,1.016366,B_Target,Oncogene


In [5]:
with open(pancancer_pickle, 'rb') as f:
    pancancer_data = pkl.load(f)

In [6]:
mutation_df = pancancer_data[1]
print(mutation_df.shape)
mutation_df.iloc[:5, :5]

(9074, 20938)


Unnamed: 0_level_0,5S_rRNA,A1BG,A1CF,A2M,A2ML1
SAMPLE_BARCODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TCGA-02-0047-01,0,0,0,0,0
TCGA-02-0055-01,0,0,0,0,0
TCGA-02-2483-01,0,0,0,0,0
TCGA-02-2485-01,0,0,0,0,0
TCGA-02-2486-01,0,0,0,0,0


In [7]:
copy_loss_df = pancancer_data[2]
print(copy_loss_df.shape)
copy_loss_df.iloc[:5, :5]

(9074, 25128)


Unnamed: 0,ACAP3,ACTRT2,AGRN,ANKRD65,ATAD3A
TCGA-02-0047-01,0,0,0,0,0
TCGA-02-0055-01,0,0,0,0,0
TCGA-02-2483-01,0,0,0,0,0
TCGA-02-2485-01,0,0,0,0,0
TCGA-02-2486-01,0,0,0,0,0


In [8]:
copy_gain_df = pancancer_data[3]
print(copy_gain_df.shape)
copy_gain_df.iloc[:5, :5]

(9074, 25128)


Unnamed: 0,ACAP3,ACTRT2,AGRN,ANKRD65,ATAD3A
TCGA-02-0047-01,0,0,0,0,0
TCGA-02-0055-01,0,0,0,0,0
TCGA-02-2483-01,0,0,0,0,0
TCGA-02-2485-01,0,0,0,0,0
TCGA-02-2486-01,0,0,0,0,0


### Classify genes/cancer types into "classes"

In [Park et al. 2021](https://www.nature.com/articles/s41467-021-27242-3), they describe 4 "classes" of driver genes:

1. Genes that function exclusively as one-hit drivers, no significant co-occurrence with CNAs (we aren't concerned with those here)
2. Genes that interact with CNA loss in at least one cancer type - "two-hit loss" drivers (i.e. classical tumor suppressors)
3. Genes that interact with CNA gain in at least one cancer type - "two-hit gain" drivers (for some examples/explanation of "two-hit" oncogenes, see [this paper](https://www.nature.com/articles/s41586-020-2175-2))
4. Genes that interact with both CNA loss and CNA gain across multiple cancer types - "two-hit loss and gain" drivers

Here, we label each of the genes from the Park et al. data with their "class", since we want to segment our analyses in this way too.

In [9]:
# our datasets are already filtered for significance, so genes that appear
# in both loss/gain tables are class 4
# others are class 2/3 for loss/gain tables respectively

class_4_genes = (
    set(park_loss_df.Gene.unique()).intersection(
    set(park_gain_df.Gene.unique())
))
print(class_4_genes)

{'TP53', 'EPAS1', 'CTNNB1', 'CUL3', 'ARID1A', 'PPP2R1A', 'NRAS'}


In [10]:
def gene_to_class(g):
    return 'class 4' if g in class_4_genes else 'class 2'

loss_class = {g: gene_to_class(g) for g in park_loss_df.Gene.unique()}

park_loss_df['class'] = park_loss_df.Gene.map(loss_class)
park_loss_df.head()

Unnamed: 0,Gene,Tissue,Effect,Pval,FDR,LogFDR,Target,classification,class
AJUBA_HNSC,AJUBA,HNSC,-0.93605,0.008732955,0.045816,1.338886,A_Hit,TSG,class 2
ARID1A_LGG,ARID1A,LGG,-0.904202,0.01956617,0.076291,1.117471,A_Hit,TSG,class 4
ARID1A_STAD,ARID1A,STAD,1.133548,0.0003979932,0.000294,3.517309,B_Target,TSG,class 4
ARID1A_UCEC,ARID1A,UCEC,1.597876,3.451937e-07,0.0,5.0,B_Target,TSG,class 4
ARID2_LIHC,ARID2,LIHC,-0.978419,0.01019284,0.050276,1.298552,A_Hit,TSG,class 2


In [11]:
def gene_to_class(g):
    return 'class 4' if g in class_4_genes else 'class 3'

gain_class = {g: gene_to_class(g) for g in park_gain_df.Gene.unique()}

park_gain_df['class'] = park_gain_df.Gene.map(gain_class)
park_gain_df.head()

Unnamed: 0,Gene,Tissue,Effect,Pval,FDR,LogFDR,Target,classification,class
ARID1A_UCEC,ARID1A,UCEC,-2.351526,0.001154365,-0.000447,3.340299,B_Target,TSG,class 4
ATRX_LGG,ATRX,LGG,1.440987,4.025141e-08,0.0,5.0,A_Hit,TSG,class 3
BRAF_SKCM,BRAF,SKCM,1.239939,2.042839e-11,0.0,5.0,A_Hit,Oncogene,class 3
BRAF_THCA,BRAF,THCA,-2.449684,0.0007509064,-0.000209,3.659041,B_Target,Oncogene,class 3
CTNNB1_UCEC,CTNNB1,UCEC,-1.257861,0.01597968,0.096292,1.016366,B_Target,Oncogene,class 4
