In [1]:
import os

import numpy as np
import pandas as pd
import shap
import xgboost as xgb
from sklearn.preprocessing import StandardScaler

import mgitools.os_helpers as os_helpers

In [2]:
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

In [3]:
# !pip install git+https://github.com/estorrs/mgitools

#### read in data

In [4]:
d = pd.read_csv('../data/199_driver_genes.txt', sep='\t')
d

Unnamed: 0,Gene,Tumor suppressor or oncogene prediction (by 20/20+)
0,PHF6,possible tsg
1,ABL1,
2,ALK,
3,AR,
4,ARAF,
...,...,...
183,KMT2A,tsg
184,KMT2B,tsg
185,MAX,oncogene
186,MED12,oncogene


In [5]:
# target_genes = ['PIK3CA', 'TP53', 'KRAS']
target_genes = sorted(set(d['Gene']))

###### protein pairs

In [6]:
# get pik3ca related genes
pathways = pd.read_csv('../data/protein_pair_table_v2.txt', sep='\t')
pathways

Unnamed: 0,GENE,SUB_GENE,pair_pro,SUB_GENE.is_TF_downstream,SUB_GENE.is_TF_upstream,SUB_GENE.is_kinase_substrate,SUB_GENE.is_phosphatase_substrate,SUB_GENE.is_upstream_kinase,SUB_GENE.is_upstream_phosphatase,SUB_GENE.is_complex_partner
0,TP53,CDKN1A,TP53:CDKN1A,True,False,False,False,False,False,False
1,TP53,SIAH1,TP53:SIAH1,True,False,False,False,False,False,False
2,TP53,SFN,TP53:SFN,True,False,False,False,False,False,False
3,TP53,RPRM,TP53:RPRM,True,False,False,False,False,False,False
4,TP53,GADD45A,TP53:GADD45A,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
831929,SETD2,SETD2,SETD2:SETD2,False,False,False,False,False,False,False
831930,PUMA,PUMA,PUMA:PUMA,False,False,False,False,False,False,False
831931,NOXA,NOXA,NOXA:NOXA,False,False,False,False,False,False,False
831932,FOXR2,FOXR2,FOXR2:FOXR2,False,False,False,False,False,False,False


In [7]:
pathways[[True if g in target_genes else False
         for g in pathways['GENE']]]

Unnamed: 0,GENE,SUB_GENE,pair_pro,SUB_GENE.is_TF_downstream,SUB_GENE.is_TF_upstream,SUB_GENE.is_kinase_substrate,SUB_GENE.is_phosphatase_substrate,SUB_GENE.is_upstream_kinase,SUB_GENE.is_upstream_phosphatase,SUB_GENE.is_complex_partner
0,TP53,CDKN1A,TP53:CDKN1A,True,False,False,False,False,False,False
1,TP53,SIAH1,TP53:SIAH1,True,False,False,False,False,False,False
2,TP53,SFN,TP53:SFN,True,False,False,False,False,False,False
3,TP53,RPRM,TP53:RPRM,True,False,False,False,False,False,False
4,TP53,GADD45A,TP53:GADD45A,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
830817,IDH1,IDH1,IDH1:IDH1,False,False,False,False,False,False,False
831488,BCOR,BCOR,BCOR:BCOR,False,False,False,False,False,False,False
831552,ATRX,ATRX,ATRX:ATRX,False,False,False,False,False,False,False
831798,RQCD1,RQCD1,RQCD1:RQCD1,False,False,False,False,False,False,False


In [8]:
gene_to_subgenes = {t:[g for g in sorted(set(pathways[pathways['GENE']==t]['SUB_GENE'])) if g != t]
                    for t in target_genes}
target_genes[0], len(gene_to_subgenes[target_genes[0]])

('ABL1', 126)

###### proteome

In [9]:
proteome = pd.read_csv('../data/Combine_PanCan_Proteome-gene_UMich_GENCODE34_Sinai_imputed_Apr2021.tsv',
                      sep='\t')
genes = proteome['GENECODE34_Symbol'].to_list()
proteome = proteome.iloc[:, 2:].transpose()
proteome.columns = genes
proteome['sample_id'] = proteome.index.to_list()
# proteome['sample_id'] = [x[1:] if 'CO' in x else x for x in proteome.index]
proteome = proteome.set_index('sample_id')
# scale
# proteome = pd.DataFrame(data=StandardScaler().fit_transform(proteome.values),
#                  columns=proteome.columns, index=proteome.index)
proteome

Unnamed: 0_level_0,TSPAN6,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,NIPAL3,...,EXOC3L2,PRR33,SCO2,AC073111.4,EEF1AKMT4,CCDC39,AL022312.1,AL034430.2,ASDURF,DERPC
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01BR001-T,21.421546,25.896035,24.095342,,20.374201,29.558080,24.189202,26.081010,23.426992,21.618669,...,21.324716,,24.842217,,19.880139,,22.625876,22.513445,22.429392,24.813732
01BR008-T,22.334792,26.046587,23.921620,,20.979104,28.528723,24.044621,25.587332,23.569577,21.696055,...,21.088402,,24.676147,,19.573110,,22.026176,22.115323,22.451728,24.206633
01BR009-T,22.744050,25.816690,23.955057,,20.532495,29.716976,24.095790,25.449831,23.190233,22.091323,...,21.125166,,25.664726,,19.980938,,21.849140,22.642879,22.171134,23.905527
01BR010-T,25.040285,26.641993,24.509677,,19.948534,29.157457,24.202489,25.862581,23.053010,21.838931,...,21.463939,,24.755041,,20.019758,,22.313802,22.922275,22.334238,23.527194
01BR015-T,21.159464,26.076153,23.829427,,19.626148,29.658244,24.290514,25.589347,23.559853,20.858738,...,20.826410,,24.584503,,19.598232,,22.645138,22.657004,22.661953,24.564202
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
QC4-Q.1,,,,,,,,,,,...,,,,,,,,,,
QC5-Q.1,,,,,,,,,,,...,,,,,,,,,,
QC6-Q.1,,,,,,,,,,,...,,,,,,,,,,
WU-PDA1-Q,22.781236,25.917683,23.087405,,21.947461,30.901251,24.571460,25.575413,20.591792,,...,21.765363,,24.111474,,,,,22.412717,,22.787378


In [10]:
[x for x in proteome.index if 'CO' in x][:2]

['01CO005-T', '01CO006-T']

In [11]:
genes = []
for target_gene, subgenes in gene_to_subgenes.items():
    genes += [target_gene]
    genes += subgenes
genes = sorted(set(genes))
proteome = proteome[[g for g in genes if g in proteome.columns]]
proteome.columns = [f'{c}_proteome' for c in proteome.columns]
proteome

Unnamed: 0_level_0,AAAS_proteome,AAK1_proteome,AATF_proteome,ABCA1_proteome,ABCA2_proteome,ABCB1_proteome,ABCB11_proteome,ABCC2_proteome,ABCC3_proteome,ABCE1_proteome,...,ZNRF2_proteome,ZRSR2_proteome,ZSCAN10_proteome,ZSCAN25_proteome,ZSCAN32_proteome,ZW10_proteome,ZWILCH_proteome,ZWINT_proteome,ZXDC_proteome,ZYX_proteome
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01BR001-T,24.366478,24.512335,25.009167,22.761127,18.747711,,,,19.982433,26.985937,...,23.783368,20.954146,,,,25.160244,22.453169,21.101127,21.230329,28.397691
01BR008-T,24.394392,24.336012,25.198883,22.507570,19.817547,,,,19.608661,26.724760,...,23.756042,21.549888,,,,25.341950,22.635411,21.522846,20.922012,27.935289
01BR009-T,24.126891,24.312230,24.822895,23.841045,20.841777,,,,19.515286,26.398394,...,24.313472,21.568350,,,,25.264645,22.073689,20.996493,20.274014,28.101657
01BR010-T,24.328566,24.399937,24.455721,23.044677,20.980193,,,,20.429545,26.360272,...,23.565525,21.385798,,,,25.825940,21.441472,20.719015,19.688179,28.634480
01BR015-T,24.606059,24.330728,24.259055,22.052379,19.149462,,,,18.981616,26.545160,...,23.291041,21.142830,,,,25.124026,21.907613,21.629653,18.811849,28.760563
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
QC4-Q.1,,,,,,,,,,,...,,,,,,,,,,
QC5-Q.1,,,,,,,,,,,...,,,,,,,,,,
QC6-Q.1,,,,,,,,,,,...,,,,,,,,,,
WU-PDA1-Q,22.927226,24.655459,25.348246,24.669570,,21.895529,,,24.184738,27.099644,...,23.285801,,,,,24.504263,21.916507,,,28.697983


###### purity

In [12]:
purity = pd.read_csv('../data/CPTAC_pancan_RNA_tumor_purity_ESTIMATE.tsv.gz',
                    sep='\t')
purity['sample_id'] = purity['Sample_ID'].to_list()
purity = purity.set_index('sample_id')
purity = purity[['TumorPurity']]
purity

Unnamed: 0_level_0,TumorPurity
sample_id,Unnamed: 1_level_1
01BR001-T,0.816624
01BR008-T,0.510466
01BR009-T,0.556239
01BR010-T,0.747700
01BR015-T,0.649161
...,...
C3N-01520-T,0.869669
C3N-01521-T,0.855558
C3N-01537-T,0.647919
C3N-01802-T,0.659522


###### somatic mutation

In [13]:
fps = sorted(os_helpers.listfiles('../data/Somatic_mutation_wxs/', regex=r'exonic.*.maf.gz$'))
fps = [fp for fp in fps if 'Archived' not in fp]
fps

['../data/Somatic_mutation_wxs/BR/WashU_pipeline/BR_prospective.dnp.annotated.exonic.addrecovercases.maf.gz',
 '../data/Somatic_mutation_wxs/CO/WashU_pipeline/CO_prospective.dnp.annotated.exonic.addrecovercases.maf.gz',
 '../data/Somatic_mutation_wxs/EC/WashU_pipeline/EC_discovery.dnp.annotated.exonic.maf.gz',
 '../data/Somatic_mutation_wxs/GBM/WashU_pipeline/GBM_discovery.dnp.annotated.exonic.maf.gz',
 '../data/Somatic_mutation_wxs/HNSCC/WashU_pipeline/HNSCC_discovery.dnp.annotated.exonic.maf.gz',
 '../data/Somatic_mutation_wxs/LSCC/WashU_pipeline/LSCC_discovery.dnp.annotated.exonic.maf.gz',
 '../data/Somatic_mutation_wxs/LUAD/WashU_pipeline/LUAD_discovery.dnp.annotated.exonic.maf.gz',
 '../data/Somatic_mutation_wxs/OV/WashU_pipeline/OV_prospective.dnp.annotated.exonic.addrecovercases.maf.gz',
 '../data/Somatic_mutation_wxs/PDA/WashU_pipeline/PDA_discovery.dnp.annotated.exonic.maf.gz',
 '../data/Somatic_mutation_wxs/ccRCC/WashU_pipeline/ccRCC_discovery.dnp.annotated.exonic.maf.gz']

In [14]:
mutations = None
genes = list(target_genes)
for gs in gene_to_subgenes.values(): genes += gs
genes = sorted(set(genes))
keep = ['gene', 'sample_id', 'Chromosome', 'Start_Position', 'End_Position', 'HGVSp_Short', 'Variant_Classification']
for fp in fps:
    cancer_type = fp.split('/')[-1].split('_')[0]
    df = pd.read_csv(fp, sep='\t')
    df = df.loc[[True if g in genes else False for g in df['Hugo_Symbol']]]
    df['sample_id'] = [x.replace('_', '-') for x in df['Tumor_Sample_Barcode']]
    df['gene'] = df['Hugo_Symbol'].to_list()
    df = df[keep]
    df['disease'] = cancer_type
    
    
    if mutations is None:
        mutations = df
    else:
        mutations = pd.concat((mutations, df), axis=0)

Columns (88) have mixed types.Specify dtype option on import or set low_memory=False.


In [15]:
mutations

Unnamed: 0,gene,sample_id,Chromosome,Start_Position,End_Position,HGVSp_Short,Variant_Classification,disease
0,PIK3CD,01BR001-T,chr1,9722533,9722533,p.R785W,Missense_Mutation,BR
5,HECW2,01BR001-T,chr2,196319047,196319047,p.E615K,Missense_Mutation,BR
10,AREG,01BR001-T,chr4,74445388,74445388,p.L15F,Missense_Mutation,BR
11,SCLT1,01BR001-T,chr4,129044011,129044011,p.N48S,Missense_Mutation,BR
12,FBXW7,01BR001-T,chr4,152329731,152329731,p.R393*,Nonsense_Mutation,BR
...,...,...,...,...,...,...,...,...
7693,LRP5,C3N-01808-T,chr11,68348050,68348050,p.V99L,Missense_Mutation,ccRCC
7697,UBC,C3N-01808-T,chr12,124913626,124913626,p.Q49Rfs*33,Frame_Shift_Del,ccRCC
7699,FERMT2,C3N-01808-T,chr14,52881102,52881102,p.D263D,Silent,ccRCC
7701,YY1,C3N-01808-T,chr14,100239466,100239466,p.H74H,Silent,ccRCC


In [16]:
from collections import Counter
c = Counter(mutations['Variant_Classification'])
c

Counter({'Missense_Mutation': 45577,
         'Nonsense_Mutation': 4341,
         'Frame_Shift_Del': 5236,
         'Splice_Site': 1658,
         'Silent': 16239,
         'In_Frame_Del': 495,
         'In_Frame_Ins': 83,
         'Nonstop_Mutation': 82,
         'Frame_Shift_Ins': 1291})

In [17]:
def is_truncating(v):
    if 'Shift' in v: return True
    if 'Nonsense_Mutation'==v: return True
    return False

In [18]:
# do mutation type columns
# d = {x:i for i, x in sorted(set(mutations['Variant_Classification']))}
types = sorted(set(mutations['Variant_Classification']))
for gene in target_genes:
    print(gene)
    for m in types:
        mutations[f'{gene}_mutation_is_{m}'] = [1 if vc==m and g==gene else 0
                        for vc, g in zip(mutations['Variant_Classification'], mutations['gene'])]
    mutations[f'{gene}_has_truncating_mutation'] = [1 if is_truncating(vc) and g==gene else 0
                    for vc, g in zip(mutations['Variant_Classification'], mutations['gene'])]
    mutations[f'{gene}_has_nonsilent_mutation'] = [1 if vc!='Silent' and g==gene else 0
                    for vc, g in zip(mutations['Variant_Classification'], mutations['gene'])]
mutations

ABL1
AJUBA
AKT1
ALB
ALK
AMER1
APC
AR
ARAF
ARHGAP35
ARID1A
ARID2
ASXL1
ASXL2
ATM
ATRX
AXIN1
AXIN2
B2M
BAP1
BCL2
BCOR
BRAF
BRCA1
BRD7
CASP8
CCND1
CD79B
CDH1
CDK12
CDK4
CDKN1A
CDKN1B
CDKN2A
CDKN2C
CEBPA
CHD4
CIC
CNBD1
CREBBP
CSDE1
CTCF
CTNNB1
CTNND1
CUL1
CUL3
CYSLTR2
DNMT3A
EEF1A1
EGFR
EGR3
EIF1AX
EP300
EPAS1
EPHA2
EPHA3
ERBB2
ERCC2
ESR1
EZH2
FAT1
FBXW7
FGFR1
FGFR2
FGFR3
FOXA1
FOXA2
FUBP1
GATA3
GNA11
GNAQ
GNAS
GPS2
GTF2I
HGF
HIST1H1C
HLA-A
HLA-B
HRAS
HUWE1
IDH1
IDH2
IL7R
IRF6
JAK2
JAK3
KANSL1
KDM5C
KDM6A
KEAP1
KIF1A
KIT
KLF5
KMT2A
KMT2B
KMT2C
KMT2D
KRAS
LATS1
LATS2
MACF1
MAP2K1
MAP2K4
MAP3K1
MAPK1
MAX
MED12
MEN1
MET
MGA
MGMT
MLH1
MSH2
MSH3
MTOR
MYD88
MYH9
NCOR1
NF1
NF2
NFE2L2
NIPBL
NOTCH1
NOTCH2
NPM1
NRAS
NSD1
PAX5
PBRM1
PCBP1
PHF6
PIK3CA
PIK3CB
PIK3CG
PIK3R1
PIK3R2
PIM1
PMS1
PMS2
POLE
PPP2R1A
PTEN
PTPRC
RAC1
RAD21
RAF1
RARA
RASA1
RB1
RBM10
RET
RHEB
RHOA
RNF43
RPS6KA3
RQCD1
RUNX1
RXRA
SCAF4
SETBP1
SETD2
SF3B1
SMAD4
SMARCA4
SMC1A
SMC3
SOS1
SOX9
SPOP
STAG2
STK11
TAF1
TBL1XR1
TBX3
TCF12
TCF7

Unnamed: 0,gene,sample_id,Chromosome,Start_Position,End_Position,HGVSp_Short,Variant_Classification,disease,ABL1_mutation_is_Frame_Shift_Del,ABL1_mutation_is_Frame_Shift_Ins,...,ZNF750_mutation_is_Frame_Shift_Ins,ZNF750_mutation_is_In_Frame_Del,ZNF750_mutation_is_In_Frame_Ins,ZNF750_mutation_is_Missense_Mutation,ZNF750_mutation_is_Nonsense_Mutation,ZNF750_mutation_is_Nonstop_Mutation,ZNF750_mutation_is_Silent,ZNF750_mutation_is_Splice_Site,ZNF750_has_truncating_mutation,ZNF750_has_nonsilent_mutation
0,PIK3CD,01BR001-T,chr1,9722533,9722533,p.R785W,Missense_Mutation,BR,0,0,...,0,0,0,0,0,0,0,0,0,0
5,HECW2,01BR001-T,chr2,196319047,196319047,p.E615K,Missense_Mutation,BR,0,0,...,0,0,0,0,0,0,0,0,0,0
10,AREG,01BR001-T,chr4,74445388,74445388,p.L15F,Missense_Mutation,BR,0,0,...,0,0,0,0,0,0,0,0,0,0
11,SCLT1,01BR001-T,chr4,129044011,129044011,p.N48S,Missense_Mutation,BR,0,0,...,0,0,0,0,0,0,0,0,0,0
12,FBXW7,01BR001-T,chr4,152329731,152329731,p.R393*,Nonsense_Mutation,BR,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7693,LRP5,C3N-01808-T,chr11,68348050,68348050,p.V99L,Missense_Mutation,ccRCC,0,0,...,0,0,0,0,0,0,0,0,0,0
7697,UBC,C3N-01808-T,chr12,124913626,124913626,p.Q49Rfs*33,Frame_Shift_Del,ccRCC,0,0,...,0,0,0,0,0,0,0,0,0,0
7699,FERMT2,C3N-01808-T,chr14,52881102,52881102,p.D263D,Silent,ccRCC,0,0,...,0,0,0,0,0,0,0,0,0,0
7701,YY1,C3N-01808-T,chr14,100239466,100239466,p.H74H,Silent,ccRCC,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
sample_ids = sorted(set(mutations['sample_id']))
new = []
for s in sample_ids:
    filtered = mutations[mutations['sample_id']==s]
    base = filtered.iloc[0, :7].to_list()
#     base = filtered[['sample_id', 'disease']].iloc[0]
    if filtered.shape[0]<=1:
        tail = filtered.iloc[0, 7:].to_list()
    else:
        tail = list(np.max(filtered.iloc[:, 7:].values, axis=0).flatten())
    ls = base + tail
    new.append(ls)
consolidated = pd.DataFrame(data=new, index=sample_ids, columns=mutations.columns)
consolidated

Unnamed: 0,gene,sample_id,Chromosome,Start_Position,End_Position,HGVSp_Short,Variant_Classification,disease,ABL1_mutation_is_Frame_Shift_Del,ABL1_mutation_is_Frame_Shift_Ins,...,ZNF750_mutation_is_Frame_Shift_Ins,ZNF750_mutation_is_In_Frame_Del,ZNF750_mutation_is_In_Frame_Ins,ZNF750_mutation_is_Missense_Mutation,ZNF750_mutation_is_Nonsense_Mutation,ZNF750_mutation_is_Nonstop_Mutation,ZNF750_mutation_is_Silent,ZNF750_mutation_is_Splice_Site,ZNF750_has_truncating_mutation,ZNF750_has_nonsilent_mutation
01BR001-T,PIK3CD,01BR001-T,chr1,9722533,9722533,p.R785W,Missense_Mutation,BR,0,0,...,0,0,0,0,0,0,0,0,0,0
01BR008-T,RUNX3,01BR008-T,chr1,24927692,24927692,p.V121V,Silent,BR,0,0,...,0,0,0,0,0,0,0,0,0,0
01BR009-T,MACF1,01BR009-T,chr1,39310835,39310835,p.D1040E,Missense_Mutation,BR,0,0,...,0,0,0,0,0,0,0,0,0,0
01BR010-T,FBXO2,01BR010-T,chr1,11650734,11650734,p.A41A,Silent,BR,0,0,...,0,0,0,0,0,0,0,0,0,0
01BR015-T,TOP2B,01BR015-T,chr3,25612588,25612588,p.Y1233C,Missense_Mutation,BR,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-04283-T,BCL2L11,C3N-04283-T,chr2,111153803,111153803,p.V140L,Missense_Mutation,PDA,0,0,...,0,0,0,0,0,0,0,0,0,0
C3N-04284-T,CENPF,C3N-04284-T,chr1,214646920,214646920,p.L2450L,Silent,PDA,0,0,...,0,0,0,0,0,0,0,0,0,0
C3N-04611-T,COPA,C3N-04611-T,chr1,160339938,160339938,p.V67F,Missense_Mutation,HNSCC,0,0,...,0,0,0,0,0,0,0,0,0,0
CPT000814-T,SPTA1,CPT000814-T,chr1,158619290,158619290,p.K2154N,Missense_Mutation,BR,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
consolidated = consolidated.set_index('sample_id')

In [21]:
consolidated_filtered = consolidated.iloc[:, 7:]
# consolidated_filtered = consolidated
consolidated_filtered

Unnamed: 0_level_0,ABL1_mutation_is_Frame_Shift_Del,ABL1_mutation_is_Frame_Shift_Ins,ABL1_mutation_is_In_Frame_Del,ABL1_mutation_is_In_Frame_Ins,ABL1_mutation_is_Missense_Mutation,ABL1_mutation_is_Nonsense_Mutation,ABL1_mutation_is_Nonstop_Mutation,ABL1_mutation_is_Silent,ABL1_mutation_is_Splice_Site,ABL1_has_truncating_mutation,...,ZNF750_mutation_is_Frame_Shift_Ins,ZNF750_mutation_is_In_Frame_Del,ZNF750_mutation_is_In_Frame_Ins,ZNF750_mutation_is_Missense_Mutation,ZNF750_mutation_is_Nonsense_Mutation,ZNF750_mutation_is_Nonstop_Mutation,ZNF750_mutation_is_Silent,ZNF750_mutation_is_Splice_Site,ZNF750_has_truncating_mutation,ZNF750_has_nonsilent_mutation
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01BR001-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01BR008-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01BR009-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01BR010-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01BR015-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-04283-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C3N-04284-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C3N-04611-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CPT000814-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


###### somatic cnv

In [22]:
fps = sorted(os_helpers.listfiles('../data/Somatic_cnv/', regex=r'WashU_pipeline_wxs/'))
fps = [fp for fp in fps if 'gene_level' in fp]
fps

['../data/Somatic_cnv/BR/WashU_pipeline_wxs/BR.gene_level.from_seg.filtered.tsv',
 '../data/Somatic_cnv/CO/WashU_pipeline_wxs/CO.gene_level.from_seg.filtered.tsv',
 '../data/Somatic_cnv/EC/WashU_pipeline_wxs/UCEC.gene_level.from_seg.filtered.tsv',
 '../data/Somatic_cnv/GBM/WashU_pipeline_wxs/GBM.gene_level.from_seg.filtered.tsv',
 '../data/Somatic_cnv/HNSCC/WashU_pipeline_wxs/HNSCC.gene_level.from_seg.filtered.tsv',
 '../data/Somatic_cnv/LSCC/WashU_pipeline_wxs/LSCC.gene_level.from_seg.filtered.tsv',
 '../data/Somatic_cnv/LUAD/WashU_pipeline_wxs/LUAD.gene_level.from_seg.filtered.tsv',
 '../data/Somatic_cnv/OV/WashU_pipeline_wxs/OV.gene_level.from_seg.filtered.tsv',
 '../data/Somatic_cnv/PDA/WashU_pipeline_wxs/PDA.gene_level.from_seg.filtered.tsv',
 '../data/Somatic_cnv/ccRCC/WashU_pipeline_wxs/ccRCC.gene_level.from_seg.filtered.tsv']

In [23]:
cnv = None
gene_order = list(set(pd.read_csv(fps[0], sep='\t')['Gene']).intersection(*[set(pd.read_csv(fp, sep='\t')['Gene'])
                                                                           for fp in fps[1:]]))
for fp in fps:
    df = pd.read_csv(fp,
                sep='\t')
    df = df.set_index('Gene')
    df = df.transpose()[gene_order]
    df['disease'] = fp.split('/')[3]
    
    if cnv is None:
        cnv = df
    else:
        cnv = pd.concat((cnv, df), axis=0)
cnv

Gene,OR2V1,FXR1,FRG2B,HS3ST4,PLA2G2F,TRPM8,KEL,ADCY7,TMED6,SDCCAG8,...,FBXW5,SUMO2,SPP2,TMSB10,MIS18A,OTUD7A,HCAR1,LPO,SCAF11,disease
01BR001,-0.27270,-0.06602,-0.00966,-0.40657,-0.12887,0.09183,0.05187,0.13984,0.13984,0.46218,...,0.01670,0.16581,0.09183,0.11436,0.52258,-0.24938,-0.16064,-0.06617,-0.00563,BR
01BR008,-0.07550,-0.09075,0.11550,-0.32181,0.08417,-0.05192,-0.00576,-0.01953,-0.01953,0.32427,...,0.01058,0.06834,-0.05192,0.03183,-0.00672,0.00988,-0.02567,-0.10262,-0.02567,BR
01BR009,-0.17713,0.06816,0.28902,-0.38928,-0.12834,0.05363,0.06942,0.26919,0.26919,0.33435,...,-0.19639,0.24314,0.05363,0.13819,0.01965,0.02461,-0.17576,-0.29783,0.20852,BR
01BR010,-0.01151,-0.00744,0.00058,-0.14335,0.03008,0.13935,-0.03744,0.01711,0.01711,0.16701,...,0.08945,-0.01515,0.13935,0.14108,0.12614,-0.18501,-0.00332,-0.01515,-0.00332,BR
01BR015,0.07021,0.09962,0.13746,0.26465,-0.17893,-0.23689,-0.27216,0.13552,0.13552,0.24552,...,0.09958,-0.26588,-0.23689,-0.23580,0.27022,-0.23970,-0.21528,-0.26588,0.12874,BR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01646,0.04318,0.30051,0.04368,0.04099,0.03641,0.05653,0.03709,0.04099,0.04099,0.04553,...,-0.32422,0.03650,0.05653,0.05431,0.03746,0.04633,0.07525,0.03650,0.07525,ccRCC
C3N-01648,-0.02056,-0.01847,-0.02243,-0.02305,-0.02997,-0.00540,-0.03307,-0.02305,-0.02305,-0.00849,...,-0.02193,-0.02973,-0.00540,-0.00540,-0.01504,-0.02031,-0.01736,-0.02973,-0.01736,ccRCC
C3N-01649,0.02307,0.01720,0.01736,0.01164,0.01854,0.02319,0.32882,0.01164,0.01164,0.01901,...,0.01562,0.01504,0.02319,0.02319,0.01470,0.02188,0.03014,0.01504,0.03014,ccRCC
C3N-01651,0.14240,0.14771,-0.31422,0.14783,-0.06828,0.51034,0.14662,-0.29981,-0.29981,-0.06828,...,-0.07086,-0.05907,0.51034,0.16029,-0.06616,-0.32835,0.14936,-0.05907,0.14936,ccRCC


In [24]:
include = [g for g in target_genes]
for gs in gene_to_subgenes.values(): include += gs
include = sorted(set(include))
include.append('disease')

cnv = cnv[[g for g in include if g in cnv.columns]]
cnv.columns = [f'{c}_cnv' if c!='disease' else c for c in cnv.columns]
cnv.index = [x + '-T' for x in cnv.index]
cnv

Unnamed: 0,AAAS_cnv,AAK1_cnv,AATF_cnv,ABCA1_cnv,ABCA2_cnv,ABCB1_cnv,ABCB11_cnv,ABCC2_cnv,ABCC3_cnv,ABCE1_cnv,...,ZRANB1_cnv,ZSCAN10_cnv,ZSCAN25_cnv,ZSCAN32_cnv,ZW10_cnv,ZWILCH_cnv,ZWINT_cnv,ZXDC_cnv,ZYX_cnv,disease
01BR001-T,-0.00563,0.11436,-0.10931,-0.05580,0.01670,0.04711,0.09183,-0.00966,-0.06617,0.40944,...,-0.00966,-0.19088,0.04711,-0.19088,0.01914,-0.26696,-0.00966,-0.06602,0.05187,BR
01BR008-T,-0.02567,0.03183,-0.10262,-0.06820,0.01058,-0.00576,-0.05192,0.11550,-0.10262,-0.06661,...,0.11550,-0.16619,-0.00576,-0.16619,-0.04312,0.00988,0.11550,-0.09075,-0.00576,BR
01BR009-T,-0.22827,0.13819,-0.29783,-0.19639,-0.19639,0.16028,0.21653,-0.14743,-0.29783,-0.18501,...,0.32349,-0.38928,0.67516,-0.38928,-0.15614,-0.15576,-0.14743,0.26289,0.06942,BR
01BR010-T,-0.00332,0.14108,-0.20200,0.08945,0.08945,-0.03744,0.14108,0.01446,-0.01515,0.09144,...,0.00058,-0.04794,-0.03744,-0.04794,0.03900,-0.18501,-0.14666,-0.00744,-0.03744,BR
01BR015-T,0.12874,-0.23580,-0.26588,-0.11742,0.09958,-0.28490,-0.23689,0.13746,-0.26588,0.09826,...,0.13746,0.26465,-0.28490,0.26465,-0.20074,-0.23970,0.13746,0.09962,-0.27216,BR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01646-T,0.07525,0.05431,0.03650,-0.32422,-0.32422,0.03709,0.05653,0.04368,0.03650,0.02862,...,0.04368,0.04099,0.03709,0.04099,0.04100,0.04141,0.04368,0.30051,0.03709,ccRCC
C3N-01648-T,-0.01736,-0.00540,-0.02973,-0.02193,-0.02193,-0.03307,-0.00540,-0.02243,-0.02973,-0.03739,...,-0.02243,-0.02305,-0.03307,-0.02305,-0.02360,-0.02031,-0.02243,-0.01847,-0.03307,ccRCC
C3N-01649-T,0.03014,0.02319,0.01504,0.01562,0.01562,0.32882,0.02319,0.01736,0.01504,0.01663,...,0.01736,0.01164,0.32882,0.01164,0.01643,0.02188,0.01736,0.01720,0.32882,ccRCC
C3N-01651-T,0.14936,0.16029,-0.05907,-0.07086,-0.07086,0.14662,0.51034,-0.31422,-0.05907,-0.08853,...,-0.31422,0.14783,0.14662,0.14783,-0.06331,-0.32835,-0.31422,0.14771,0.14662,ccRCC


###### eQTL

In [23]:
fps = sorted(os_helpers.listfiles('../data/Somatic_mutation_wxs/', regex=r'exonic.*.maf.gz$'))
fps = [fp for fp in fps if 'Archived' not in fp]
mutations = None
genes = list(target_genes)
for gs in gene_to_subgenes.values(): genes += gs
genes = sorted(set(genes))
keep = ['gene', 'sample_id', 'Chromosome', 'Start_Position', 'End_Position', 'HGVSp_Short', 'Variant_Classification',
       'Reference_Allele', 'Tumor_Seq_Allele2']
for fp in fps:
    cancer_type = fp.split('/')[-1].split('_')[0]
    df = pd.read_csv(fp, sep='\t')
    df = df.loc[[True if g in genes else False for g in df['Hugo_Symbol']]]
#     df = df[df['Variant_Classification']!='Silent']
    df['sample_id'] = [x.replace('_', '-') for x in df['Tumor_Sample_Barcode']]
    df['gene'] = df['Hugo_Symbol'].to_list()
    df = df[keep]
    df['disease'] = cancer_type
    
    
    if mutations is None:
        mutations = df
    else:
        mutations = pd.concat((mutations, df), axis=0)

Columns (88) have mixed types.Specify dtype option on import or set low_memory=False.


In [63]:
mutations

Unnamed: 0,gene,sample_id,Chromosome,Start_Position,End_Position,HGVSp_Short,Variant_Classification,Reference_Allele,Tumor_Seq_Allele2,disease
0,PIK3CD,01BR001-T,chr1,9722533,9722533,p.R785W,Missense_Mutation,C,T,BR
10,AREG,01BR001-T,chr4,74445388,74445388,p.L15F,Missense_Mutation,C,T,BR
12,FBXW7,01BR001-T,chr4,152329731,152329731,p.R393*,Nonsense_Mutation,G,A,BR
34,FLT1,01BR001-T,chr13,28438252,28438252,p.T161M,Missense_Mutation,G,A,BR
43,TP53,01BR001-T,chr17,7674216,7674216,p.R249S,Missense_Mutation,C,A,BR
...,...,...,...,...,...,...,...,...,...,...
7641,PTEN,C3N-01651-T,chr10,87933033,87933033,p.D92Y,Missense_Mutation,G,T,ccRCC
7655,IGHG2,C3N-01651-T,chr14,105644096,105644096,p.K101Nfs*31,Frame_Shift_Del,T,-,ccRCC
7673,TP73,C3N-01808-T,chr1,3731054,3731054,p.P491P,Silent,C,T,ccRCC
7683,FGB,C3N-01808-T,chr4,154563025,154563025,p.R3R,Silent,A,C,ccRCC


In [64]:
fps = sorted(os_helpers.listfiles('/diskmnt/Projects/Users/estorrs/cptac_driver/data/eQTL_filtered',
                                 ))
fps

['/diskmnt/Projects/Users/estorrs/cptac_driver/data/eQTL_filtered/BR_eQTLs_filtered.2021017.tsv',
 '/diskmnt/Projects/Users/estorrs/cptac_driver/data/eQTL_filtered/CO_eQTLs_filtered.2021017.tsv',
 '/diskmnt/Projects/Users/estorrs/cptac_driver/data/eQTL_filtered/EC_eQTLs_filtered.2021017.tsv',
 '/diskmnt/Projects/Users/estorrs/cptac_driver/data/eQTL_filtered/GBM_eQTLs_filtered.2021017.tsv',
 '/diskmnt/Projects/Users/estorrs/cptac_driver/data/eQTL_filtered/HNSCC_eQTLs_filtered.2021017.tsv',
 '/diskmnt/Projects/Users/estorrs/cptac_driver/data/eQTL_filtered/LSCC_eQTLs_filtered.2021017.tsv',
 '/diskmnt/Projects/Users/estorrs/cptac_driver/data/eQTL_filtered/LUAD_eQTLs_filtered.2021017.tsv',
 '/diskmnt/Projects/Users/estorrs/cptac_driver/data/eQTL_filtered/OV_eQTLs_filtered.2021017.tsv',
 '/diskmnt/Projects/Users/estorrs/cptac_driver/data/eQTL_filtered/PDA_eQTLs_filtered.2021017.tsv',
 '/diskmnt/Projects/Users/estorrs/cptac_driver/data/eQTL_filtered/ccRCC_eQTLs_filtered.2021017.tsv']

In [65]:
eqtl = None

for fp in fps:
    df = pd.read_csv(fp,
                sep='\t')
    
    if eqtl is None:
        eqtl = df
    else:
        eqtl = pd.concat((eqtl, df), axis=0)
eqtl

Unnamed: 0,SNP,gene,beta,t.stat,P,FDR,Gene_SNP,Disease
0,chr1_107056636_A_T,PRMT6,-0.606306,-5.163265,1.373427e-06,2.883696e-02,PRMT6,BR
1,chr1_109656105_C_G,GSTM4,-0.574039,-5.391549,5.270820e-07,1.706793e-02,GSTM4,BR
2,chr1_109688145_G_A,GSTM1,0.676938,8.537473,2.526182e-13,1.709479e-06,GSTM1,BR
3,chr1_109690179_A_G,GSTM1,0.707850,9.033014,2.272933e-14,1.644163e-07,GSTM1,BR
4,chr1_109690625_T_C,GSTM1,0.611524,7.463626,4.401857e-11,4.542680e-05,GSTM1,BR
...,...,...,...,...,...,...,...,...
4914,chr9_92784607_G_A,ASPN,3.278481,5.040561,2.460206e-06,4.897364e-02,ASPN,ccRCC
4915,chr9_93697213_C_A,FGD3,1.352675,5.120912,1.774278e-06,4.078420e-02,FGD3,ccRCC
4916,chr9_93849832_C_G,FGD3,2.705351,5.120912,1.774278e-06,4.078420e-02,FGD3,ccRCC
4917,chr9_97896656_A_G,TMOD1,-1.901571,-4.773050,7.169601e-06,8.226812e-02,TMOD1,ccRCC


In [66]:
for disease in set(eqtl['Disease']):
    sites = set(eqtl[eqtl['Disease']==disease]['SNP'])
    f = mutations[mutations['disease']==disease]
    mask = [True if f'{c}_{s}_{ref}_{alt}' in sites else False
           for c, s, ref, alt in zip(f['Chromosome'], f['Start_Position'],
                                    f['Reference_Allele'], f['Tumor_Seq_Allele2'])]
    print(f[mask].shape)

(1, 10)
(0, 10)
(0, 10)
(0, 10)
(0, 10)
(0, 10)
(0, 10)
(0, 10)
(0, 10)
(0, 10)


In [None]:
d = pd.read_csv('../data/199_driver_genes.txt', sep='\t')
d

In [51]:
set(d['Gene']).intersection(mutations[mask]['gene'])

{'ERBB2'}

###### pQTL

In [52]:
pqtl = pd.read_csv('../data/filtered_pQTL.tsv', sep='\t')
pqtl

Unnamed: 0,SNP,gene,beta,t.stat,P,FDR,Gene_SNP,Disease
0,chr1_109688145_G_A,GSTM1,0.547301,5.611051,2.296727e-07,0.022047,GSTM1,ccRCC
1,chr1_109690179_A_G,GSTM1,0.547301,5.611051,2.296727e-07,0.022047,GSTM1,ccRCC
2,chr1_109690625_T_C,GSTM1,0.547301,5.611051,2.296727e-07,0.022047,GSTM1,ccRCC
3,chr1_109737199_C_A,GSTM3,0.641189,5.217990,1.191517e-06,0.053193,GSTM3,ccRCC
4,chr1_109739319_G_A,GSTM3,0.713888,5.903624,6.529679e-08,0.009406,GSTM3,ccRCC
...,...,...,...,...,...,...,...,...
8195,chr9_137103590_C_T,UAP1L1,-0.585547,-4.871726,6.120846e-06,0.093298,UAP1L1,EC
8196,chr9_21816759_G_A,MTAP,-0.569130,-5.338531,9.881701e-07,0.041168,MTAP,EC
8197,chr9_34318291_G_T,NUDT2,0.948056,6.126573,3.972724e-08,0.008367,NUDT2,EC
8198,chr9_34371790_A_T,NUDT2,0.952703,6.324868,1.732983e-08,0.004470,NUDT2,EC


In [53]:
sites = set(pqtl['SNP'])
mask = [True if f'{c}_{s}_{ref}_{alt}' in sites else False
       for c, s, ref, alt in zip(mutations['Chromosome'], mutations['Start_Position'],
                                mutations['Reference_Allele'], mutations['Tumor_Seq_Allele2'])]
mutations[mask]

Unnamed: 0,gene,sample_id,Chromosome,Start_Position,End_Position,HGVSp_Short,Variant_Classification,Reference_Allele,Tumor_Seq_Allele2,disease


###### germline_variants

In [25]:
fps = sorted(os_helpers.listfiles('/diskmnt/Projects/Users/estorrs/cptac_driver/data/Germline_pathogenic_variants_reviewed.1.0/',
                                 ))
fps

['/diskmnt/Projects/Users/estorrs/cptac_driver/data/Germline_pathogenic_variants_reviewed.1.0/BR_germline_pathogenic.v.1.0.txt',
 '/diskmnt/Projects/Users/estorrs/cptac_driver/data/Germline_pathogenic_variants_reviewed.1.0/CO_germline_pathogenic.v.1.0.txt',
 '/diskmnt/Projects/Users/estorrs/cptac_driver/data/Germline_pathogenic_variants_reviewed.1.0/EC_germline_pathogenic.v.1.0.txt',
 '/diskmnt/Projects/Users/estorrs/cptac_driver/data/Germline_pathogenic_variants_reviewed.1.0/GBM_germline_pathogenic.v.1.0.txt',
 '/diskmnt/Projects/Users/estorrs/cptac_driver/data/Germline_pathogenic_variants_reviewed.1.0/HNSCC_germline_pathogenic.v.1.0.txt',
 '/diskmnt/Projects/Users/estorrs/cptac_driver/data/Germline_pathogenic_variants_reviewed.1.0/LSCC_germline_pathogenic.v.1.0.txt',
 '/diskmnt/Projects/Users/estorrs/cptac_driver/data/Germline_pathogenic_variants_reviewed.1.0/LUAD_germline_pathogenic.v.1.0.txt',
 '/diskmnt/Projects/Users/estorrs/cptac_driver/data/Germline_pathogenic_variants_reviewed

In [26]:
germline = None

for fp in fps:
    df = pd.read_csv(fp,
                sep='\t')
    
    if germline is None:
        germline = df
    else:
        germline = pd.concat((germline, df), axis=0)
germline

Unnamed: 0,Disease,Overall_Classification,GeneClass,Sample,HUGO_Symbol,Chromosome,Start,Stop,Reference,Alternate,...,VCF_Details,N_REF,N_ALT,N_VAF,T_REF,T_ALT,T_VAF,Genotype,Cohort_AC,Manual_review
0,BRCA,Pathogenic,TSG,11BR020,ATM,11,108325416,108325416,C,T,...,"chr11::108325416::None::C::[""T""]::{""CSQ"":[""T|m...",137,136,0.498168,127,130,0.505837,0/1,2,YES
1,BRCA,Pathogenic,TSG,11BR006,BLM,15,90749586,90749587,-,T,...,"chr15::90749586::None::A::[""AT""]::{""CSQ"":[""T|f...",614,281,0.313966,503,257,0.338158,0/1,1,YES
2,BRCA,Pathogenic,TSG,11BR016,BRCA1,17,43057062,43057063,-,G,...,"chr17::43057062::None::T::[""TG""]::{""CSQ"":[""G|f...",397,193,0.327119,274,176,0.391111,0/1,1,YES
3,BRCA,Pathogenic,TSG,18BR006,BRCA2,13,32329468,32329469,TG,-,...,"chr13::32329467::None::CTG::[""C""]::{""CSQ"":[""-|...",48,61,0.559633,30,84,0.736842,0/1,1,YES
4,BRCA,Pathogenic,TSG,11BR006,BRCA2,13,32340704,32340705,TG,-,...,"chr13::32340703::None::CTG::[""C""]::{""CSQ"":[""-|...",135,106,0.439834,76,119,0.610256,0/1,1,YES
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6,ccRCC,Pathogenic,TSG,C3N-00168.N,SDHA,5,218356,218356,A,G,...,"chr5::218356::None::A::[""G""]::{""CSQ"":[""G|start...",19,21,0.525000,35,41,0.539474,0/1,1,YES
7,ccRCC,Pathogenic,,C3N-00831.N,SERPINA1,14,94380949,94380949,T,A,...,"chr14::94380949::None::T::[""A""]::{""CSQ"":[""A|mi...",186,178,0.489011,164,185,0.530086,0/1,1,YES
8,ccRCC,Pathogenic,,C3L-00416.N,TYR,11,89227904,89227904,C,A,...,"chr11::89227904::None::C::[""A""]::{""CSQ"":[""A|mi...",160,96,0.375000,149,125,0.456204,0/1,1,YES
9,ccRCC,Pathogenic,,C3L-01283.N,TYR,11,89295242,89295243,-,T,...,"chr11::89295242::None::C::[""CT""]::{""CSQ"":[""T|f...",273,253,0.480989,235,190,0.447059,0/1,1,YES


In [27]:
gs = list(target_genes)
for ls in gene_to_subgenes.values(): gs += ls
gs = sorted(set(gs))
mask = [True if g in gs else False
       for g in germline['HUGO_Symbol']]
germline = germline[mask]

In [28]:
# do mutation type columns
# d = {x:i for i, x in sorted(set(mutations['Variant_Classification']))}
for gene in gs:
    germline[f'{gene}_is_pathogenic_germline'] = [1 if g==gene else 0
                    for g in germline['HUGO_Symbol']]
germline


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,Disease,Overall_Classification,GeneClass,Sample,HUGO_Symbol,Chromosome,Start,Stop,Reference,Alternate,...,ZRANB1_is_pathogenic_germline,ZRSR2_is_pathogenic_germline,ZSCAN10_is_pathogenic_germline,ZSCAN25_is_pathogenic_germline,ZSCAN32_is_pathogenic_germline,ZW10_is_pathogenic_germline,ZWILCH_is_pathogenic_germline,ZWINT_is_pathogenic_germline,ZXDC_is_pathogenic_germline,ZYX_is_pathogenic_germline
0,BRCA,Pathogenic,TSG,11BR020,ATM,11,108325416,108325416,C,T,...,0,0,0,0,0,0,0,0,0,0
1,BRCA,Pathogenic,TSG,11BR006,BLM,15,90749586,90749587,-,T,...,0,0,0,0,0,0,0,0,0,0
2,BRCA,Pathogenic,TSG,11BR016,BRCA1,17,43057062,43057063,-,G,...,0,0,0,0,0,0,0,0,0,0
3,BRCA,Pathogenic,TSG,18BR006,BRCA2,13,32329468,32329469,TG,-,...,0,0,0,0,0,0,0,0,0,0
4,BRCA,Pathogenic,TSG,11BR006,BRCA2,13,32340704,32340705,TG,-,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,ccRCC,Pathogenic,TSG,C3N-01200.N,ERCC2,19,45352511,45352511,C,T,...,0,0,0,0,0,0,0,0,0,0
4,ccRCC,Likely_Pathogenic,Oncogene,C3N-01175.N,MITF,3,69866348,69866348,T,G,...,0,0,0,0,0,0,0,0,0,0
5,ccRCC,Pathogenic,,C3L-01553.N,MTHFR,1,11791216,11791216,C,T,...,0,0,0,0,0,0,0,0,0,0
7,ccRCC,Pathogenic,,C3N-00831.N,SERPINA1,14,94380949,94380949,T,A,...,0,0,0,0,0,0,0,0,0,0


In [29]:
germline.index = [x.replace('.N', '') + '-T' for x in germline['Sample']]
germline.index.name = 'sample_id'
germline


Unnamed: 0_level_0,Disease,Overall_Classification,GeneClass,Sample,HUGO_Symbol,Chromosome,Start,Stop,Reference,Alternate,...,ZRANB1_is_pathogenic_germline,ZRSR2_is_pathogenic_germline,ZSCAN10_is_pathogenic_germline,ZSCAN25_is_pathogenic_germline,ZSCAN32_is_pathogenic_germline,ZW10_is_pathogenic_germline,ZWILCH_is_pathogenic_germline,ZWINT_is_pathogenic_germline,ZXDC_is_pathogenic_germline,ZYX_is_pathogenic_germline
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11BR020-T,BRCA,Pathogenic,TSG,11BR020,ATM,11,108325416,108325416,C,T,...,0,0,0,0,0,0,0,0,0,0
11BR006-T,BRCA,Pathogenic,TSG,11BR006,BLM,15,90749586,90749587,-,T,...,0,0,0,0,0,0,0,0,0,0
11BR016-T,BRCA,Pathogenic,TSG,11BR016,BRCA1,17,43057062,43057063,-,G,...,0,0,0,0,0,0,0,0,0,0
18BR006-T,BRCA,Pathogenic,TSG,18BR006,BRCA2,13,32329468,32329469,TG,-,...,0,0,0,0,0,0,0,0,0,0
11BR006-T,BRCA,Pathogenic,TSG,11BR006,BRCA2,13,32340704,32340705,TG,-,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01200-T,ccRCC,Pathogenic,TSG,C3N-01200.N,ERCC2,19,45352511,45352511,C,T,...,0,0,0,0,0,0,0,0,0,0
C3N-01175-T,ccRCC,Likely_Pathogenic,Oncogene,C3N-01175.N,MITF,3,69866348,69866348,T,G,...,0,0,0,0,0,0,0,0,0,0
C3L-01553-T,ccRCC,Pathogenic,,C3L-01553.N,MTHFR,1,11791216,11791216,C,T,...,0,0,0,0,0,0,0,0,0,0
C3N-00831-T,ccRCC,Pathogenic,,C3N-00831.N,SERPINA1,14,94380949,94380949,T,A,...,0,0,0,0,0,0,0,0,0,0


In [30]:
germline = germline[[c for c in germline.columns if 'is_pathogenic_germline' in c]]
germline

Unnamed: 0_level_0,AAAS_is_pathogenic_germline,AAK1_is_pathogenic_germline,AATF_is_pathogenic_germline,ABCA1_is_pathogenic_germline,ABCA2_is_pathogenic_germline,ABCB1_is_pathogenic_germline,ABCB11_is_pathogenic_germline,ABCC2_is_pathogenic_germline,ABCC3_is_pathogenic_germline,ABCE1_is_pathogenic_germline,...,ZRANB1_is_pathogenic_germline,ZRSR2_is_pathogenic_germline,ZSCAN10_is_pathogenic_germline,ZSCAN25_is_pathogenic_germline,ZSCAN32_is_pathogenic_germline,ZW10_is_pathogenic_germline,ZWILCH_is_pathogenic_germline,ZWINT_is_pathogenic_germline,ZXDC_is_pathogenic_germline,ZYX_is_pathogenic_germline
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11BR020-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11BR006-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11BR016-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18BR006-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11BR006-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01200-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C3N-01175-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C3L-01553-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C3N-00831-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
sample_ids = sorted(set(germline.index))
new = []
for s in sample_ids:
    filtered = germline.loc[[s], :]
#     print(filtered.shape)
    if filtered.shape[0]<=1:
        tail = filtered.iloc[0, :].to_list()
    else:
        tail = list(np.max(filtered.values, axis=0).flatten())
#     print(tail)
    new.append(tail)
consolidated_germline = pd.DataFrame(data=new, index=sample_ids, columns=germline.columns)
consolidated_germline

Unnamed: 0,AAAS_is_pathogenic_germline,AAK1_is_pathogenic_germline,AATF_is_pathogenic_germline,ABCA1_is_pathogenic_germline,ABCA2_is_pathogenic_germline,ABCB1_is_pathogenic_germline,ABCB11_is_pathogenic_germline,ABCC2_is_pathogenic_germline,ABCC3_is_pathogenic_germline,ABCE1_is_pathogenic_germline,...,ZRANB1_is_pathogenic_germline,ZRSR2_is_pathogenic_germline,ZSCAN10_is_pathogenic_germline,ZSCAN25_is_pathogenic_germline,ZSCAN32_is_pathogenic_germline,ZW10_is_pathogenic_germline,ZWILCH_is_pathogenic_germline,ZWINT_is_pathogenic_germline,ZXDC_is_pathogenic_germline,ZYX_is_pathogenic_germline
01BR017-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01BR033-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01BR042-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01OV029-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
02OV008-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-03439-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C3N-03782-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C3N-03841-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C3N-04279-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


###### rna expression

In [64]:
expression = pd.read_csv('../data/ALL_RNA-Seq_Expr_WashU_FPKM_UQ_annotation.tsv.gz', sep='\t')
expression

Unnamed: 0,gene_id,gene_name,seqname,start,end,strand,gene_type,gene_status,havana_gene,full_length,...,C3L-01282-A,C3L-01304-A,C3L-01307-A,C3L-01311-A,C3N-00333-A,C3N-00383-A,C3N-00858-A,C3N-00866-A,C3N-01003-A,C3N-01346-A
0,ENSG00000223972.5,DDX11L1,chr1,11869,14409,+,transcribed_unprocessed_pseudogene,KNOWN,OTTHUMG00000000961.2,2541,...,0.000000e+00,3.316754e+02,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
1,ENSG00000227232.5,WASH7P,chr1,14404,29570,-,unprocessed_pseudogene,KNOWN,OTTHUMG00000000958.1,15167,...,3.294426e+04,3.322400e+04,2.772942e+04,4.175670e+04,2.970832e+04,3.837182e+04,3.049993e+04,2.128872e+04,3.700962e+04,2.161777e+04
2,ENSG00000278267.1,MIR6859-3,chr1,17369,17436,-,miRNA,KNOWN,,68,...,1.446044e+05,1.777146e+05,1.351309e+05,2.008523e+05,1.713584e+05,1.626363e+05,2.080168e+05,1.518306e+05,1.547988e+05,1.288483e+05
3,ENSG00000243485.3,RP11-34P13.3,chr1,29554,31109,+,lincRNA,NOVEL,OTTHUMG00000000959.2,1556,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
4,ENSG00000274890.1,MIR1302-9,chr1,30366,30503,+,miRNA,KNOWN,,138,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60478,ENSG00000198695.2,MT-ND6,chrM,14149,14673,-,protein_coding,KNOWN,,525,...,3.391063e+05,4.505004e+05,3.325508e+05,3.167063e+05,2.540094e+05,4.976672e+05,3.994611e+05,2.275601e+05,3.295739e+05,1.813529e+05
60479,ENSG00000210194.1,MT-TE,chrM,14674,14742,-,Mt_tRNA,KNOWN,,69,...,7.500455e+03,1.167593e+05,0.000000e+00,8.606148e+03,0.000000e+00,2.003491e+04,3.565253e+04,1.068787e+04,9.534706e+03,3.386157e+04
60480,ENSG00000198727.2,MT-CYB,chrM,14747,15887,+,protein_coding,KNOWN,,1141,...,3.787822e+06,5.057060e+06,4.413250e+06,3.395884e+06,5.556691e+06,1.084422e+07,4.775596e+06,4.455156e+06,3.985424e+06,4.091857e+06
60481,ENSG00000210195.2,MT-TT,chrM,15888,15953,+,Mt_tRNA,KNOWN,,66,...,7.057246e+04,1.656618e+05,5.354838e+04,4.498668e+04,3.138686e+05,3.037110e+05,1.770472e+05,4.469473e+04,1.395534e+05,1.150524e+05


In [65]:
expression = expression[expression['gene_type']=='protein_coding']
expression

Unnamed: 0,gene_id,gene_name,seqname,start,end,strand,gene_type,gene_status,havana_gene,full_length,...,C3L-01282-A,C3L-01304-A,C3L-01307-A,C3L-01311-A,C3N-00333-A,C3N-00383-A,C3N-00858-A,C3N-00866-A,C3N-01003-A,C3N-01346-A
8,ENSG00000186092.4,OR4F5,chr1,69091,70008,+,protein_coding,KNOWN,OTTHUMG00000001094.1,918,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
18,ENSG00000279928.1,FO538757.3,chr1,182393,184158,+,protein_coding,KNOWN,,1766,...,7.207958e+02,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,9.626803e+02,3.426218e+03,0.000000e+00,1.832576e+03,6.508213e+03
19,ENSG00000279457.2,FO538757.2,chr1,184923,200322,-,protein_coding,KNOWN,,15400,...,4.491191e+04,8.710244e+04,8.273793e+04,6.591389e+04,6.271037e+04,7.079440e+04,6.578270e+04,7.218358e+04,6.904243e+04,7.662420e+04
29,ENSG00000278566.1,OR4F29,chr1,450740,451678,-,protein_coding,KNOWN,OTTHUMG00000002860.1,939,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
49,ENSG00000273547.1,OR4F16,chr1,685716,686654,-,protein_coding,KNOWN,OTTHUMG00000002581.1,939,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60472,ENSG00000212907.2,MT-ND4L,chrM,10470,10766,+,protein_coding,KNOWN,,297,...,4.574141e+06,5.785569e+06,4.864573e+06,3.558947e+06,5.483982e+06,1.110582e+07,5.232729e+06,5.393165e+06,4.558745e+06,3.561707e+06
60473,ENSG00000198886.2,MT-ND4,chrM,10760,12137,+,protein_coding,KNOWN,,1378,...,3.574647e+06,4.320937e+06,3.335171e+06,2.743744e+06,4.135455e+06,8.500611e+06,3.932826e+06,3.735481e+06,3.340558e+06,3.692454e+06
60477,ENSG00000198786.2,MT-ND5,chrM,12337,14148,+,protein_coding,KNOWN,,1812,...,1.154449e+06,1.414823e+06,1.179625e+06,1.083107e+06,1.476910e+06,4.349781e+06,1.377655e+06,1.366667e+06,1.203962e+06,1.353902e+06
60478,ENSG00000198695.2,MT-ND6,chrM,14149,14673,-,protein_coding,KNOWN,,525,...,3.391063e+05,4.505004e+05,3.325508e+05,3.167063e+05,2.540094e+05,4.976672e+05,3.994611e+05,2.275601e+05,3.295739e+05,1.813529e+05


In [66]:
genes = expression['gene_name'].to_list()
expression = expression.iloc[:, 12:]
expression['gene'] = [g.split('.')[0] for g in genes]
expression = expression.groupby('gene').mean()
expression = pd.DataFrame(data=np.log1p(expression.values), index=expression.index, columns=expression.columns)

expression = expression.transpose()

In [67]:
expression.columns = [f'{c}_expression' for c in expression.columns]
expression

Unnamed: 0,1-Dec_expression,1-Mar_expression,1-Sep_expression,10-Mar_expression,10-Sep_expression,11-Mar_expression,11-Sep_expression,12-Sep_expression,14-Sep_expression,15-Sep_expression,...,ZWINT_expression,ZXDA_expression,ZXDB_expression,ZXDC_expression,ZYG11A_expression,ZYG11B_expression,ZYX_expression,ZZEF1_expression,ZZZ3_expression,pk_expression
01BR001-T,5.548933,9.469038,9.526235,7.293216,13.060178,0.000000,13.287526,7.046166,0.000000,14.215909,...,11.757262,10.134319,11.362794,11.461471,10.430631,11.700726,13.320117,11.281892,11.415130,11.518754
01BR008-T,5.357447,10.496447,12.433383,0.000000,10.894918,0.000000,11.880545,0.000000,0.000000,14.216420,...,14.030484,9.176603,10.521493,11.695101,11.566098,10.635637,13.760682,11.407163,11.716684,10.209459
01BR009-T,6.061211,10.371105,10.798686,5.326345,12.614280,0.000000,12.756396,7.272150,0.000000,14.226891,...,12.447119,9.611568,10.434179,12.030646,7.766814,11.253443,13.427215,11.866759,11.416178,10.698735
01BR010-T,6.003868,11.087496,9.837070,9.001784,12.608134,9.973902,12.406277,9.206505,0.000000,14.197264,...,12.496828,9.760693,10.977785,11.414167,11.251199,11.315866,14.469199,11.537793,11.505077,10.341984
01BR015-T,7.273084,10.303115,10.240463,5.287290,12.648443,0.000000,12.149339,0.000000,0.000000,14.579366,...,13.132139,10.336607,11.524852,11.805481,9.179496,11.984678,13.306251,11.617746,12.390759,11.342688
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-00383-A,6.936267,9.680521,9.530201,8.608871,13.318557,0.000000,12.981265,6.491163,0.000000,13.811720,...,10.648933,11.231319,11.428921,11.966358,5.688687,12.921488,13.195903,12.107978,12.973878,13.511957
C3N-00858-A,7.581142,9.941336,9.597697,8.678642,13.160485,0.000000,13.028646,0.000000,5.065704,14.143747,...,9.666310,11.394370,11.345977,12.077106,7.178695,12.992283,12.854665,12.103689,12.825424,13.623158
C3N-00866-A,7.357388,9.306353,9.035412,6.620679,13.408593,0.000000,13.182946,0.000000,0.000000,13.808237,...,9.292396,11.167963,11.623877,12.122369,0.000000,12.458606,13.615069,12.305028,12.782117,13.449940
C3N-01003-A,8.159156,9.437909,9.039024,8.702565,13.305092,0.000000,12.837443,5.750295,0.000000,14.078470,...,10.100163,11.475160,11.414621,12.156336,6.330836,12.940224,12.959681,11.914472,13.181620,13.767287


In [8]:
# ## check for IDs
# diseases = sorted(set(cnv['disease']))
# for d in diseases:
#     idxs = cnv[cnv['disease']==d].index.to_list()
#     print(d, len(set(idxs).intersection(set(expression.index))))

###### phospho

In [6]:
phospho = pd.read_csv('../data/Combine_PanCan_Phospho-multi-site_UMich_GENCODE34_Sinai_imputed_Apr2021.tsv',
                     sep='\t')
phospho

Unnamed: 0,GENECODE34_Symbol,HGNC_Approved_Symbol,01BR001-T,01BR008-T,01BR009-T,01BR010-T,01BR015-T,01BR017-T,01BR018-T,01BR020-T,...,KoreanReference3-R,Pool-24-2-R,QC1-Q.1,QC2-Q.1,QC3-Q.1,QC4-Q.1,QC5-Q.1,QC6-Q.1,WU-PDA1-Q,WU-Pool-25-R
ENSP00000000412.3|ENST00000000412.8|ENSG00000003056.8|OTTHUMG00000168276.4|OTTHUMT00000399130.2|M6PR-201|M6PR|277_267_267_1_1_S267,M6PR,M6PR,21.512453,22.139312,21.701069,22.389933,21.246821,21.683051,21.861632,20.888826,...,21.931296,21.692985,,,,,,,21.630111,21.485603
ENSP00000000442.6|ENST00000000442.11|ENSG00000173153.16|OTTHUMG00000150641.9|OTTHUMT00000319303.3|ESRRA-201|ESRRA|423_19_22_1_0,ESRRA,ESRRA,,,,,,,,,...,,,,,,,,,,
ENSP00000000442.6|ENST00000000442.11|ENSG00000173153.16|OTTHUMG00000150641.9|OTTHUMT00000319303.3|ESRRA-201|ESRRA|423_19_22_2_2_S19S22,ESRRA,ESRRA,,,,,,,,,...,18.673098,19.799056,,,,,,,19.266891,19.303384
ENSP00000000442.6|ENST00000000442.11|ENSG00000173153.16|OTTHUMG00000150641.9|OTTHUMT00000319303.3|ESRRA-201|ESRRA|423_19_44_2_0,ESRRA,ESRRA,,,,,,,,,...,,,,,,,,,,
ENSP00000000442.6|ENST00000000442.11|ENSG00000173153.16|OTTHUMG00000150641.9|OTTHUMT00000319303.3|ESRRA-201|ESRRA|423_19_44_2_2_S19S22,ESRRA,ESRRA,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSP00000501300.1|ENST00000674001.1|ENSG00000011451.21|OTTHUMG00000182448.13|OTTHUMT00000530371.1|WIZ-211|WIZ|1066_895_895_1_1_S895,WIZ,WIZ,,,,,,,,,...,,,,,,,,,,
ENSP00000501300.1|ENST00000674001.1|ENSG00000011451.21|OTTHUMG00000182448.13|OTTHUMT00000530371.1|WIZ-211|WIZ|1066_932_936_1_1_S932,WIZ,WIZ,,,,,,,,,...,,,,,,,,,,
ENSP00000501312.1|ENST00000674128.1|ENSG00000135951.16|OTTHUMG00000130637.5|OTTHUMT00000530304.1|TSGA10-219|TSGA10|789_173_179_1_1_S173,TSGA10,TSGA10,,,,,,,,,...,,,,,,,,,,
ENSP00000501312.1|ENST00000674128.1|ENSG00000135951.16|OTTHUMG00000130637.5|OTTHUMT00000530304.1|TSGA10-219|TSGA10|789_779_786_1_1_S779,TSGA10,TSGA10,,,,,,,,,...,,,,,,,,,,


In [7]:
phospho.shape, len({'|'.join(x.split('|')[-3:]) for x in phospho.index})

((95021, 2098), 95021)

In [8]:
phospho.index = ['|'.join(x.split('|')[-3:]) for x in phospho.index]

In [9]:
phospho = phospho.iloc[:, 2:]
phospho.index = [f'{x}_phospho' for x in phospho.index]
phospho = phospho.transpose()
phospho

Unnamed: 0,M6PR-201|M6PR|277_267_267_1_1_S267_phospho,ESRRA-201|ESRRA|423_19_22_1_0_phospho,ESRRA-201|ESRRA|423_19_22_2_2_S19S22_phospho,ESRRA-201|ESRRA|423_19_44_2_0_phospho,ESRRA-201|ESRRA|423_19_44_2_2_S19S22_phospho,ESRRA-201|ESRRA|423_19_44_3_0_phospho,ESRRA-201|ESRRA|423_19_44_3_3_S22S26S27_phospho,ESRRA-201|ESRRA|423_26_44_1_1_S27_phospho,FKBP4-201|FKBP4|459_258_263_1_0_phospho,FKBP4-201|FKBP4|459_258_263_1_1_S258_phospho,...,WIZ-211|WIZ|1066_288_299_2_2_S294S299_phospho,WIZ-211|WIZ|1066_507_521_1_1_S521_phospho,WIZ-211|WIZ|1066_542_549_1_1_S549_phospho,WIZ-211|WIZ|1066_561_574_1_1_S561_phospho,WIZ-211|WIZ|1066_750_755_1_1_T752_phospho,WIZ-211|WIZ|1066_895_895_1_1_S895_phospho,WIZ-211|WIZ|1066_932_936_1_1_S932_phospho,TSGA10-219|TSGA10|789_173_179_1_1_S173_phospho,TSGA10-219|TSGA10|789_779_786_1_1_S779_phospho,SVIL-215|SVIL|1904_459_461_1_1_S459_phospho
01BR001-T,21.512453,,,,,20.271345,,,21.400875,,...,,,,,,,,,,
01BR008-T,22.139312,,,,,20.632442,,,20.006183,,...,,,,,,,,,,
01BR009-T,21.701069,,,,,20.947102,,,20.177087,,...,,,,,,,,,,
01BR010-T,22.389933,,,,,20.539099,,,20.252901,,...,,,,,,,,,,
01BR015-T,21.246821,,,,,20.171325,,,20.706533,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
QC4-Q.1,,,,,,,,,,,...,,,,,,,,,,
QC5-Q.1,,,,,,,,,,,...,,,,,,,,,,
QC6-Q.1,,,,,,,,,,,...,,,,,,,,,,
WU-PDA1-Q,21.630111,,19.266891,,,,,,,19.005680,...,,,,,,,,,,


###### methylation subtypes

In [7]:
methylation_st = pd.read_csv('../data/methylation_subtype.csv', sep=',')
methylation_st

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Sample_ID,Subject_ID,Cancer_type,Class,Pan3_k5
0,0,0,CPT0008520010,C3N-00295,HNSCC,1,3
1,1,1,CPT0008630006,C3N-00299,HNSCC,1,2
2,2,2,CPT0008680006,C3N-00300,HNSCC,1,2
3,3,3,CPT0008930006,C3N-00307,HNSCC,1,2
4,4,4,CPT0011290008,C3N-00306,HNSCC,1,2
...,...,...,...,...,...,...,...
698,766,836,CPT0128770006,C3N-02253,UCEC,3,
699,767,837,CPT0128960006,C3N-02244,UCEC,2,
700,768,838,CPT0129080006,C3N-02249,UCEC,1,
701,769,839,CPT0129520007,C3N-00755,UCEC,1,


In [8]:
len(set(methylation_st['Sample_ID']))

703

In [9]:
methylation_st.index = [x + '-T' for x in methylation_st['Subject_ID']]
methylation_st = methylation_st[['Class', 'Cancer_type']]
methylation_st.columns = ['methylation_subtype', 'disease']
methylation_st

Unnamed: 0,methylation_subtype,disease
C3N-00295-T,1,HNSCC
C3N-00299-T,1,HNSCC
C3N-00300-T,1,HNSCC
C3N-00307-T,1,HNSCC
C3N-00306-T,1,HNSCC
...,...,...
C3N-02253-T,3,UCEC
C3N-02244-T,2,UCEC
C3N-02249-T,1,UCEC
C3N-00755-T,1,UCEC


In [14]:
## check for IDs
diseases = sorted(set(methylation_st['disease']))
for d in diseases:
    idxs = methylation_st[methylation_st['disease']==d].index.to_list()
    print(d, len(set(idxs).intersection(set(expression.index))))

CCRCC 104
GBM 69
HNSCC 81
LSCC 82
LUAD 65
UCEC 79


In [15]:
# one hot encode
for t in set(methylation_st['methylation_subtype']):
    methylation_st[f'methylation_subtype_{t}'] = [1 if x==t else 0
                                                 for x in methylation_st['methylation_subtype']]
methylation_st = methylation_st[[c for c in methylation_st.columns if 'methylation_subtype_' in c]]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [16]:
methylation_st

Unnamed: 0,methylation_subtype_1,methylation_subtype_2,methylation_subtype_3,methylation_subtype_4,methylation_subtype_5,methylation_subtype_6
C3N-00295-T,1,0,0,0,0,0
C3N-00299-T,1,0,0,0,0,0
C3N-00300-T,1,0,0,0,0,0
C3N-00307-T,1,0,0,0,0,0
C3N-00306-T,1,0,0,0,0,0
...,...,...,...,...,...,...
C3N-02253-T,0,0,1,0,0,0
C3N-02244-T,0,1,0,0,0,0
C3N-02249-T,1,0,0,0,0,0
C3N-00755-T,1,0,0,0,0,0


###### immune subtypes

In [17]:
immune_st = pd.read_csv('../data/immune_subtype.txt', sep='\t')
immune_st

Unnamed: 0,case,Immune_subtype,disease
0,01BR001-T,1,BR
1,01BR008-T,2,BR
2,01BR009-T,3,BR
3,01BR010-T,3,BR
4,01BR015-T,3,BR
...,...,...,...
1083,C3N-01520-T,3,UCEC
1084,C3N-01521-T,3,UCEC
1085,C3N-01537-T,4,UCEC
1086,C3N-01802-T,4,UCEC


In [18]:
immune_st = immune_st.set_index('case')
immune_st = immune_st[['Immune_subtype']]
immune_st.columns = ['immune_subtype']
immune_st

Unnamed: 0_level_0,immune_subtype
case,Unnamed: 1_level_1
01BR001-T,1
01BR008-T,2
01BR009-T,3
01BR010-T,3
01BR015-T,3
...,...
C3N-01520-T,3
C3N-01521-T,3
C3N-01537-T,4
C3N-01802-T,4


In [19]:
## check for IDs

len(set(immune_st.index).intersection(set(expression.index)))

1083

In [20]:
# one hot encode
for t in set(immune_st['immune_subtype']):
    immune_st[f'immune_subtype_{t}'] = [1 if x==t else 0
                                                 for x in immune_st['immune_subtype']]
immune_st = immune_st[[c for c in immune_st.columns if 'immune_subtype_' in c]]
immune_st

Unnamed: 0_level_0,immune_subtype_1,immune_subtype_2,immune_subtype_3,immune_subtype_4,immune_subtype_5
case,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
01BR001-T,1,0,0,0,0
01BR008-T,0,1,0,0,0
01BR009-T,0,0,1,0,0
01BR010-T,0,0,1,0,0
01BR015-T,0,0,1,0,0
...,...,...,...,...,...
C3N-01520-T,0,0,1,0,0
C3N-01521-T,0,0,1,0,0
C3N-01537-T,0,0,0,1,0
C3N-01802-T,0,0,0,1,0


###### clinical

In [6]:
from mgitools.os_helpers import listfiles
fps = sorted(listfiles('/diskmnt/Projects/Users/estorrs/stemness_analysis/data/TargetRNAProtCNV_top50_miRNA/', 'mRNAsi.xlsx'))
fps = [fp for fp in fps if '~$data' not in fp]
len(fps), fps[:5]

(11,
 ['/diskmnt/Projects/Users/estorrs/stemness_analysis/data/TargetRNAProtCNV_top50_miRNA/tables/dataBRCA_PROTsi_mRNAsi.xlsx',
  '/diskmnt/Projects/Users/estorrs/stemness_analysis/data/TargetRNAProtCNV_top50_miRNA/tables/dataCRC_PROTsi_mRNAsi.xlsx',
  '/diskmnt/Projects/Users/estorrs/stemness_analysis/data/TargetRNAProtCNV_top50_miRNA/tables/dataEC_PROTsi_mRNAsi.xlsx',
  '/diskmnt/Projects/Users/estorrs/stemness_analysis/data/TargetRNAProtCNV_top50_miRNA/tables/dataGBM_PROTsi_mRNAsi.xlsx',
  '/diskmnt/Projects/Users/estorrs/stemness_analysis/data/TargetRNAProtCNV_top50_miRNA/tables/dataHNSCC_PROTsi_mRNAsi.xlsx'])

In [31]:
import re
disease_to_table = {}
for fp in fps:
    disease = re.sub(r'^.*/data(.+)_PROT.*$', r'\1', fp)
    disease_to_table[disease] = pd.read_excel(fp).set_index('case_id')

In [32]:
disease_to_table.keys()

dict_keys(['BRCA', 'CRC', 'EC', 'GBM', 'HNSCC', 'LSCC', 'LUAD', 'PBT', 'PDA', 'UCEC', 'ccRCC'])

In [33]:
# something got messed up during table creation
disease_to_table.pop('UCEC')

disease_to_table['BR'] = disease_to_table['BRCA']
disease_to_table.pop('BRCA')

disease_to_table['CO'] = disease_to_table['CRC']
disease_to_table.pop('CRC')

Unnamed: 0_level_0,Aliquot.ID,tumor/normal,TumorType,PROTsi,subtype,mRNAsi
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
01CO005-T,C01CO005,Tumor,COAD,0.643793,CMS2,0.645431
01CO006-T,C01CO006,Tumor,COAD,0.466982,CMS3,0.412707
01CO008-T,C01CO008,Tumor,COAD,0.459737,CMS3,0.221003
01CO013-T,C01CO013,Tumor,COAD,0.692245,CMS3,0.733719
01CO014-T,C01CO014,Tumor,COAD,0.708171,CMS3,0.375508
...,...,...,...,...,...,...
21CO006-T,C21CO006,Tumor,COAD,0.501541,CMS4,0.524546
21CO007-T,C21CO007,Tumor,COAD,0.669347,CMS2,0.512884
22CO004-T,C22CO004,Tumor,COAD,0.638785,,0.390766
22CO006-T,C22CO006,Tumor,COAD,0.579294,CMS4,0.301375


In [34]:
# generate clinical dataframe
clinical = None
for disease, df in disease_to_table.items():
    subtypes = sorted({s for s in df['subtype'] if not pd.isnull(s)})
    for s in subtypes:
        df[f'is_subtype_{disease}_{s}'] = [1 if x==s else 0
                                                for x in df['subtype']]
    
    f = df[[c for c in df.columns if 'is_subtype_' in c]]
    if clinical is None:
        clinical = f
    else:
        clinical = pd.merge(clinical, f, left_index=True, right_index=True, how='outer')
clinical = clinical.replace(np.nan, 0)
clinical
    

Unnamed: 0_level_0,is_subtype_EC_Endometrioid carcinoma,is_subtype_EC_Serous carcinoma,is_subtype_GBM_Classical,is_subtype_GBM_IDH mutant,is_subtype_GBM_Mesenchymal,is_subtype_GBM_Proneural,is_subtype_HNSCC_Atypical,is_subtype_HNSCC_Basal,is_subtype_HNSCC_Classical,is_subtype_HNSCC_Mesenchymal,...,is_subtype_ccRCC_VEGF immune-desert,is_subtype_BR_Basal,is_subtype_BR_Her2,is_subtype_BR_LumA,is_subtype_BR_LumB,is_subtype_BR_Normal-like,is_subtype_CO_CMS1,is_subtype_CO_CMS2,is_subtype_CO_CMS3,is_subtype_CO_CMS4
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01BR001-T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01BR008-T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01BR009-T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01BR010-T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01BR015-T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-04280-T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C3N-04282-T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C3N-04283-T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C3N-04284-T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
set(combined['disease'])

{'BR',
 'CO',
 'EC',
 'GBM',
 'HNSCC',
 'LSCC',
 'LUAD',
 'OV',
 'PDA',
 'ccRCC',
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan}

###### yize clinical

In [8]:
yc = pd.read_csv('../data/clinical_Pan-cancer.Apr2021.tsv', sep='\t', index_col='case_id')
yc

Unnamed: 0_level_0,tumor_code,discovery_study,discovery_study/type_of_analyzed_samples,confirmatory_study,confirmatory_study/type_of_analyzed_samples,consent/age,consent/sex,consent/race,consent/ethnicity,consent/ethnicity_race_ancestry_identified,...,follow-up/residual_tumor_after_surgery_for_new_tumor,follow-up/additional_treatment_radiation_therapy_for_new_tumor,follow-up/additional_treatment_pharmaceutical_therapy_for_new_tumor,follow-up/additional_treatment_immuno_for_new_tumor,follow-up/number_of_days_from_date_of_initial_pathologic_diagnosis_to_date_of_additional_surgery_for_new_tumor_event_loco-regional,follow-up/number_of_days_from_date_of_initial_pathologic_diagnosis_to_date_of_additional_surgery_for_new_tumor_event_metastasis,"Recurrence-free survival, days","Overall survival, days","Recurrence status (1, yes; 0, no)","Survival status (1, dead; 0, alive)"
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00908,CCRCC,Yes,Tumor_and_Normal,Yes,Tumor,60,Female,White,Not-Hispanic or Latino,White,...,,,,,,,,1429.0,0,0.0
C3L-00004,CCRCC,Yes,Tumor_and_Normal,No,,72,Male,White,Not-Hispanic or Latino,White,...,,,,,,,,384.0,0,0.0
C3L-00010,CCRCC,Yes,Tumor_and_Normal,No,,30,Male,White,Not-Hispanic or Latino,White,...,,,,,,,,896.0,0,0.0
C3L-00011,CCRCC,Yes,Tumor_and_Normal,No,,63,Female,White,Not-Hispanic or Latino,White,...,,,,,,,,241.0,0,1.0
C3L-00026,CCRCC,Yes,Tumor_and_Normal,No,,65,Female,White,Not-Hispanic or Latino,White,...,,,,,,,,1458.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26OV008,OV,,,,,65,Female,White,Not Evaluated,,...,,,,,,,,400.0,0,1.0
26OV009,OV,,,,,60,Female,White,Unknown,,...,,,,,,,,727.0,0,0.0
26OV010,OV,,,,,57,Female,White,Unknown,,...,,,,,,,,506.0,0,0.0
26OV011,OV,,,,,64,Female,White,Unknown,,...,,,,,,,,177.0,0,0.0


In [9]:
cols = [
    'consent/age',
    'consent/sex', 
]

In [10]:
[c for c in yc.columns if 'sex' in c]

['consent/sex']

In [11]:
yc = yc[cols]
yc.columns = ['clinical_age', 'clinical_sex']
yc

Unnamed: 0_level_0,clinical_age,clinical_sex
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1
C3L-00908,60,Female
C3L-00004,72,Male
C3L-00010,30,Male
C3L-00011,63,Female
C3L-00026,65,Female
...,...,...
26OV008,65,Female
26OV009,60,Female
26OV010,57,Female
26OV011,64,Female


In [12]:
yc['clinical_is_female'] = [0 if x!='Female' else 1 for x in yc['clinical_sex']]
yc = yc[['clinical_age', 'clinical_is_female']]
yc


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,clinical_age,clinical_is_female
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1
C3L-00908,60,1
C3L-00004,72,0
C3L-00010,30,0
C3L-00011,63,1
C3L-00026,65,1
...,...,...
26OV008,65,1
26OV009,60,1
26OV010,57,1
26OV011,64,1


In [13]:
fps = sorted(os_helpers.listfiles('../data/ancestry/', regex=r'.tsv$'))
fps

['../data/ancestry/BRCA.tsv',
 '../data/ancestry/CO.tsv',
 '../data/ancestry/GBM.tsv',
 '../data/ancestry/HNSCC.tsv',
 '../data/ancestry/LSCC.tsv',
 '../data/ancestry/LUAD.tsv',
 '../data/ancestry/OV.tsv',
 '../data/ancestry/PDAC.tsv',
 '../data/ancestry/UCEC.tsv',
 '../data/ancestry/ccRCC.tsv']

In [14]:
df = None
for fp in fps:
    if df is None:
        df = pd.read_csv(fp, sep='\t', index_col='sample_id')
    else:
        df = pd.concat((df, pd.read_csv(fp, sep='\t', index_col='sample_id')))
df

Unnamed: 0_level_0,predicted_ancestry,probability_AFR,probability_AMR,probability_EAS,probability_EUR,probability_SAS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
01BR001,AFR,0.98,0.01,0.00,0.00,0.01
01BR015,EUR,0.00,0.03,0.01,0.96,0.00
01BR017,EUR,0.00,0.01,0.00,0.99,0.00
01BR018,EUR,0.00,0.00,0.00,1.00,0.00
01BR025,AFR,0.94,0.05,0.00,0.00,0.01
...,...,...,...,...,...,...
C3N-01648,EUR,0.00,0.00,0.02,0.98,0.00
C3N-01649,EUR,0.01,0.06,0.00,0.93,0.00
C3N-01651,EUR,0.00,0.00,0.00,1.00,0.00
C3N-01808,EUR,0.00,0.06,0.02,0.92,0.00


In [15]:
d = {s:x for s, x in zip(df.index, df['predicted_ancestry'])}
for a in sorted(set(df['predicted_ancestry'])):
    yc[f'clinical_predicted_ancestry_is_{a}'] = [1 if d.get(x) == a else 0 for x in yc.index]
yc

Unnamed: 0_level_0,clinical_age,clinical_is_female,clinical_predicted_ancestry_is_AFR,clinical_predicted_ancestry_is_AMR,clinical_predicted_ancestry_is_EAS,clinical_predicted_ancestry_is_EUR,clinical_predicted_ancestry_is_SAS
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
C3L-00908,60,1,0,0,0,1,0
C3L-00004,72,0,0,0,0,1,0
C3L-00010,30,0,0,1,0,0,0
C3L-00011,63,1,0,0,0,1,0
C3L-00026,65,1,0,0,0,1,0
...,...,...,...,...,...,...,...
26OV008,65,1,0,0,0,1,0
26OV009,60,1,0,0,0,1,0
26OV010,57,1,0,0,0,1,0
26OV011,64,1,0,0,0,1,0


###### merge everything together

In [72]:
# merge proteome and mutations
data, idxs = [], []
for s, row in proteome.iterrows():
    if '-T' in s:
        idxs.append(s)
        ls = []
        if s in consolidated_filtered.index:
            ls += consolidated_filtered.loc[s, :].to_list()
        else:
            ls += [0 for i in range(consolidated_filtered.shape[1])]
        
        if s in consolidated_germline.index:
            ls += consolidated_germline.loc[s, :].to_list()
        else:
            ls += [0 for i in range(consolidated_germline.shape[1])]
            
        data.append(ls)
        
cols = list(consolidated_filtered.columns)
cols += list(consolidated_germline.columns)
df = pd.DataFrame(data=data, index=idxs, columns=cols)
df.index.name = 'sample_id'
df

Unnamed: 0_level_0,ABL1_mutation_is_Frame_Shift_Del,ABL1_mutation_is_Frame_Shift_Ins,ABL1_mutation_is_In_Frame_Del,ABL1_mutation_is_In_Frame_Ins,ABL1_mutation_is_Missense_Mutation,ABL1_mutation_is_Nonsense_Mutation,ABL1_mutation_is_Nonstop_Mutation,ABL1_mutation_is_Silent,ABL1_mutation_is_Splice_Site,ABL1_has_truncating_mutation,...,ZRANB1_is_pathogenic_germline,ZRSR2_is_pathogenic_germline,ZSCAN10_is_pathogenic_germline,ZSCAN25_is_pathogenic_germline,ZSCAN32_is_pathogenic_germline,ZW10_is_pathogenic_germline,ZWILCH_is_pathogenic_germline,ZWINT_is_pathogenic_germline,ZXDC_is_pathogenic_germline,ZYX_is_pathogenic_germline
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01BR001-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01BR008-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01BR009-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01BR010-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01BR015-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-04119-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C3N-04126-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C3N-04282-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C3N-04283-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [73]:
combined = pd.merge(proteome, df, left_index=True, right_index=True, how='right')
combined

Unnamed: 0_level_0,AAAS_proteome,AAK1_proteome,AATF_proteome,ABCA1_proteome,ABCA2_proteome,ABCB1_proteome,ABCB11_proteome,ABCC2_proteome,ABCC3_proteome,ABCE1_proteome,...,ZRANB1_is_pathogenic_germline,ZRSR2_is_pathogenic_germline,ZSCAN10_is_pathogenic_germline,ZSCAN25_is_pathogenic_germline,ZSCAN32_is_pathogenic_germline,ZW10_is_pathogenic_germline,ZWILCH_is_pathogenic_germline,ZWINT_is_pathogenic_germline,ZXDC_is_pathogenic_germline,ZYX_is_pathogenic_germline
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01BR001-T,24.366478,24.512335,25.009167,22.761127,18.747711,,,,19.982433,26.985937,...,0,0,0,0,0,0,0,0,0,0
01BR008-T,24.394392,24.336012,25.198883,22.507570,19.817547,,,,19.608661,26.724760,...,0,0,0,0,0,0,0,0,0,0
01BR009-T,24.126891,24.312230,24.822895,23.841045,20.841777,,,,19.515286,26.398394,...,0,0,0,0,0,0,0,0,0,0
01BR010-T,24.328566,24.399937,24.455721,23.044677,20.980193,,,,20.429545,26.360272,...,0,0,0,0,0,0,0,0,0,0
01BR015-T,24.606059,24.330728,24.259055,22.052379,19.149462,,,,18.981616,26.545160,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-04119-T,22.997202,24.726521,25.304842,23.719660,,22.479719,,,24.267079,26.922160,...,0,0,0,0,0,0,0,0,0,0
C3N-04126-T,23.057493,24.772949,25.715299,24.214258,,22.003800,,,24.271196,27.086099,...,0,0,0,0,0,0,0,0,0,0
C3N-04282-T,22.775772,24.852962,25.724592,23.073272,,21.588377,,,24.725779,26.921562,...,0,0,0,0,0,0,0,0,0,0
C3N-04283-T,23.116111,24.664143,25.571772,23.463646,,21.701678,,,24.395560,26.921337,...,0,0,0,0,0,0,0,0,0,0


In [74]:
# list(cnv.index)

In [75]:
combined = pd.merge(combined, purity, left_index=True, right_index=True, how='left')
combined

Unnamed: 0_level_0,AAAS_proteome,AAK1_proteome,AATF_proteome,ABCA1_proteome,ABCA2_proteome,ABCB1_proteome,ABCB11_proteome,ABCC2_proteome,ABCC3_proteome,ABCE1_proteome,...,ZRSR2_is_pathogenic_germline,ZSCAN10_is_pathogenic_germline,ZSCAN25_is_pathogenic_germline,ZSCAN32_is_pathogenic_germline,ZW10_is_pathogenic_germline,ZWILCH_is_pathogenic_germline,ZWINT_is_pathogenic_germline,ZXDC_is_pathogenic_germline,ZYX_is_pathogenic_germline,TumorPurity
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01BR001-T,24.366478,24.512335,25.009167,22.761127,18.747711,,,,19.982433,26.985937,...,0,0,0,0,0,0,0,0,0,0.816624
01BR008-T,24.394392,24.336012,25.198883,22.507570,19.817547,,,,19.608661,26.724760,...,0,0,0,0,0,0,0,0,0,0.510466
01BR009-T,24.126891,24.312230,24.822895,23.841045,20.841777,,,,19.515286,26.398394,...,0,0,0,0,0,0,0,0,0,0.556239
01BR010-T,24.328566,24.399937,24.455721,23.044677,20.980193,,,,20.429545,26.360272,...,0,0,0,0,0,0,0,0,0,0.747700
01BR015-T,24.606059,24.330728,24.259055,22.052379,19.149462,,,,18.981616,26.545160,...,0,0,0,0,0,0,0,0,0,0.649161
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-04119-T,22.997202,24.726521,25.304842,23.719660,,22.479719,,,24.267079,26.922160,...,0,0,0,0,0,0,0,0,0,0.530830
C3N-04126-T,23.057493,24.772949,25.715299,24.214258,,22.003800,,,24.271196,27.086099,...,0,0,0,0,0,0,0,0,0,0.464536
C3N-04282-T,22.775772,24.852962,25.724592,23.073272,,21.588377,,,24.725779,26.921562,...,0,0,0,0,0,0,0,0,0,0.604708
C3N-04283-T,23.116111,24.664143,25.571772,23.463646,,21.701678,,,24.395560,26.921337,...,0,0,0,0,0,0,0,0,0,0.661656


In [76]:
combined = pd.merge(combined, cnv, left_index=True, right_index=True, how='left')
combined

Unnamed: 0_level_0,AAAS_proteome,AAK1_proteome,AATF_proteome,ABCA1_proteome,ABCA2_proteome,ABCB1_proteome,ABCB11_proteome,ABCC2_proteome,ABCC3_proteome,ABCE1_proteome,...,ZRANB1_cnv,ZSCAN10_cnv,ZSCAN25_cnv,ZSCAN32_cnv,ZW10_cnv,ZWILCH_cnv,ZWINT_cnv,ZXDC_cnv,ZYX_cnv,disease
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01BR001-T,24.366478,24.512335,25.009167,22.761127,18.747711,,,,19.982433,26.985937,...,-0.00966,-0.19088,0.04711,-0.19088,0.01914,-0.26696,-0.00966,-0.06602,0.05187,BR
01BR008-T,24.394392,24.336012,25.198883,22.507570,19.817547,,,,19.608661,26.724760,...,0.11550,-0.16619,-0.00576,-0.16619,-0.04312,0.00988,0.11550,-0.09075,-0.00576,BR
01BR009-T,24.126891,24.312230,24.822895,23.841045,20.841777,,,,19.515286,26.398394,...,0.32349,-0.38928,0.67516,-0.38928,-0.15614,-0.15576,-0.14743,0.26289,0.06942,BR
01BR010-T,24.328566,24.399937,24.455721,23.044677,20.980193,,,,20.429545,26.360272,...,0.00058,-0.04794,-0.03744,-0.04794,0.03900,-0.18501,-0.14666,-0.00744,-0.03744,BR
01BR015-T,24.606059,24.330728,24.259055,22.052379,19.149462,,,,18.981616,26.545160,...,0.13746,0.26465,-0.28490,0.26465,-0.20074,-0.23970,0.13746,0.09962,-0.27216,BR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-04119-T,22.997202,24.726521,25.304842,23.719660,,22.479719,,,24.267079,26.922160,...,0.01824,-0.18047,0.01245,-0.18047,0.01144,0.02949,0.01824,0.01070,0.01245,PDA
C3N-04126-T,23.057493,24.772949,25.715299,24.214258,,22.003800,,,24.271196,27.086099,...,0.04492,0.03413,-0.17720,0.03413,0.04178,0.03023,0.04492,0.03950,-0.17720,PDA
C3N-04282-T,22.775772,24.852962,25.724592,23.073272,,21.588377,,,24.725779,26.921562,...,0.45477,0.00417,0.20035,0.00417,0.16786,0.32189,-0.28072,0.18985,0.20035,PDA
C3N-04283-T,23.116111,24.664143,25.571772,23.463646,,21.701678,,,24.395560,26.921337,...,-0.15172,0.02498,0.02378,0.02498,0.02317,-0.11829,-0.15172,0.00268,0.02378,PDA


In [68]:
combined = combined[[c for c in combined.columns if '_expression' not in c]]

In [69]:
combined = pd.merge(combined, expression, left_index=True, right_index=True, how='left')
combined

Unnamed: 0,AAAS_proteome,AAK1_proteome,AATF_proteome,ABCA1_proteome,ABCA2_proteome,ABCB1_proteome,ABCB11_proteome,ABCC2_proteome,ABCC3_proteome,ABCE1_proteome,...,ZWINT_expression,ZXDA_expression,ZXDB_expression,ZXDC_expression,ZYG11A_expression,ZYG11B_expression,ZYX_expression,ZZEF1_expression,ZZZ3_expression,pk_expression
01BR001-T,24.366478,24.512335,25.009167,22.761127,18.747711,,,,19.982433,26.985937,...,11.757262,10.134319,11.362794,11.461471,10.430631,11.700726,13.320117,11.281892,11.415130,11.518754
01BR008-T,24.394392,24.336012,25.198883,22.507570,19.817547,,,,19.608661,26.724760,...,14.030484,9.176603,10.521493,11.695101,11.566098,10.635637,13.760682,11.407163,11.716684,10.209459
01BR009-T,24.126891,24.312230,24.822895,23.841045,20.841777,,,,19.515286,26.398394,...,12.447119,9.611568,10.434179,12.030646,7.766814,11.253443,13.427215,11.866759,11.416178,10.698735
01BR010-T,24.328566,24.399937,24.455721,23.044677,20.980193,,,,20.429545,26.360272,...,12.496828,9.760693,10.977785,11.414167,11.251199,11.315866,14.469199,11.537793,11.505077,10.341984
01BR015-T,24.606059,24.330728,24.259055,22.052379,19.149462,,,,18.981616,26.545160,...,13.132139,10.336607,11.524852,11.805481,9.179496,11.984678,13.306251,11.617746,12.390759,11.342688
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TumorOnlyIR01-T,22.770989,23.859305,21.765639,22.120720,20.364479,19.257281,,,22.949869,25.466323,...,,,,,,,,,,
TumorOnlyIR03-T,22.938051,23.648094,21.663581,22.077940,20.358023,19.586598,,,22.951027,25.483131,...,,,,,,,,,,
TumorOnlyIR14-T,22.828392,23.731050,21.760472,21.959052,20.375218,19.630542,,,23.019681,25.425778,...,,,,,,,,,,
TumorOnlyIR21-T,22.813782,23.775605,21.644840,21.982435,20.291244,19.633217,,,23.008923,25.405379,...,,,,,,,,,,


In [78]:
combined = pd.merge(combined, methylation_st, left_index=True, right_index=True, how='left')
combined

Unnamed: 0,AAAS_proteome,AAK1_proteome,AATF_proteome,ABCA1_proteome,ABCA2_proteome,ABCB1_proteome,ABCB11_proteome,ABCC2_proteome,ABCC3_proteome,ABCE1_proteome,...,snoZ6_expression,snosnR66_expression,uc_338_expression,yR211F11_expression,methylation_subtype_1,methylation_subtype_2,methylation_subtype_3,methylation_subtype_4,methylation_subtype_5,methylation_subtype_6
01BR001-T,24.366478,24.512335,25.009167,22.761127,18.747711,,,,19.982433,26.985937,...,0.000000,0.0,3064.642802,0.000000,,,,,,
01BR008-T,24.394392,24.336012,25.198883,22.507570,19.817547,,,,19.608661,26.724760,...,0.000000,0.0,4058.372741,0.000000,,,,,,
01BR009-T,24.126891,24.312230,24.822895,23.841045,20.841777,,,,19.515286,26.398394,...,0.000000,0.0,2113.866719,3472.583950,,,,,,
01BR010-T,24.328566,24.399937,24.455721,23.044677,20.980193,,,,20.429545,26.360272,...,0.000000,0.0,4642.228707,1639.303952,,,,,,
01BR015-T,24.606059,24.330728,24.259055,22.052379,19.149462,,,,18.981616,26.545160,...,3185.159704,0.0,5415.098490,0.000000,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TumorOnlyIR01-T,22.770989,23.859305,21.765639,22.120720,20.364479,19.257281,,,22.949869,25.466323,...,,,,,,,,,,
TumorOnlyIR03-T,22.938051,23.648094,21.663581,22.077940,20.358023,19.586598,,,22.951027,25.483131,...,,,,,,,,,,
TumorOnlyIR14-T,22.828392,23.731050,21.760472,21.959052,20.375218,19.630542,,,23.019681,25.425778,...,,,,,,,,,,
TumorOnlyIR21-T,22.813782,23.775605,21.644840,21.982435,20.291244,19.633217,,,23.008923,25.405379,...,,,,,,,,,,


In [79]:
combined = pd.merge(combined, immune_st, left_index=True, right_index=True, how='left')
combined

Unnamed: 0,AAAS_proteome,AAK1_proteome,AATF_proteome,ABCA1_proteome,ABCA2_proteome,ABCB1_proteome,ABCB11_proteome,ABCC2_proteome,ABCC3_proteome,ABCE1_proteome,...,methylation_subtype_2,methylation_subtype_3,methylation_subtype_4,methylation_subtype_5,methylation_subtype_6,immune_subtype_1,immune_subtype_2,immune_subtype_3,immune_subtype_4,immune_subtype_5
01BR001-T,24.366478,24.512335,25.009167,22.761127,18.747711,,,,19.982433,26.985937,...,,,,,,1.0,0.0,0.0,0.0,0.0
01BR008-T,24.394392,24.336012,25.198883,22.507570,19.817547,,,,19.608661,26.724760,...,,,,,,0.0,1.0,0.0,0.0,0.0
01BR009-T,24.126891,24.312230,24.822895,23.841045,20.841777,,,,19.515286,26.398394,...,,,,,,0.0,0.0,1.0,0.0,0.0
01BR010-T,24.328566,24.399937,24.455721,23.044677,20.980193,,,,20.429545,26.360272,...,,,,,,0.0,0.0,1.0,0.0,0.0
01BR015-T,24.606059,24.330728,24.259055,22.052379,19.149462,,,,18.981616,26.545160,...,,,,,,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TumorOnlyIR01-T,22.770989,23.859305,21.765639,22.120720,20.364479,19.257281,,,22.949869,25.466323,...,,,,,,,,,,
TumorOnlyIR03-T,22.938051,23.648094,21.663581,22.077940,20.358023,19.586598,,,22.951027,25.483131,...,,,,,,,,,,
TumorOnlyIR14-T,22.828392,23.731050,21.760472,21.959052,20.375218,19.630542,,,23.019681,25.425778,...,,,,,,,,,,
TumorOnlyIR21-T,22.813782,23.775605,21.644840,21.982435,20.291244,19.633217,,,23.008923,25.405379,...,,,,,,,,,,


In [35]:
combined = pd.merge(combined, clinical, left_index=True, right_index=True, how='left')
combined

Unnamed: 0,AAAS_proteome,AAK1_proteome,AATF_proteome,ABCA1_proteome,ABCA2_proteome,ABCB1_proteome,ABCB11_proteome,ABCC2_proteome,ABCC3_proteome,ABCE1_proteome,...,is_subtype_ccRCC_VEGF immune-desert,is_subtype_BR_Basal,is_subtype_BR_Her2,is_subtype_BR_LumA,is_subtype_BR_LumB,is_subtype_BR_Normal-like,is_subtype_CO_CMS1,is_subtype_CO_CMS2,is_subtype_CO_CMS3,is_subtype_CO_CMS4
01BR001-T,24.366478,24.512335,25.009167,22.761127,18.747711,,,,19.982433,26.985937,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01BR008-T,24.394392,24.336012,25.198883,22.507570,19.817547,,,,19.608661,26.724760,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01BR009-T,24.126891,24.312230,24.822895,23.841045,20.841777,,,,19.515286,26.398394,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01BR010-T,24.328566,24.399937,24.455721,23.044677,20.980193,,,,20.429545,26.360272,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01BR015-T,24.606059,24.330728,24.259055,22.052379,19.149462,,,,18.981616,26.545160,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TumorOnlyIR01-T,22.770989,23.859305,21.765639,22.120720,20.364479,19.257281,,,22.949869,25.466323,...,,,,,,,,,,
TumorOnlyIR03-T,22.938051,23.648094,21.663581,22.077940,20.358023,19.586598,,,22.951027,25.483131,...,,,,,,,,,,
TumorOnlyIR14-T,22.828392,23.731050,21.760472,21.959052,20.375218,19.630542,,,23.019681,25.425778,...,,,,,,,,,,
TumorOnlyIR21-T,22.813782,23.775605,21.644840,21.982435,20.291244,19.633217,,,23.008923,25.405379,...,,,,,,,,,,


In [10]:
# combined = combined[[c for c in combined.columns if '_phospho' not in c]]

In [11]:
combined = pd.merge(combined, phospho, left_index=True, right_index=True, how='left')
combined

Unnamed: 0,AAAS_proteome,AAK1_proteome,AATF_proteome,ABCA1_proteome,ABCA2_proteome,ABCB1_proteome,ABCB11_proteome,ABCC2_proteome,ABCC3_proteome,ABCE1_proteome,...,WIZ-211|WIZ|1066_288_299_2_2_S294S299_phospho,WIZ-211|WIZ|1066_507_521_1_1_S521_phospho,WIZ-211|WIZ|1066_542_549_1_1_S549_phospho,WIZ-211|WIZ|1066_561_574_1_1_S561_phospho,WIZ-211|WIZ|1066_750_755_1_1_T752_phospho,WIZ-211|WIZ|1066_895_895_1_1_S895_phospho,WIZ-211|WIZ|1066_932_936_1_1_S932_phospho,TSGA10-219|TSGA10|789_173_179_1_1_S173_phospho,TSGA10-219|TSGA10|789_779_786_1_1_S779_phospho,SVIL-215|SVIL|1904_459_461_1_1_S459_phospho
01BR001-T,24.366478,24.512335,25.009167,22.761127,18.747711,,,,19.982433,26.985937,...,,,,,,,,,,
01BR008-T,24.394392,24.336012,25.198883,22.507570,19.817547,,,,19.608661,26.724760,...,,,,,,,,,,
01BR009-T,24.126891,24.312230,24.822895,23.841045,20.841777,,,,19.515286,26.398394,...,,,,,,,,,,
01BR010-T,24.328566,24.399937,24.455721,23.044677,20.980193,,,,20.429545,26.360272,...,,,,,,,,,,
01BR015-T,24.606059,24.330728,24.259055,22.052379,19.149462,,,,18.981616,26.545160,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TumorOnlyIR01-T,22.770989,23.859305,21.765639,22.120720,20.364479,19.257281,,,22.949869,25.466323,...,,,,,,,,,,
TumorOnlyIR03-T,22.938051,23.648094,21.663581,22.077940,20.358023,19.586598,,,22.951027,25.483131,...,,,,,,,,,,
TumorOnlyIR14-T,22.828392,23.731050,21.760472,21.959052,20.375218,19.630542,,,23.019681,25.425778,...,,,,,,,,,,
TumorOnlyIR21-T,22.813782,23.775605,21.644840,21.982435,20.291244,19.633217,,,23.008923,25.405379,...,,,,,,,,,,


In [6]:
combined['clinical_is_tumor'] = [1 if x[-2:]=='-T' else 0 for x in combined.index]
combined

Unnamed: 0_level_0,AAAS_proteome,AAK1_proteome,AATF_proteome,ABCA1_proteome,ABCA2_proteome,ABCB1_proteome,ABCB11_proteome,ABCC2_proteome,ABCC3_proteome,ABCE1_proteome,...,WIZ_phospho.61,WIZ_phospho.62,WIZ_phospho.63,WIZ_phospho.64,WIZ_phospho.65,WIZ_phospho.66,TSGA10_phospho,TSGA10_phospho.1,SVIL_phospho.126,clinical_is_tumor
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01BR001-T,24.366478,24.512335,25.009167,22.761127,18.747711,,,,19.982433,26.985937,...,,,,,,,,,,1
01BR008-T,24.394392,24.336012,25.198883,22.507570,19.817547,,,,19.608661,26.724760,...,,,,,,,,,,1
01BR009-T,24.126891,24.312230,24.822895,23.841045,20.841777,,,,19.515286,26.398394,...,,,,,,,,,,1
01BR010-T,24.328566,24.399937,24.455721,23.044677,20.980193,,,,20.429545,26.360272,...,,,,,,,,,,1
01BR015-T,24.606059,24.330728,24.259055,22.052379,19.149462,,,,18.981616,26.545160,...,,,,,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TumorOnlyIR01-T,22.770989,23.859305,21.765639,22.120720,20.364479,19.257281,,,22.949869,25.466323,...,,,,,,,,,,1
TumorOnlyIR03-T,22.938051,23.648094,21.663581,22.077940,20.358023,19.586598,,,22.951027,25.483131,...,,,,,,,,,,1
TumorOnlyIR14-T,22.828392,23.731050,21.760472,21.959052,20.375218,19.630542,,,23.019681,25.425778,...,,,,,,,,,,1
TumorOnlyIR21-T,22.813782,23.775605,21.644840,21.982435,20.291244,19.633217,,,23.008923,25.405379,...,,,,,,,,,,1


In [16]:
combined['case_id'] = [x[:-2] if x[-2]=='-' else x for x in combined.index]
combined = pd.merge(combined, yc, left_on='case_id', right_index=True, how='left')
combined = combined[[c for c in combined.columns if c!='case_id']]
combined

Unnamed: 0_level_0,AAAS_proteome,AAK1_proteome,AATF_proteome,ABCA1_proteome,ABCA2_proteome,ABCB1_proteome,ABCB11_proteome,ABCC2_proteome,ABCC3_proteome,ABCE1_proteome,...,TSGA10_phospho.1,SVIL_phospho.126,clinical_is_tumor,clinical_age,clinical_is_female,clinical_predicted_ancestry_is_AFR,clinical_predicted_ancestry_is_AMR,clinical_predicted_ancestry_is_EAS,clinical_predicted_ancestry_is_EUR,clinical_predicted_ancestry_is_SAS
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01BR001-T,24.366478,24.512335,25.009167,22.761127,18.747711,,,,19.982433,26.985937,...,,,1,55,1.0,1.0,0.0,0.0,0.0,0.0
01BR008-T,24.394392,24.336012,25.198883,22.507570,19.817547,,,,19.608661,26.724760,...,,,1,,,,,,,
01BR009-T,24.126891,24.312230,24.822895,23.841045,20.841777,,,,19.515286,26.398394,...,,,1,,,,,,,
01BR010-T,24.328566,24.399937,24.455721,23.044677,20.980193,,,,20.429545,26.360272,...,,,1,,,,,,,
01BR015-T,24.606059,24.330728,24.259055,22.052379,19.149462,,,,18.981616,26.545160,...,,,1,35,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TumorOnlyIR01-T,22.770989,23.859305,21.765639,22.120720,20.364479,19.257281,,,22.949869,25.466323,...,,,1,,,,,,,
TumorOnlyIR03-T,22.938051,23.648094,21.663581,22.077940,20.358023,19.586598,,,22.951027,25.483131,...,,,1,,,,,,,
TumorOnlyIR14-T,22.828392,23.731050,21.760472,21.959052,20.375218,19.630542,,,23.019681,25.425778,...,,,1,,,,,,,
TumorOnlyIR21-T,22.813782,23.775605,21.644840,21.982435,20.291244,19.633217,,,23.008923,25.405379,...,,,1,,,,,,,


In [19]:
from collections import Counter
Counter(combined['disease'])

Counter({'BR': 120,
         'CO': 96,
         'OV': 82,
         nan: 47,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         'LUAD': 111,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
         nan: 1,
 

In [12]:
combined.to_csv('../data/aggregated_10072021.txt.gz', sep='\t', )

In [1]:
combined

NameError: name 'combined' is not defined

In [4]:
combined = pd.read_csv('../data/aggregated_08012021.txt.gz', sep='\t')

Columns (14819,158933) have mixed types.Specify dtype option on import or set low_memory=False.


In [5]:
combined = combined.set_index('Unnamed: 0')
combined

Unnamed: 0_level_0,AAAS_proteome,AAK1_proteome,AATF_proteome,ABCA1_proteome,ABCA2_proteome,ABCB1_proteome,ABCB11_proteome,ABCC2_proteome,ABCC3_proteome,ABCE1_proteome,...,TSGA10_phospho.1,SVIL_phospho.126,clinical_is_tumor,clinical_age,clinical_is_female,clinical_predicted_ancestry_is_AFR,clinical_predicted_ancestry_is_AMR,clinical_predicted_ancestry_is_EAS,clinical_predicted_ancestry_is_EUR,clinical_predicted_ancestry_is_SAS
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01BR001-T,24.366478,24.512335,25.009167,22.761127,18.747711,,,,19.982433,26.985937,...,,,1,55.0,1.0,1.0,0.0,0.0,0.0,0.0
01BR008-T,24.394392,24.336012,25.198883,22.507570,19.817547,,,,19.608661,26.724760,...,,,1,,,,,,,
01BR009-T,24.126891,24.312230,24.822895,23.841045,20.841777,,,,19.515286,26.398394,...,,,1,,,,,,,
01BR010-T,24.328566,24.399937,24.455721,23.044677,20.980193,,,,20.429545,26.360272,...,,,1,,,,,,,
01BR015-T,24.606059,24.330728,24.259055,22.052379,19.149462,,,,18.981616,26.545160,...,,,1,35.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TumorOnlyIR01-T,22.770989,23.859305,21.765639,22.120720,20.364479,19.257281,,,22.949869,25.466323,...,,,1,,,,,,,
TumorOnlyIR03-T,22.938051,23.648094,21.663581,22.077940,20.358023,19.586598,,,22.951027,25.483131,...,,,1,,,,,,,
TumorOnlyIR14-T,22.828392,23.731050,21.760472,21.959052,20.375218,19.630542,,,23.019681,25.425778,...,,,1,,,,,,,
TumorOnlyIR21-T,22.813782,23.775605,21.644840,21.982435,20.291244,19.633217,,,23.008923,25.405379,...,,,1,,,,,,,


In [25]:
[c for c in combined.columns if 'methylation' in c]

['methylation_subtype_1',
 'methylation_subtype_2',
 'methylation_subtype_3',
 'methylation_subtype_4',
 'methylation_subtype_5',
 'methylation_subtype_6']

In [31]:
f = combined[[c for c in combined.columns if 'methylation' in c]]
f

Unnamed: 0_level_0,methylation_subtype_1,methylation_subtype_2,methylation_subtype_3,methylation_subtype_4,methylation_subtype_5,methylation_subtype_6
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
01BR001-T,,,,,,
01BR008-T,,,,,,
01BR009-T,,,,,,
01BR010-T,,,,,,
01BR015-T,,,,,,
...,...,...,...,...,...,...
TumorOnlyIR01-T,,,,,,
TumorOnlyIR03-T,,,,,,
TumorOnlyIR14-T,,,,,,
TumorOnlyIR21-T,,,,,,


In [33]:
f.loc['C3L-00001-T']

methylation_subtype_1    1.0
methylation_subtype_2    0.0
methylation_subtype_3    0.0
methylation_subtype_4    0.0
methylation_subtype_5    0.0
methylation_subtype_6    0.0
Name: C3L-00001-T, dtype: float64