In [13]:
import os
import re

import numpy as np
import pandas as pd
import shap
import xgboost as xgb
from sklearn.preprocessing import StandardScaler

import mgitools.os_helpers as os_helpers

In [2]:
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

In [3]:
# run this if you havent already
# !pip install git+https://github.com/estorrs/mgitools

In [11]:
def update_sample_id(s_id):
    if re.findall(r'[Cc]3[lLnN]', s_id):
        s_id = s_id.replace('.', '-')
        if s_id[-2] != '-':
            s_id += '-T'
            
    if re.findall(r'[XxCc][0-9][0-9].....', s_id):
        s_id = s_id[1:]
    
    if re.findall(r'[0-9][0-9][A-Za-z]{2}[0-9]{3}', s_id):
        if s_id[-2] != '-':
            s_id += '-T'
    
    return s_id

#### read in data

In [6]:
d = pd.read_csv('../data/new/199_driver_genes.txt', sep='\t')
d

Unnamed: 0,Gene,Tumor suppressor or oncogene prediction (by 20/20+)
0,PHF6,possible tsg
1,ABL1,
2,ALK,
3,AR,
4,ARAF,
...,...,...
183,KMT2A,tsg
184,KMT2B,tsg
185,MAX,oncogene
186,MED12,oncogene


In [7]:
# target_genes = ['PIK3CA', 'TP53', 'KRAS']
target_genes = sorted(set(d['Gene']))

###### protein pairs

In [8]:
pathways = pd.read_csv('../data/new/protein_pair_table_v2.txt', sep='\t')
pathways

Unnamed: 0,GENE,SUB_GENE,pair_pro,SUB_GENE.is_TF_downstream,SUB_GENE.is_TF_upstream,SUB_GENE.is_kinase_substrate,SUB_GENE.is_phosphatase_substrate,SUB_GENE.is_upstream_kinase,SUB_GENE.is_upstream_phosphatase,SUB_GENE.is_complex_partner
0,TP53,CDKN1A,TP53:CDKN1A,True,False,False,False,False,False,False
1,TP53,SIAH1,TP53:SIAH1,True,False,False,False,False,False,False
2,TP53,SFN,TP53:SFN,True,False,False,False,False,False,False
3,TP53,RPRM,TP53:RPRM,True,False,False,False,False,False,False
4,TP53,GADD45A,TP53:GADD45A,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
831929,SETD2,SETD2,SETD2:SETD2,False,False,False,False,False,False,False
831930,PUMA,PUMA,PUMA:PUMA,False,False,False,False,False,False,False
831931,NOXA,NOXA,NOXA:NOXA,False,False,False,False,False,False,False
831932,FOXR2,FOXR2,FOXR2:FOXR2,False,False,False,False,False,False,False


In [9]:
pathways[[True if g in target_genes else False
         for g in pathways['GENE']]]

Unnamed: 0,GENE,SUB_GENE,pair_pro,SUB_GENE.is_TF_downstream,SUB_GENE.is_TF_upstream,SUB_GENE.is_kinase_substrate,SUB_GENE.is_phosphatase_substrate,SUB_GENE.is_upstream_kinase,SUB_GENE.is_upstream_phosphatase,SUB_GENE.is_complex_partner
0,TP53,CDKN1A,TP53:CDKN1A,True,False,False,False,False,False,False
1,TP53,SIAH1,TP53:SIAH1,True,False,False,False,False,False,False
2,TP53,SFN,TP53:SFN,True,False,False,False,False,False,False
3,TP53,RPRM,TP53:RPRM,True,False,False,False,False,False,False
4,TP53,GADD45A,TP53:GADD45A,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
830817,IDH1,IDH1,IDH1:IDH1,False,False,False,False,False,False,False
831488,BCOR,BCOR,BCOR:BCOR,False,False,False,False,False,False,False
831552,ATRX,ATRX,ATRX:ATRX,False,False,False,False,False,False,False
831798,RQCD1,RQCD1,RQCD1:RQCD1,False,False,False,False,False,False,False


In [10]:
gene_to_subgenes = {t:[g for g in sorted(set(pathways[pathways['GENE']==t]['SUB_GENE'])) if g != t]
                    for t in target_genes}
target_genes[0], len(gene_to_subgenes[target_genes[0]])

('ABL1', 126)

###### proteome

In [57]:
proteome = pd.read_csv('../data/new/Proteome_Broad_updated_tumor_NAT_raw_gene_level.tsv',
                      sep='\t', index_col=0)
proteome.columns = [update_sample_id(x) for x in proteome.columns]
proteome = proteome.transpose()
proteome

external_gene_name,A1BG,A1CF,A2M,A2ML1,A4GALT,A4GNT,AAAS,AACS,AADAC,AADAT,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
CPT000814,-0.13177,,,2.17089,,,1.26679,,,,...,,1.23417,,,-1.93998,,0.91585,-0.22309,-0.47488,0.91585
CPT001846,1.35035,,,-3.73837,,,0.85472,,,,...,,-0.08602,,,-1.40906,,0.32086,2.19005,0.03004,0.16931
01BR001-T,1.89560,,,-1.88194,,,0.35140,,,,...,,-0.44120,1.33532,,,,0.63252,0.75160,-1.38217,0.06833
01BR008-T,-0.38333,,,1.39297,,,0.53624,,,,...,,0.77504,,,1.72184,,-0.14453,-0.12778,-0.03561,0.38961
01BR009-T,1.24708,,,2.60577,,,-0.27902,,,,...,,-0.39547,,,0.29115,,0.47797,0.21108,-0.16498,-0.20380
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-00729-N,3.12127,,,,,,-0.04845,,,,...,,0.80837,,,0.21676,,0.80582,-0.66557,0.57121,0.45391
C3N-00858-N,3.09051,,,-1.33380,,,0.02169,,,,...,,,,,-2.26204,,0.36869,0.26025,-0.19085,0.16483
C3N-00866-N,2.59323,,,-1.90617,2.54505,,0.38333,,,,...,,-0.55719,,,1.70298,,0.45036,0.56138,-0.10473,0.84626
C3N-01211-N,3.65774,,,,,,-0.98423,,,,...,,-1.02906,,,-0.73970,,0.64596,2.68370,0.38921,0.01426


In [58]:
# calc overlap
pool = set(proteome.columns)
len(pool.intersection(set(target_genes))), len(target_genes)

(182, 188)

In [59]:
genes = []
for target_gene, subgenes in gene_to_subgenes.items():
    genes += [target_gene]
    genes += subgenes
genes = sorted(set(genes))
proteome = proteome[[g for g in genes if g in proteome.columns]]
proteome.columns = [f'{c}_proteome' for c in proteome.columns]
proteome

Unnamed: 0,AAAS_proteome,AAK1_proteome,AATF_proteome,ABCA1_proteome,ABCA2_proteome,ABCB1_proteome,ABCB11_proteome,ABCC2_proteome,ABCC3_proteome,ABCE1_proteome,...,ZNRF3_proteome,ZRANB1_proteome,ZRSR2_proteome,ZSCAN25_proteome,ZSCAN32_proteome,ZW10_proteome,ZWILCH_proteome,ZWINT_proteome,ZXDC_proteome,ZYX_proteome
CPT000814,1.26679,-0.01044,1.59816,-1.84996,,,,,,0.49184,...,,,-0.76581,,,0.64970,,1.23417,-1.93998,-0.22309
CPT001846,0.85472,0.51201,0.51065,-1.52921,,,,,-1.96476,0.45740,...,,,-0.56526,,,0.72501,,-0.08602,-1.40906,2.19005
01BR001-T,0.35140,-0.00390,1.10105,0.98196,,,,,-3.91029,1.12252,...,,,0.16594,,,-0.19132,,-0.44120,,0.75160
01BR008-T,0.53624,-0.24927,1.54379,0.85463,,,,,-1.42230,0.51320,...,,,0.90072,,,0.19900,,0.77504,1.72184,-0.12778
01BR009-T,-0.27902,-0.33482,0.69390,0.78852,,,,,-1.49698,-0.39790,...,,,0.41731,,,0.17226,,-0.39547,0.29115,0.21108
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-00729-N,-0.04845,-0.12495,-1.04297,,,,,,-0.72167,-0.30091,...,,,1.77739,,,-0.32896,,0.80837,0.21676,-0.66557
C3N-00858-N,0.02169,0.84149,-0.28194,7.88569,,,,,-2.38566,-0.34917,...,,,-0.62461,,,-0.50749,,,-2.26204,0.26025
C3N-00866-N,0.38333,0.09636,-0.31211,,,,,,-0.82112,-0.23879,...,,,0.47968,,,-0.55300,,-0.55719,1.70298,0.56138
C3N-01211-N,-0.98423,0.87419,-0.34030,0.66226,,,,,,0.38309,...,,,0.05706,,,-0.41162,,-1.02906,-0.73970,2.68370


###### purity

In [20]:
purity = pd.read_csv('../data/new/CPTAC_pancan_RNA_tumor_purity_ESTIMATE.tsv.gz',
                    sep='\t')
purity['sample_id'] = purity['Sample_ID'].to_list()
purity = purity.set_index('sample_id')
purity = purity[['TumorPurity']]
purity

Unnamed: 0_level_0,TumorPurity
sample_id,Unnamed: 1_level_1
01BR001-T,0.816624
01BR008-T,0.510466
01BR009-T,0.556239
01BR010-T,0.747700
01BR015-T,0.649161
...,...
C3N-01520-T,0.869669
C3N-01521-T,0.855558
C3N-01537-T,0.647919
C3N-01802-T,0.659522


###### somatic mutation

In [24]:
df = pd.read_csv('../data/new/PanCan_Union_Maf_Broad_WashU_v1.1.maf', sep='\t')
df

Columns (23,61,74,76,80,81,89,119) have mixed types.Specify dtype option on import or set low_memory=False.


Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Variant_Classification,Variant_Type,Reference_Allele,...,HGNC_UniProt_ID(supplied_by_UniProt),HGNC_Ensembl_ID(supplied_by_Ensembl),HGNC_UCSC_ID(supplied_by_UCSC),Oreganno_Build,Simple_Uniprot_alt_uniprot_accessions,dbSNP_TOPMED,HGNC_Entrez_Gene_ID(supplied_by_NCBI),COHORT,getz,washu
0,AGRN,375790.0,hg38,chr1,1041579,1041579,+,Missense_Mutation,SNP,C,...,O00468,ENSG00000188157,uc001ack.3,hg38,Q5SVA1|Q5SVA2|Q60FE1|Q7KYS8|Q8N4J5|Q96IC1|Q9BTD4,"0.99995221712538226,.,0.00004778287461773",375790.0,COAD,True,
1,CHD5,26038.0,hg38,chr1,6128146,6128146,+,Missense_Mutation,SNP,C,...,Q8TDI0,ENSG00000116254,uc001amb.3,hg38,A8KAP8|A8MQ44|D3DSH9|O60740,,26038.0,COAD,True,True
2,EPHA8,2046.0,hg38,chr1,22576725,22576725,+,Missense_Mutation,SNP,C,...,P29322,ENSG00000070886,uc001bfx.2,,Q6IN80|Q8IUX6|Q9NUA9|Q9P269,"0.99999203618756371,0.00000796381243628",2046.0,COAD,True,
3,ARID1A,8289.0,hg38,chr1,26775121,26775121,+,Frame_Shift_Del,DEL,C,...,O14497,ENSG00000117713,uc001bmv.2,,D3DPL1|Q53FK9|Q5T0W1|Q5T0W2|Q5T0W3|Q8NFD6|Q96T...,,8289.0,COAD,,True
4,CSMD2,114784.0,hg38,chr1,33571534,33571534,+,Missense_Mutation,SNP,C,...,Q7Z408,ENSG00000121904,uc001bxm.2,,B1AM50|E7EUA6|Q53TY4|Q5VT59|Q8N963|Q96Q03|Q9H4...,"0.99997610856269113,0.00002389143730886",114784.0,COAD,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345647,NCOR1,9611.0,hg38,chr17,16061829,16061829,+,Missense_Mutation,SNP,G,...,O75376,ENSG00000141027,uc002gpo.4,,B3DLF8|E9PGV6|Q86YY0|Q9UPV5|Q9UQ18,"0.99992036187563710,0.00007963812436289",9611.0,PDAC,True,True
345648,KATNAL2,83473.0,hg38,chr18,47033638,47033638,+,Intron,SNP,C,...,Q8IYT4,ENSG00000167216,uc060ows.1,,,,83473.0,PDAC,True,True
345649,DNMT1,1786.0,hg38,chr19,10137932,10137932,+,Missense_Mutation,SNP,G,...,P26358,ENSG00000130816,uc002mng.4,,A0AV63|B7ZLW6|Q9UHG5|Q9ULA2|Q9UMZ6,"0.99994425331294597,0.00005574668705402",1786.0,PDAC,True,True
345650,SCAF1,58506.0,hg38,chr19,49651431,49651431,+,Missense_Mutation,SNP,A,...,Q9H7N4,ENSG00000126461,uc002poq.4,hg38,Q7Z5V7|Q8WVA1|Q9NR59,"0.99998407237512742,0.00001592762487257",58506.0,PDAC,True,


In [26]:
df

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Variant_Classification,Variant_Type,Reference_Allele,...,HGNC_UCSC_ID(supplied_by_UCSC),Oreganno_Build,Simple_Uniprot_alt_uniprot_accessions,dbSNP_TOPMED,HGNC_Entrez_Gene_ID(supplied_by_NCBI),COHORT,getz,washu,sample_id,gene
2,EPHA8,2046.0,hg38,chr1,22576725,22576725,+,Missense_Mutation,SNP,C,...,uc001bfx.2,,Q6IN80|Q8IUX6|Q9NUA9|Q9P269,"0.99999203618756371,0.00000796381243628",2046.0,COAD,True,,09CO013-T,EPHA8
3,ARID1A,8289.0,hg38,chr1,26775121,26775121,+,Frame_Shift_Del,DEL,C,...,uc001bmv.2,,D3DPL1|Q53FK9|Q5T0W1|Q5T0W2|Q5T0W3|Q8NFD6|Q96T...,,8289.0,COAD,,True,09CO013-T,ARID1A
7,MAST2,23139.0,hg38,chr1,46031468,46031468,+,Missense_Mutation,SNP,C,...,uc001cov.3,,,"0.99997610856269113,0.00002389143730886",23139.0,COAD,True,,09CO013-T,MAST2
15,UHMK1,127933.0,hg38,chr1,162498240,162498241,+,Missense_Mutation,DNP,GG,...,uc001gcc.3,hg38,A8K8K4|G3V1M1|Q96C22,,127933.0,COAD,True,True,09CO013-T,UHMK1
16,F5,2153.0,hg38,chr1,169523892,169523892,+,Missense_Mutation,SNP,G,...,uc001ggg.2,,A8K6E8|Q14285|Q2EHR5|Q5R346|Q5R347|Q6UPU6|Q8WWQ6,,2153.0,COAD,True,True,09CO013-T,F5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345644,KLC2,64837.0,hg38,chr11,66264199,66264199,+,Missense_Mutation,SNP,G,...,uc010rov.2,,A8MXL7|B2RDY4|Q9H9C8|Q9HA20,,64837.0,PDAC,True,True,C3N-02295-T,KLC2
345645,KRAS,3845.0,hg38,chr12,25245350,25245350,+,Missense_Mutation,SNP,C,...,uc001rgp.3,,A8K8Z5|B0LPF9|P01118|Q96D10,"0.99998407237512742,.,.,0.00001592762487257",3845.0,PDAC,True,True,C3N-02295-T,KRAS
345646,TP53,7157.0,hg38,chr17,7675088,7675088,+,Missense_Mutation,SNP,C,...,uc060aur.1,,Q15086|Q15087|Q15088|Q16535|Q16807|Q16808|Q168...,"0.99999203618756371,.,0.00000796381243628",7157.0,PDAC,,True,C3N-02295-T,TP53
345647,NCOR1,9611.0,hg38,chr17,16061829,16061829,+,Missense_Mutation,SNP,G,...,uc002gpo.4,,B3DLF8|E9PGV6|Q86YY0|Q9UPV5|Q9UQ18,"0.99992036187563710,0.00007963812436289",9611.0,PDAC,True,True,C3N-02295-T,NCOR1


In [32]:
for c in df.columns:
    print(c, df.loc[2, c])

Hugo_Symbol EPHA8
Entrez_Gene_Id 2046.0
NCBI_Build hg38
Chromosome chr1
Start_Position 22576725
End_Position 22576725
Strand +
Variant_Classification Missense_Mutation
Variant_Type SNP
Reference_Allele C
Tumor_Seq_Allele1 C
Tumor_Seq_Allele2 T
Tumor_Sample_Barcode 09CO013_T
Matched_Norm_Sample_Barcode 09CO013_N
Validation_Method __UNKNOWN__
Score nan
BAM_File nan
Tumor_Sample_UUID 09CO013_T
Matched_Norm_Sample_UUID 09CO013_N
Genome_Change g.chr1:22576725C>T
Annotation_Transcript ENST00000166244.8
Transcript_Strand +
Transcript_Exon 3.0
Transcript_Position 815
cDNA_Change c.668C>T
Codon_Change c.(667-669)tCg>tTg
Protein_Change p.S223L
Other_Transcripts EPHA8_ENST00000374644.8_Missense_Mutation_p.S223L
Refseq_mRNA_Id NM_020526.3
Refseq_prot_Id NP_065387
SwissProt_acc_Id P29322
SwissProt_entry_Id EPHA8_HUMAN
Description EPH receptor A8
GO_Biological_Process axon guidance (GO:0007411)|cell adhesion (GO:0007155)|ephrin receptor signaling pathway (GO:0048013)|neuron projection development (G

In [34]:
genes = list(target_genes)
for gs in gene_to_subgenes.values(): genes += gs
genes = sorted(set(genes))
keep = ['gene', 'sample_id', 'Chromosome', 'Start_Position', 'End_Position', 'Protein_Change', 'Variant_Classification']

df = df.loc[[True if g in genes else False for g in df['Hugo_Symbol']]]
df['sample_id'] = [x.replace('_', '-') for x in df['Tumor_Sample_Barcode']]
df['gene'] = df['Hugo_Symbol'].to_list()
df = df[keep]

mutations = df
mutations

Unnamed: 0,gene,sample_id,Chromosome,Start_Position,End_Position,Protein_Change,Variant_Classification
2,EPHA8,09CO013-T,chr1,22576725,22576725,p.S223L,Missense_Mutation
3,ARID1A,09CO013-T,chr1,26775121,26775121,p.M1417fs,Frame_Shift_Del
7,MAST2,09CO013-T,chr1,46031468,46031468,p.R1024C,Missense_Mutation
15,UHMK1,09CO013-T,chr1,162498240,162498241,p.E81K,Missense_Mutation
16,F5,09CO013-T,chr1,169523892,169523892,p.P1939H,Missense_Mutation
...,...,...,...,...,...,...,...
345644,KLC2,C3N-02295-T,chr11,66264199,66264199,p.A366T,Missense_Mutation
345645,KRAS,C3N-02295-T,chr12,25245350,25245350,p.G12V,Missense_Mutation
345646,TP53,C3N-02295-T,chr17,7675088,7675088,p.R175H,Missense_Mutation
345647,NCOR1,C3N-02295-T,chr17,16061829,16061829,p.A1818V,Missense_Mutation


In [39]:
from collections import Counter
Counter(mutations['Variant_Classification']).most_common()

[('Missense_Mutation', 48324),
 ('Silent', 17894),
 ('Intron', 5448),
 ('Nonsense_Mutation', 4391),
 ('Frame_Shift_Del', 4194),
 ('Splice_Site', 2925),
 ("3'UTR", 1394),
 ('Frame_Shift_Ins', 1219),
 ("5'UTR", 683),
 ('In_Frame_Del', 536),
 ("5'Flank", 366),
 ('RNA', 338),
 ('Nonstop_Mutation', 71),
 ('In_Frame_Ins', 64),
 ('START_CODON_SNP', 61),
 ('DE_NOVO_START_OUT_FRAME', 22),
 ('DE_NOVO_START_IN_FRAME', 9),
 ('Translation_Start_Site', 8),
 ('START_CODON_INS', 4),
 ('COULD_NOT_DETERMINE', 2)]

In [35]:
def is_truncating(v):
    if 'Shift' in v: return True
    if 'Nonsense_Mutation'==v: return True
    return False

In [36]:
# do mutation type columns
types = sorted(set(mutations['Variant_Classification']))
for gene in target_genes:
    print(gene)
    for m in types:
        mutations[f'{gene}_mutation_is_{m}'] = [1 if vc==m and g==gene else 0
                        for vc, g in zip(mutations['Variant_Classification'], mutations['gene'])]
    mutations[f'{gene}_has_truncating_mutation'] = [1 if is_truncating(vc) and g==gene else 0
                    for vc, g in zip(mutations['Variant_Classification'], mutations['gene'])]
    mutations[f'{gene}_has_nonsilent_mutation'] = [1 if vc!='Silent' and g==gene else 0
                    for vc, g in zip(mutations['Variant_Classification'], mutations['gene'])]
mutations

ABL1



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


AJUBA
AKT1
ALB
ALK
AMER1
APC
AR
ARAF
ARHGAP35
ARID1A
ARID2
ASXL1
ASXL2
ATM
ATRX
AXIN1
AXIN2
B2M
BAP1
BCL2
BCOR
BRAF
BRCA1
BRD7
CASP8
CCND1
CD79B
CDH1
CDK12
CDK4
CDKN1A
CDKN1B
CDKN2A
CDKN2C
CEBPA
CHD4
CIC
CNBD1
CREBBP
CSDE1
CTCF
CTNNB1
CTNND1
CUL1
CUL3
CYSLTR2
DNMT3A
EEF1A1
EGFR
EGR3
EIF1AX
EP300
EPAS1
EPHA2
EPHA3
ERBB2
ERCC2
ESR1
EZH2
FAT1
FBXW7
FGFR1
FGFR2
FGFR3
FOXA1
FOXA2
FUBP1
GATA3
GNA11
GNAQ
GNAS
GPS2
GTF2I
HGF
HIST1H1C
HLA-A
HLA-B
HRAS
HUWE1
IDH1
IDH2
IL7R
IRF6
JAK2
JAK3
KANSL1
KDM5C
KDM6A
KEAP1
KIF1A
KIT
KLF5
KMT2A
KMT2B
KMT2C
KMT2D
KRAS
LATS1
LATS2
MACF1
MAP2K1
MAP2K4
MAP3K1
MAPK1
MAX
MED12
MEN1
MET
MGA
MGMT
MLH1
MSH2
MSH3
MTOR
MYD88
MYH9
NCOR1
NF1
NF2
NFE2L2
NIPBL
NOTCH1
NOTCH2
NPM1
NRAS
NSD1
PAX5
PBRM1
PCBP1
PHF6
PIK3CA
PIK3CB
PIK3CG
PIK3R1
PIK3R2
PIM1
PMS1
PMS2
POLE
PPP2R1A
PTEN
PTPRC
RAC1
RAD21
RAF1
RARA
RASA1
RB1
RBM10
RET
RHEB
RHOA
RNF43
RPS6KA3
RQCD1
RUNX1
RXRA
SCAF4
SETBP1
SETD2
SF3B1
SMAD4
SMARCA4
SMC1A
SMC3
SOS1
SOX9
SPOP
STAG2
STK11
TAF1
TBL1XR1
TBX3
TCF12
TCF7L2
TE

Unnamed: 0,gene,sample_id,Chromosome,Start_Position,End_Position,Protein_Change,Variant_Classification,ABL1_mutation_is_3'UTR,ABL1_mutation_is_5'Flank,ABL1_mutation_is_5'UTR,...,ZNF750_mutation_is_Nonsense_Mutation,ZNF750_mutation_is_Nonstop_Mutation,ZNF750_mutation_is_RNA,ZNF750_mutation_is_START_CODON_INS,ZNF750_mutation_is_START_CODON_SNP,ZNF750_mutation_is_Silent,ZNF750_mutation_is_Splice_Site,ZNF750_mutation_is_Translation_Start_Site,ZNF750_has_truncating_mutation,ZNF750_has_nonsilent_mutation
2,EPHA8,09CO013-T,chr1,22576725,22576725,p.S223L,Missense_Mutation,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ARID1A,09CO013-T,chr1,26775121,26775121,p.M1417fs,Frame_Shift_Del,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,MAST2,09CO013-T,chr1,46031468,46031468,p.R1024C,Missense_Mutation,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15,UHMK1,09CO013-T,chr1,162498240,162498241,p.E81K,Missense_Mutation,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16,F5,09CO013-T,chr1,169523892,169523892,p.P1939H,Missense_Mutation,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345644,KLC2,C3N-02295-T,chr11,66264199,66264199,p.A366T,Missense_Mutation,0,0,0,...,0,0,0,0,0,0,0,0,0,0
345645,KRAS,C3N-02295-T,chr12,25245350,25245350,p.G12V,Missense_Mutation,0,0,0,...,0,0,0,0,0,0,0,0,0,0
345646,TP53,C3N-02295-T,chr17,7675088,7675088,p.R175H,Missense_Mutation,0,0,0,...,0,0,0,0,0,0,0,0,0,0
345647,NCOR1,C3N-02295-T,chr17,16061829,16061829,p.A1818V,Missense_Mutation,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
sample_ids = sorted(set(mutations['sample_id']))
new = []
for s in sample_ids:
    filtered = mutations[mutations['sample_id']==s]
    base = filtered.iloc[0, :7].to_list()
#     base = filtered[['sample_id', 'disease']].iloc[0]
    if filtered.shape[0]<=1:
        tail = filtered.iloc[0, 7:].to_list()
    else:
        tail = list(np.max(filtered.iloc[:, 7:].values, axis=0).flatten())
    ls = base + tail
    new.append(ls)
consolidated = pd.DataFrame(data=new, index=sample_ids, columns=mutations.columns)
consolidated = consolidated.set_index('sample_id')
consolidated_filtered = consolidated.iloc[:, 7:]
somatic_mutations = consolidated_filtered
somatic_mutations

Unnamed: 0_level_0,ABL1_mutation_is_5'Flank,ABL1_mutation_is_5'UTR,ABL1_mutation_is_COULD_NOT_DETERMINE,ABL1_mutation_is_DE_NOVO_START_IN_FRAME,ABL1_mutation_is_DE_NOVO_START_OUT_FRAME,ABL1_mutation_is_Frame_Shift_Del,ABL1_mutation_is_Frame_Shift_Ins,ABL1_mutation_is_In_Frame_Del,ABL1_mutation_is_In_Frame_Ins,ABL1_mutation_is_Intron,...,ZNF750_mutation_is_Nonsense_Mutation,ZNF750_mutation_is_Nonstop_Mutation,ZNF750_mutation_is_RNA,ZNF750_mutation_is_START_CODON_INS,ZNF750_mutation_is_START_CODON_SNP,ZNF750_mutation_is_Silent,ZNF750_mutation_is_Splice_Site,ZNF750_mutation_is_Translation_Start_Site,ZNF750_has_truncating_mutation,ZNF750_has_nonsilent_mutation
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01BR001-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01BR008-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01BR009-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01BR010-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01BR015-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-04280-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C3N-04282-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C3N-04283-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C3N-04284-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


###### somatic cnv

In [41]:
fps = sorted(os_helpers.listfiles('../data/new/WashU_pipeline/', regex=r'WashU_pipeline_wxs/'))
fps = [fp for fp in fps if 'gene_level' in fp]
fps

['../data/new/WashU_pipeline/BR/WashU_pipeline_wxs/BR.gene_level.from_seg.filtered.tsv',
 '../data/new/WashU_pipeline/CO/WashU_pipeline_wxs/CO.gene_level.from_seg.filtered.tsv',
 '../data/new/WashU_pipeline/GBM/WashU_pipeline_wxs/GBM.gene_level.from_seg.filtered.tsv',
 '../data/new/WashU_pipeline/HNSCC/WashU_pipeline_wxs/HNSCC.gene_level.from_seg.filtered.tsv',
 '../data/new/WashU_pipeline/LSCC/WashU_pipeline_wxs/LSCC.gene_level.from_seg.filtered.tsv',
 '../data/new/WashU_pipeline/LUAD/WashU_pipeline_wxs/LUAD.gene_level.from_seg.filtered.tsv',
 '../data/new/WashU_pipeline/OV/WashU_pipeline_wxs/OV.gene_level.from_seg.filtered.tsv',
 '../data/new/WashU_pipeline/PDA/WashU_pipeline_wxs/PDA.gene_level.from_seg.filtered.tsv',
 '../data/new/WashU_pipeline/UCEC/WashU_pipeline_wxs/UCEC.gene_level.from_seg.filtered.tsv',
 '../data/new/WashU_pipeline/ccRCC/WashU_pipeline_wxs/ccRCC.gene_level.from_seg.filtered.tsv']

In [43]:
cnv = None
gene_order = list(set(pd.read_csv(fps[0], sep='\t')['Gene']).intersection(*[set(pd.read_csv(fp, sep='\t')['Gene'])
                                                                           for fp in fps[1:]]))
for fp in fps:
    df = pd.read_csv(fp,
                sep='\t')
    df = df.set_index('Gene')
    df = df.transpose()[gene_order]
    df['disease'] = fp.split('/')[-3]
    
    if cnv is None:
        cnv = df
    else:
        cnv = pd.concat((cnv, df), axis=0)
cnv

Gene,GABPA,USP31,WFIKKN1,OR5M10,ZNF551,ADAMTS3,AHCYL1,PHPT1,HOMER3,KCNJ4,...,PDCD4,C19orf43,FAM118A,CDC37L1,MRPS31,PDK4,ZNF837,SUGP2,ZNF205,disease
01BR001,0.64656,-0.40657,-0.19088,0.01914,-0.07265,0.40944,-0.14749,0.01670,-0.04602,-0.02041,...,-0.00966,-0.04602,-0.01887,-0.05369,0.01497,0.04711,-0.07265,-0.04602,-0.19088,BR
01BR008,-0.00672,-0.32181,-0.16619,-0.04312,-0.06644,-0.06661,0.08417,0.01058,-0.06230,-0.07571,...,0.11550,-0.06230,-0.07571,0.15075,-0.06048,-0.00576,-0.06644,-0.06230,-0.16619,BR
01BR009,0.01965,-0.38928,-0.35388,-0.15924,0.28468,0.31188,-0.15498,-0.19639,0.27581,0.04398,...,-0.14743,0.26883,0.04398,-0.09215,0.02417,0.67516,0.28468,0.27581,-0.38928,BR
01BR010,0.12614,-0.14335,-0.04794,0.03900,0.00370,0.09144,0.03008,0.08945,0.00370,-0.00940,...,0.01446,0.00370,-0.00940,-0.19650,0.17361,-0.03744,0.00370,0.00370,-0.04794,BR
01BR015,-0.12758,0.26465,0.26465,0.12214,0.09610,0.09826,0.39166,0.09958,-0.25756,-0.24742,...,0.13746,-0.21085,-0.24742,-0.11742,0.36613,-0.28490,0.09610,-0.25756,0.26465,BR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01646,0.03746,0.04099,0.04099,0.04100,0.02313,0.01830,0.00646,-0.32422,0.02994,0.04504,...,0.04368,0.02994,0.04504,-0.32422,0.03630,0.03709,0.02313,0.02994,0.04099,ccRCC
C3N-01648,-0.01504,-0.02305,-0.02305,-0.02360,-0.04038,-0.03739,-0.02997,-0.02193,-0.04038,-0.01679,...,-0.02243,-0.04038,-0.01679,-0.02193,-0.02469,-0.03307,-0.04038,-0.04038,-0.02305,ccRCC
C3N-01649,0.01470,0.01164,0.01164,0.01643,0.02214,0.01663,0.01854,0.01562,0.00896,0.00170,...,0.01736,0.00896,0.00170,0.01562,0.02005,0.32882,0.02214,0.00896,0.01164,ccRCC
C3N-01651,-0.06616,0.14783,0.14783,-0.06331,0.11945,-0.08853,-0.06828,-0.07086,0.13713,-0.05893,...,-0.31422,0.13713,-0.05893,-0.07086,-0.32576,0.14662,0.11945,0.13713,0.14783,ccRCC


In [44]:
cnv.index = [update_sample_id(x) for x in cnv.index]
cnv

Gene,GABPA,USP31,WFIKKN1,OR5M10,ZNF551,ADAMTS3,AHCYL1,PHPT1,HOMER3,KCNJ4,...,PDCD4,C19orf43,FAM118A,CDC37L1,MRPS31,PDK4,ZNF837,SUGP2,ZNF205,disease
01BR001-T,0.64656,-0.40657,-0.19088,0.01914,-0.07265,0.40944,-0.14749,0.01670,-0.04602,-0.02041,...,-0.00966,-0.04602,-0.01887,-0.05369,0.01497,0.04711,-0.07265,-0.04602,-0.19088,BR
01BR008-T,-0.00672,-0.32181,-0.16619,-0.04312,-0.06644,-0.06661,0.08417,0.01058,-0.06230,-0.07571,...,0.11550,-0.06230,-0.07571,0.15075,-0.06048,-0.00576,-0.06644,-0.06230,-0.16619,BR
01BR009-T,0.01965,-0.38928,-0.35388,-0.15924,0.28468,0.31188,-0.15498,-0.19639,0.27581,0.04398,...,-0.14743,0.26883,0.04398,-0.09215,0.02417,0.67516,0.28468,0.27581,-0.38928,BR
01BR010-T,0.12614,-0.14335,-0.04794,0.03900,0.00370,0.09144,0.03008,0.08945,0.00370,-0.00940,...,0.01446,0.00370,-0.00940,-0.19650,0.17361,-0.03744,0.00370,0.00370,-0.04794,BR
01BR015-T,-0.12758,0.26465,0.26465,0.12214,0.09610,0.09826,0.39166,0.09958,-0.25756,-0.24742,...,0.13746,-0.21085,-0.24742,-0.11742,0.36613,-0.28490,0.09610,-0.25756,0.26465,BR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01646-T,0.03746,0.04099,0.04099,0.04100,0.02313,0.01830,0.00646,-0.32422,0.02994,0.04504,...,0.04368,0.02994,0.04504,-0.32422,0.03630,0.03709,0.02313,0.02994,0.04099,ccRCC
C3N-01648-T,-0.01504,-0.02305,-0.02305,-0.02360,-0.04038,-0.03739,-0.02997,-0.02193,-0.04038,-0.01679,...,-0.02243,-0.04038,-0.01679,-0.02193,-0.02469,-0.03307,-0.04038,-0.04038,-0.02305,ccRCC
C3N-01649-T,0.01470,0.01164,0.01164,0.01643,0.02214,0.01663,0.01854,0.01562,0.00896,0.00170,...,0.01736,0.00896,0.00170,0.01562,0.02005,0.32882,0.02214,0.00896,0.01164,ccRCC
C3N-01651-T,-0.06616,0.14783,0.14783,-0.06331,0.11945,-0.08853,-0.06828,-0.07086,0.13713,-0.05893,...,-0.31422,0.13713,-0.05893,-0.07086,-0.32576,0.14662,0.11945,0.13713,0.14783,ccRCC


In [45]:
include = [g for g in target_genes]
for gs in gene_to_subgenes.values(): include += gs
include = sorted(set(include))
include.append('disease')

cnv = cnv[[g for g in include if g in cnv.columns]]
cnv.columns = [f'{c}_cnv' if c!='disease' else c for c in cnv.columns]
cnv

Unnamed: 0,AAAS_cnv,AAK1_cnv,AATF_cnv,ABCA1_cnv,ABCA2_cnv,ABCB1_cnv,ABCB11_cnv,ABCC2_cnv,ABCC3_cnv,ABCE1_cnv,...,ZRANB1_cnv,ZSCAN10_cnv,ZSCAN25_cnv,ZSCAN32_cnv,ZW10_cnv,ZWILCH_cnv,ZWINT_cnv,ZXDC_cnv,ZYX_cnv,disease
01BR001-T,-0.00563,0.11436,-0.10931,-0.05580,0.01670,0.04711,0.09183,-0.00966,-0.06617,0.40944,...,-0.00966,-0.19088,0.04711,-0.19088,0.01914,-0.26696,-0.00966,-0.06602,0.05187,BR
01BR008-T,-0.02567,0.03183,-0.10262,-0.06820,0.01058,-0.00576,-0.05192,0.11550,-0.10262,-0.06661,...,0.11550,-0.16619,-0.00576,-0.16619,-0.04312,0.00988,0.11550,-0.09075,-0.00576,BR
01BR009-T,-0.22827,0.13819,-0.29783,-0.19639,-0.19639,0.16028,0.21653,-0.14743,-0.29783,-0.18501,...,0.32349,-0.38928,0.67516,-0.38928,-0.15614,-0.15576,-0.14743,0.26289,0.06942,BR
01BR010-T,-0.00332,0.14108,-0.20200,0.08945,0.08945,-0.03744,0.14108,0.01446,-0.01515,0.09144,...,0.00058,-0.04794,-0.03744,-0.04794,0.03900,-0.18501,-0.14666,-0.00744,-0.03744,BR
01BR015-T,0.12874,-0.23580,-0.26588,-0.11742,0.09958,-0.28490,-0.23689,0.13746,-0.26588,0.09826,...,0.13746,0.26465,-0.28490,0.26465,-0.20074,-0.23970,0.13746,0.09962,-0.27216,BR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01646-T,0.07525,0.05431,0.03650,-0.32422,-0.32422,0.03709,0.05653,0.04368,0.03650,0.02862,...,0.04368,0.04099,0.03709,0.04099,0.04100,0.04141,0.04368,0.30051,0.03709,ccRCC
C3N-01648-T,-0.01736,-0.00540,-0.02973,-0.02193,-0.02193,-0.03307,-0.00540,-0.02243,-0.02973,-0.03739,...,-0.02243,-0.02305,-0.03307,-0.02305,-0.02360,-0.02031,-0.02243,-0.01847,-0.03307,ccRCC
C3N-01649-T,0.03014,0.02319,0.01504,0.01562,0.01562,0.32882,0.02319,0.01736,0.01504,0.01663,...,0.01736,0.01164,0.32882,0.01164,0.01643,0.02188,0.01736,0.01720,0.32882,ccRCC
C3N-01651-T,0.14936,0.16029,-0.05907,-0.07086,-0.07086,0.14662,0.51034,-0.31422,-0.05907,-0.08853,...,-0.31422,0.14783,0.14662,0.14783,-0.06331,-0.32835,-0.31422,0.14771,0.14662,ccRCC


###### eQTL

dont use

In [23]:
fps = sorted(os_helpers.listfiles('../data/Somatic_mutation_wxs/', regex=r'exonic.*.maf.gz$'))
fps = [fp for fp in fps if 'Archived' not in fp]
mutations = None
genes = list(target_genes)
for gs in gene_to_subgenes.values(): genes += gs
genes = sorted(set(genes))
keep = ['gene', 'sample_id', 'Chromosome', 'Start_Position', 'End_Position', 'HGVSp_Short', 'Variant_Classification',
       'Reference_Allele', 'Tumor_Seq_Allele2']
for fp in fps:
    cancer_type = fp.split('/')[-1].split('_')[0]
    df = pd.read_csv(fp, sep='\t')
    df = df.loc[[True if g in genes else False for g in df['Hugo_Symbol']]]
#     df = df[df['Variant_Classification']!='Silent']
    df['sample_id'] = [x.replace('_', '-') for x in df['Tumor_Sample_Barcode']]
    df['gene'] = df['Hugo_Symbol'].to_list()
    df = df[keep]
    df['disease'] = cancer_type
    
    
    if mutations is None:
        mutations = df
    else:
        mutations = pd.concat((mutations, df), axis=0)

Columns (88) have mixed types.Specify dtype option on import or set low_memory=False.


In [63]:
mutations

Unnamed: 0,gene,sample_id,Chromosome,Start_Position,End_Position,HGVSp_Short,Variant_Classification,Reference_Allele,Tumor_Seq_Allele2,disease
0,PIK3CD,01BR001-T,chr1,9722533,9722533,p.R785W,Missense_Mutation,C,T,BR
10,AREG,01BR001-T,chr4,74445388,74445388,p.L15F,Missense_Mutation,C,T,BR
12,FBXW7,01BR001-T,chr4,152329731,152329731,p.R393*,Nonsense_Mutation,G,A,BR
34,FLT1,01BR001-T,chr13,28438252,28438252,p.T161M,Missense_Mutation,G,A,BR
43,TP53,01BR001-T,chr17,7674216,7674216,p.R249S,Missense_Mutation,C,A,BR
...,...,...,...,...,...,...,...,...,...,...
7641,PTEN,C3N-01651-T,chr10,87933033,87933033,p.D92Y,Missense_Mutation,G,T,ccRCC
7655,IGHG2,C3N-01651-T,chr14,105644096,105644096,p.K101Nfs*31,Frame_Shift_Del,T,-,ccRCC
7673,TP73,C3N-01808-T,chr1,3731054,3731054,p.P491P,Silent,C,T,ccRCC
7683,FGB,C3N-01808-T,chr4,154563025,154563025,p.R3R,Silent,A,C,ccRCC


In [64]:
fps = sorted(os_helpers.listfiles('/diskmnt/Projects/Users/estorrs/cptac_driver/data/eQTL_filtered',
                                 ))
fps

['/diskmnt/Projects/Users/estorrs/cptac_driver/data/eQTL_filtered/BR_eQTLs_filtered.2021017.tsv',
 '/diskmnt/Projects/Users/estorrs/cptac_driver/data/eQTL_filtered/CO_eQTLs_filtered.2021017.tsv',
 '/diskmnt/Projects/Users/estorrs/cptac_driver/data/eQTL_filtered/EC_eQTLs_filtered.2021017.tsv',
 '/diskmnt/Projects/Users/estorrs/cptac_driver/data/eQTL_filtered/GBM_eQTLs_filtered.2021017.tsv',
 '/diskmnt/Projects/Users/estorrs/cptac_driver/data/eQTL_filtered/HNSCC_eQTLs_filtered.2021017.tsv',
 '/diskmnt/Projects/Users/estorrs/cptac_driver/data/eQTL_filtered/LSCC_eQTLs_filtered.2021017.tsv',
 '/diskmnt/Projects/Users/estorrs/cptac_driver/data/eQTL_filtered/LUAD_eQTLs_filtered.2021017.tsv',
 '/diskmnt/Projects/Users/estorrs/cptac_driver/data/eQTL_filtered/OV_eQTLs_filtered.2021017.tsv',
 '/diskmnt/Projects/Users/estorrs/cptac_driver/data/eQTL_filtered/PDA_eQTLs_filtered.2021017.tsv',
 '/diskmnt/Projects/Users/estorrs/cptac_driver/data/eQTL_filtered/ccRCC_eQTLs_filtered.2021017.tsv']

In [65]:
eqtl = None

for fp in fps:
    df = pd.read_csv(fp,
                sep='\t')
    
    if eqtl is None:
        eqtl = df
    else:
        eqtl = pd.concat((eqtl, df), axis=0)
eqtl

Unnamed: 0,SNP,gene,beta,t.stat,P,FDR,Gene_SNP,Disease
0,chr1_107056636_A_T,PRMT6,-0.606306,-5.163265,1.373427e-06,2.883696e-02,PRMT6,BR
1,chr1_109656105_C_G,GSTM4,-0.574039,-5.391549,5.270820e-07,1.706793e-02,GSTM4,BR
2,chr1_109688145_G_A,GSTM1,0.676938,8.537473,2.526182e-13,1.709479e-06,GSTM1,BR
3,chr1_109690179_A_G,GSTM1,0.707850,9.033014,2.272933e-14,1.644163e-07,GSTM1,BR
4,chr1_109690625_T_C,GSTM1,0.611524,7.463626,4.401857e-11,4.542680e-05,GSTM1,BR
...,...,...,...,...,...,...,...,...
4914,chr9_92784607_G_A,ASPN,3.278481,5.040561,2.460206e-06,4.897364e-02,ASPN,ccRCC
4915,chr9_93697213_C_A,FGD3,1.352675,5.120912,1.774278e-06,4.078420e-02,FGD3,ccRCC
4916,chr9_93849832_C_G,FGD3,2.705351,5.120912,1.774278e-06,4.078420e-02,FGD3,ccRCC
4917,chr9_97896656_A_G,TMOD1,-1.901571,-4.773050,7.169601e-06,8.226812e-02,TMOD1,ccRCC


In [66]:
for disease in set(eqtl['Disease']):
    sites = set(eqtl[eqtl['Disease']==disease]['SNP'])
    f = mutations[mutations['disease']==disease]
    mask = [True if f'{c}_{s}_{ref}_{alt}' in sites else False
           for c, s, ref, alt in zip(f['Chromosome'], f['Start_Position'],
                                    f['Reference_Allele'], f['Tumor_Seq_Allele2'])]
    print(f[mask].shape)

(1, 10)
(0, 10)
(0, 10)
(0, 10)
(0, 10)
(0, 10)
(0, 10)
(0, 10)
(0, 10)
(0, 10)


In [None]:
d = pd.read_csv('../data/199_driver_genes.txt', sep='\t')
d

In [51]:
set(d['Gene']).intersection(mutations[mask]['gene'])

{'ERBB2'}

###### pQTL

dont use

In [52]:
pqtl = pd.read_csv('../data/filtered_pQTL.tsv', sep='\t')
pqtl

Unnamed: 0,SNP,gene,beta,t.stat,P,FDR,Gene_SNP,Disease
0,chr1_109688145_G_A,GSTM1,0.547301,5.611051,2.296727e-07,0.022047,GSTM1,ccRCC
1,chr1_109690179_A_G,GSTM1,0.547301,5.611051,2.296727e-07,0.022047,GSTM1,ccRCC
2,chr1_109690625_T_C,GSTM1,0.547301,5.611051,2.296727e-07,0.022047,GSTM1,ccRCC
3,chr1_109737199_C_A,GSTM3,0.641189,5.217990,1.191517e-06,0.053193,GSTM3,ccRCC
4,chr1_109739319_G_A,GSTM3,0.713888,5.903624,6.529679e-08,0.009406,GSTM3,ccRCC
...,...,...,...,...,...,...,...,...
8195,chr9_137103590_C_T,UAP1L1,-0.585547,-4.871726,6.120846e-06,0.093298,UAP1L1,EC
8196,chr9_21816759_G_A,MTAP,-0.569130,-5.338531,9.881701e-07,0.041168,MTAP,EC
8197,chr9_34318291_G_T,NUDT2,0.948056,6.126573,3.972724e-08,0.008367,NUDT2,EC
8198,chr9_34371790_A_T,NUDT2,0.952703,6.324868,1.732983e-08,0.004470,NUDT2,EC


In [53]:
sites = set(pqtl['SNP'])
mask = [True if f'{c}_{s}_{ref}_{alt}' in sites else False
       for c, s, ref, alt in zip(mutations['Chromosome'], mutations['Start_Position'],
                                mutations['Reference_Allele'], mutations['Tumor_Seq_Allele2'])]
mutations[mask]

Unnamed: 0,gene,sample_id,Chromosome,Start_Position,End_Position,HGVSp_Short,Variant_Classification,Reference_Allele,Tumor_Seq_Allele2,disease


###### germline_variants

In [46]:
fps = sorted(os_helpers.listfiles('../data/new/Germline_pathogenic_variants_reviewed.1.0/',
                                 ))
fps

['../data/new/Germline_pathogenic_variants_reviewed.1.0/BR_germline_pathogenic.v.1.0.txt',
 '../data/new/Germline_pathogenic_variants_reviewed.1.0/CO_germline_pathogenic.v.1.0.txt',
 '../data/new/Germline_pathogenic_variants_reviewed.1.0/EC_germline_pathogenic.v.1.0.txt',
 '../data/new/Germline_pathogenic_variants_reviewed.1.0/GBM_germline_pathogenic.v.1.0.txt',
 '../data/new/Germline_pathogenic_variants_reviewed.1.0/HNSCC_germline_pathogenic.v.1.0.txt',
 '../data/new/Germline_pathogenic_variants_reviewed.1.0/LSCC_germline_pathogenic.v.1.0.txt',
 '../data/new/Germline_pathogenic_variants_reviewed.1.0/LUAD_germline_pathogenic.v.1.0.txt',
 '../data/new/Germline_pathogenic_variants_reviewed.1.0/OV_germline_pathogenic.v.1.0.txt',
 '../data/new/Germline_pathogenic_variants_reviewed.1.0/PDA_germline_pathogenic.v.1.0.txt',
 '../data/new/Germline_pathogenic_variants_reviewed.1.0/ccRCC_germline_pathogenic.v.1.0.txt']

In [47]:
germline = None

for fp in fps:
    df = pd.read_csv(fp,
                sep='\t')
    
    if germline is None:
        germline = df
    else:
        germline = pd.concat((germline, df), axis=0)
germline

Unnamed: 0,Disease,Overall_Classification,GeneClass,Sample,HUGO_Symbol,Chromosome,Start,Stop,Reference,Alternate,...,VCF_Details,N_REF,N_ALT,N_VAF,T_REF,T_ALT,T_VAF,Genotype,Cohort_AC,Manual_review
0,BRCA,Pathogenic,TSG,11BR020,ATM,11,108325416,108325416,C,T,...,"chr11::108325416::None::C::[""T""]::{""CSQ"":[""T|m...",137,136,0.498168,127,130,0.505837,0/1,2,YES
1,BRCA,Pathogenic,TSG,11BR006,BLM,15,90749586,90749587,-,T,...,"chr15::90749586::None::A::[""AT""]::{""CSQ"":[""T|f...",614,281,0.313966,503,257,0.338158,0/1,1,YES
2,BRCA,Pathogenic,TSG,11BR016,BRCA1,17,43057062,43057063,-,G,...,"chr17::43057062::None::T::[""TG""]::{""CSQ"":[""G|f...",397,193,0.327119,274,176,0.391111,0/1,1,YES
3,BRCA,Pathogenic,TSG,18BR006,BRCA2,13,32329468,32329469,TG,-,...,"chr13::32329467::None::CTG::[""C""]::{""CSQ"":[""-|...",48,61,0.559633,30,84,0.736842,0/1,1,YES
4,BRCA,Pathogenic,TSG,11BR006,BRCA2,13,32340704,32340705,TG,-,...,"chr13::32340703::None::CTG::[""C""]::{""CSQ"":[""-|...",135,106,0.439834,76,119,0.610256,0/1,1,YES
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6,ccRCC,Pathogenic,TSG,C3N-00168.N,SDHA,5,218356,218356,A,G,...,"chr5::218356::None::A::[""G""]::{""CSQ"":[""G|start...",19,21,0.525000,35,41,0.539474,0/1,1,YES
7,ccRCC,Pathogenic,,C3N-00831.N,SERPINA1,14,94380949,94380949,T,A,...,"chr14::94380949::None::T::[""A""]::{""CSQ"":[""A|mi...",186,178,0.489011,164,185,0.530086,0/1,1,YES
8,ccRCC,Pathogenic,,C3L-00416.N,TYR,11,89227904,89227904,C,A,...,"chr11::89227904::None::C::[""A""]::{""CSQ"":[""A|mi...",160,96,0.375000,149,125,0.456204,0/1,1,YES
9,ccRCC,Pathogenic,,C3L-01283.N,TYR,11,89295242,89295243,-,T,...,"chr11::89295242::None::C::[""CT""]::{""CSQ"":[""T|f...",273,253,0.480989,235,190,0.447059,0/1,1,YES


In [48]:
gs = list(target_genes)
for ls in gene_to_subgenes.values(): gs += ls
gs = sorted(set(gs))
mask = [True if g in gs else False
       for g in germline['HUGO_Symbol']]
germline = germline[mask]

In [49]:
# do mutation type columns
for gene in gs:
    germline[f'{gene}_is_pathogenic_germline'] = [1 if g==gene else 0
                    for g in germline['HUGO_Symbol']]
germline


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,Disease,Overall_Classification,GeneClass,Sample,HUGO_Symbol,Chromosome,Start,Stop,Reference,Alternate,...,ZRANB1_is_pathogenic_germline,ZRSR2_is_pathogenic_germline,ZSCAN10_is_pathogenic_germline,ZSCAN25_is_pathogenic_germline,ZSCAN32_is_pathogenic_germline,ZW10_is_pathogenic_germline,ZWILCH_is_pathogenic_germline,ZWINT_is_pathogenic_germline,ZXDC_is_pathogenic_germline,ZYX_is_pathogenic_germline
0,BRCA,Pathogenic,TSG,11BR020,ATM,11,108325416,108325416,C,T,...,0,0,0,0,0,0,0,0,0,0
1,BRCA,Pathogenic,TSG,11BR006,BLM,15,90749586,90749587,-,T,...,0,0,0,0,0,0,0,0,0,0
2,BRCA,Pathogenic,TSG,11BR016,BRCA1,17,43057062,43057063,-,G,...,0,0,0,0,0,0,0,0,0,0
3,BRCA,Pathogenic,TSG,18BR006,BRCA2,13,32329468,32329469,TG,-,...,0,0,0,0,0,0,0,0,0,0
4,BRCA,Pathogenic,TSG,11BR006,BRCA2,13,32340704,32340705,TG,-,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,ccRCC,Pathogenic,TSG,C3N-01200.N,ERCC2,19,45352511,45352511,C,T,...,0,0,0,0,0,0,0,0,0,0
4,ccRCC,Likely_Pathogenic,Oncogene,C3N-01175.N,MITF,3,69866348,69866348,T,G,...,0,0,0,0,0,0,0,0,0,0
5,ccRCC,Pathogenic,,C3L-01553.N,MTHFR,1,11791216,11791216,C,T,...,0,0,0,0,0,0,0,0,0,0
7,ccRCC,Pathogenic,,C3N-00831.N,SERPINA1,14,94380949,94380949,T,A,...,0,0,0,0,0,0,0,0,0,0


In [50]:
germline.index = [x.replace('.N', '') + '-T' for x in germline['Sample']]
germline.index.name = 'sample_id'
germline


Unnamed: 0_level_0,Disease,Overall_Classification,GeneClass,Sample,HUGO_Symbol,Chromosome,Start,Stop,Reference,Alternate,...,ZRANB1_is_pathogenic_germline,ZRSR2_is_pathogenic_germline,ZSCAN10_is_pathogenic_germline,ZSCAN25_is_pathogenic_germline,ZSCAN32_is_pathogenic_germline,ZW10_is_pathogenic_germline,ZWILCH_is_pathogenic_germline,ZWINT_is_pathogenic_germline,ZXDC_is_pathogenic_germline,ZYX_is_pathogenic_germline
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11BR020-T,BRCA,Pathogenic,TSG,11BR020,ATM,11,108325416,108325416,C,T,...,0,0,0,0,0,0,0,0,0,0
11BR006-T,BRCA,Pathogenic,TSG,11BR006,BLM,15,90749586,90749587,-,T,...,0,0,0,0,0,0,0,0,0,0
11BR016-T,BRCA,Pathogenic,TSG,11BR016,BRCA1,17,43057062,43057063,-,G,...,0,0,0,0,0,0,0,0,0,0
18BR006-T,BRCA,Pathogenic,TSG,18BR006,BRCA2,13,32329468,32329469,TG,-,...,0,0,0,0,0,0,0,0,0,0
11BR006-T,BRCA,Pathogenic,TSG,11BR006,BRCA2,13,32340704,32340705,TG,-,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01200-T,ccRCC,Pathogenic,TSG,C3N-01200.N,ERCC2,19,45352511,45352511,C,T,...,0,0,0,0,0,0,0,0,0,0
C3N-01175-T,ccRCC,Likely_Pathogenic,Oncogene,C3N-01175.N,MITF,3,69866348,69866348,T,G,...,0,0,0,0,0,0,0,0,0,0
C3L-01553-T,ccRCC,Pathogenic,,C3L-01553.N,MTHFR,1,11791216,11791216,C,T,...,0,0,0,0,0,0,0,0,0,0
C3N-00831-T,ccRCC,Pathogenic,,C3N-00831.N,SERPINA1,14,94380949,94380949,T,A,...,0,0,0,0,0,0,0,0,0,0


In [51]:
germline = germline[[c for c in germline.columns if 'is_pathogenic_germline' in c]]
germline

Unnamed: 0_level_0,AAAS_is_pathogenic_germline,AAK1_is_pathogenic_germline,AATF_is_pathogenic_germline,ABCA1_is_pathogenic_germline,ABCA2_is_pathogenic_germline,ABCB1_is_pathogenic_germline,ABCB11_is_pathogenic_germline,ABCC2_is_pathogenic_germline,ABCC3_is_pathogenic_germline,ABCE1_is_pathogenic_germline,...,ZRANB1_is_pathogenic_germline,ZRSR2_is_pathogenic_germline,ZSCAN10_is_pathogenic_germline,ZSCAN25_is_pathogenic_germline,ZSCAN32_is_pathogenic_germline,ZW10_is_pathogenic_germline,ZWILCH_is_pathogenic_germline,ZWINT_is_pathogenic_germline,ZXDC_is_pathogenic_germline,ZYX_is_pathogenic_germline
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11BR020-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11BR006-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11BR016-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18BR006-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11BR006-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01200-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C3N-01175-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C3L-01553-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C3N-00831-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
sample_ids = sorted(set(germline.index))
new = []
for s in sample_ids:
    filtered = germline.loc[[s], :]
    if filtered.shape[0]<=1:
        tail = filtered.iloc[0, :].to_list()
    else:
        tail = list(np.max(filtered.values, axis=0).flatten())
    new.append(tail)
consolidated_germline = pd.DataFrame(data=new, index=sample_ids, columns=germline.columns)
consolidated_germline

Unnamed: 0,AAAS_is_pathogenic_germline,AAK1_is_pathogenic_germline,AATF_is_pathogenic_germline,ABCA1_is_pathogenic_germline,ABCA2_is_pathogenic_germline,ABCB1_is_pathogenic_germline,ABCB11_is_pathogenic_germline,ABCC2_is_pathogenic_germline,ABCC3_is_pathogenic_germline,ABCE1_is_pathogenic_germline,...,ZRANB1_is_pathogenic_germline,ZRSR2_is_pathogenic_germline,ZSCAN10_is_pathogenic_germline,ZSCAN25_is_pathogenic_germline,ZSCAN32_is_pathogenic_germline,ZW10_is_pathogenic_germline,ZWILCH_is_pathogenic_germline,ZWINT_is_pathogenic_germline,ZXDC_is_pathogenic_germline,ZYX_is_pathogenic_germline
01BR017-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01BR033-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01BR042-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01OV029-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
02OV008-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-03439-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C3N-03782-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C3N-03841-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C3N-04279-T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


###### rna expression

In [53]:
expression = pd.read_csv('../data/new/ALL_RNA-Seq_Expr_WashU_FPKM_UQ_annotation.tsv.gz', sep='\t')
expression

Unnamed: 0,gene_id,gene_name,seqname,start,end,strand,gene_type,gene_status,havana_gene,full_length,...,C3L-01282-A,C3L-01304-A,C3L-01307-A,C3L-01311-A,C3N-00333-A,C3N-00383-A,C3N-00858-A,C3N-00866-A,C3N-01003-A,C3N-01346-A
0,ENSG00000223972.5,DDX11L1,chr1,11869,14409,+,transcribed_unprocessed_pseudogene,KNOWN,OTTHUMG00000000961.2,2541,...,0.000000e+00,3.316754e+02,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
1,ENSG00000227232.5,WASH7P,chr1,14404,29570,-,unprocessed_pseudogene,KNOWN,OTTHUMG00000000958.1,15167,...,3.294426e+04,3.322400e+04,2.772942e+04,4.175670e+04,2.970832e+04,3.837182e+04,3.049993e+04,2.128872e+04,3.700962e+04,2.161777e+04
2,ENSG00000278267.1,MIR6859-3,chr1,17369,17436,-,miRNA,KNOWN,,68,...,1.446044e+05,1.777146e+05,1.351309e+05,2.008523e+05,1.713584e+05,1.626363e+05,2.080168e+05,1.518306e+05,1.547988e+05,1.288483e+05
3,ENSG00000243485.3,RP11-34P13.3,chr1,29554,31109,+,lincRNA,NOVEL,OTTHUMG00000000959.2,1556,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
4,ENSG00000274890.1,MIR1302-9,chr1,30366,30503,+,miRNA,KNOWN,,138,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60478,ENSG00000198695.2,MT-ND6,chrM,14149,14673,-,protein_coding,KNOWN,,525,...,3.391063e+05,4.505004e+05,3.325508e+05,3.167063e+05,2.540094e+05,4.976672e+05,3.994611e+05,2.275601e+05,3.295739e+05,1.813529e+05
60479,ENSG00000210194.1,MT-TE,chrM,14674,14742,-,Mt_tRNA,KNOWN,,69,...,7.500455e+03,1.167593e+05,0.000000e+00,8.606148e+03,0.000000e+00,2.003491e+04,3.565253e+04,1.068787e+04,9.534706e+03,3.386157e+04
60480,ENSG00000198727.2,MT-CYB,chrM,14747,15887,+,protein_coding,KNOWN,,1141,...,3.787822e+06,5.057060e+06,4.413250e+06,3.395884e+06,5.556691e+06,1.084422e+07,4.775596e+06,4.455156e+06,3.985424e+06,4.091857e+06
60481,ENSG00000210195.2,MT-TT,chrM,15888,15953,+,Mt_tRNA,KNOWN,,66,...,7.057246e+04,1.656618e+05,5.354838e+04,4.498668e+04,3.138686e+05,3.037110e+05,1.770472e+05,4.469473e+04,1.395534e+05,1.150524e+05


In [54]:
expression = expression[expression['gene_type']=='protein_coding']
expression

Unnamed: 0,gene_id,gene_name,seqname,start,end,strand,gene_type,gene_status,havana_gene,full_length,...,C3L-01282-A,C3L-01304-A,C3L-01307-A,C3L-01311-A,C3N-00333-A,C3N-00383-A,C3N-00858-A,C3N-00866-A,C3N-01003-A,C3N-01346-A
8,ENSG00000186092.4,OR4F5,chr1,69091,70008,+,protein_coding,KNOWN,OTTHUMG00000001094.1,918,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
18,ENSG00000279928.1,FO538757.3,chr1,182393,184158,+,protein_coding,KNOWN,,1766,...,7.207958e+02,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,9.626803e+02,3.426218e+03,0.000000e+00,1.832576e+03,6.508213e+03
19,ENSG00000279457.2,FO538757.2,chr1,184923,200322,-,protein_coding,KNOWN,,15400,...,4.491191e+04,8.710244e+04,8.273793e+04,6.591389e+04,6.271037e+04,7.079440e+04,6.578270e+04,7.218358e+04,6.904243e+04,7.662420e+04
29,ENSG00000278566.1,OR4F29,chr1,450740,451678,-,protein_coding,KNOWN,OTTHUMG00000002860.1,939,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
49,ENSG00000273547.1,OR4F16,chr1,685716,686654,-,protein_coding,KNOWN,OTTHUMG00000002581.1,939,...,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60472,ENSG00000212907.2,MT-ND4L,chrM,10470,10766,+,protein_coding,KNOWN,,297,...,4.574141e+06,5.785569e+06,4.864573e+06,3.558947e+06,5.483982e+06,1.110582e+07,5.232729e+06,5.393165e+06,4.558745e+06,3.561707e+06
60473,ENSG00000198886.2,MT-ND4,chrM,10760,12137,+,protein_coding,KNOWN,,1378,...,3.574647e+06,4.320937e+06,3.335171e+06,2.743744e+06,4.135455e+06,8.500611e+06,3.932826e+06,3.735481e+06,3.340558e+06,3.692454e+06
60477,ENSG00000198786.2,MT-ND5,chrM,12337,14148,+,protein_coding,KNOWN,,1812,...,1.154449e+06,1.414823e+06,1.179625e+06,1.083107e+06,1.476910e+06,4.349781e+06,1.377655e+06,1.366667e+06,1.203962e+06,1.353902e+06
60478,ENSG00000198695.2,MT-ND6,chrM,14149,14673,-,protein_coding,KNOWN,,525,...,3.391063e+05,4.505004e+05,3.325508e+05,3.167063e+05,2.540094e+05,4.976672e+05,3.994611e+05,2.275601e+05,3.295739e+05,1.813529e+05


In [55]:
genes = expression['gene_name'].to_list()
expression = expression.iloc[:, 12:]
expression['gene'] = [g.split('.')[0] for g in genes]
expression = expression.groupby('gene').mean()
expression = pd.DataFrame(data=np.log1p(expression.values), index=expression.index, columns=expression.columns)

expression = expression.transpose()

In [56]:
expression.columns = [f'{c}_expression' for c in expression.columns]
expression

Unnamed: 0,1-Dec_expression,1-Mar_expression,1-Sep_expression,10-Mar_expression,10-Sep_expression,11-Mar_expression,11-Sep_expression,12-Sep_expression,14-Sep_expression,15-Sep_expression,...,ZWINT_expression,ZXDA_expression,ZXDB_expression,ZXDC_expression,ZYG11A_expression,ZYG11B_expression,ZYX_expression,ZZEF1_expression,ZZZ3_expression,pk_expression
01BR001-T,5.548933,9.469038,9.526235,7.293216,13.060178,0.000000,13.287526,7.046166,0.000000,14.215909,...,11.757262,10.134319,11.362794,11.461471,10.430631,11.700726,13.320117,11.281892,11.415130,11.518754
01BR008-T,5.357447,10.496447,12.433383,0.000000,10.894918,0.000000,11.880545,0.000000,0.000000,14.216420,...,14.030484,9.176603,10.521493,11.695101,11.566098,10.635637,13.760682,11.407163,11.716684,10.209459
01BR009-T,6.061211,10.371105,10.798686,5.326345,12.614280,0.000000,12.756396,7.272150,0.000000,14.226891,...,12.447119,9.611568,10.434179,12.030646,7.766814,11.253443,13.427215,11.866759,11.416178,10.698735
01BR010-T,6.003868,11.087496,9.837070,9.001784,12.608134,9.973902,12.406277,9.206505,0.000000,14.197264,...,12.496828,9.760693,10.977785,11.414167,11.251199,11.315866,14.469199,11.537793,11.505077,10.341984
01BR015-T,7.273084,10.303115,10.240463,5.287290,12.648443,0.000000,12.149339,0.000000,0.000000,14.579366,...,13.132139,10.336607,11.524852,11.805481,9.179496,11.984678,13.306251,11.617746,12.390759,11.342688
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-00383-A,6.936267,9.680521,9.530201,8.608871,13.318557,0.000000,12.981265,6.491163,0.000000,13.811720,...,10.648933,11.231319,11.428921,11.966358,5.688687,12.921488,13.195903,12.107978,12.973878,13.511957
C3N-00858-A,7.581142,9.941336,9.597697,8.678642,13.160485,0.000000,13.028646,0.000000,5.065704,14.143747,...,9.666310,11.394370,11.345977,12.077106,7.178695,12.992283,12.854665,12.103689,12.825424,13.623158
C3N-00866-A,7.357388,9.306353,9.035412,6.620679,13.408593,0.000000,13.182946,0.000000,0.000000,13.808237,...,9.292396,11.167963,11.623877,12.122369,0.000000,12.458606,13.615069,12.305028,12.782117,13.449940
C3N-01003-A,8.159156,9.437909,9.039024,8.702565,13.305092,0.000000,12.837443,5.750295,0.000000,14.078470,...,10.100163,11.475160,11.414621,12.156336,6.330836,12.940224,12.959681,11.914472,13.181620,13.767287


###### phospho

In [60]:
phospho = pd.read_csv('../data/new/phosphoproteomee_broad_v1_tumor_normal_imputed.tsv',
                     sep='\t')
phospho

Unnamed: 0,py_sites,id,geneSymbol,ptm_sites,peptide_start,peptide_end,CPT000814,CPT001846,X01BR001,X01BR008,...,C3L.01311.N,C3L.01744.N,C3N.00200.N,C3N.00333.N,C3N.00383.N,C3N.00729.N,C3N.00858.N,C3N.00866.N,C3N.01211.N,C3N.01346.N
0,AHNAK_S93,NP_001333374.1_S93s_1_1_93_93,AHNAK,S93,89,102,-1.22118,-0.114830,-0.23140,-0.90426,...,-0.149040,1.193790,1.436150,2.117770,1.259550,2.220770,1.000380,1.847670,1.127020,1.049380
1,AHNAK_S135,NP_001333374.1_S135s_1_1_135_135,AHNAK,S135,133,149,-0.07201,0.493080,1.50544,-0.89833,...,0.754740,0.687620,1.757680,2.176910,0.392570,2.006650,1.680220,1.239560,1.722740,0.177250
2,AHNAK_S216,NP_001333374.1_S216s_1_1_216_216,AHNAK,S216,208,225,-0.13388,1.453190,0.38261,-1.36677,...,0.351580,0.384400,0.371540,0.923830,1.541480,0.894730,1.080520,0.744450,1.904610,0.276070
3,AHNAK_S379,NP_001333374.1_S379s_1_1_379_379,AHNAK,S379,372,387,-0.30732,2.939200,2.08658,-0.52477,...,1.512054,2.279690,0.445850,1.502122,1.635450,1.310484,2.396110,1.427632,2.444490,2.338760
4,AHNAK_T490,NP_001333374.1_T490t_1_1_490_490,AHNAK,T490,488,497,0.04361,0.606950,0.41679,-0.77233,...,-0.196810,0.747310,1.659080,1.408120,1.323210,1.055330,-0.109520,1.018910,1.780030,0.757630
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4987,SNRPD3_S2,NP_001265585.1_S2s_1_1_2_2,SNRPD3,S2,2,8,0.93414,0.019300,0.87565,-0.25053,...,-0.592330,-0.859344,-1.902010,-1.283450,0.321330,-0.425190,0.239080,-1.124750,0.070170,-0.039210
4988,RAB28_S207,NP_001017979.1_S207s_1_1_207_207,RAB28,S207,196,208,-0.85706,2.667090,3.10949,0.40914,...,0.168150,0.977710,0.417270,1.796510,0.281116,-0.152950,1.573370,-0.093280,2.587700,3.019520
4989,FBXL7_S105,NP_036436.1_S105s_1_1_105_105,FBXL7,S105,103,109,-1.97479,-1.888650,-3.34221,-1.38308,...,1.448340,1.234380,1.944880,1.729992,1.875440,1.515244,0.244420,2.039620,2.079320,0.763900
4990,MAP3K14_S410,NP_003945.2_S410s_1_1_410_410,MAP3K14,S410,409,416,-1.91799,-1.830038,-2.56122,-0.39728,...,-1.067802,-1.067802,-0.976486,-0.821566,-1.023100,1.096620,-0.979136,-0.821566,-0.976486,-0.821566


In [62]:
len(set(phospho['py_sites']))

4986

In [63]:
phospho.index = [f'{x}|{z}|{y}' for x, y, z in zip(phospho['py_sites'], phospho['id'], phospho['geneSymbol'])]
phospho

Unnamed: 0,py_sites,id,geneSymbol,ptm_sites,peptide_start,peptide_end,CPT000814,CPT001846,X01BR001,X01BR008,...,C3L.01311.N,C3L.01744.N,C3N.00200.N,C3N.00333.N,C3N.00383.N,C3N.00729.N,C3N.00858.N,C3N.00866.N,C3N.01211.N,C3N.01346.N
AHNAK_S93|AHNAK|NP_001333374.1_S93s_1_1_93_93,AHNAK_S93,NP_001333374.1_S93s_1_1_93_93,AHNAK,S93,89,102,-1.22118,-0.114830,-0.23140,-0.90426,...,-0.149040,1.193790,1.436150,2.117770,1.259550,2.220770,1.000380,1.847670,1.127020,1.049380
AHNAK_S135|AHNAK|NP_001333374.1_S135s_1_1_135_135,AHNAK_S135,NP_001333374.1_S135s_1_1_135_135,AHNAK,S135,133,149,-0.07201,0.493080,1.50544,-0.89833,...,0.754740,0.687620,1.757680,2.176910,0.392570,2.006650,1.680220,1.239560,1.722740,0.177250
AHNAK_S216|AHNAK|NP_001333374.1_S216s_1_1_216_216,AHNAK_S216,NP_001333374.1_S216s_1_1_216_216,AHNAK,S216,208,225,-0.13388,1.453190,0.38261,-1.36677,...,0.351580,0.384400,0.371540,0.923830,1.541480,0.894730,1.080520,0.744450,1.904610,0.276070
AHNAK_S379|AHNAK|NP_001333374.1_S379s_1_1_379_379,AHNAK_S379,NP_001333374.1_S379s_1_1_379_379,AHNAK,S379,372,387,-0.30732,2.939200,2.08658,-0.52477,...,1.512054,2.279690,0.445850,1.502122,1.635450,1.310484,2.396110,1.427632,2.444490,2.338760
AHNAK_T490|AHNAK|NP_001333374.1_T490t_1_1_490_490,AHNAK_T490,NP_001333374.1_T490t_1_1_490_490,AHNAK,T490,488,497,0.04361,0.606950,0.41679,-0.77233,...,-0.196810,0.747310,1.659080,1.408120,1.323210,1.055330,-0.109520,1.018910,1.780030,0.757630
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SNRPD3_S2|SNRPD3|NP_001265585.1_S2s_1_1_2_2,SNRPD3_S2,NP_001265585.1_S2s_1_1_2_2,SNRPD3,S2,2,8,0.93414,0.019300,0.87565,-0.25053,...,-0.592330,-0.859344,-1.902010,-1.283450,0.321330,-0.425190,0.239080,-1.124750,0.070170,-0.039210
RAB28_S207|RAB28|NP_001017979.1_S207s_1_1_207_207,RAB28_S207,NP_001017979.1_S207s_1_1_207_207,RAB28,S207,196,208,-0.85706,2.667090,3.10949,0.40914,...,0.168150,0.977710,0.417270,1.796510,0.281116,-0.152950,1.573370,-0.093280,2.587700,3.019520
FBXL7_S105|FBXL7|NP_036436.1_S105s_1_1_105_105,FBXL7_S105,NP_036436.1_S105s_1_1_105_105,FBXL7,S105,103,109,-1.97479,-1.888650,-3.34221,-1.38308,...,1.448340,1.234380,1.944880,1.729992,1.875440,1.515244,0.244420,2.039620,2.079320,0.763900
MAP3K14_S410|MAP3K14|NP_003945.2_S410s_1_1_410_410,MAP3K14_S410,NP_003945.2_S410s_1_1_410_410,MAP3K14,S410,409,416,-1.91799,-1.830038,-2.56122,-0.39728,...,-1.067802,-1.067802,-0.976486,-0.821566,-1.023100,1.096620,-0.979136,-0.821566,-0.976486,-0.821566


In [64]:
len(set(phospho.index))

4992

In [67]:
phospho = phospho.iloc[:, 6:]
phospho.index = [f'{x}_phospho' for x in phospho.index]
phospho = phospho.transpose()
phospho

Unnamed: 0,AHNAK_S93|AHNAK|NP_001333374.1_S93s_1_1_93_93_phospho,AHNAK_S135|AHNAK|NP_001333374.1_S135s_1_1_135_135_phospho,AHNAK_S216|AHNAK|NP_001333374.1_S216s_1_1_216_216_phospho,AHNAK_S379|AHNAK|NP_001333374.1_S379s_1_1_379_379_phospho,AHNAK_T490|AHNAK|NP_001333374.1_T490t_1_1_490_490_phospho,AHNAK_S511|AHNAK|NP_001333374.1_S511s_1_1_511_511_phospho,AHNAK_S819|AHNAK|NP_001333374.1_S819s_1_1_819_819_phospho,AHNAK_S886|AHNAK|NP_001333374.1_S886s_1_1_886_886_phospho,AHNAK_S1010|AHNAK|NP_001333374.1_S1010s_1_1_1010_1010_phospho,AHNAK_S1042|AHNAK|NP_001333374.1_S1042s_1_1_1042_1042_phospho,...,TOMM34_S186|TOMM34|NP_006800.2_S186s_1_1_186_186_phospho,XYLB_S354|XYLB|NP_001336107.1_S354s_1_1_354_354_phospho,NARS1_S88|NARS1|NP_004530.1_S88s_1_1_88_88_phospho,GPR65_S324|GPR65|NP_003599.2_S324s_1_1_324_324_phospho,DBI_S2|DBI|NP_001073331.1_S2s_1_1_2_2_phospho,SNRPD3_S2|SNRPD3|NP_001265585.1_S2s_1_1_2_2_phospho,RAB28_S207|RAB28|NP_001017979.1_S207s_1_1_207_207_phospho,FBXL7_S105|FBXL7|NP_036436.1_S105s_1_1_105_105_phospho,MAP3K14_S410|MAP3K14|NP_003945.2_S410s_1_1_410_410_phospho,UGGT2_S952|UGGT2|NP_064506.3_S952s_1_1_952_952_phospho
CPT000814,-1.22118,-0.07201,-0.13388,-0.307320,0.04361,0.45642,-0.03449,0.592330,0.425990,0.18155,...,0.379340,0.018260,-3.130040,-0.655220,-0.420920,0.93414,-0.85706,-1.974790,-1.917990,-2.19894
CPT001846,-0.11483,0.49308,1.45319,2.939200,0.60695,1.29495,0.85107,0.596330,1.610480,0.31457,...,0.525890,-0.159210,-2.531118,-0.510450,-1.393370,0.01930,2.66709,-1.888650,-1.830038,-1.36346
X01BR001,-0.23140,1.50544,0.38261,2.086580,0.41679,1.90777,-0.67186,-1.748680,1.260890,0.39181,...,-0.219570,-2.064230,-1.222760,-0.404420,0.545640,0.87565,3.10949,-3.342210,-2.561220,-0.14331
X01BR008,-0.90426,-0.89833,-1.36677,-0.524770,-0.77233,-0.57665,-1.21408,-3.462880,-2.216180,-1.88264,...,0.421000,-1.774430,-1.452750,2.809140,-2.573440,-0.25053,0.40914,-1.383080,-0.397280,1.70179
X01BR009,0.50317,0.04167,0.78562,0.739320,0.95540,0.37815,-1.37059,-1.896910,-1.759540,-1.92469,...,-0.412100,-1.341260,-2.096010,-0.537120,1.802760,0.47693,1.35052,-1.683910,-1.756450,2.01112
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N.00729.N,2.22077,2.00665,0.89473,1.310484,1.05533,1.58146,0.74638,2.257480,1.440692,-0.78920,...,-0.503190,0.591900,-3.471870,0.070288,-0.395776,-0.42519,-0.15295,1.515244,1.096620,-0.68520
C3N.00858.N,1.00038,1.68022,1.08052,2.396110,-0.10952,1.21408,2.07155,3.810540,1.406852,-0.34459,...,-2.643200,-0.368630,-2.118300,0.114860,-1.064490,0.23908,1.57337,0.244420,-0.979136,-1.07785
C3N.00866.N,1.84767,1.23956,0.74445,1.427632,1.01891,0.41079,0.79289,2.834300,1.146280,1.65214,...,-0.073550,0.674490,-1.680850,0.070288,0.127360,-1.12475,-0.09328,2.039620,-0.821566,-1.22700
C3N.01211.N,1.12702,1.72274,1.90461,2.444490,1.78003,0.13604,0.47544,0.747582,0.746090,1.94471,...,-0.293570,-0.267790,-1.754250,0.324926,-0.905550,0.07017,2.58770,2.079320,-0.976486,-2.74952


In [68]:
phospho.index = [x.replace('.', '-') for x in phospho.index]
phospho.index = [update_sample_id(x) for x in phospho.index]
phospho

Unnamed: 0,AHNAK_S93|AHNAK|NP_001333374.1_S93s_1_1_93_93_phospho,AHNAK_S135|AHNAK|NP_001333374.1_S135s_1_1_135_135_phospho,AHNAK_S216|AHNAK|NP_001333374.1_S216s_1_1_216_216_phospho,AHNAK_S379|AHNAK|NP_001333374.1_S379s_1_1_379_379_phospho,AHNAK_T490|AHNAK|NP_001333374.1_T490t_1_1_490_490_phospho,AHNAK_S511|AHNAK|NP_001333374.1_S511s_1_1_511_511_phospho,AHNAK_S819|AHNAK|NP_001333374.1_S819s_1_1_819_819_phospho,AHNAK_S886|AHNAK|NP_001333374.1_S886s_1_1_886_886_phospho,AHNAK_S1010|AHNAK|NP_001333374.1_S1010s_1_1_1010_1010_phospho,AHNAK_S1042|AHNAK|NP_001333374.1_S1042s_1_1_1042_1042_phospho,...,TOMM34_S186|TOMM34|NP_006800.2_S186s_1_1_186_186_phospho,XYLB_S354|XYLB|NP_001336107.1_S354s_1_1_354_354_phospho,NARS1_S88|NARS1|NP_004530.1_S88s_1_1_88_88_phospho,GPR65_S324|GPR65|NP_003599.2_S324s_1_1_324_324_phospho,DBI_S2|DBI|NP_001073331.1_S2s_1_1_2_2_phospho,SNRPD3_S2|SNRPD3|NP_001265585.1_S2s_1_1_2_2_phospho,RAB28_S207|RAB28|NP_001017979.1_S207s_1_1_207_207_phospho,FBXL7_S105|FBXL7|NP_036436.1_S105s_1_1_105_105_phospho,MAP3K14_S410|MAP3K14|NP_003945.2_S410s_1_1_410_410_phospho,UGGT2_S952|UGGT2|NP_064506.3_S952s_1_1_952_952_phospho
CPT000814,-1.22118,-0.07201,-0.13388,-0.307320,0.04361,0.45642,-0.03449,0.592330,0.425990,0.18155,...,0.379340,0.018260,-3.130040,-0.655220,-0.420920,0.93414,-0.85706,-1.974790,-1.917990,-2.19894
CPT001846,-0.11483,0.49308,1.45319,2.939200,0.60695,1.29495,0.85107,0.596330,1.610480,0.31457,...,0.525890,-0.159210,-2.531118,-0.510450,-1.393370,0.01930,2.66709,-1.888650,-1.830038,-1.36346
01BR001-T,-0.23140,1.50544,0.38261,2.086580,0.41679,1.90777,-0.67186,-1.748680,1.260890,0.39181,...,-0.219570,-2.064230,-1.222760,-0.404420,0.545640,0.87565,3.10949,-3.342210,-2.561220,-0.14331
01BR008-T,-0.90426,-0.89833,-1.36677,-0.524770,-0.77233,-0.57665,-1.21408,-3.462880,-2.216180,-1.88264,...,0.421000,-1.774430,-1.452750,2.809140,-2.573440,-0.25053,0.40914,-1.383080,-0.397280,1.70179
01BR009-T,0.50317,0.04167,0.78562,0.739320,0.95540,0.37815,-1.37059,-1.896910,-1.759540,-1.92469,...,-0.412100,-1.341260,-2.096010,-0.537120,1.802760,0.47693,1.35052,-1.683910,-1.756450,2.01112
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-00729-N,2.22077,2.00665,0.89473,1.310484,1.05533,1.58146,0.74638,2.257480,1.440692,-0.78920,...,-0.503190,0.591900,-3.471870,0.070288,-0.395776,-0.42519,-0.15295,1.515244,1.096620,-0.68520
C3N-00858-N,1.00038,1.68022,1.08052,2.396110,-0.10952,1.21408,2.07155,3.810540,1.406852,-0.34459,...,-2.643200,-0.368630,-2.118300,0.114860,-1.064490,0.23908,1.57337,0.244420,-0.979136,-1.07785
C3N-00866-N,1.84767,1.23956,0.74445,1.427632,1.01891,0.41079,0.79289,2.834300,1.146280,1.65214,...,-0.073550,0.674490,-1.680850,0.070288,0.127360,-1.12475,-0.09328,2.039620,-0.821566,-1.22700
C3N-01211-N,1.12702,1.72274,1.90461,2.444490,1.78003,0.13604,0.47544,0.747582,0.746090,1.94471,...,-0.293570,-0.267790,-1.754250,0.324926,-0.905550,0.07017,2.58770,2.079320,-0.976486,-2.74952


###### methylation subtypes

In [69]:
methylation_st = pd.read_csv('../data/new/methylation_subtype.csv', sep=',')
methylation_st

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Sample_ID,Subject_ID,Cancer_type,Class,Pan3_k5
0,0,0,CPT0008520010,C3N-00295,HNSCC,1,3
1,1,1,CPT0008630006,C3N-00299,HNSCC,1,2
2,2,2,CPT0008680006,C3N-00300,HNSCC,1,2
3,3,3,CPT0008930006,C3N-00307,HNSCC,1,2
4,4,4,CPT0011290008,C3N-00306,HNSCC,1,2
...,...,...,...,...,...,...,...
698,766,836,CPT0128770006,C3N-02253,UCEC,3,
699,767,837,CPT0128960006,C3N-02244,UCEC,2,
700,768,838,CPT0129080006,C3N-02249,UCEC,1,
701,769,839,CPT0129520007,C3N-00755,UCEC,1,


In [70]:
set(methylation_st['Cancer_type'])

{'CCRCC', 'GBM', 'HNSCC', 'LSCC', 'LUAD', 'UCEC'}

In [71]:
len(set(methylation_st['Sample_ID']))

703

In [72]:
methylation_st.index = [x + '-T' for x in methylation_st['Subject_ID']]
methylation_st = methylation_st[['Class', 'Cancer_type']]
methylation_st.columns = ['methylation_subtype', 'disease']
methylation_st

Unnamed: 0,methylation_subtype,disease
C3N-00295-T,1,HNSCC
C3N-00299-T,1,HNSCC
C3N-00300-T,1,HNSCC
C3N-00307-T,1,HNSCC
C3N-00306-T,1,HNSCC
...,...,...
C3N-02253-T,3,UCEC
C3N-02244-T,2,UCEC
C3N-02249-T,1,UCEC
C3N-00755-T,1,UCEC


In [73]:
# one hot encode
for t in set(methylation_st['methylation_subtype']):
    methylation_st[f'methylation_subtype_{t}'] = [1 if x==t else 0
                                                 for x in methylation_st['methylation_subtype']]
methylation_st = methylation_st[[c for c in methylation_st.columns if 'methylation_subtype_' in c]]



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [74]:
methylation_st

Unnamed: 0,methylation_subtype_1,methylation_subtype_2,methylation_subtype_3,methylation_subtype_4,methylation_subtype_5,methylation_subtype_6
C3N-00295-T,1,0,0,0,0,0
C3N-00299-T,1,0,0,0,0,0
C3N-00300-T,1,0,0,0,0,0
C3N-00307-T,1,0,0,0,0,0
C3N-00306-T,1,0,0,0,0,0
...,...,...,...,...,...,...
C3N-02253-T,0,0,1,0,0,0
C3N-02244-T,0,1,0,0,0,0
C3N-02249-T,1,0,0,0,0,0
C3N-00755-T,1,0,0,0,0,0


###### immune subtypes

In [75]:
immune_st = pd.read_csv('../data/new/immune_subtype.txt', sep='\t')
immune_st

Unnamed: 0,case,Immune_subtype,disease
0,01BR001-T,1,BR
1,01BR008-T,2,BR
2,01BR009-T,3,BR
3,01BR010-T,3,BR
4,01BR015-T,3,BR
...,...,...,...
1083,C3N-01520-T,3,UCEC
1084,C3N-01521-T,3,UCEC
1085,C3N-01537-T,4,UCEC
1086,C3N-01802-T,4,UCEC


In [76]:
immune_st = immune_st.set_index('case')
immune_st = immune_st[['Immune_subtype']]
immune_st.columns = ['immune_subtype']
immune_st

Unnamed: 0_level_0,immune_subtype
case,Unnamed: 1_level_1
01BR001-T,1
01BR008-T,2
01BR009-T,3
01BR010-T,3
01BR015-T,3
...,...
C3N-01520-T,3
C3N-01521-T,3
C3N-01537-T,4
C3N-01802-T,4


In [77]:
# one hot encode
for t in set(immune_st['immune_subtype']):
    immune_st[f'immune_subtype_{t}'] = [1 if x==t else 0
                                                 for x in immune_st['immune_subtype']]
immune_st = immune_st[[c for c in immune_st.columns if 'immune_subtype_' in c]]
immune_st

Unnamed: 0_level_0,immune_subtype_1,immune_subtype_2,immune_subtype_3,immune_subtype_4,immune_subtype_5
case,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
01BR001-T,1,0,0,0,0
01BR008-T,0,1,0,0,0
01BR009-T,0,0,1,0,0
01BR010-T,0,0,1,0,0
01BR015-T,0,0,1,0,0
...,...,...,...,...,...
C3N-01520-T,0,0,1,0,0
C3N-01521-T,0,0,1,0,0
C3N-01537-T,0,0,0,1,0
C3N-01802-T,0,0,0,1,0


###### clinical

In [94]:
df = pd.read_csv('../data/new/clinical_Pan-cancer.Jan2022.tsv', sep='\t')
df = df.set_index('case_id')
df

Unnamed: 0_level_0,tumor_code,discovery_study,discovery_study/type_of_analyzed_samples,confirmatory_study,confirmatory_study/type_of_analyzed_samples,consent/age,consent/sex,consent/race,consent/ethnicity,consent/ethnicity_race_ancestry_identified,...,follow-up/residual_tumor_after_surgery_for_new_tumor,follow-up/additional_treatment_radiation_therapy_for_new_tumor,follow-up/additional_treatment_pharmaceutical_therapy_for_new_tumor,follow-up/additional_treatment_immuno_for_new_tumor,follow-up/number_of_days_from_date_of_initial_pathologic_diagnosis_to_date_of_additional_surgery_for_new_tumor_event_loco-regional,follow-up/number_of_days_from_date_of_initial_pathologic_diagnosis_to_date_of_additional_surgery_for_new_tumor_event_metastasis,"Recurrence-free survival, days","Overall survival, days","Recurrence status (1, yes; 0, no)","Survival status (1, dead; 0, alive)"
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00908,CCRCC,Yes,Tumor_and_Normal,Yes,Tumor,60,Female,White,Not-Hispanic or Latino,White,...,RX: Presence of Residual Tumor cannot be assessed,Yes,Yes,No,,1517.0,1115.0,1795.0,1,0.0
C3L-00004,CCRCC,Yes,Tumor_and_Normal,No,,72,Male,White,Not-Hispanic or Latino,White,...,,,,,,,,384.0,0,0.0
C3L-00010,CCRCC,Yes,Tumor_and_Normal,No,,30,Male,White,Not-Hispanic or Latino,White,...,,,,,,,,896.0,0,0.0
C3L-00011,CCRCC,Yes,Tumor_and_Normal,No,,63,Female,White,Not-Hispanic or Latino,White,...,,,,,,,,241.0,0,1.0
C3L-00026,CCRCC,Yes,Tumor_and_Normal,No,,65,Female,White,Not-Hispanic or Latino,White,...,,,,,,,,1769.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26OV008,OV,,,,,65,Female,White,Not Evaluated,,...,,,,,,,,400.0,0,1.0
26OV009,OV,,,,,60,Female,White,Unknown,,...,,,,,,,,727.0,0,0.0
26OV010,OV,,,,,57,Female,White,Unknown,,...,,,,,,,,506.0,0,0.0
26OV011,OV,,,,,64,Female,White,Unknown,,...,,,,,,,,177.0,0,0.0


In [95]:
[c for c in df.columns if 'subtype' in df.columns]

[]

In [96]:
cols = [
    'consent/age',
    'consent/sex', 
]

In [97]:
clinical = df[cols]
clinical.columns = ['clinical_age', 'clinical_sex']
clinical

Unnamed: 0_level_0,clinical_age,clinical_sex
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1
C3L-00908,60,Female
C3L-00004,72,Male
C3L-00010,30,Male
C3L-00011,63,Female
C3L-00026,65,Female
...,...,...
26OV008,65,Female
26OV009,60,Female
26OV010,57,Female
26OV011,64,Female


In [98]:
clinical['clinical_is_female'] = [0 if x!='Female' else 1 for x in clinical['clinical_sex']]
clinical = clinical[['clinical_age', 'clinical_is_female']]
clinical


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,clinical_age,clinical_is_female
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1
C3L-00908,60,1
C3L-00004,72,0
C3L-00010,30,0
C3L-00011,63,1
C3L-00026,65,1
...,...,...
26OV008,65,1
26OV009,60,1
26OV010,57,1
26OV011,64,1


In [99]:
fps = sorted(os_helpers.listfiles('../data/new/ancestry/', regex=r'.tsv$'))
fps

['../data/new/ancestry/BRCA.tsv',
 '../data/new/ancestry/CO.tsv',
 '../data/new/ancestry/GBM.tsv',
 '../data/new/ancestry/HNSCC.tsv',
 '../data/new/ancestry/LSCC.tsv',
 '../data/new/ancestry/LUAD.tsv',
 '../data/new/ancestry/OV.tsv',
 '../data/new/ancestry/PDAC.tsv',
 '../data/new/ancestry/UCEC.tsv',
 '../data/new/ancestry/ccRCC.tsv']

In [100]:
df = None
for fp in fps:
    if df is None:
        df = pd.read_csv(fp, sep='\t', index_col='sample_id')
    else:
        df = pd.concat((df, pd.read_csv(fp, sep='\t', index_col='sample_id')))
df

Unnamed: 0_level_0,predicted_ancestry,probability_AFR,probability_AMR,probability_EAS,probability_EUR,probability_SAS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
01BR001,AFR,0.98,0.01,0.00,0.00,0.01
01BR015,EUR,0.00,0.03,0.01,0.96,0.00
01BR017,EUR,0.00,0.01,0.00,0.99,0.00
01BR018,EUR,0.00,0.00,0.00,1.00,0.00
01BR025,AFR,0.94,0.05,0.00,0.00,0.01
...,...,...,...,...,...,...
C3N-01648,EUR,0.00,0.00,0.02,0.98,0.00
C3N-01649,EUR,0.01,0.06,0.00,0.93,0.00
C3N-01651,EUR,0.00,0.00,0.00,1.00,0.00
C3N-01808,EUR,0.00,0.06,0.02,0.92,0.00


In [101]:
d = {s:x for s, x in zip(df.index, df['predicted_ancestry'])}
for a in sorted(set(df['predicted_ancestry'])):
    clinical[f'clinical_predicted_ancestry_is_{a}'] = [1 if d.get(x) == a else 0 for x in clinical.index]
clinical

Unnamed: 0_level_0,clinical_age,clinical_is_female,clinical_predicted_ancestry_is_AFR,clinical_predicted_ancestry_is_AMR,clinical_predicted_ancestry_is_EAS,clinical_predicted_ancestry_is_EUR,clinical_predicted_ancestry_is_SAS
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
C3L-00908,60,1,0,0,0,1,0
C3L-00004,72,0,0,0,0,1,0
C3L-00010,30,0,0,1,0,0,0
C3L-00011,63,1,0,0,0,1,0
C3L-00026,65,1,0,0,0,1,0
...,...,...,...,...,...,...,...
26OV008,65,1,0,0,0,1,0
26OV009,60,1,0,0,0,1,0
26OV010,57,1,0,0,0,1,0
26OV011,64,1,0,0,0,1,0


In [102]:
clinical.index = [update_sample_id(x) for x in clinical.index]
clinical

Unnamed: 0,clinical_age,clinical_is_female,clinical_predicted_ancestry_is_AFR,clinical_predicted_ancestry_is_AMR,clinical_predicted_ancestry_is_EAS,clinical_predicted_ancestry_is_EUR,clinical_predicted_ancestry_is_SAS
C3L-00908-T,60,1,0,0,0,1,0
C3L-00004-T,72,0,0,0,0,1,0
C3L-00010-T,30,0,0,1,0,0,0
C3L-00011-T,63,1,0,0,0,1,0
C3L-00026-T,65,1,0,0,0,1,0
...,...,...,...,...,...,...,...
26OV008-T,65,1,0,0,0,1,0
26OV009-T,60,1,0,0,0,1,0
26OV010-T,57,1,0,0,0,1,0
26OV011-T,64,1,0,0,0,1,0


###### check ids

In [103]:
sample_to_disease = {s:d for s, d in zip(cnv.index, cnv['disease'])}
disease_to_sample = {d:[] for d in set(cnv['disease'])}
for s, d in sample_to_disease.items():
    disease_to_sample[d].append(s)
    
for d, s_ids in disease_to_sample.items():
    print(d, len(s_ids), s_ids[:5])

GBM 99 ['C3L-00104-T', 'C3L-00365-T', 'C3L-00674-T', 'C3L-00677-T', 'C3L-01040-T']
ccRCC 110 ['C3L-00004-T', 'C3L-00010-T', 'C3L-00011-T', 'C3L-00026-T', 'C3L-00079-T']
BR 122 ['01BR001-T', '01BR008-T', '01BR009-T', '01BR010-T', '01BR015-T']
OV 82 ['01OV007-T', '01OV017-T', '01OV018-T', '01OV023-T', '01OV026-T']
CO 106 ['01CO001-T', '01CO005-T', '01CO006-T', '01CO008-T', '01CO013-T']
PDA 140 ['C3L-00017-T', 'C3L-00102-T', 'C3L-00189-T', 'C3L-00277-T', 'C3L-00401-T']
HNSCC 110 ['C3L-00977-T', 'C3L-00987-T', 'C3L-00994-T', 'C3L-00995-T', 'C3L-00997-T']
UCEC 95 ['C3L-00006-T', 'C3L-00008-T', 'C3L-00032-T', 'C3L-00090-T', 'C3L-00098-T']
LSCC 110 ['C3L-00081-T', 'C3L-00415-T', 'C3L-00445-T', 'C3L-00568-T', 'C3L-00603-T']
LUAD 110 ['11LU013-T', '11LU016-T', '11LU022-T', '11LU035-T', 'C3L-00001-T']


In [104]:
for d, s_ids in disease_to_sample.items():
    print(d, len(set(s_ids).intersection(set(proteome.index))), len(s_ids), proteome.shape[0])

GBM 99 99 1696
ccRCC 110 110 1696
BR 122 122 1696
OV 82 82 1696
CO 96 106 1696
PDA 140 140 1696
HNSCC 110 110 1696
UCEC 95 95 1696
LSCC 108 110 1696
LUAD 109 110 1696


In [105]:
for d, s_ids in disease_to_sample.items():
    print(d, len(set(s_ids).intersection(set(purity.index))), len(s_ids), purity.shape[0])

GBM 99 99 1083
ccRCC 110 110 1083
BR 119 122 1083
OV 82 82 1083
CO 106 106 1083
PDA 140 140 1083
HNSCC 110 110 1083
UCEC 95 95 1083
LSCC 110 110 1083
LUAD 110 110 1083


In [106]:
for d, s_ids in disease_to_sample.items():
    print(d, len(set(s_ids).intersection(set(cnv.index))), len(s_ids), cnv.shape[0])

GBM 99 99 1084
ccRCC 110 110 1084
BR 122 122 1084
OV 82 82 1084
CO 106 106 1084
PDA 140 140 1084
HNSCC 110 110 1084
UCEC 95 95 1084
LSCC 110 110 1084
LUAD 110 110 1084


In [107]:
# somatic mutations
for d, s_ids in disease_to_sample.items():
    print(d, len(set(s_ids).intersection(set(somatic_mutations.index))), len(s_ids), somatic_mutations.shape[0])

GBM 99 99 1066
ccRCC 110 110 1066
BR 119 122 1066
OV 82 82 1066
CO 96 106 1066
PDA 140 140 1066
HNSCC 108 110 1066
UCEC 95 95 1066
LSCC 107 110 1066
LUAD 108 110 1066


In [108]:
for d, s_ids in disease_to_sample.items():
    print(d, len(set(s_ids).intersection(set(germline.index))), len(s_ids), germline.shape[0])

GBM 10 99 106
ccRCC 8 110 106
BR 10 122 106
OV 11 82 106
CO 8 106 106
PDA 16 140 106
HNSCC 12 110 106
UCEC 12 95 106
LSCC 5 110 106
LUAD 6 110 106


In [109]:
for d, s_ids in disease_to_sample.items():
    print(d, len(set(s_ids).intersection(set(expression.index))), len(s_ids), expression.shape[0])

GBM 99 99 1470
ccRCC 110 110 1470
BR 119 122 1470
OV 82 82 1470
CO 106 106 1470
PDA 140 140 1470
HNSCC 110 110 1470
UCEC 95 95 1470
LSCC 110 110 1470
LUAD 110 110 1470


In [110]:
for d, s_ids in disease_to_sample.items():
    print(d, len(set(s_ids).intersection(set(phospho.index))), len(s_ids), phospho.shape[0])

GBM 99 99 1696
ccRCC 110 110 1696
BR 122 122 1696
OV 82 82 1696
CO 96 106 1696
PDA 140 140 1696
HNSCC 110 110 1696
UCEC 95 95 1696
LSCC 108 110 1696
LUAD 109 110 1696


In [111]:
for d, s_ids in disease_to_sample.items():
    print(d, len(set(s_ids).intersection(set(methylation_st.index))), len(s_ids), methylation_st.shape[0])

GBM 69 99 703
ccRCC 104 110 703
BR 0 122 703
OV 0 82 703
CO 0 106 703
PDA 0 140 703
HNSCC 81 110 703
UCEC 79 95 703
LSCC 82 110 703
LUAD 65 110 703


In [112]:
for d, s_ids in disease_to_sample.items():
    print(d, len(set(s_ids).intersection(set(immune_st.index))), len(s_ids), immune_st.shape[0])

GBM 99 99 1088
ccRCC 110 110 1088
BR 119 122 1088
OV 82 82 1088
CO 106 106 1088
PDA 140 140 1088
HNSCC 110 110 1088
UCEC 95 95 1088
LSCC 110 110 1088
LUAD 110 110 1088


In [114]:
for d, s_ids in disease_to_sample.items():
    print(d, len(set(s_ids).intersection(set(clinical.index))), len(s_ids), clinical.shape[0])

GBM 99 99 1922
ccRCC 110 110 1922
BR 114 122 1922
OV 82 82 1922
CO 105 106 1922
PDA 140 140 1922
HNSCC 110 110 1922
UCEC 95 95 1922
LSCC 110 110 1922
LUAD 110 110 1922


###### merge everything together

In [115]:
combined = pd.merge(proteome, somatic_mutations, left_index=True, right_index=True)
combined

Unnamed: 0,AAAS_proteome,AAK1_proteome,AATF_proteome,ABCA1_proteome,ABCA2_proteome,ABCB1_proteome,ABCB11_proteome,ABCC2_proteome,ABCC3_proteome,ABCE1_proteome,...,ZNF750_mutation_is_Nonsense_Mutation,ZNF750_mutation_is_Nonstop_Mutation,ZNF750_mutation_is_RNA,ZNF750_mutation_is_START_CODON_INS,ZNF750_mutation_is_START_CODON_SNP,ZNF750_mutation_is_Silent,ZNF750_mutation_is_Splice_Site,ZNF750_mutation_is_Translation_Start_Site,ZNF750_has_truncating_mutation,ZNF750_has_nonsilent_mutation
01BR001-T,0.35140,-0.00390,1.10105,0.98196,,,,,-3.91029,1.12252,...,0,0,0,0,0,0,0,0,0,0
01BR008-T,0.53624,-0.24927,1.54379,0.85463,,,,,-1.42230,0.51320,...,0,0,0,0,0,0,0,0,0,0
01BR009-T,-0.27902,-0.33482,0.69390,0.78852,,,,,-1.49698,-0.39790,...,0,0,0,0,0,0,0,0,0,0
01BR010-T,0.35560,-0.05965,-0.06194,-2.39742,,,,4.85220,1.09662,-0.41295,...,0,0,0,0,0,0,0,0,0,0
01BR015-T,0.95351,-0.37789,-0.68548,-3.96565,,,,,-5.95178,0.12743,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-01520-T,0.63342,-0.36535,-0.39778,0.15781,,,,-0.21834,-0.35454,0.76313,...,0,0,0,0,0,0,0,0,0,0
C3N-01521-T,0.52322,-0.91119,0.81686,-1.23152,,,,,-1.06780,0.52678,...,0,0,0,0,0,0,0,0,0,0
C3N-01537-T,0.25028,0.07848,1.33838,0.73600,,,,0.74236,-1.43594,0.51753,...,0,0,0,0,0,0,0,0,0,0
C3N-01802-T,0.14344,0.72401,0.64546,1.99615,,,,,-0.33810,0.03415,...,0,0,0,0,0,0,0,0,0,0


In [116]:
combined = pd.merge(combined, germline, left_index=True, right_index=True, how='left')
combined

Unnamed: 0,AAAS_proteome,AAK1_proteome,AATF_proteome,ABCA1_proteome,ABCA2_proteome,ABCB1_proteome,ABCB11_proteome,ABCC2_proteome,ABCC3_proteome,ABCE1_proteome,...,ZRANB1_is_pathogenic_germline,ZRSR2_is_pathogenic_germline,ZSCAN10_is_pathogenic_germline,ZSCAN25_is_pathogenic_germline,ZSCAN32_is_pathogenic_germline,ZW10_is_pathogenic_germline,ZWILCH_is_pathogenic_germline,ZWINT_is_pathogenic_germline,ZXDC_is_pathogenic_germline,ZYX_is_pathogenic_germline
01BR001-T,0.35140,-0.00390,1.10105,0.98196,,,,,-3.91029,1.12252,...,,,,,,,,,,
01BR008-T,0.53624,-0.24927,1.54379,0.85463,,,,,-1.42230,0.51320,...,,,,,,,,,,
01BR009-T,-0.27902,-0.33482,0.69390,0.78852,,,,,-1.49698,-0.39790,...,,,,,,,,,,
01BR010-T,0.35560,-0.05965,-0.06194,-2.39742,,,,4.85220,1.09662,-0.41295,...,,,,,,,,,,
01BR015-T,0.95351,-0.37789,-0.68548,-3.96565,,,,,-5.95178,0.12743,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-04280-T,-0.54060,0.28620,0.56570,-0.29791,,,,,,0.45859,...,,,,,,,,,,
C3N-04282-T,-0.38949,0.11637,0.56524,-1.92610,,,,-1.70998,1.13998,-0.12112,...,,,,,,,,,,
C3N-04283-T,0.38921,-0.15259,-0.18576,1.35119,,,0.25874,0.90227,0.31181,-0.05971,...,,,,,,,,,,
C3N-04284-T,0.76318,0.31274,0.45277,-1.21128,,,,,0.54146,1.09926,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [117]:
combined = pd.merge(combined, purity, left_index=True, right_index=True, how='left')
combined

Unnamed: 0,AAAS_proteome,AAK1_proteome,AATF_proteome,ABCA1_proteome,ABCA2_proteome,ABCB1_proteome,ABCB11_proteome,ABCC2_proteome,ABCC3_proteome,ABCE1_proteome,...,ZRSR2_is_pathogenic_germline,ZSCAN10_is_pathogenic_germline,ZSCAN25_is_pathogenic_germline,ZSCAN32_is_pathogenic_germline,ZW10_is_pathogenic_germline,ZWILCH_is_pathogenic_germline,ZWINT_is_pathogenic_germline,ZXDC_is_pathogenic_germline,ZYX_is_pathogenic_germline,TumorPurity
01BR001-T,0.35140,-0.00390,1.10105,0.98196,,,,,-3.91029,1.12252,...,,,,,,,,,,0.816624
01BR008-T,0.53624,-0.24927,1.54379,0.85463,,,,,-1.42230,0.51320,...,,,,,,,,,,0.510466
01BR009-T,-0.27902,-0.33482,0.69390,0.78852,,,,,-1.49698,-0.39790,...,,,,,,,,,,0.556239
01BR010-T,0.35560,-0.05965,-0.06194,-2.39742,,,,4.85220,1.09662,-0.41295,...,,,,,,,,,,0.747700
01BR015-T,0.95351,-0.37789,-0.68548,-3.96565,,,,,-5.95178,0.12743,...,,,,,,,,,,0.649161
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-04280-T,-0.54060,0.28620,0.56570,-0.29791,,,,,,0.45859,...,,,,,,,,,,0.797724
C3N-04282-T,-0.38949,0.11637,0.56524,-1.92610,,,,-1.70998,1.13998,-0.12112,...,,,,,,,,,,0.604708
C3N-04283-T,0.38921,-0.15259,-0.18576,1.35119,,,0.25874,0.90227,0.31181,-0.05971,...,,,,,,,,,,0.661656
C3N-04284-T,0.76318,0.31274,0.45277,-1.21128,,,,,0.54146,1.09926,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.723917


In [118]:
combined = pd.merge(combined, cnv, left_index=True, right_index=True, how='left')
combined

Unnamed: 0,AAAS_proteome,AAK1_proteome,AATF_proteome,ABCA1_proteome,ABCA2_proteome,ABCB1_proteome,ABCB11_proteome,ABCC2_proteome,ABCC3_proteome,ABCE1_proteome,...,ZRANB1_cnv,ZSCAN10_cnv,ZSCAN25_cnv,ZSCAN32_cnv,ZW10_cnv,ZWILCH_cnv,ZWINT_cnv,ZXDC_cnv,ZYX_cnv,disease
01BR001-T,0.35140,-0.00390,1.10105,0.98196,,,,,-3.91029,1.12252,...,-0.00966,-0.19088,0.04711,-0.19088,0.01914,-0.26696,-0.00966,-0.06602,0.05187,BR
01BR008-T,0.53624,-0.24927,1.54379,0.85463,,,,,-1.42230,0.51320,...,0.11550,-0.16619,-0.00576,-0.16619,-0.04312,0.00988,0.11550,-0.09075,-0.00576,BR
01BR009-T,-0.27902,-0.33482,0.69390,0.78852,,,,,-1.49698,-0.39790,...,0.32349,-0.38928,0.67516,-0.38928,-0.15614,-0.15576,-0.14743,0.26289,0.06942,BR
01BR010-T,0.35560,-0.05965,-0.06194,-2.39742,,,,4.85220,1.09662,-0.41295,...,0.00058,-0.04794,-0.03744,-0.04794,0.03900,-0.18501,-0.14666,-0.00744,-0.03744,BR
01BR015-T,0.95351,-0.37789,-0.68548,-3.96565,,,,,-5.95178,0.12743,...,0.13746,0.26465,-0.28490,0.26465,-0.20074,-0.23970,0.13746,0.09962,-0.27216,BR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-04280-T,-0.54060,0.28620,0.56570,-0.29791,,,,,,0.45859,...,-0.02797,-0.02114,-0.03366,-0.02114,-0.02474,-0.03298,-0.02797,-0.03297,-0.03366,HNSCC
C3N-04282-T,-0.38949,0.11637,0.56524,-1.92610,,,,-1.70998,1.13998,-0.12112,...,0.45477,0.00417,0.20035,0.00417,0.16786,0.32189,-0.28072,0.18985,0.20035,PDA
C3N-04283-T,0.38921,-0.15259,-0.18576,1.35119,,,0.25874,0.90227,0.31181,-0.05971,...,-0.15172,0.02498,0.02378,0.02498,0.02317,-0.11829,-0.15172,0.00268,0.02378,PDA
C3N-04284-T,0.76318,0.31274,0.45277,-1.21128,,,,,0.54146,1.09926,...,-0.06958,-0.04134,0.05587,-0.04134,-0.08305,0.00816,-0.06958,-0.00409,0.05587,PDA


In [131]:
# only need to run if adding directly into previously saved dataframe
# combined = combined[[c for c in combined.columns if '_expression' not in c]]

In [119]:
combined = pd.merge(combined, expression, left_index=True, right_index=True, how='left')
combined

Unnamed: 0,AAAS_proteome,AAK1_proteome,AATF_proteome,ABCA1_proteome,ABCA2_proteome,ABCB1_proteome,ABCB11_proteome,ABCC2_proteome,ABCC3_proteome,ABCE1_proteome,...,ZWINT_expression,ZXDA_expression,ZXDB_expression,ZXDC_expression,ZYG11A_expression,ZYG11B_expression,ZYX_expression,ZZEF1_expression,ZZZ3_expression,pk_expression
01BR001-T,0.35140,-0.00390,1.10105,0.98196,,,,,-3.91029,1.12252,...,11.757262,10.134319,11.362794,11.461471,10.430631,11.700726,13.320117,11.281892,11.415130,11.518754
01BR008-T,0.53624,-0.24927,1.54379,0.85463,,,,,-1.42230,0.51320,...,14.030484,9.176603,10.521493,11.695101,11.566098,10.635637,13.760682,11.407163,11.716684,10.209459
01BR009-T,-0.27902,-0.33482,0.69390,0.78852,,,,,-1.49698,-0.39790,...,12.447119,9.611568,10.434179,12.030646,7.766814,11.253443,13.427215,11.866759,11.416178,10.698735
01BR010-T,0.35560,-0.05965,-0.06194,-2.39742,,,,4.85220,1.09662,-0.41295,...,12.496828,9.760693,10.977785,11.414167,11.251199,11.315866,14.469199,11.537793,11.505077,10.341984
01BR015-T,0.95351,-0.37789,-0.68548,-3.96565,,,,,-5.95178,0.12743,...,13.132139,10.336607,11.524852,11.805481,9.179496,11.984678,13.306251,11.617746,12.390759,11.342688
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-04280-T,-0.54060,0.28620,0.56570,-0.29791,,,,,,0.45859,...,12.262253,9.877220,9.807525,11.786973,7.991356,12.073526,12.863836,12.216953,12.173269,13.029091
C3N-04282-T,-0.38949,0.11637,0.56524,-1.92610,,,,-1.70998,1.13998,-0.12112,...,11.298640,11.137093,11.290063,12.054070,6.366129,12.511467,12.954658,11.765840,12.684628,11.730873
C3N-04283-T,0.38921,-0.15259,-0.18576,1.35119,,,0.25874,0.90227,0.31181,-0.05971,...,12.329715,10.966991,11.735180,11.695404,6.729858,12.600089,12.528603,11.462524,12.674957,12.165704
C3N-04284-T,0.76318,0.31274,0.45277,-1.21128,,,,,0.54146,1.09926,...,11.254853,11.166731,11.488598,12.329690,6.564357,12.282929,12.292070,12.580099,12.646604,11.792286


In [120]:
combined = pd.merge(combined, methylation_st, left_index=True, right_index=True, how='left')
combined

Unnamed: 0,AAAS_proteome,AAK1_proteome,AATF_proteome,ABCA1_proteome,ABCA2_proteome,ABCB1_proteome,ABCB11_proteome,ABCC2_proteome,ABCC3_proteome,ABCE1_proteome,...,ZYX_expression,ZZEF1_expression,ZZZ3_expression,pk_expression,methylation_subtype_1,methylation_subtype_2,methylation_subtype_3,methylation_subtype_4,methylation_subtype_5,methylation_subtype_6
01BR001-T,0.35140,-0.00390,1.10105,0.98196,,,,,-3.91029,1.12252,...,13.320117,11.281892,11.415130,11.518754,,,,,,
01BR008-T,0.53624,-0.24927,1.54379,0.85463,,,,,-1.42230,0.51320,...,13.760682,11.407163,11.716684,10.209459,,,,,,
01BR009-T,-0.27902,-0.33482,0.69390,0.78852,,,,,-1.49698,-0.39790,...,13.427215,11.866759,11.416178,10.698735,,,,,,
01BR010-T,0.35560,-0.05965,-0.06194,-2.39742,,,,4.85220,1.09662,-0.41295,...,14.469199,11.537793,11.505077,10.341984,,,,,,
01BR015-T,0.95351,-0.37789,-0.68548,-3.96565,,,,,-5.95178,0.12743,...,13.306251,11.617746,12.390759,11.342688,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-04280-T,-0.54060,0.28620,0.56570,-0.29791,,,,,,0.45859,...,12.863836,12.216953,12.173269,13.029091,0.0,1.0,0.0,0.0,0.0,0.0
C3N-04282-T,-0.38949,0.11637,0.56524,-1.92610,,,,-1.70998,1.13998,-0.12112,...,12.954658,11.765840,12.684628,11.730873,,,,,,
C3N-04283-T,0.38921,-0.15259,-0.18576,1.35119,,,0.25874,0.90227,0.31181,-0.05971,...,12.528603,11.462524,12.674957,12.165704,,,,,,
C3N-04284-T,0.76318,0.31274,0.45277,-1.21128,,,,,0.54146,1.09926,...,12.292070,12.580099,12.646604,11.792286,,,,,,


In [121]:
combined = pd.merge(combined, immune_st, left_index=True, right_index=True, how='left')
combined

Unnamed: 0,AAAS_proteome,AAK1_proteome,AATF_proteome,ABCA1_proteome,ABCA2_proteome,ABCB1_proteome,ABCB11_proteome,ABCC2_proteome,ABCC3_proteome,ABCE1_proteome,...,methylation_subtype_2,methylation_subtype_3,methylation_subtype_4,methylation_subtype_5,methylation_subtype_6,immune_subtype_1,immune_subtype_2,immune_subtype_3,immune_subtype_4,immune_subtype_5
01BR001-T,0.35140,-0.00390,1.10105,0.98196,,,,,-3.91029,1.12252,...,,,,,,1.0,0.0,0.0,0.0,0.0
01BR008-T,0.53624,-0.24927,1.54379,0.85463,,,,,-1.42230,0.51320,...,,,,,,0.0,1.0,0.0,0.0,0.0
01BR009-T,-0.27902,-0.33482,0.69390,0.78852,,,,,-1.49698,-0.39790,...,,,,,,0.0,0.0,1.0,0.0,0.0
01BR010-T,0.35560,-0.05965,-0.06194,-2.39742,,,,4.85220,1.09662,-0.41295,...,,,,,,0.0,0.0,1.0,0.0,0.0
01BR015-T,0.95351,-0.37789,-0.68548,-3.96565,,,,,-5.95178,0.12743,...,,,,,,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-04280-T,-0.54060,0.28620,0.56570,-0.29791,,,,,,0.45859,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
C3N-04282-T,-0.38949,0.11637,0.56524,-1.92610,,,,-1.70998,1.13998,-0.12112,...,,,,,,0.0,1.0,0.0,0.0,0.0
C3N-04283-T,0.38921,-0.15259,-0.18576,1.35119,,,0.25874,0.90227,0.31181,-0.05971,...,,,,,,1.0,0.0,0.0,0.0,0.0
C3N-04284-T,0.76318,0.31274,0.45277,-1.21128,,,,,0.54146,1.09926,...,,,,,,0.0,1.0,0.0,0.0,0.0


In [122]:
combined = pd.merge(combined, clinical, left_index=True, right_index=True, how='left')
combined

Unnamed: 0,AAAS_proteome,AAK1_proteome,AATF_proteome,ABCA1_proteome,ABCA2_proteome,ABCB1_proteome,ABCB11_proteome,ABCC2_proteome,ABCC3_proteome,ABCE1_proteome,...,immune_subtype_3,immune_subtype_4,immune_subtype_5,clinical_age,clinical_is_female,clinical_predicted_ancestry_is_AFR,clinical_predicted_ancestry_is_AMR,clinical_predicted_ancestry_is_EAS,clinical_predicted_ancestry_is_EUR,clinical_predicted_ancestry_is_SAS
01BR001-T,0.35140,-0.00390,1.10105,0.98196,,,,,-3.91029,1.12252,...,0.0,0.0,0.0,55,1.0,1.0,0.0,0.0,0.0,0.0
01BR008-T,0.53624,-0.24927,1.54379,0.85463,,,,,-1.42230,0.51320,...,0.0,0.0,0.0,,,,,,,
01BR009-T,-0.27902,-0.33482,0.69390,0.78852,,,,,-1.49698,-0.39790,...,1.0,0.0,0.0,,,,,,,
01BR010-T,0.35560,-0.05965,-0.06194,-2.39742,,,,4.85220,1.09662,-0.41295,...,1.0,0.0,0.0,,,,,,,
01BR015-T,0.95351,-0.37789,-0.68548,-3.96565,,,,,-5.95178,0.12743,...,1.0,0.0,0.0,35,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-04280-T,-0.54060,0.28620,0.56570,-0.29791,,,,,,0.45859,...,0.0,0.0,0.0,65,0.0,0.0,0.0,0.0,1.0,0.0
C3N-04282-T,-0.38949,0.11637,0.56524,-1.92610,,,,-1.70998,1.13998,-0.12112,...,0.0,0.0,0.0,75,0.0,0.0,0.0,0.0,0.0,0.0
C3N-04283-T,0.38921,-0.15259,-0.18576,1.35119,,,0.25874,0.90227,0.31181,-0.05971,...,0.0,0.0,0.0,66,1.0,0.0,0.0,0.0,0.0,0.0
C3N-04284-T,0.76318,0.31274,0.45277,-1.21128,,,,,0.54146,1.09926,...,0.0,0.0,0.0,80,1.0,0.0,0.0,0.0,0.0,0.0


In [123]:
# only need to run if replacing in already created dataframe
# combined = combined[[c for c in combined.columns if '_phospho' not in c]]

In [124]:
combined = pd.merge(combined, phospho, left_index=True, right_index=True, how='left')
combined

Unnamed: 0,AAAS_proteome,AAK1_proteome,AATF_proteome,ABCA1_proteome,ABCA2_proteome,ABCB1_proteome,ABCB11_proteome,ABCC2_proteome,ABCC3_proteome,ABCE1_proteome,...,TOMM34_S186|TOMM34|NP_006800.2_S186s_1_1_186_186_phospho,XYLB_S354|XYLB|NP_001336107.1_S354s_1_1_354_354_phospho,NARS1_S88|NARS1|NP_004530.1_S88s_1_1_88_88_phospho,GPR65_S324|GPR65|NP_003599.2_S324s_1_1_324_324_phospho,DBI_S2|DBI|NP_001073331.1_S2s_1_1_2_2_phospho,SNRPD3_S2|SNRPD3|NP_001265585.1_S2s_1_1_2_2_phospho,RAB28_S207|RAB28|NP_001017979.1_S207s_1_1_207_207_phospho,FBXL7_S105|FBXL7|NP_036436.1_S105s_1_1_105_105_phospho,MAP3K14_S410|MAP3K14|NP_003945.2_S410s_1_1_410_410_phospho,UGGT2_S952|UGGT2|NP_064506.3_S952s_1_1_952_952_phospho
01BR001-T,0.35140,-0.00390,1.10105,0.98196,,,,,-3.91029,1.12252,...,-0.21957,-2.064230,-1.22276,-0.40442,0.54564,0.875650,3.109490,-3.34221,-2.561220,-0.143310
01BR008-T,0.53624,-0.24927,1.54379,0.85463,,,,,-1.42230,0.51320,...,0.42100,-1.774430,-1.45275,2.80914,-2.57344,-0.250530,0.409140,-1.38308,-0.397280,1.701790
01BR009-T,-0.27902,-0.33482,0.69390,0.78852,,,,,-1.49698,-0.39790,...,-0.41210,-1.341260,-2.09601,-0.53712,1.80276,0.476930,1.350520,-1.68391,-1.756450,2.011120
01BR010-T,0.35560,-0.05965,-0.06194,-2.39742,,,,4.85220,1.09662,-0.41295,...,-0.20548,-0.859580,-1.73799,-0.80625,-0.82664,0.986640,0.330970,1.13409,-0.707344,1.794460
01BR015-T,0.95351,-0.37789,-0.68548,-3.96565,,,,,-5.95178,0.12743,...,-0.10312,-1.371280,-1.56777,-0.28762,-0.17280,0.671700,0.437580,-1.53015,-1.262580,-2.419064
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-04280-T,-0.54060,0.28620,0.56570,-0.29791,,,,,,0.45859,...,0.83508,-1.278484,-0.66452,-1.30025,0.42640,0.582466,1.899430,-0.80850,1.863990,-0.550450
C3N-04282-T,-0.38949,0.11637,0.56524,-1.92610,,,,-1.70998,1.13998,-0.12112,...,-1.51760,0.610720,1.00393,-1.01017,-0.07182,-0.015610,-0.499620,-2.86815,0.521480,-0.113980
C3N-04283-T,0.38921,-0.15259,-0.18576,1.35119,,,0.25874,0.90227,0.31181,-0.05971,...,-1.09135,0.213746,-0.55999,-2.71406,-0.63692,0.084090,3.855510,-1.65134,0.883820,-0.814058
C3N-04284-T,0.76318,0.31274,0.45277,-1.21128,,,,,0.54146,1.09926,...,-0.26326,0.822460,1.72946,-1.00419,-1.34322,0.639012,-2.874184,-0.22867,-0.249884,-2.721020


In [125]:
from collections import Counter
Counter(combined['disease'])

Counter({'BR': 120,
         'CO': 96,
         'OV': 82,
         'LUAD': 108,
         'ccRCC': 111,
         'UCEC': 96,
         'PDA': 141,
         'LSCC': 108,
         'GBM': 100,
         'HNSCC': 108})

In [126]:
combined.to_csv('../data/formatted/aggregated_01282022.txt.gz', sep='\t', )

###### sandbox

In [4]:
combined = pd.read_csv('../data/aggregated_08012021.txt.gz', sep='\t', index_col=0)
combined

Columns (14819,158933) have mixed types.Specify dtype option on import or set low_memory=False.


In [25]:
[c for c in combined.columns if 'methylation' in c]

['methylation_subtype_1',
 'methylation_subtype_2',
 'methylation_subtype_3',
 'methylation_subtype_4',
 'methylation_subtype_5',
 'methylation_subtype_6']

In [31]:
f = combined[[c for c in combined.columns if 'methylation' in c]]
f

Unnamed: 0_level_0,methylation_subtype_1,methylation_subtype_2,methylation_subtype_3,methylation_subtype_4,methylation_subtype_5,methylation_subtype_6
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
01BR001-T,,,,,,
01BR008-T,,,,,,
01BR009-T,,,,,,
01BR010-T,,,,,,
01BR015-T,,,,,,
...,...,...,...,...,...,...
TumorOnlyIR01-T,,,,,,
TumorOnlyIR03-T,,,,,,
TumorOnlyIR14-T,,,,,,
TumorOnlyIR21-T,,,,,,


In [33]:
f.loc['C3L-00001-T']

methylation_subtype_1    1.0
methylation_subtype_2    0.0
methylation_subtype_3    0.0
methylation_subtype_4    0.0
methylation_subtype_5    0.0
methylation_subtype_6    0.0
Name: C3L-00001-T, dtype: float64