In [1]:
import os

import pandas as pd
import numpy as np

import seaborn as sns
from sklearn.impute import SimpleImputer
import mgitools.os_helpers as os_helpers

In [2]:
# !pip install git+https://github.com/estorrs/mgitools
# !pip install sklearn

In [3]:
CANCER_TYPE_MAP = {
    'CPTAC2_BRCA_prospective': 'BRCA_prospective',
    'TCGA_BRCA_retrospective': 'BRCA_retrospective',
    'BR': 'BRCA_prospective',
    'BRCA': 'BRCA_prospective',
    'CPTAC2_CRC_prospective': 'CRC_prospective',
    'CO': 'CRC_prospective',
    'COADREAD': 'CRC_prospective',
    'CPTAC2_OV_prospective': 'OV_prospective',
    'TCGA_OV_retrospective': 'OV_retrospective',
    'OV': 'OV_prospective',
    'CPTAC3_CCRCC_discovery': 'CCRCC',
    'KIRC': 'CCRCC',
    'CPTAC3_GBM_discovery': 'GBM',
    'CPTAC3_HNSCC_discovery': 'HNSCC',
    'HNSC': 'HNSCC',
    'CPTAC3_LSCC_discovery': 'LSCC',
    'LUSC': 'LSCC',
    'CPTAC3_LUAD_discovery': 'LUAD',
    'CPTAC3_UCEC_discovery': 'UCEC'
}

## read in driver genes

In [4]:
genes = pd.read_csv('/data/driver_genes/bailey_smg_table.tsv', sep='\t')
genes

Unnamed: 0,Gene,Cancer
0,ABL1,PANCAN
1,ACVR1,UCEC
2,ACVR1B,PANCAN
3,ACVR2A,COADREAD
4,ACVR2A,LIHC
...,...,...
734,ZMYM3,PRAD
735,ZNF133,OV
736,ZNF750,PANCAN
737,ZNF750,ESCA


In [5]:
sorted(set(genes['Cancer']))

['ACC',
 'BLCA',
 'BRCA',
 'CESC',
 'CHOL',
 'COADREAD',
 'DLBC',
 'ESCA',
 'GBM',
 'HNSC',
 'KICH',
 'KIRC',
 'KIRP',
 'LAML',
 'LGG',
 'LIHC',
 'LUAD',
 'LUSC',
 'MESO',
 'OV',
 'PAAD',
 'PANCAN',
 'PCPG',
 'PRAD',
 'SARC',
 'SKCM',
 'STAD',
 'TGCT',
 'THCA',
 'THYM',
 'UCEC',
 'UCS',
 'UVM']

In [6]:
drivers = pd.read_csv('/data/driver_genes/199_driver_list.txt', sep='\t')
drivers

Unnamed: 0,Gene,Tumor suppressor or oncogene prediction (by 20/20+)
0,PHF6,possible tsg
1,ABL1,
2,ALK,
3,AR,
4,ARAF,
...,...,...
183,KMT2A,tsg
184,KMT2B,tsg
185,MAX,oncogene
186,MED12,oncogene


In [7]:
cancer_to_driver_genes = {CANCER_TYPE_MAP.get(c, c):genes[genes['Cancer']==c].set_index('Gene')
                          for c in sorted(set(genes['Cancer']))}
cancer_to_driver_genes.keys()

dict_keys(['ACC', 'BLCA', 'BRCA_prospective', 'CESC', 'CHOL', 'CRC_prospective', 'DLBC', 'ESCA', 'GBM', 'HNSCC', 'KICH', 'CCRCC', 'KIRP', 'LAML', 'LGG', 'LIHC', 'LUAD', 'LSCC', 'MESO', 'OV_prospective', 'PAAD', 'PANCAN', 'PCPG', 'PRAD', 'SARC', 'SKCM', 'STAD', 'TGCT', 'THCA', 'THYM', 'UCEC', 'UCS', 'UVM'])

In [8]:
cancer_to_driver_genes['LSCC']

Unnamed: 0_level_0,Cancer
Gene,Unnamed: 1_level_1
ARHGAP35,LUSC
ARID1A,LUSC
CDKN2A,LUSC
CUL3,LUSC
EP300,LUSC
FAT1,LUSC
FBXW7,LUSC
FGFR2,LUSC
HLA-A,LUSC
HRAS,LUSC


In [9]:
# ucec_genes = genes[[True if c in ['UCEC'] else False for c in genes['Cancer']]]
# ucec_genes = pd.merge(drivers, ucec_genes, left_on='Gene', right_on='Gene')
# ucec_genes = ucec_genes.set_index('Gene')
# ucec_genes.index.name = 'gene'
# ucec_genes

In [10]:
# ucec_genes.to_csv('/data/driver_genes/ucec_drivers.tsv', sep='\t')

In [11]:
# drivers.to_csv('/data/driver_genes/expanded_drivers.tsv', sep='\t', index=False)

## read in somatic mutations

In [12]:
mutation_dir = '/data/somtatic_mutations/'
mutation_fps = sorted(os_helpers.listfiles(mutation_dir, regex=r'.maf$'))
mutation_fps

['/data/somtatic_mutations/BR.maf',
 '/data/somtatic_mutations/CCRCC.maf',
 '/data/somtatic_mutations/CO.maf',
 '/data/somtatic_mutations/GBM.maf',
 '/data/somtatic_mutations/HNSCC.maf',
 '/data/somtatic_mutations/LSCC.maf',
 '/data/somtatic_mutations/LUAD.maf',
 '/data/somtatic_mutations/OV.maf',
 '/data/somtatic_mutations/UCEC.maf']

In [13]:
cancer_to_maf = {}
for fp in mutation_fps:
    cancer = fp.split('/')[-1].split('.')[0]
    mutation_df = pd.read_csv(fp, sep='\t')
    mutation_df['sample_id'] = [x.split('_')[0] for x in mutation_df['Tumor_Sample_Barcode']]
    mutation_df = mutation_df[mutation_df['Variant_Classification']!='Silent']
    cancer_to_maf[CANCER_TYPE_MAP.get(cancer, cancer)] = mutation_df
cancer_to_maf.keys()

  interactivity=interactivity, compiler=compiler, result=result)


dict_keys(['BRCA_prospective', 'CCRCC', 'CRC_prospective', 'GBM', 'HNSCC', 'LSCC', 'LUAD', 'OV_prospective', 'UCEC'])

## read in expression

In [14]:
expression_dir = '/data/expression'
expression_fps = sorted(os_helpers.listfiles(expression_dir, regex=r'fpkm.tsv$'))
expression_fps

['/data/expression/AML_fpkm.tsv',
 '/data/expression/BR_fpkm.tsv',
 '/data/expression/CCRCC_fpkm.tsv',
 '/data/expression/CM_fpkm.tsv',
 '/data/expression/CO_fpkm.tsv',
 '/data/expression/EGAD-PDAC_fpkm.tsv',
 '/data/expression/GBM_fpkm.tsv',
 '/data/expression/HNSCC_fpkm.tsv',
 '/data/expression/LSCC_fpkm.tsv',
 '/data/expression/LUAD_fpkm.tsv',
 '/data/expression/OV_fpkm.tsv',
 '/data/expression/PDA_fpkm.tsv',
 '/data/expression/SAR_fpkm.tsv',
 '/data/expression/UCEC_fpkm.tsv',
 '/data/expression/eocg_fpkm.tsv',
 '/data/expression/hipsci_managed_fpkm.tsv',
 '/data/expression/hipsci_public_fpkm.tsv']

In [15]:
cancer_type_to_expression = {}
for fp in expression_fps:
    cancer = fp.split('/')[-1].replace('_fpkm.tsv', '')
    
    cancer_type_to_expression[CANCER_TYPE_MAP.get(cancer, cancer)] = pd.read_csv(fp, sep='\t', index_col='gene')
cancer_type_to_expression.keys()

dict_keys(['AML', 'BRCA_prospective', 'CCRCC', 'CM', 'CRC_prospective', 'EGAD-PDAC', 'GBM', 'HNSCC', 'LSCC', 'LUAD', 'OV_prospective', 'PDA', 'SAR', 'UCEC', 'eocg', 'hipsci_managed', 'hipsci_public'])

## read in raw protein data

In [16]:
fps = sorted(os_helpers.listfiles('/data/v2.0/', regex=r'.gct'))
fps

['/data/v2.0/CPTAC2_BRCA_prospective/CPTAC2_BRCA_prospective_broad_acetylome.v2.0.gct',
 '/data/v2.0/CPTAC2_BRCA_prospective/CPTAC2_BRCA_prospective_broad_phosphoproteome.v2.0.gct',
 '/data/v2.0/CPTAC2_BRCA_prospective/CPTAC2_BRCA_prospective_broad_proteome.v2.0.gct',
 '/data/v2.0/CPTAC2_CRC_prospective/CPTAC2_CRC_prospective_pnnl_phosphoproteome.v2.0.gct',
 '/data/v2.0/CPTAC2_CRC_prospective/CPTAC2_CRC_prospective_pnnl_proteome.v2.0.gct',
 '/data/v2.0/CPTAC2_OV_prospective/CPTAC2_OV_prospective_pnnl_phosphoproteome.v2.0.gct',
 '/data/v2.0/CPTAC2_OV_prospective/CPTAC2_OV_prospective_pnnl_proteome.v2.0.gct',
 '/data/v2.0/CPTAC3_CCRCC_discovery/CPTAC3_CCRCC_discovery_umich_phosphoproteome.v2.0.gct',
 '/data/v2.0/CPTAC3_CCRCC_discovery/CPTAC3_CCRCC_discovery_umich_proteome.v2.0.gct',
 '/data/v2.0/CPTAC3_GBM_discovery/CPTAC3_GBM_discovery_pnnl_acetylome.v2.0.gct',
 '/data/v2.0/CPTAC3_GBM_discovery/CPTAC3_GBM_discovery_pnnl_phosphoproteome.v2.0.gct',
 '/data/v2.0/CPTAC3_GBM_discovery/CPTAC3

In [17]:
# f = open('/data/v2.0/CPTAC3_UCEC_discovery/CPTAC3_UCEC_discovery_pnnl_phosphoproteome.v2.0.gct')
# for i, line in enumerate(f):
#     print(line)
#     if i % 20 == 19:
#         break
# f.close()

In [18]:
def convert_gct_to_df(fp):
    f = open(fp)
    f.readline()
    _, n_samples, n_feature_metadata, n_sample_metadata = [int(x) for x in f.readline().strip().split('\t')]
    df = pd.read_csv(f, sep='\t')
    df.columns = [c[1:] if c[0]=='X' else c for c in df.columns]
    
    ## do some renaming and split metadata into different table
    ids = df['id'].to_list()[:n_sample_metadata - 1]
    metadata = df.iloc[:n_sample_metadata - 1, -n_samples:].copy()
    metadata.index = ids
    metadata = metadata.transpose()
    
    
    df = df.iloc[n_sample_metadata:, :]
#     print(df.shape)
    df = df.set_index('id')
    df.index.name = 'site_id'
    
    return df, metadata


def change_sample_ids(df, metadata):
    site_cols = list(df.columns[:-metadata.shape[0]])
    ids = []
    for s_id in df.columns[-metadata.shape[0]:]:
        case = metadata.loc[s_id, 'case_id']
        case = case.replace('.', '-')
        identifier = '.T' if 'Tumor' in metadata.loc[s_id, 'sample_type'] else '.N'
        new = case + identifier
        ids.append(new)
    site_cols += ids
    df.columns = site_cols

    ids = []
    for s_id in metadata.index:
        case = metadata.loc[s_id, 'case_id']
        case = case.replace('.', '-')
        identifier = '.T' if 'Tumor' in metadata.loc[s_id, 'sample_type'] else '.N'
        new = case + identifier
        ids.append(new)
    metadata.index = ids
    
    return df, metadata

def get_cancer_to_dfs(fps):
    d = {}
    for fp in fps:
        cancer = fp.split('/')[3]
        cancer = CANCER_TYPE_MAP.get(cancer, cancer)
        filetype = fp.split('.v2.0')[-2].split('_')[-1]
        print(cancer, filetype)
        
        if cancer not in d:
            d[cancer] = {
                k:{} for k in ['acetylome', 'phosphoproteome', 'acetylome', 'ubiquitylome']
            }
        
        df, metadata = convert_gct_to_df(fp)
        
        ## make sample_type if not already in dataframe
        if cancer in ['LSCC', 'LUAD']:
            metadata['sample_type'] = ['Normal' if s[-2:]=='.N' else 'Tumor' for s in metadata.index]
            
        ## fix identifiers
        if cancer in ['LSCC', 'LUAD', 'CCRCC']:
            df, metadata = change_sample_ids(df, metadata)
        
        
        d[cancer][filetype] = {
            'metadata': metadata,
            'data': df
        }
    ## filter out empty datatypes
#     d = {for cancer, dicts in d.items() for dtypes, m in dicts.items() if len(m)}
    for cancer in list(d.keys()):
        for dtypes in list(d[cancer].keys()):
            if not len(d[cancer][dtypes]):
                d[cancer].pop(dtypes)
            
    return d

In [19]:
cancer_type_to_expression.keys()

dict_keys(['AML', 'BRCA_prospective', 'CCRCC', 'CM', 'CRC_prospective', 'EGAD-PDAC', 'GBM', 'HNSCC', 'LSCC', 'LUAD', 'OV_prospective', 'PDA', 'SAR', 'UCEC', 'eocg', 'hipsci_managed', 'hipsci_public'])

In [20]:
cancer_to_protein = get_cancer_to_dfs(fps)
cancer_to_protein.keys()

BRCA_prospective acetylome


  exec(code_obj, self.user_global_ns, self.user_ns)


BRCA_prospective phosphoproteome
BRCA_prospective proteome


  exec(code_obj, self.user_global_ns, self.user_ns)


CRC_prospective phosphoproteome


  exec(code_obj, self.user_global_ns, self.user_ns)


CRC_prospective proteome


  exec(code_obj, self.user_global_ns, self.user_ns)


OV_prospective phosphoproteome


  exec(code_obj, self.user_global_ns, self.user_ns)


OV_prospective proteome


  exec(code_obj, self.user_global_ns, self.user_ns)


CCRCC phosphoproteome


  exec(code_obj, self.user_global_ns, self.user_ns)


CCRCC proteome


  exec(code_obj, self.user_global_ns, self.user_ns)


GBM acetylome


  exec(code_obj, self.user_global_ns, self.user_ns)


GBM phosphoproteome
GBM proteome


  exec(code_obj, self.user_global_ns, self.user_ns)


HNSCC phosphoproteome


  exec(code_obj, self.user_global_ns, self.user_ns)


HNSCC proteome


  exec(code_obj, self.user_global_ns, self.user_ns)


LSCC acetylome


  exec(code_obj, self.user_global_ns, self.user_ns)


LSCC phosphoproteome
LSCC proteome


  exec(code_obj, self.user_global_ns, self.user_ns)


LSCC ubiquitylome


  exec(code_obj, self.user_global_ns, self.user_ns)


LUAD acetylome


  exec(code_obj, self.user_global_ns, self.user_ns)


LUAD phosphoproteome
LUAD proteome


  exec(code_obj, self.user_global_ns, self.user_ns)


UCEC acetylome


  exec(code_obj, self.user_global_ns, self.user_ns)


UCEC phosphoproteome
UCEC proteome


  exec(code_obj, self.user_global_ns, self.user_ns)


BRCA_retrospective phosphoproteome


  exec(code_obj, self.user_global_ns, self.user_ns)


BRCA_retrospective proteome


  exec(code_obj, self.user_global_ns, self.user_ns)


OV_retrospective phosphoproteome
OV_retrospective proteome


dict_keys(['BRCA_prospective', 'CRC_prospective', 'OV_prospective', 'CCRCC', 'GBM', 'HNSCC', 'LSCC', 'LUAD', 'UCEC', 'BRCA_retrospective', 'OV_retrospective'])

In [21]:
for k, d in cancer_to_protein.items():
    print(k, list(d.keys()))

BRCA_prospective ['acetylome', 'phosphoproteome', 'proteome']
CRC_prospective ['phosphoproteome', 'proteome']
OV_prospective ['phosphoproteome', 'proteome']
CCRCC ['phosphoproteome', 'proteome']
GBM ['acetylome', 'phosphoproteome', 'proteome']
HNSCC ['phosphoproteome', 'proteome']
LSCC ['acetylome', 'phosphoproteome', 'ubiquitylome', 'proteome']
LUAD ['acetylome', 'phosphoproteome', 'proteome']
UCEC ['acetylome', 'phosphoproteome', 'proteome']
BRCA_retrospective ['phosphoproteome', 'proteome']
OV_retrospective ['phosphoproteome', 'proteome']


In [22]:
cancer_to_protein['CCRCC']['phosphoproteome']['data']

Unnamed: 0_level_0,original_id,refseq_prot_id,symbol,phosphosites,peptide,peptide_start,peptide_end,refseq_tx_id,uniparc_id,hgnc_id,...,C3N-01646.T,C3N-01646.N,C3N-01648.T,C3N-01648.N,C3N-01649.T,C3N-01649.N,C3N-01651.T,C3N-01651.N,C3N-01808.T,C3N-01808.N
site_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG:NP_570602.2:S302,NP_570602.2_291_317_1_1_S302,NP_570602.2,A1BG,S302,LHDNQNGWSGDSAPVELILsDETLPAPEFSPEPESGR,283,319,NM_130786,UPI0000167B10,HGNC:5,...,,,,,,,,,,
A1BG:NP_570602.2:T171,NP_570602.2_171_184_1_1_T171,NP_570602.2,A1BG,T171,EGDHEFLEVPEAQEDVEAtFPVHQPGNYSCSYR,153,185,NM_130786,UPI0000167B10,HGNC:5,...,,,,,,,,,,
A1BG:NP_570602.2:T171;Y180;Y184,NP_570602.2_171_184_3_3_T171Y180Y184,NP_570602.2,A1BG,T171;Y180;Y184,REGDHEFLEVPEAQEDVEAtFPVHQPGNySCSyR,152,185,NM_130786,UPI0000167B10,HGNC:5,...,,,,,,,,,,
A1CF:NP_001185747.1:T491,NP_001185747.1_476_491_1_1_T491,NP_001185747.1,A1CF,T491,ITIPALASQNPAIHPFtPPK,475,494,NM_001198818,UPI000006EC73,HGNC:24086,...,,,-0.69413,0.117396,-0.633383,0.076533,0.467653,0.480063,0.464454,0.643927
A2M:NP_000005.2:S710,NP_000005.2_708_710_1_1_S710,NP_000005.2,A2M,S710,VGFYEsDVMGR,705,715,NM_000014,UPI0000155718,HGNC:7,...,0.888674,-0.7553,,,,,0.407549,-0.73731,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZZZ3:NP_056349.1:S468,NP_056349.1_468_468_1_1_S468,NP_056349.1,ZZZ3,S468,LNIGHLPsAK,461,470,NM_015534,UPI0000074256,HGNC:24523,...,,,,,-0.721719,0.372557,,,0.097481,0.241512
ZZZ3:NP_056349.1:S777,NP_056349.1_763_786_1_1_S777,NP_056349.1,ZZZ3,S777,SCFHSHMNTAVEDAsDDESIPIMYR,763,787,NM_015534,UPI0000074256,HGNC:24523,...,,,,,,,,,,
ZZZ3:NP_056349.1:S89,NP_056349.1_89_91_1_1_S89,NP_056349.1,ZZZ3,S89,RGLsSSEK,86,93,NM_015534,UPI0000074256,HGNC:24523,...,,,,,,,,,,
ZZZ3:NP_056349.1:Y399,NP_056349.1_396_434_1_1_Y399,NP_056349.1,ZZZ3,Y399,NSSPyRENGQFEENNLSPNETNATVSDNVSQSPTNPGEISQNEK,395,438,NM_015534,UPI0000074256,HGNC:24523,...,,,,,,,,,,


In [23]:
# cancer_to_protein['LSCC']['phosphoproteome']['metadata']

In [24]:


def filter_sites(df, metadata_df):
    ## filter sites with excessive missing NA
    sample_df = df[metadata_df.index]
    usable_count = [np.count_nonzero(~pd.isnull(ls)) for ls in sample_df.values]
#     sns.distplot(np.asarray(usable_count)/sample_df.shape[1])
    mask = [True if c / sample_df.shape[1] >= .75 else False for c in usable_count]
    df = df[mask]
    sample_df = sample_df[mask]
    
    sample_df = sample_df.transpose()
    
    ## impute nans
    sample_df = pd.DataFrame(data=SimpleImputer(strategy='median').fit_transform(sample_df.values),
                       index=sample_df.index, columns=sample_df.columns)
    
    return pd.concat((df.iloc[:, :-metadata_df.shape[0]], sample_df.transpose()), axis=1)
    

    
    

In [25]:
for cancer, dicts in cancer_to_protein.items():
    for dtype, d in dicts.items():
        print(cancer, dtype)
        d['processed_data'] = filter_sites(d['data'], d['metadata'])
        print(d['data'].shape, d['processed_data'].shape)

BRCA_prospective acetylome
(18328, 143) (3800, 143)
BRCA_prospective phosphoproteome
(63330, 143) (19502, 143)
BRCA_prospective proteome
(9764, 133) (8890, 133)
CRC_prospective phosphoproteome
(41891, 220) (5851, 220)
CRC_prospective proteome
(7402, 201) (5940, 201)
OV_prospective phosphoproteome
(48571, 126) (2422, 126)
OV_prospective proteome
(10095, 108) (8624, 108)
CCRCC phosphoproteome
(81780, 215) (15255, 215)
CCRCC proteome
(11355, 199) (8442, 199)
GBM acetylome
(18767, 130) (3388, 130)
GBM phosphoproteome
(56292, 130) (19501, 130)
GBM proteome
(11141, 113) (9799, 113)
HNSCC phosphoproteome
(66577, 207) (13008, 207)
HNSCC proteome
(11744, 189) (8845, 189)
LSCC acetylome
(15056, 228) (3047, 228)
LSCC phosphoproteome
(68544, 228) (23790, 228)
LSCC ubiquitylome
(25430, 168) (6827, 168)
LSCC proteome
(11117, 218) (9882, 218)
LUAD acetylome
(13368, 232) (2749, 232)
LUAD phosphoproteome
(64996, 232) (22344, 232)
LUAD proteome
(10305, 222) (8987, 222)
UCEC acetylome
(14184, 165) (2183,

In [26]:
cancer_to_protein['UCEC']['acetylome']['processed_data']

Unnamed: 0_level_0,original_id,refseq_prot_id,symbol,acetylsites,peptide,peptide_start,peptide_end,refseq_tx_id,uniparc_id,hgnc_id,...,S144,S145,S146,S147,S148,S149,S150,S151,S152,S153
site_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAGAB:NP_078942.3:K290,VHAEK#VAK,NP_078942.3,AAGAB,K290,VHAEkVAK,286,293,NM_024666,UPI000013D219,HGNC:25662,...,-0.428,-0.87300,-0.29500,0.15200,-0.53000,-0.75700,-0.72800,-1.16000,-0.59600,-0.22400
ABCE1:NP_001035809.1:K431,QLLHEK#IR,NP_001035809.1,ABCE1,K431,QLLHEkIR,426,433,NM_001040876,UPI0000001226,HGNC:69,...,0.149,-0.04345,-0.04345,-0.04345,-0.04345,-0.04345,-0.04345,-0.04345,-0.04345,-0.04345
ACAA2:NP_006102.2:K13,GVFVVAAK#R,NP_006102.2,ACAA2,K13,GVFVVAAkR,6,14,NM_006111,UPI000006FECE,HGNC:83,...,0.779,0.23200,0.08980,0.14800,0.19800,0.06040,-0.04920,0.26900,0.02100,0.05590
ACAA2:NP_006102.2:K137,FGTK#LGSDIK,NP_006102.2,ACAA2,K137,FGTkLGSDIK,134,143,NM_006111,UPI000006FECE,HGNC:83,...,0.257,0.37200,0.01440,0.18900,0.17800,0.20600,-0.13800,0.41600,0.28200,-0.22200
ACAA2:NP_006102.2:K234,QTMQVDEHARPQTTLEQLQK#LPPVFK,NP_006102.2,ACAA2,K234,QTMQVDEHARPQTTLEQLQkLPPVFK,215,240,NM_006111,UPI000006FECE,HGNC:83,...,1.080,0.21800,0.00190,-0.15900,0.06660,-0.55800,-0.22000,-0.23800,-0.03560,-0.59600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF703:NP_079345.1:K141,SAPGAASAAAALK#QLGDSPAEDK,NP_079345.1,ZNF703,K141,SAPGAASAAAALkQLGDSPAEDK,129,151,NM_025069,UPI0000073D30,HGNC:25883,...,0.194,0.71000,0.55000,0.25200,0.67200,0.10800,0.02580,0.06670,0.29800,-0.00305
ZRANB2:NP_976225.1:K54,AGGTEIGK#TLAEK,NP_976225.1,ZRANB2,K54,AGGTEIGkTLAEK,47,59,NM_203350,UPI000013CE63,HGNC:13058,...,-0.604,0.61200,0.44600,0.52400,0.64700,0.06380,0.11200,0.63000,0.69100,0.16700
ZSCAN18:NP_001139014.1:K422,GTAK#LGTK,NP_001139014.1,ZSCAN18,K422,GTAkLGTK,419,426,NM_001145542,UPI00017A6DAF,HGNC:21037,...,0.290,-0.38100,0.35200,-0.17200,-0.22400,-0.15000,-0.31400,-0.67600,-0.37900,0.04730
ZYX:NP_001010972.1:K25,PSPAISVSVSAPAFYAPQKK#,NP_001010972.1,ZYX,K25,PSPAISVSVSAPAFYAPQKk,6,25,NM_001010972,UPI00000424F2,HGNC:13200,...,0.445,0.82000,0.89600,0.46200,0.78300,0.00951,0.28700,1.06000,1.10000,0.56200


In [28]:
from collections import Counter
len(Counter([g.split(':')[0] for g in cancer_to_protein['UCEC']['acetylome']['processed_data'].index]
       ).most_common())

849

In [29]:
len(Counter([g.split(':')[0] for g in cancer_to_protein['UCEC']['phosphoproteome']['processed_data'].index]
       ).most_common())

4095

In [30]:
cancer_to_protein['CCRCC']['phosphoproteome']['metadata'].sort_values('case_id')

Unnamed: 0,case_id,sample_type,aliquot_id,tmt_experiment
C3L-00004.T,C3L-00004,Primary Tumor,CPT0001540009,6
C3L-00004.N,C3L-00004,Solid Tissue Normal,CPT0001550001,6
C3L-00010.T,C3L-00010,Primary Tumor,CPT0001220008,3
C3L-00010.N,C3L-00010,Solid Tissue Normal,CPT0001230001,3
C3L-00011.T,C3L-00011,Primary Tumor,CPT0001340003,11
...,...,...,...,...
C3N-01649.N,C3N-01649,Solid Tissue Normal,CPT0088640003,12
C3N-01651.T,C3N-01651,Primary Tumor,CPT0088690003,14
C3N-01651.N,C3N-01651,Solid Tissue Normal,CPT0088710001,14
C3N-01808.T,C3N-01808,Primary Tumor,CPT0089460004,12


In [31]:
cancer_to_protein['UCEC']['phosphoproteome']['metadata'].sort_values('case_id')

Unnamed: 0,case_id,sample_type,aliquot_ids,tmt_experiment,tmt_channel
S001,C3L-00006,Tumor,CPT0001460012,5,128N
S105,C3L-00006,Adjacent_normal,CPT0001470001,5,128C
S002,C3L-00008,Tumor,CPT0001300009,16,130N
S003,C3L-00032,Tumor,CPT0001420009,2,131
S005,C3L-00090,Tumor,CPT0001140003,12,129C
...,...,...,...,...,...
S140,NX5,Enriched_normal,CPT0189520002;CPT0189520003;CPT0189520004;CPT0...,15,127C
S141,NX6,Enriched_normal,CPT0189170002;CPT0189170003;CPT0189180002;CPT0...,15,128N
S142,NX7,Enriched_normal,CPT0183020002;CPT0183050002;CPT0183050003;CPT0...,15,131
S143,NX8,Enriched_normal,CPT0191190002;CPT0191190003;CPT0191200002;CPT0...,16,127C


In [32]:
cancer_to_protein['LSCC']['phosphoproteome']['metadata'].sort_values('case_id')

Unnamed: 0,case_id,aliquot_id,tmt_experiment,tmt_channel,sample_type
C3L-00081.T,C3L-00081,CPT0001020003,3,128C,Tumor
C3L-00081.N,C3L-00081,CPT0001030003,3,129N,Normal
C3L-00415.T,C3L-00415,CPT0010050003,16,130C,Tumor
C3L-00415.N,C3L-00415,CPT0010060003,16,131N,Normal
C3L-00445.N,C3L-00445,CPT0130440004,15,130N,Normal
...,...,...,...,...,...
C3N-04127.T,C3N-04127,CPT0238110004,10,128C,Tumor
C3N-04155.N,C3N-04155,CPT0223820003,11,130N,Normal
C3N-04155.T,C3N-04155,CPT0223790003,11,129C,Tumor
C3N-04162.N,C3N-04162,CPT0222740003,2,128N,Normal


In [33]:
for cancer, d in cancer_to_protein.items():
    print(cancer)
#     print(d['phosphoproteome']['metadata'].columns)
    if 'sample_type' in d['phosphoproteome']['metadata']:
        print(cancer, set(d['phosphoproteome']['metadata']['sample_type']))

BRCA_prospective
BRCA_prospective {'Primary Tumor'}
CRC_prospective
CRC_prospective {'Primary Tumor', 'Solid Tissue Normal'}
OV_prospective
OV_prospective {'Primary Tumor', 'Solid Tissue Normal'}
CCRCC
CCRCC {'Primary Tumor', 'Solid Tissue Normal'}
GBM
HNSCC
HNSCC {'Primary Tumor', 'Solid Tissue Normal'}
LSCC
LSCC {'Normal', 'Tumor'}
LUAD
LUAD {'Normal', 'Tumor'}
UCEC
UCEC {'Adjacent_normal', 'Enriched_normal', 'Myometrium_normal', 'Tumor'}
BRCA_retrospective
OV_retrospective


In [34]:
from collections import Counter
Counter(cancer_to_protein['UCEC']['acetylome']['metadata']['case_id']).most_common()

[('C3L-00006', 2),
 ('C3L-00361', 2),
 ('C3L-00563', 2),
 ('C3L-00586', 2),
 ('C3L-00601', 2),
 ('C3L-00605', 2),
 ('C3L-00769', 2),
 ('C3L-00770', 2),
 ('C3L-00771', 2),
 ('C3L-00932', 2),
 ('C3L-00947', 2),
 ('C3L-00963', 2),
 ('C3L-01246', 2),
 ('C3L-01249', 2),
 ('C3L-01252', 2),
 ('C3L-01256', 2),
 ('C3L-01257', 2),
 ('C3L-01282', 2),
 ('C3L-01304', 2),
 ('C3L-01307', 2),
 ('C3L-01311', 2),
 ('C3L-01744', 2),
 ('C3N-00200', 2),
 ('C3N-00333', 2),
 ('C3N-00383', 2),
 ('C3N-00729', 2),
 ('C3N-00858', 2),
 ('C3N-00866', 2),
 ('C3N-01211', 2),
 ('C3N-01346', 2),
 ('C3L-00008', 1),
 ('C3L-00032', 1),
 ('C3L-00090', 1),
 ('C3L-00098', 1),
 ('C3L-00136', 1),
 ('C3L-00137', 1),
 ('C3L-00139', 1),
 ('C3L-00143', 1),
 ('C3L-00145', 1),
 ('C3L-00156', 1),
 ('C3L-00161', 1),
 ('C3L-00358', 1),
 ('C3L-00362', 1),
 ('C3L-00413', 1),
 ('C3L-00449', 1),
 ('C3L-00767', 1),
 ('C3L-00780', 1),
 ('C3L-00781', 1),
 ('C3L-00905', 1),
 ('C3L-00918', 1),
 ('C3L-00921', 1),
 ('C3L-00942', 1),
 ('C3L-00946

In [35]:
## process for final output
def generate_model_input(data_dict):
    output = None
    
    for dtype, df_dict in data_dict.items():
        print(dtype)
        filtered = df_dict['processed_data'].copy()
#         filtered = df_dict['data'].copy()
        metadata = df_dict['metadata'].copy()
        filtered = filtered[metadata.index]
        filtered = filtered.transpose()

        ## add in metadata
        if 'sample_type' in metadata.columns:
            filtered['sample_type'] = ['normal' if 'tumor' not in x.lower() else 'tumor'
                                       for x in metadata['sample_type']]
        else:
            filtered['sample_type'] = ['tumor'] * metadata.shape[0]
            
        ## rename_columns
        filtered.columns = [f'{c}_{dtype}' for c in filtered.columns]
#         filtered = filtered.transpose()
        
        
        if output is None:
            output = filtered
        else:
            print(output.shape, filtered.shape, len(set(filtered.index).intersection(set(output.index))))
            output = pd.merge(output, filtered, right_index=True, left_index=True)
            
    ## normalize sample types
    df = output[[c for c in output.columns if 'sample_type' in c]]
    output = output[[c for c in output.columns if f'sample_type' not in c]]
    output['sample_type'] = df.iloc[:, 0].to_list()
    
    ## split into gene groupings
    ## groupby tissue
#     output = output.groupby('sample_type').mean()
#     output = output[[c for c in output.columns if '_proteome' in c]]
#     output = output.transpose()

    return output
    

In [36]:
cancer_to_model_inputs = {}
df = None
for cancer, data_dict in cancer_to_protein.items():
    print(cancer)
    output = generate_model_input(data_dict)
    cancer_to_model_inputs[cancer] = output

BRCA_prospective
acetylome
phosphoproteome
(122, 3801) (122, 19503) 122
proteome
(122, 23304) (122, 8891) 122
CRC_prospective
phosphoproteome
proteome
(197, 5852) (197, 5941) 197
OV_prospective
phosphoproteome
proteome
(103, 2423) (103, 8625) 103
CCRCC
phosphoproteome
proteome
(194, 15256) (194, 8443) 194
GBM
acetylome
phosphoproteome
(109, 3389) (109, 19502) 109
proteome
(109, 22891) (109, 9800) 109
HNSCC
phosphoproteome
proteome
(186, 13009) (186, 8846) 186
LSCC
acetylome
phosphoproteome
(207, 3048) (207, 23791) 207
ubiquitylome
(207, 26839) (147, 6828) 143
proteome
(143, 33667) (207, 9883) 143
LUAD
acetylome
phosphoproteome
(211, 2750) (211, 22345) 211
proteome
(211, 25095) (211, 8988) 211
UCEC
acetylome
phosphoproteome
(144, 2184) (144, 15983) 144
proteome
(144, 18167) (144, 9246) 144
BRCA_retrospective
phosphoproteome
proteome
(83, 15820) (83, 9211) 83
OV_retrospective
phosphoproteome
proteome
(69, 668) (169, 3355) 67


In [37]:
cancer_to_model_inputs['UCEC']

Unnamed: 0,AAGAB:NP_078942.3:K290_acetylome,ABCE1:NP_001035809.1:K431_acetylome,ACAA2:NP_006102.2:K13_acetylome,ACAA2:NP_006102.2:K137_acetylome,ACAA2:NP_006102.2:K234_acetylome,ACADM:NP_001272972.1:K312_acetylome,ACADVL:NP_001257376.1:K262_acetylome,ACADVL:NP_001257376.1:K299_acetylome,ACADVL:NP_001257376.1:K301_acetylome,ACAT1:NP_000010.1:K124_acetylome,...,ZSWIM8_proteome,ZW10_proteome,ZWILCH_proteome,ZWINT_proteome,ZXDC_proteome,ZYG11B_proteome,ZYX_proteome,ZZEF1_proteome,ZZZ3_proteome,sample_type
S001,0.46100,-0.22700,-1.0700,-1.030,-1.0400,2.41000,1.070,1.930,1.670,0.5430,...,-0.08770,0.0229,0.1090,-0.000315,-0.332,-0.43300,-1.020,-0.1230,-0.08590,tumor
S002,1.77000,0.56400,-1.0300,-0.906,-0.6040,-0.55700,-0.329,0.360,-0.586,0.1670,...,-0.03560,0.3630,1.0700,0.737000,-0.564,-0.00461,-1.130,-0.0757,-0.47300,tumor
S003,-0.81500,0.10500,0.0573,0.719,0.6040,0.65100,1.160,1.100,0.902,-0.6980,...,0.00112,0.0105,-0.1160,-0.000315,0.151,-0.07400,-0.540,0.3200,-0.41900,tumor
S005,-0.00334,-0.04860,-0.7900,-0.949,-0.3210,0.70400,0.934,1.250,1.190,-0.1600,...,0.07250,-0.0714,0.0933,0.156000,-0.398,-0.07520,-0.797,-0.0301,-0.46700,tumor
S006,0.20500,0.48900,0.6510,0.434,0.0320,-0.45000,0.111,0.247,-0.441,-0.0157,...,-0.17600,-1.2200,-0.5620,0.937000,-0.646,0.20700,-1.850,-0.1760,0.05130,tumor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
S149,-0.75700,-0.04345,0.0604,0.206,-0.5580,0.00757,-0.505,-0.715,-0.804,-0.0119,...,-0.20400,-0.1510,0.0552,0.105000,1.330,-0.67400,-0.840,-0.2690,0.10300,normal
S150,-0.72800,-0.04345,-0.0492,-0.138,-0.2200,0.00757,-0.116,0.117,-0.313,0.0169,...,-0.07820,0.1410,0.7260,0.868000,0.662,-0.57000,-0.399,-0.0959,-0.06350,normal
S151,-1.16000,-0.04345,0.2690,0.416,-0.2380,0.00757,-0.140,-0.170,-0.414,0.0519,...,0.12200,-0.3800,-0.2970,0.046200,0.045,-0.31900,0.835,0.1690,-0.00809,normal
S152,-0.59600,-0.04345,0.0210,0.282,-0.0356,0.00757,-0.517,-0.815,-0.752,-0.0125,...,0.06070,-0.1260,-0.0662,-0.010300,1.050,-0.35600,0.416,0.2730,-0.65800,normal


In [None]:
value, tumnorm_value_dif, value_type, phosph, phosph_diff, acet, acet_diff, prot, prot_dif, sample_type, cancer_type

rna_exp, rna_exp_diff, somatic_mutation_bool, gene_len, in_complex_withxxxxxx?

is_driver

In [38]:
import re
re.split(r':|_', 'll_00')

['ll', '00']

In [39]:
for cancer, exp in cancer_type_to_expression.items():
#     print(exp.columns)
    print(cancer, {c.split('.')[2] for c in exp.columns if len(c.split('.'))>2})
# {c.split('.')[2] for c in cancer_type_to_expression['UCEC'].columns}

AML {'Tbm', 'Tpb'}
BRCA_prospective {'T'}
CCRCC {'A', 'T'}
CM {'A', 'T'}
CRC_prospective {'T'}
EGAD-PDAC set()
GBM {'T'}
HNSCC {'A', 'T'}
LSCC {'A', 'T'}
LUAD {'A', 'T'}
OV_prospective {'T'}
PDA {'A', 'T'}
SAR {'A', 'T'}
UCEC {'A', 'prov', 'T'}
eocg set()
hipsci_managed set()
hipsci_public set()


In [40]:
## define cancers you want to keep
## here i filter by whether there is normal and tumor
to_keep = ['CCRCC', 'HNSCC', 'LSCC', 'LUAD', 'UCEC']
to_keep

['CCRCC', 'HNSCC', 'LSCC', 'LUAD', 'UCEC']

In [41]:
def normalize_gene_expression_columns(metadata, sample_list, tumor_identifiers=['.T.'],
                                      normal_identifiers=['.A.', '.N.']):
    tumor_mask = [True if 'Tumor' in st else False for st in metadata['sample_type']]
    nontumor_mask = [False if 'Tumor' in st else True for st in metadata['sample_type']]
    tumor = metadata[tumor_mask]
    non_tumor = metadata[nontumor_mask]
    
    new_ids = []
    for s_id in sample_list:
        case = s_id.split('.')[0]
#         if '.T.' in s_id:
        if any([i in s_id for i in tumor_identifiers]):
            filtered = tumor[tumor['case_id']==case]
        elif any([i in s_id for i in normal_identifiers]):
            filtered = non_tumor[non_tumor['case_id']==case]
        else:
            filtered = None
            
        if filtered is not None and filtered.shape[0]: new_ids.append(filtered.index[0])
        else: new_ids.append(s_id)

    return new_ids

def get_sample_to_mutation_status(metadata, maf_df):
    
    samples = sorted(set(maf_df['sample_id']))
    prot_to_case = {c:metadata.loc[c, 'case_id'] for i, c in enumerate(metadata.index)}
#     prot_to_case = {s1:s2 for s1, s2 in zip(new_ids, samples)}
    case_to_prot = {v:k for k, v in prot_to_case.items()}
    genes = sorted(set(maf_df['Hugo_Symbol']))
#     print(case_to_prot)

    sample_to_gene_to_mutation_status = {s:{} for s in prot_to_case.keys()}
    for sample in sample_to_gene_to_mutation_status.keys():
        maf_name = prot_to_case[sample]
        filtered = maf_df[maf_df['sample_id']==maf_name]
#         print(filtered.shape)
        filtered_genes = set(filtered['Hugo_Symbol'])
#         print(filtered.shape)
        for gene in genes:
#             gene_filtered = filtered[filtered['Hugo_Symbol']==gene]
            if gene in filtered_genes:
                sample_to_gene_to_mutation_status[sample][gene] = True
            else:
                sample_to_gene_to_mutation_status[sample][gene] = False
    return sample_to_gene_to_mutation_status
    
            

# def prepare_gene_level_aggregates(df, metadata, gene_expression_df):
#     sample_types = df['sample_type'].to_list()
#     dtypes = sorted(set([re.split(r'_', x)[-1] for x in df.columns if 'type' not in x]))
# #     print(dtypes)
#     combined = None
#     for dtype in dtypes:
#         filtered = df[[c for c in df.columns if dtype == c.split('_')[-1]]].copy()
#         filtered = filtered.transpose()
#         filtered['gene'] = [re.split(r':|_', x)[0] for x in filtered.index]
#         filtered = filtered.fillna(np.inf).groupby('gene').mean().replace(np.inf, np.nan)
#         filtered.columns = [f'{c}_{dtype}' for c in filtered.columns]
#         if combined is None:
#             combined = filtered.copy()
#         else:
#             combined = pd.merge(combined, filtered, how='outer', left_index=True, right_index=True)

#     combined = combined.transpose()
    
#     combined['dtype'] = [x.split('_')[-1] for x in combined.index]
    
#     overall = combined.groupby('dtype').mean()

#     ls = [df.loc[s.split('_')[0], 'sample_type'] for s in combined.index]
#     combined['dtype'] = [f'{d}_{s}' for d, s in zip(combined['dtype'], ls)]
#     grouped = combined.groupby('dtype').mean()
#     overall = pd.concat((overall, grouped), axis=0)
#     overall = overall.transpose()
    
#     for dtype in dtypes:
#         filtered = overall[[c for c in overall.columns if (dtype == c.split('_')[0] and ('normal' in c or 'tumor' in c))]]
#         print(filtered.columns)
#         filtered.columns = [c.split('_')[-1] for c in filtered.columns]
#         overall[f'{dtype}_tumor_normal_difference'] = filtered['tumor'] - filtered['normal']
        
# #     sample_to_pair = {}
# #     for sample, sample_type in zip(df.index, df['sample_type']):
# #         ## look for same case
# #         case = metadata.loc[sample, 'case_id']
# #         filtered = metadata[metadata['case_id']==case]
# #         filtered = filtered[filtered['sample_type']!='Tumor'] if sample_type == 'tumor' else filtered[filtered['sample_type']=='Tumor']
# #         if filtered.shape[0]:
# #             sample_to_pair[sample] = filtered.index[0]
# #     print(sample_to_pair)
    
#     data = []
#     tumor_mask = [True if '.T.' in t else False for t in gene_expression_df.columns]
#     non_tumor_mask = [True if not x else False for x in tumor_mask]
#     if gene_expression_df is not None:
#         expression = gene_expression_df.copy()
#         expression.columns = normalize_gene_expression_columns(metadata, expression.columns)
#         for gene in overall.index:
#             if gene in expression.index:
#                 filtered = expression.loc[gene, :]
#                 avg = np.mean(filtered.values.flatten())
#                 tumor_avg = np.mean([c for i, c in enumerate(filtered) if tumor_mask[i]])
#                 normal_avg = np.mean([c for i, c in enumerate(filtered) if non_tumor_mask[i]])
#                 data.append([avg, tumor_avg, normal_avg, tumor_avg / normal_avg])
#             else:
#                 data.append([np.nan, np.nan, np.nan])
#     temp = pd.DataFrame(data=data, index=overall.index, columns=['rna_expression_average',
#                                                                  'rna_expression_average_tumor',
#                                                                 'rna_expression_average_normal',
#                                                                 'rna_expression_tumor_normal_fc'])
#     overall = pd.merge(overall, temp, left_index=True, right_index=True)
#     return overall

def prepare_site_level_features(df, metadata, gene_expression_df, maf_df):
    dtypes = sorted(set([re.split(r'_', x)[-1] for x in df.columns if 'type' not in x]))
#     dtypes = [t for t in dtypes if t not in ['proteome']]
    print(dtypes)
    print(sorted(set(metadata['sample_type'])))
#     print(dtypes)
    sample_to_type = {s:t for s, t in zip(df.index, df['sample_type'])}
    sample_to_pair = {}
    for sample, sample_type in zip(df.index, df['sample_type']):
        ## look for same case
#         print(metadata.index)
#         print(gene_expression_df.columns)
#         print(sample)
        case = metadata.loc[sample, 'case_id']
#         print(case)
        filtered = metadata[metadata['case_id']==case]
        filtered = filtered[~filtered.index.duplicated(keep='first')]
#         print(sample, sample_type)
#         print(list(filtered.index), list(filtered['sample_type']))
        tumor_mask = [True if 'Tumor' in st else False for st in filtered['sample_type']]
        nontumor_mask = [False if 'Tumor' in st else True for st in filtered['sample_type']]
        filtered = filtered[nontumor_mask] if 'tumor' == sample_type else filtered[tumor_mask]
#         print(list(filtered.index), list(filtered['sample_type']))
#         print(filtered.index, filtered['sample_type'])

        if filtered.shape[0]:
            sample_to_pair[sample] = filtered.index[0]
#     print(len(sample_to_pair), df.shape)
    print(sample_to_pair)
            
    sample_to_gene_to_status = get_sample_to_mutation_status(metadata, maf_df)
        
    expression = gene_expression_df.copy()
    expression.columns = normalize_gene_expression_columns(metadata, expression.columns)
    expression = expression.loc[:,~expression.columns.duplicated()]
    expression = expression.loc[~expression.index.duplicated(), :]
    expressed_genes = set(expression.index)
    expressed_columns = set(expression.columns)
    
    print(expressed_columns)
    data = []
    ids = []
    for dtype in dtypes:
        cs = [c for c in df.columns if dtype == c.split('_')[-1]]
        cs += ['sample_type']
        filtered = df[cs].copy()
        print(dtype, filtered.shape)
#         print(filtered.columns[-5:])
#         print(list(expression.columns))
        for i, (sample, sample_type) in enumerate(zip(filtered.index, filtered['sample_type'])):
#             if i % 11 == 10:
#                 print(i, sample)
#                 break
            has_expression = True if sample in expression.columns else False
#             print(sample, has_expression, sample_to_pair.get(sample), sample_to_pair.get(sample) in expression.columns)
#             if sample in expression.columns:
#                 s_expression = expres
#             print('here', len(filtered.columns))
            paired_sample = sample_to_pair[sample] if sample in sample_to_pair else None
            in_filtered = paired_sample in filtered.index
            for feat in filtered.columns[:-1]:
                s = sample.split('_')[0]
                if dtype == 'proteome':
                    gene = feat.split('_')[0]
                else:
                    gene = feat.split(':')[0]
                ids.append(f'{s}_{dtype}_{feat}')
                val = float(filtered.loc[sample, feat])
#                 try:
#                 print(sample, sample_to_pair.get(sample))
                pair_val = float(filtered.loc[paired_sample, feat]) if in_filtered else np.nan
#                 except KeyError as e:
#                     print(sample, sample_to_pair[sample])
#                     print(filtered.shape, sample_to_pair[sample] in filtered.index)
#                     print(len(ids), i)
#                     print(e)
#                     raise RuntimeError()
                    #                print(val, pair_val)
                diff = np.nan if pd.isnull(pair_val) else val - pair_val
                
                ## do somatic mutation
                has_mutation = sample_to_gene_to_status[sample].get(gene, False)
                
                ## do expression
                if has_expression:
                    
                    exp = expression.loc[gene, sample] if gene in expressed_genes else np.nan
                    
                    pair_exp = expression.loc[gene, paired_sample] if paired_sample in expressed_columns and gene in expressed_genes else np.nan
#                     if sample in sample_to_pair:
#                         print(gene, gene in expressed_genes, exp)
#                         print(sample, sample_to_pair[sample], sample_to_pair[sample] in expressed_columns, pair_exp)
                    exp_diff = np.nan if pd.isnull(exp) or pd.isnull(pair_exp) else exp - pair_exp
                else:
                    exp, pair_exp, exp_diff = np.nan, np.nan, np.nan
                
                
                
                data.append([val, pair_val, diff, sample_type, dtype, exp, pair_exp, exp_diff, has_mutation,
                            gene])
    
#     print(len(data))     
    to_return = pd.DataFrame(data=data, index=ids,
                             columns=['value', 'paired_value', 'paired_difference', 'sample_type', 'data_type',
                                     'rna_expression', 'paired_rna_expression', 'difference_rna_expression',
                                     'has_mutation', 'gene'])
    return to_return
    
 
        
        
        

In [42]:
# cancer = 'LSCC'
# cancer_to_protein[cancer][metadata_key]['metadata']
# # cancer_type_to_expression[cancer]

In [43]:
# cancer_type_to_expression[cancer]

In [44]:
cancer_to_aggs = {}
for cancer, d in cancer_to_model_inputs.items():
    if cancer in to_keep:
        print(cancer)
        metadata_key = 'acetylome' if 'acetylome' in cancer_to_protein[cancer] else 'phosphoproteome'
        site_level_agg = prepare_site_level_features(d,
                               cancer_to_protein[cancer][metadata_key]['metadata'],
                                cancer_type_to_expression[cancer],
                                cancer_to_maf[cancer])
        
#         gene_level_agg = prepare_gene_level_aggregates(d,
#                               cancer_to_protein[cancer][metadata_key]['metadata'],
#                              cancer_type_to_expression[cancer])
        cancer_to_aggs[cancer] = {
            'site_level': site_level_agg,
#             'gene_level': gene_level_agg
        }

CCRCC
['phosphoproteome', 'proteome']
['Primary Tumor', 'Solid Tissue Normal']
{'C3L-00004.T': 'C3L-00004.N', 'C3L-00004.N': 'C3L-00004.T', 'C3L-00010.T': 'C3L-00010.N', 'C3L-00010.N': 'C3L-00010.T', 'C3L-00011.T': 'C3L-00011.N', 'C3L-00011.N': 'C3L-00011.T', 'C3L-00026.T': 'C3L-00026.N', 'C3L-00026.N': 'C3L-00026.T', 'C3L-00079.T': 'C3L-00079.N', 'C3L-00079.N': 'C3L-00079.T', 'C3L-00088.T': 'C3L-00088.N', 'C3L-00088.N': 'C3L-00088.T', 'C3L-00096.T': 'C3L-00096.N', 'C3L-00096.N': 'C3L-00096.T', 'C3L-00097.T': 'C3L-00097.N', 'C3L-00097.N': 'C3L-00097.T', 'C3L-00103.T': 'C3L-00103.N', 'C3L-00103.N': 'C3L-00103.T', 'C3L-00183.N': 'C3L-00183.T', 'C3L-00183.T': 'C3L-00183.N', 'C3L-00360.T': 'C3L-00360.N', 'C3L-00360.N': 'C3L-00360.T', 'C3L-00369.T': 'C3L-00369.N', 'C3L-00369.N': 'C3L-00369.T', 'C3L-00416.T': 'C3L-00416.N', 'C3L-00416.N': 'C3L-00416.T', 'C3L-00418.T': 'C3L-00418.N', 'C3L-00418.N': 'C3L-00418.T', 'C3L-00447.T': 'C3L-00447.N', 'C3L-00447.N': 'C3L-00447.T', 'C3L-00448.T': 'C3L-

proteome (194, 8443)
HNSCC
['phosphoproteome', 'proteome']
['Primary Tumor', 'Solid Tissue Normal']
{'C3L-00997-T': 'C3L-00997-N', 'C3N-03849-T': 'C3N-03849-N', 'C3N-01858-N': 'C3N-01858-T', 'C3L-00997-N': 'C3L-00997-T', 'C3N-01858-T': 'C3N-01858-N', 'C3N-03849-N': 'C3N-03849-T', 'C3N-01859-N': 'C3N-01859-T', 'C3N-03781-T': 'C3N-03781-N', 'C3N-03933-N': 'C3N-03933-T', 'C3L-00999-N': 'C3L-00999-T', 'C3N-03781-N': 'C3N-03781-T', 'C3L-00999-T': 'C3L-00999-N', 'C3N-01859-T': 'C3N-01859-N', 'C3N-03933-T': 'C3N-03933-N', 'C3N-03015-N': 'C3N-03015-T', 'C3N-03490-T': 'C3N-03490-N', 'C3N-03012-T': 'C3N-03012-N', 'C3N-01620-N': 'C3N-01620-T', 'C3N-03012-N': 'C3N-03012-T', 'C3N-03490-N': 'C3N-03490-T', 'C3N-03015-T': 'C3N-03015-N', 'C3N-01620-T': 'C3N-01620-N', 'C3N-01758-N': 'C3N-01758-T', 'C3N-04277-T': 'C3N-04277-N', 'C3N-03045-T': 'C3N-03045-N', 'C3N-02275-T': 'C3N-02275-N-need2remove', 'C3N-02275-N-need2remove': 'C3N-02275-T', 'C3N-01758-T': 'C3N-01758-N', 'C3N-03045-N': 'C3N-03045-T', 'C3N-

{'C3N-03880.T', 'C3L-01606.N', 'C3L-02669.N', 'C3L-02964.T', 'C3L-00993.N', 'C3N-01017.T', 'C3L-02891.N', 'C3L-00568.T', 'C3L-01884.T', 'C3N-04124.N', 'C3L-00603.T', 'C3L-03965.N', 'C3N-04162.T', 'C3N-02575.N', 'C3L-02168.N', 'C3L-04014.T', 'C3L-02669.T', 'C3L-02168.T', 'C3N-02375.T', 'C3L-04013.N', 'C3L-00081.N', 'C3N-01411.T', 'C3L-00904.T', 'C3L-02546.N', 'C3L-02951.N', 'C3N-02494.T', 'C3L-02968.T', 'C3N-03072.T', 'C3L-00503.RNA-Seq.T.hg38', 'C3L-02619.N', 'C3L-00445.T', 'C3L-01285.T', 'C3L-01838.T', 'C3N-01020.T', 'C3L-00415.T', 'C3L-02951.T', 'C3L-01663.T', 'C3N-02252.T', 'C3L-02127.N', 'C3N-01025.N', 'C3N-00221.T', 'C3N-03882.T', 'C3L-02164.T', 'C3L-02170.T', 'C3N-03441.T', 'C3N-00497.T', 'C3L-02349.T', 'C3L-02650.N', 'C3L-02625.T', 'C3N-03424.T', 'C3N-04127.T', 'C3N-01893.T', 'C3L-02649.T', 'C3N-03093.T', 'C3N-02523.T', 'C3N-03076.T', 'C3N-01892.N', 'C3N-03886.T', 'C3L-01606.T', 'C3N-04155.T', 'C3N-03875.T', 'C3N-01017.N', 'C3N-01028.T', 'C3N-02426.N', 'C3L-00927.T', 'C3N-01018.

{'C3L-00510.T', 'C3L-00368.N', 'C3N-03212.RNA-Seq.A.hg38', 'C3N-02242.RNA-Seq.A.hg38', 'C3N-01417.RNA-Seq.A.hg38', 'C3N-03233.RNA-Seq.A.hg38', 'C3N-01019.RNA-Seq.A.hg38', 'C3L-02169.RNA-Seq.A.hg38', 'C3N-00959.N', 'C3N-00169.T', 'C3L-00263.T', 'C3L-00001.N', 'C3N-00180.T', 'C3N-00550.N', 'C3N-02234.RNA-Seq.A.hg38', 'C3N-02149.N', 'C3N-02587.N', 'C3N-02240.RNA-Seq.A.hg38', 'C3N-00552.T', 'C3N-01024.N', 'C3N-00574.N', 'C3N-02002.N', 'C3N-00547.N', 'C3N-01419.RNA-Seq.T.hg38', 'C3N-02153.RNA-Seq.A.hg38', 'C3N-02672.RNA-Seq.T.hg38', 'C3N-02150.RNA-Seq.T.hg38', 'C3N-00572.T', 'C3N-01416.T', 'C3N-02281.RNA-Seq.T.hg38', 'C3N-02758.RNA-Seq.T.hg38', 'C3N-03205.RNA-Seq.T.hg38', 'C3L-00893.N', 'C3N-02234.RNA-Seq.T.hg38', 'C3N-00579.N', 'C3N-02144.RNA-Seq.T.hg38', '11LU013.T', 'C3L-00144.T', 'C3N-00559.N', 'C3N-01021.N', 'C3N-01409.RNA-Seq.A.hg38', 'C3N-02729.T', 'C3L-01683.T', 'C3N-01016.N', 'C3N-01022.RNA-Seq.T.hg38', 'C3N-02380.N', 'C3N-00578.N', 'C3L-01924.N', 'C3N-00551.T', 'C3L-00412.N', 'C3N

phosphoproteome (144, 15983)
proteome (144, 9246)


In [None]:
# cancer_to_protein['CCRCC']['phosphoproteome']['metadata'].loc[['CPT0001550001', 'CPT0001540009']]

In [45]:
cancer_to_protein['LSCC']['phosphoproteome']['metadata'].loc['C3N-03875.N']

case_id               C3N-03875
aliquot_id        CPT0237430003
tmt_experiment                2
tmt_channel                131N
sample_type              Normal
Name: C3N-03875.N, dtype: object

In [46]:
# cancer_to_protein['LSCC']['proteome']['processed_data'][['C3N.03875.N']]

In [47]:
cancer_to_aggs['LSCC']['site_level']

Unnamed: 0,value,paired_value,paired_difference,sample_type,data_type,rna_expression,paired_rna_expression,difference_rna_expression,has_mutation,gene
C3L-02665.T_acetylome_A2M:NP_000005.2:K664_acetylome,-0.0997,-1.7808,1.6811,tumor,acetylome,,,,False,A2M
C3L-02665.T_acetylome_AASS:NP_005754.2:K93_acetylome,0.1159,1.8238,-1.7079,tumor,acetylome,,,,False,AASS
C3L-02665.T_acetylome_ABCE1:NP_001035809.1:K210_acetylome,0.2270,-1.2052,1.4322,tumor,acetylome,,,,False,ABCE1
C3L-02665.T_acetylome_ABCE1:NP_001035809.1:K415_acetylome,1.1944,-0.5094,1.7038,tumor,acetylome,,,,False,ABCE1
C3L-02665.T_acetylome_ABCE1:NP_001035809.1:K431_acetylome,3.1825,-0.3380,3.5205,tumor,acetylome,,,,False,ABCE1
...,...,...,...,...,...,...,...,...,...,...
C3N-03851.N_ubiquitylome_ZSWIM9:NP_955373.3:K328_ubiquitylome,-1.6167,-1.8588,0.2421,normal,ubiquitylome,0.639734,1.565725,-0.925991,False,ZSWIM9
C3N-03851.N_ubiquitylome_ZUP1:NP_001348118.1:K276_ubiquitylome,0.6940,0.3584,0.3356,normal,ubiquitylome,6.038561,8.687788,-2.649227,False,ZUP1
C3N-03851.N_ubiquitylome_ZWINT:NP_008988.2:K111_ubiquitylome,0.2040,4.1551,-3.9511,normal,ubiquitylome,5.910446,74.706409,-68.795962,False,ZWINT
C3N-03851.N_ubiquitylome_ZZEF1:NP_055928.3:K2490_ubiquitylome,0.0106,0.0323,-0.0217,normal,ubiquitylome,11.741871,12.478456,-0.736585,False,ZZEF1


In [48]:
np.count_nonzero(pd.isnull(cancer_to_aggs['LSCC']['site_level']['paired_value']))

1175742

In [49]:
def combine_gene_site_level(gene_level_agg, site_level_agg):
    combined = pd.merge(site_level_agg, gene_level_agg, how='left', left_on='gene', right_index=True)
    return combined


def combine_cancer_types(cancer_to_aggs):
    df = None
    for cancer, d in cancer_to_aggs.items():
        data = d['site_level']
        data['cancer_type'] = [cancer] * data.shape[0]
        if df is None:
            df = data
        else:
            df = pd.concat((df, data), axis=0)
    return df
        

In [50]:
# combined = combine_gene_site_level(gene_level_agg, site_level_agg)
combined = combine_cancer_types(cancer_to_aggs)
combined

Unnamed: 0,value,paired_value,paired_difference,sample_type,data_type,rna_expression,paired_rna_expression,difference_rna_expression,has_mutation,gene,cancer_type
C3L-00004.T_phosphoproteome_A1CF:NP_001185747.1:T491_phosphoproteome,0.265550,-0.000111,0.265661,tumor,phosphoproteome,15.287382,8.086648,7.200734,False,A1CF,CCRCC
C3L-00004.T_phosphoproteome_AAAS:NP_001166937.1:S462_phosphoproteome,0.288098,0.087886,0.200212,tumor,phosphoproteome,8.623087,10.856750,-2.233663,False,AAAS,CCRCC
C3L-00004.T_phosphoproteome_AAAS:NP_001166937.1:Y452_phosphoproteome,0.540065,-0.229424,0.769489,tumor,phosphoproteome,8.623087,10.856750,-2.233663,False,AAAS,CCRCC
C3L-00004.T_phosphoproteome_AAED1:NP_714542.1:S12_phosphoproteome,0.785612,-0.366038,1.151650,tumor,phosphoproteome,,,,False,AAED1,CCRCC
C3L-00004.T_phosphoproteome_AAGAB:NP_001258814.1:S201;S202_phosphoproteome,0.307438,-0.163721,0.471159,tumor,phosphoproteome,10.663889,15.487737,-4.823848,False,AAGAB,CCRCC
...,...,...,...,...,...,...,...,...,...,...,...
S153_proteome_ZXDC_proteome,0.587000,,,normal,proteome,,,,False,ZXDC,UCEC
S153_proteome_ZYG11B_proteome,-0.477000,,,normal,proteome,,,,False,ZYG11B,UCEC
S153_proteome_ZYX_proteome,-0.422000,,,normal,proteome,,,,False,ZYX,UCEC
S153_proteome_ZZEF1_proteome,-0.093100,,,normal,proteome,,,,False,ZZEF1,UCEC


In [52]:
def add_drivers(combined):
    is_driver, is_expanded = [], []
    expanded_drivers = set(drivers['Gene'])
    for gene, cancer in zip(combined['gene'], combined['cancer_type']):
        is_driver.append(gene in cancer_to_driver_genes[cancer].index)
        is_expanded.append(gene in expanded_drivers)
    combined['driver'] = is_driver
    combined['expanded_driver'] = is_expanded
    return combined
        

In [53]:
## add drivers
# combined['driver'] = [0 if g not in ucec_genes.index else 1 for g in combined['gene']]
# combined['expanded_driver'] = [0 if g not in drivers['Gene'] else 1 for g in combined['gene']]
# combined
combined = add_drivers(combined)
combined

Unnamed: 0,value,paired_value,paired_difference,sample_type,data_type,rna_expression,paired_rna_expression,difference_rna_expression,has_mutation,gene,cancer_type,driver,expanded_driver
C3L-00004.T_phosphoproteome_A1CF:NP_001185747.1:T491_phosphoproteome,0.265550,-0.000111,0.265661,tumor,phosphoproteome,15.287382,8.086648,7.200734,False,A1CF,CCRCC,False,False
C3L-00004.T_phosphoproteome_AAAS:NP_001166937.1:S462_phosphoproteome,0.288098,0.087886,0.200212,tumor,phosphoproteome,8.623087,10.856750,-2.233663,False,AAAS,CCRCC,False,False
C3L-00004.T_phosphoproteome_AAAS:NP_001166937.1:Y452_phosphoproteome,0.540065,-0.229424,0.769489,tumor,phosphoproteome,8.623087,10.856750,-2.233663,False,AAAS,CCRCC,False,False
C3L-00004.T_phosphoproteome_AAED1:NP_714542.1:S12_phosphoproteome,0.785612,-0.366038,1.151650,tumor,phosphoproteome,,,,False,AAED1,CCRCC,False,False
C3L-00004.T_phosphoproteome_AAGAB:NP_001258814.1:S201;S202_phosphoproteome,0.307438,-0.163721,0.471159,tumor,phosphoproteome,10.663889,15.487737,-4.823848,False,AAGAB,CCRCC,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
S153_proteome_ZXDC_proteome,0.587000,,,normal,proteome,,,,False,ZXDC,UCEC,False,False
S153_proteome_ZYG11B_proteome,-0.477000,,,normal,proteome,,,,False,ZYG11B,UCEC,False,False
S153_proteome_ZYX_proteome,-0.422000,,,normal,proteome,,,,False,ZYX,UCEC,False,False
S153_proteome_ZZEF1_proteome,-0.093100,,,normal,proteome,,,,False,ZZEF1,UCEC,False,False


In [None]:
for c in combined.columns:
    for dtype in sorted(set(combined['data_type'])):
        f = combined[combined['data_type']==dtype]
        print(f.shape, dtype, c, np.count_nonzero(
            pd.isnull(f[c])) / f.shape[0])

In [54]:
## filter out samples with no paired value and only include tumor to avoid biased training
filtered_combined = combined[~pd.isnull(combined['paired_value'])]
filtered_combined = filtered_combined[filtered_combined['sample_type']=='tumor']
filtered_combined = filtered_combined[filtered_combined['data_type']!='ubiquitylome']
filtered_combined = filtered_combined[~pd.isnull(filtered_combined['paired_rna_expression'])]

to_keep = ['value', 'paired_value', 'paired_difference', 'sample_type', 'gene', 'data_type', 'cancer_type',
           'has_mutation', 'rna_expression', 'paired_rna_expression', 'difference_rna_expression',
           'driver', 'expanded_driver']

filtered_combined = filtered_combined[to_keep]
filtered_combined

Unnamed: 0,value,paired_value,paired_difference,sample_type,gene,data_type,cancer_type,has_mutation,rna_expression,paired_rna_expression,difference_rna_expression,driver,expanded_driver
C3L-00004.T_phosphoproteome_A1CF:NP_001185747.1:T491_phosphoproteome,0.265550,-0.000111,0.265661,tumor,A1CF,phosphoproteome,CCRCC,False,15.287382,8.086648,7.200734,False,False
C3L-00004.T_phosphoproteome_AAAS:NP_001166937.1:S462_phosphoproteome,0.288098,0.087886,0.200212,tumor,AAAS,phosphoproteome,CCRCC,False,8.623087,10.856750,-2.233663,False,False
C3L-00004.T_phosphoproteome_AAAS:NP_001166937.1:Y452_phosphoproteome,0.540065,-0.229424,0.769489,tumor,AAAS,phosphoproteome,CCRCC,False,8.623087,10.856750,-2.233663,False,False
C3L-00004.T_phosphoproteome_AAGAB:NP_001258814.1:S201;S202_phosphoproteome,0.307438,-0.163721,0.471159,tumor,AAGAB,phosphoproteome,CCRCC,False,10.663889,15.487737,-4.823848,False,False
C3L-00004.T_phosphoproteome_AAGAB:NP_001258814.1:S202_phosphoproteome,-0.294636,0.282808,-0.577444,tumor,AAGAB,phosphoproteome,CCRCC,False,10.663889,15.487737,-4.823848,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
S096_proteome_ZXDC_proteome,0.637000,0.420000,0.217000,tumor,ZXDC,proteome,UCEC,False,17.059799,9.967964,7.091835,False,False
S096_proteome_ZYG11B_proteome,0.537000,0.341000,0.196000,tumor,ZYG11B,proteome,UCEC,False,11.874810,15.654434,-3.779625,False,False
S096_proteome_ZYX_proteome,-0.338000,1.730000,-2.068000,tumor,ZYX,proteome,UCEC,False,19.686493,57.144577,-37.458084,False,False
S096_proteome_ZZEF1_proteome,-0.214000,-0.151000,-0.063000,tumor,ZZEF1,proteome,UCEC,False,7.169003,10.606031,-3.437028,False,False


In [55]:
def collapse_by_dtype_and_gene(df):
    dtypes = sorted(set(df['data_type']))
    

    categorical = ['cancer_type']
    
    
    df['collapse'] = [f'{dtype}_{gene}_{c}_{sample.split("_")[0]}' for dtype, gene, sample, c in zip(
                                                                               df['data_type'],
                                                                               df['gene'],
                                                                               df.index, df['cancer_type'])]
    collapse_to_cancer_type = {'_'.join(c.split('_')[1:]):cancer
                               for c, cancer in zip(df['collapse'], df['cancer_type'])}
    df = df.groupby('collapse').mean()
    shared_cols = ['rna_expression', 'paired_rna_expression', 'difference_rna_expression',
                   'has_mutation', 'driver', 'expanded_driver']
#     shared = df[shared_cols]
#     shared.index = ['_'.join(x.split('_')[1:]) for x in shared.index]
#     shared 

        
#     return df
#     print(Counter(df.index).most_common()[:5])
    dfs = []
    for dtype in dtypes:
        mask = [True if dtype in x else False for x in df.index]
        filtered = df[mask].copy()
        filtered.index = ['_'.join(x.split('_')[1:]) for x in filtered.index]
        filtered.columns = [f'{dtype}_{c}' if c not in shared_cols else c for c in filtered.columns]
        dfs.append(filtered)
    
    combined = None
    for i, df in enumerate(dfs):
#         print(dtypes[i], Counter(df.index).most_common()[:5])
        df = df[~df.index.duplicated(keep='first')]
        if combined is None:
            combined = df
        else:
#             print(df.index[:5], combined.index[:5])
#             print(len(set(df.index).intersection(set(combined.index))))
            combined = pd.merge(combined, df, right_index=True, left_index=True, how='inner')
#     print(Counter(combined.index).most_common()[:5])
#     return combined
    combined = combined[[c for c in combined.columns if c[-2:] not in ['_x', '_y']]]
    
    combined['cancer_type'] = [collapse_to_cancer_type[c] for c in combined.index]
    
#     return combined
    
    cat_columns = []
    cat_data = None
    for c in categorical:
        ls = combined[c]
        value_map = {c:i for i, c in enumerate(sorted(set(ls)))}
        print(value_map)
        r_value_map = {v:k for k, v in value_map.items()}
        zeros = np.zeros((combined.shape[0], len(value_map)))
        for i, val in enumerate(ls):
            zeros[i, value_map[val]] = 1
        
        if cat_data is None:
            cat_data = zeros
        else:
            cat_data = np.concatenate((cat_data, zeros), axis=1)
        cat_columns += sorted(set(ls))
    cat_df = pd.DataFrame(data=cat_data, columns=cat_columns, index=combined.index)
#     print(Counter(combined['LSCC']).most_common())
#     combined = combined[[c for c in combined.columns if c != 'cancer_type']]
    
    
    combined = pd.merge(left=combined, right=cat_df, right_index=True, left_index=True)
    combined = combined[[c for c in combined.columns if c != 'cancer_type']]

#     return combined
    print(combined.columns)
    ## filter down shared columns
#     cols = [c for c in combined.columns if c[:-2] not in shared_cols]
#     print(cols)
#     cols += [c for c in combined.columns if c[:-2] in shared_cols if c[-1]=='x']
#     print(cols)
#     combined = combined[cols]
#     combined.columns = [c if c[-1]!='x' else c[:-2] for c in combined.columns]

    return combined

In [56]:
fc = collapse_by_dtype_and_gene(filtered_combined)
fc

{'LSCC': 0, 'LUAD': 1, 'UCEC': 2}
Index(['acetylome_value', 'acetylome_paired_value',
       'acetylome_paired_difference', 'phosphoproteome_value',
       'phosphoproteome_paired_value', 'phosphoproteome_paired_difference',
       'proteome_value', 'proteome_paired_value', 'proteome_paired_difference',
       'has_mutation', 'rna_expression', 'paired_rna_expression',
       'difference_rna_expression', 'driver', 'expanded_driver', 'LSCC',
       'LUAD', 'UCEC'],
      dtype='object')


Unnamed: 0,acetylome_value,acetylome_paired_value,acetylome_paired_difference,phosphoproteome_value,phosphoproteome_paired_value,phosphoproteome_paired_difference,proteome_value,proteome_paired_value,proteome_paired_difference,has_mutation,rna_expression,paired_rna_expression,difference_rna_expression,driver,expanded_driver,LSCC,LUAD,UCEC
ABCE1_LUAD_C3L-00001.T,0.246850,-0.915100,1.16195,-0.719400,-1.255500,0.536100,-0.719400,-1.255500,0.536100,False,13.128813,11.484098,1.644715,False,False,0.0,1.0,0.0
ABCE1_LUAD_C3L-00009.T,-0.354350,-0.333900,-0.02045,0.213700,-3.734400,3.948100,0.213700,-3.734400,3.948100,False,21.503211,10.644732,10.858479,False,False,0.0,1.0,0.0
ABCE1_LUAD_C3L-00080.T,0.347850,-0.093750,0.44160,-0.791400,-2.024100,1.232700,-0.791400,-2.024100,1.232700,False,13.288162,11.205924,2.082239,False,False,0.0,1.0,0.0
ABCE1_LUAD_C3L-00083.T,0.221800,0.118350,0.10345,2.379900,1.897900,0.482000,2.379900,1.897900,0.482000,False,7.451822,10.310898,-2.859075,False,False,0.0,1.0,0.0
ABCE1_LUAD_C3L-00093.T,0.557200,0.734500,-0.17730,-0.312550,-0.312550,0.000000,-0.312550,-0.312550,0.000000,False,18.150735,14.897932,3.252803,False,False,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYX_UCEC_S067,0.003386,0.003386,0.00000,0.512067,0.950653,-0.438587,0.512067,0.950653,-0.438587,False,20.352733,32.856939,-12.504206,False,False,0.0,0.0,1.0
ZYX_UCEC_S075,0.019500,0.741000,-0.72150,-0.048744,0.950256,-0.999000,-0.048744,0.950256,-0.999000,False,22.233861,26.952750,-4.718888,False,False,0.0,0.0,1.0
ZYX_UCEC_S086,-0.521500,-0.190000,-0.33150,-0.269828,0.012692,-0.282520,-0.269828,0.012692,-0.282520,False,21.290153,18.449954,2.840199,False,False,0.0,0.0,1.0
ZYX_UCEC_S087,-0.040500,0.219000,-0.25950,-0.511296,0.331604,-0.842900,-0.511296,0.331604,-0.842900,False,17.309874,36.406614,-19.096740,False,False,0.0,0.0,1.0


In [None]:
# ## remove na over x percent and impute the rest
# remove = ['paired_value', 'paired_difference', 'paired_rna_expression', 'difference_rna_expression',
#          ]
# filtered_combined = combined[[c for c in combined.columns if c not in remove]]
# filtered_combined

In [None]:
# from sklearn.impute import SimpleImputer
# categorical = ['sample_type', 'data_type', 'has_mutation', 'gene']
# cont = [c for c in filtered_combined.columns if c not in categorical]
# filtered_combined = filtered_combined.replace(np.inf, np.nan)
# imputer = SimpleImputer(strategy='median')
# X = imputer.fit_transform(filtered_combined[cont])

# imputed_df = pd.DataFrame(data=X, index=filtered_combined.index, columns=cont)
# new = pd.concat((filtered_combined[categorical], imputed_df), axis=1)
# new

In [57]:
fc.to_csv('/data/results/all_test_v1.tsv', sep='\t')

In [None]:
# cancer_to_protein['CPTAC3_UCEC_discovery']['proteome']['metadata'].sort_values('case_id').to_csv(
#         '/data/results/ucec_metadata.tsv', sep='\t')

## differentials

In [None]:
fps = sorted(os_helpers.listfiles('/data/differentials/', regex='.tsv.gz'))
fps

In [None]:
fps = sorted(os_helpers.listfiles('/data/differentials/tumor_normal_enrichment/', regex='.tsv.gz'))
fps

In [None]:
cancer_to_site_differentials = {}
for fp in fps:
    
    if 'tumor_normal_diff' in fp:
        cancer = fp.split('/')[-1].split('.')[0]
        if cancer not in cancer_to_site_differentials:
            cancer_to_site_differentials[cancer] = {}
        df = pd.read_csv(fp, sep='\t', index_col='site_id')
        df = df[['tumor_normal_median_log2fc', 't_fdr']]
        if '/acetyl/' in fp: cancer_to_site_differentials[cancer]['acetyl'] = df
        if '/phospho/' in fp: cancer_to_site_differentials[cancer]['phospho'] = df
        if '/ubiquityl/' in fp: cancer_to_site_differentials[cancer]['ubiquityl'] = df

cancer_to_combined_site_diffs = {}
for cancer, diffs in cancer_to_site_differentials.items():
    combined = None
    for k, df in diffs.items():
        df.index = [f'{x}_{k}' for x in df.index]
        df.index.name = 'site_id'
        df['datatype'] = [k] * df.shape[0]
        if combined is None:
            combined = df
        else:
            combined = pd.concat((combined, df), axis=0)
    cancer_to_combined_site_diffs[cancer] = combined
 

In [None]:
cancer_to_combined_site_diffs['CPTAC3_UCEC_discovery']

In [None]:
cancer_to_combined_site_diffs['CPTAC3_UCEC_discovery'].to_csv('/data/results/ucec_differentials.tsv', sep='\t')

In [None]:
df = pd.read_csv('/data/differentials/tumor_normal_enrichment/acetyl/CPTAC3_UCEC_discovery.fisher_exact.v2.1.tsv.gz',
                sep='\t')
df

In [None]:
# df = pd.read_csv('/data/differentials/tumor_normal_diff/acetyl/CPTAC3_UCEC_discovery.fisher_exact.v2.1.tsv.gz',
#                 sep='\t')
# df

In [None]:
# df = pd.read_csv('/data/differentials/tumor_normal_diff/phospho/CPTAC3_UCEC_discovery.fisher_exact.v2.1.tsv.gz',
#                 sep='\t')
# df

In [None]:
# len(set(df['site_id']).intersection(set(cancer_to_protein['CPTAC3_UCEC_discovery']['acetylome']['processed_data'].index)))
# 

In [None]:
# df = pd.read_csv('/data/differentials/tumor_normal_diff/phospho/CPTAC3_GBM_discovery.fisher_exact.v2.1.tsv.gz',
#                 sep='\t')
# df

In [None]:
# df = pd.read_csv('/data/differentials/tumor_normal_enrichment/phospho/CPTAC3_GBM_discovery.fisher_exact.v2.1.tsv.gz',
#                 sep='\t')
# df

## corum

In [None]:
df = pd.read_csv('/data/uniprot_to_hugo_map.tsv', sep='\t', header=None)
uniprot_to_hugo = {k:v for k, v in zip(df[0], df[1])}
uniprot_to_hugo

In [None]:
df = pd.read_csv('/data/corum.tsv', sep='\t')
complexes = [[uniprot_to_hugo[p] for p in entry.split(';') if p in uniprot_to_hugo]
             for entry in df['subunits(UniProt IDs)']]
complexes = [c for c in complexes if len(c)>1]
complexes

In [None]:
import json
json.dump(complexes, open('/data/results/corum_complexes.json', 'w'))

In [None]:
df = pd.read_csv('/data/reactome.tsv', sep='\t', header=None)
df