In [264]:
import os

import pandas as pd
import numpy as np

import seaborn as sns
from sklearn.impute import SimpleImputer
import mgitools.os_helpers as os_helpers

In [3]:
# !pip install git+https://github.com/estorrs/mgitools
# !pip install sklearn

## read in driver genes

In [318]:
genes = pd.read_csv('/data/driver_genes/bailey_smg_table.tsv', sep='\t')
genes

Unnamed: 0,Gene,Cancer
0,ABL1,PANCAN
1,ACVR1,UCEC
2,ACVR1B,PANCAN
3,ACVR2A,COADREAD
4,ACVR2A,LIHC
...,...,...
734,ZMYM3,PRAD
735,ZNF133,OV
736,ZNF750,PANCAN
737,ZNF750,ESCA


In [378]:
drivers = pd.read_csv('/data/driver_genes/199_driver_list.txt', sep='\t')
drivers

Unnamed: 0,Gene,Tumor suppressor or oncogene prediction (by 20/20+)
0,PHF6,possible tsg
1,ABL1,
2,ALK,
3,AR,
4,ARAF,
...,...,...
183,KMT2A,tsg
184,KMT2B,tsg
185,MAX,oncogene
186,MED12,oncogene


In [323]:
ucec_genes = genes[[True if c in ['UCEC'] else False for c in genes['Cancer']]]
ucec_genes = pd.merge(drivers, ucec_genes, left_on='Gene', right_on='Gene')
ucec_genes = ucec_genes.set_index('Gene')
ucec_genes.index.name = 'gene'
ucec_genes

Unnamed: 0_level_0,Tumor suppressor or oncogene prediction (by 20/20+),Cancer
gene,Unnamed: 1_level_1,Unnamed: 2_level_1
TP53,tsg,UCEC
CTNNB1,oncogene,UCEC
KRAS,oncogene,UCEC
PIK3CA,oncogene,UCEC
EP300,tsg,UCEC
FBXW7,possible tsg,UCEC
PPP2R1A,oncogene,UCEC
NFE2L2,oncogene,UCEC
NRAS,oncogene,UCEC
PTEN,tsg,UCEC


In [352]:
ucec_genes.to_csv('/data/driver_genes/ucec_drivers.tsv', sep='\t')

In [379]:
drivers.to_csv('/data/driver_genes/expanded_drivers.tsv', sep='\t', index=False)

## read in raw protein data

In [61]:
fps = sorted(os_helpers.listfiles('/data/v2.0/', regex=r'.gct'))
fps

['/data/v2.0/CPTAC2_BRCA_prospective/CPTAC2_BRCA_prospective_broad_acetylome.v2.0.gct',
 '/data/v2.0/CPTAC2_BRCA_prospective/CPTAC2_BRCA_prospective_broad_phosphoproteome.v2.0.gct',
 '/data/v2.0/CPTAC2_BRCA_prospective/CPTAC2_BRCA_prospective_broad_proteome.v2.0.gct',
 '/data/v2.0/CPTAC2_CRC_prospective/CPTAC2_CRC_prospective_pnnl_phosphoproteome.v2.0.gct',
 '/data/v2.0/CPTAC2_CRC_prospective/CPTAC2_CRC_prospective_pnnl_proteome.v2.0.gct',
 '/data/v2.0/CPTAC2_OV_prospective/CPTAC2_OV_prospective_pnnl_phosphoproteome.v2.0.gct',
 '/data/v2.0/CPTAC2_OV_prospective/CPTAC2_OV_prospective_pnnl_proteome.v2.0.gct',
 '/data/v2.0/CPTAC3_CCRCC_discovery/CPTAC3_CCRCC_discovery_umich_phosphoproteome.v2.0.gct',
 '/data/v2.0/CPTAC3_CCRCC_discovery/CPTAC3_CCRCC_discovery_umich_proteome.v2.0.gct',
 '/data/v2.0/CPTAC3_GBM_discovery/CPTAC3_GBM_discovery_pnnl_acetylome.v2.0.gct',
 '/data/v2.0/CPTAC3_GBM_discovery/CPTAC3_GBM_discovery_pnnl_phosphoproteome.v2.0.gct',
 '/data/v2.0/CPTAC3_GBM_discovery/CPTAC3

In [251]:
f = open('/data/v2.0/CPTAC3_UCEC_discovery/CPTAC3_UCEC_discovery_pnnl_phosphoproteome.v2.0.gct')
for i, line in enumerate(f):
    print(line)
    if i % 20 == 19:
        break
f.close()

#1.3

105878	144	21	6

id	original_id	refseq_prot_id	symbol	phosphosites	peptide	peptide_start	peptide_end	refseq_tx_id	uniparc_id	hgnc_id	entrez_gene_id	ensembl_gene_id	multi_genomic_loci	num_exons	aa_len	tx_len	cds_len	uniparc_crc64_checksum	uniparc_xref_refseq_prot_ids	uniparc_xref_ensembl_prot_ids	uniparc_xref_uniprot_ids	S001	S002	S003	S005	S006	S007	S008	S009	S010	S011	S012	S014	S016	S017	S018	S019	S020	S021	S022	S023	S024	S025	S026	S027	S028	S029	S030	S031	S032	S033	S034	S036	S037	S038	S039	S040	S041	S042	S044	S045	S046	S048	S049	S050	S051	S053	S054	S055	S056	S057	S058	S059	S060	S061	S062	S063	S064	S065	S066	S067	S068	S069	S070	S071	S072	S073	S074	S075	S076	S077	S078	S079	S080	S081	S082	S083	S084	S085	S086	S087	S088	S090	S091	S092	S093	S094	S095	S096	S097	S098	S099	S100	S101	S102	S103	S105	S106	S107	S108	S109	S110	S111	S112	S113	S114	S115	S116	S117	S118	S119	S120	S121	S122	S123	S124	S125	S126	S127	S128	S129	S130	S131	S132	S133	S134	S135	S136	S137	S138	S139	S140	S141	S142	S143	S1

In [259]:
def convert_gct_to_df(fp):
    f = open(fp)
    f.readline()
    _, n_samples, n_feature_metadata, n_sample_metadata = [int(x) for x in f.readline().strip().split('\t')]
    df = pd.read_csv(f, sep='\t')
    df.columns = [c[1:] if c[0]=='X' else c for c in df.columns]
    
    ## do some renaming and split metadata into different table
    ids = df['id'].to_list()[:n_sample_metadata - 1]
    metadata = df.iloc[:n_sample_metadata - 1, -n_samples:].copy()
    metadata.index = ids
    metadata = metadata.transpose()
    
    
    df = df.iloc[n_sample_metadata:, :]
    print(df.shape)
    df = df.set_index('id')
    df.index.name = 'site_id'
    
    return df, metadata

def get_cancer_to_dfs(fps):
    d = {}
    for fp in fps:
        cancer = fp.split('/')[3]
        filetype = fp.split('.v2.0')[-2].split('_')[-1]
        print(cancer, filetype)
        
        if cancer not in d:
            d[cancer] = {
                k:{} for k in ['acetylome', 'phosphoproteome', 'acetylome', 'ubiquitylome']
            }
        
        df, metadata = convert_gct_to_df(fp)
        d[cancer][filetype] = {
            'metadata': metadata,
            'data': df
        }
    ## filter out empty datatypes
#     d = {for cancer, dicts in d.items() for dtypes, m in dicts.items() if len(m)}
    for cancer in list(d.keys()):
        for dtypes in list(d[cancer].keys()):
            if not len(d[cancer][dtypes]):
                d[cancer].pop(dtypes)
            
    return d

In [262]:
cancer_to_protein = get_cancer_to_dfs(fps)


CPTAC2_BRCA_prospective acetylome
(18328, 144)
CPTAC2_BRCA_prospective phosphoproteome
(63330, 144)
CPTAC2_BRCA_prospective proteome
(9764, 134)
CPTAC2_CRC_prospective phosphoproteome
(41891, 221)
CPTAC2_CRC_prospective proteome
(7402, 202)
CPTAC2_OV_prospective phosphoproteome
(48571, 127)
CPTAC2_OV_prospective proteome
(10095, 109)
CPTAC3_CCRCC_discovery phosphoproteome
(81780, 216)
CPTAC3_CCRCC_discovery proteome
(11355, 200)
CPTAC3_GBM_discovery acetylome
(18767, 131)
CPTAC3_GBM_discovery phosphoproteome
(56292, 131)
CPTAC3_GBM_discovery proteome
(11141, 114)
CPTAC3_HNSCC_discovery phosphoproteome
(66577, 208)
CPTAC3_HNSCC_discovery proteome
(11744, 190)
CPTAC3_LSCC_discovery acetylome
(15056, 229)
CPTAC3_LSCC_discovery phosphoproteome
(68544, 229)
CPTAC3_LSCC_discovery proteome
(11117, 219)
CPTAC3_LSCC_discovery ubiquitylome
(25430, 169)
CPTAC3_LUAD_discovery acetylome
(13368, 233)
CPTAC3_LUAD_discovery phosphoproteome
(64996, 233)
CPTAC3_LUAD_discovery proteome
(10305, 223)
CPTAC

In [131]:
cancer_to_protein['CPTAC2_BRCA_prospective']['acetylome']['data']

Unnamed: 0_level_0,original_id,refseq_prot_id,symbol,acetylsites,peptide,peptide_start,peptide_end,refseq_tx_id,uniparc_id,hgnc_id,...,21BR002,01BR026,05BR004,03BR006,01BR023,01BR020,20BR006,09BR001,03BR011,01BR010
site_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG:NP_570602.2:K248,NP_570602.2_K248k _1_1_248_248,NP_570602.2,A1BG,K248,RGEkELLVPR,245,254,NM_130786,UPI0000167B10,HGNC:5,...,,,,,,,,,,
A1BG:NP_570602.2:K78,NP_570602.2_K78k _1_1_78_78,NP_570602.2,A1BG,K78,NGVAQEPVHLDSPAIk,63,78,NM_130786,UPI0000167B10,HGNC:5,...,,,,,,,,,,
A2M:NP_000005.2:K1019,NP_000005.2_K1019k _1_1_1019_1019,NP_000005.2,A2M,K1019,QLNYkHYDGSYSTFGER,1015,1031,NM_000014,UPI0000155718,HGNC:7,...,,,,,,,,,,
A2M:NP_000005.2:K1092,NP_000005.2_K1092k _1_1_1092_1092,NP_000005.2,A2M,K1092,SSGSLLNNAIk,1082,1092,NM_000014,UPI0000155718,HGNC:7,...,-1.3868,-0.401,-0.5238,-0.9945,,,,,,
A2M:NP_000005.2:K115,NP_000005.2_K115k _1_1_115_115,NP_000005.2,A2M,K115,GPTQEFkK,109,116,NM_000014,UPI0000155718,HGNC:7,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZZEF1:NP_055928.3:K2306,NP_055928.3_K2306k _1_1_2306_2306,NP_055928.3,ZZEF1,K2306,DYQLVQkGGGQECGDSR,2300,2316,NM_015113,UPI00004569F7,HGNC:29027,...,3.5776,-0.7349,3.218,-1.2904,,,,,,
ZZZ3:NP_056349.1:K117,NP_056349.1_K117k _1_1_117_117,NP_056349.1,ZZZ3,K117,RQTEPVSPVLkR,107,118,NM_015534,UPI0000074256,HGNC:24523,...,-1.0367,-0.6101,-0.5204,-1.5159,,,,,,
ZZZ3:NP_056349.1:K161,NP_056349.1_K161k _1_1_161_161,NP_056349.1,ZZZ3,K161,STVVDNDADFQGTkR,148,162,NM_015534,UPI0000074256,HGNC:24523,...,-0.5272,-0.2389,1.8514,-0.5683,0.4515,-0.0178,-0.7271,0.1358,-2.175,0.5956
ZZZ3:NP_056349.1:K701,NP_056349.1_K701k _1_1_701_701,NP_056349.1,ZZZ3,K701,VQkYFIK,699,705,NM_015534,UPI0000074256,HGNC:24523,...,-2.3985,0.0171,-0.2915,0.789,0.4362,0.8986,-0.4179,0.7768,-0.963,-1.2672


In [132]:
cancer_to_protein['CPTAC2_BRCA_prospective']['acetylome']['metadata']

Unnamed: 0,case_id,aliquot_id,tmt_experiment,tmt_channel,sample_type,tumor_id,normal_id
11BR047,11BR047,[ef]-0be77a_D2,1,126,Primary Tumor,140_DM,140_DN
11BR043,11BR043,[6d]-316fca_D2,1,127N,Primary Tumor,139_DM,139_DN
11BR049,11BR049,[2e]-3eaf5a_D2,1,127C,Primary Tumor,297_DM,297_DN
11BR023,11BR023,[07]-48cfab_D2,1,128N,Primary Tumor,102_DM,102_DN
18BR010,18BR010,[0b]-a6643c_D2,1,128C,Primary Tumor,212_DM,212_DN
...,...,...,...,...,...,...,...
01BR020,01BR020,[3f]-2fca26_D1,17,127C,Primary Tumor,352-DM,352-DN
20BR006,20BR006,[3f]-e3d043_D2,17,128N,Primary Tumor,259_DM,259_DN
09BR001,09BR001,[20]-628918,17,129N,Primary Tumor,354-DM,354-DN
03BR011,03BR011,[60]-5fb803,17,129C,Primary Tumor,355-DM,355-DN


In [182]:
x = np.asarray([[np.nan, np.nan], [np.nan, np.nan]])
x

array([[nan, nan],
       [nan, nan]])

In [183]:
np.sum(x, axis=0)

array([nan, nan])

In [217]:
d = pd.DataFrame(data=x, columns=['a', 'b'])
d

Unnamed: 0,a,b
0,,
1,,


In [222]:
np.count_nonzero(~pd.isnull(d))

0

In [267]:
# def na_mean(df):
# #     print(df.shape)
#     X = df.values[:, 1:]
#     X = X.astype(np.float32)
# #     X = df.values[]
# #     non_zeros = np.count_nonzero(df.values, axis=0)
# #     if not non_zeros or pd.isnull(non_zeros):
# #         return np.nan
# #     print(df.values.flatten())
# #     print(set(df.values.flatten(), axis=0))
# #     print(df.values)
# #     try:
# #     print(df.columns)
#     X = np.sum(X, axis=0) / np.count_nonzero(X, axis=0)
#     return pd.DataFrame(data=X, index=df['symbol'], columns=df.columns[1:])
# #     except:
# # #         print(list(np.sum(df.values, axis=0)))
# # #         raise RuntimeError('stop')
    
# #     return np.arange(df.shape[0])

def filter_sites(df, metadata_df):
    ## filter sites with excessive missing NA
    sample_df = df[metadata_df.index]
    usable_count = [np.count_nonzero(~pd.isnull(ls)) for ls in sample_df.values]
#     sns.distplot(np.asarray(usable_count)/sample_df.shape[1])
    mask = [True if c / sample_df.shape[1] >= .75 else False for c in usable_count]
    df = df[mask]
    sample_df = sample_df[mask]
    
    sample_df = sample_df.transpose()
    
    ## impute nans
    sample_df = pd.DataFrame(data=SimpleImputer(strategy='median').fit_transform(sample_df.values),
                       index=sample_df.index, columns=sample_df.columns)
    
    return pd.concat((df.iloc[:, :-metadata_df.shape[0]], sample_df.transpose()), axis=1)
    

    
    

In [268]:
for cancer, dicts in cancer_to_protein.items():
    for dtype, d in dicts.items():
        print(cancer, dtype)
        d['processed_data'] = filter_sites(d['data'], d['metadata'])
        print(d['data'].shape, d['processed_data'].shape)

CPTAC2_BRCA_prospective acetylome
(18328, 143) (3800, 143)
CPTAC2_BRCA_prospective phosphoproteome
(63330, 143) (19502, 143)
CPTAC2_BRCA_prospective proteome
(9764, 133) (8890, 133)
CPTAC2_CRC_prospective phosphoproteome
(41891, 220) (5851, 220)
CPTAC2_CRC_prospective proteome
(7402, 201) (5940, 201)
CPTAC2_OV_prospective phosphoproteome
(48571, 126) (2422, 126)
CPTAC2_OV_prospective proteome
(10095, 108) (8624, 108)
CPTAC3_CCRCC_discovery phosphoproteome
(81780, 215) (15255, 215)
CPTAC3_CCRCC_discovery proteome
(11355, 199) (8442, 199)
CPTAC3_GBM_discovery acetylome
(18767, 130) (3388, 130)
CPTAC3_GBM_discovery phosphoproteome
(56292, 130) (19501, 130)
CPTAC3_GBM_discovery proteome
(11141, 113) (9799, 113)
CPTAC3_HNSCC_discovery phosphoproteome
(66577, 207) (13008, 207)
CPTAC3_HNSCC_discovery proteome
(11744, 189) (8845, 189)
CPTAC3_LSCC_discovery acetylome
(15056, 228) (3047, 228)
CPTAC3_LSCC_discovery phosphoproteome
(68544, 228) (23790, 228)
CPTAC3_LSCC_discovery ubiquitylome
(2543

In [270]:
cancer_to_protein['CPTAC3_UCEC_discovery']['acetylome']['processed_data']

Unnamed: 0_level_0,original_id,refseq_prot_id,symbol,acetylsites,peptide,peptide_start,peptide_end,refseq_tx_id,uniparc_id,hgnc_id,...,S144,S145,S146,S147,S148,S149,S150,S151,S152,S153
site_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAGAB:NP_078942.3:K290,VHAEK#VAK,NP_078942.3,AAGAB,K290,VHAEkVAK,286,293,NM_024666,UPI000013D219,HGNC:25662,...,-0.428,-0.87300,-0.29500,0.15200,-0.53000,-0.75700,-0.72800,-1.16000,-0.59600,-0.22400
ABCE1:NP_001035809.1:K431,QLLHEK#IR,NP_001035809.1,ABCE1,K431,QLLHEkIR,426,433,NM_001040876,UPI0000001226,HGNC:69,...,0.149,-0.04345,-0.04345,-0.04345,-0.04345,-0.04345,-0.04345,-0.04345,-0.04345,-0.04345
ACAA2:NP_006102.2:K13,GVFVVAAK#R,NP_006102.2,ACAA2,K13,GVFVVAAkR,6,14,NM_006111,UPI000006FECE,HGNC:83,...,0.779,0.23200,0.08980,0.14800,0.19800,0.06040,-0.04920,0.26900,0.02100,0.05590
ACAA2:NP_006102.2:K137,FGTK#LGSDIK,NP_006102.2,ACAA2,K137,FGTkLGSDIK,134,143,NM_006111,UPI000006FECE,HGNC:83,...,0.257,0.37200,0.01440,0.18900,0.17800,0.20600,-0.13800,0.41600,0.28200,-0.22200
ACAA2:NP_006102.2:K234,QTMQVDEHARPQTTLEQLQK#LPPVFK,NP_006102.2,ACAA2,K234,QTMQVDEHARPQTTLEQLQkLPPVFK,215,240,NM_006111,UPI000006FECE,HGNC:83,...,1.080,0.21800,0.00190,-0.15900,0.06660,-0.55800,-0.22000,-0.23800,-0.03560,-0.59600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZNF703:NP_079345.1:K141,SAPGAASAAAALK#QLGDSPAEDK,NP_079345.1,ZNF703,K141,SAPGAASAAAALkQLGDSPAEDK,129,151,NM_025069,UPI0000073D30,HGNC:25883,...,0.194,0.71000,0.55000,0.25200,0.67200,0.10800,0.02580,0.06670,0.29800,-0.00305
ZRANB2:NP_976225.1:K54,AGGTEIGK#TLAEK,NP_976225.1,ZRANB2,K54,AGGTEIGkTLAEK,47,59,NM_203350,UPI000013CE63,HGNC:13058,...,-0.604,0.61200,0.44600,0.52400,0.64700,0.06380,0.11200,0.63000,0.69100,0.16700
ZSCAN18:NP_001139014.1:K422,GTAK#LGTK,NP_001139014.1,ZSCAN18,K422,GTAkLGTK,419,426,NM_001145542,UPI00017A6DAF,HGNC:21037,...,0.290,-0.38100,0.35200,-0.17200,-0.22400,-0.15000,-0.31400,-0.67600,-0.37900,0.04730
ZYX:NP_001010972.1:K25,PSPAISVSVSAPAFYAPQKK#,NP_001010972.1,ZYX,K25,PSPAISVSVSAPAFYAPQKk,6,25,NM_001010972,UPI00000424F2,HGNC:13200,...,0.445,0.82000,0.89600,0.46200,0.78300,0.00951,0.28700,1.06000,1.10000,0.56200


In [284]:
cancer_to_protein['CPTAC3_UCEC_discovery']['acetylome']['metadata'].sort_values('case_id')

Unnamed: 0,case_id,sample_type,aliquot_ids,tmt_experiment,tmt_channel
S001,C3L-00006,Tumor,CPT0001460012,5,128N
S105,C3L-00006,Adjacent_normal,CPT0001470001,5,128C
S002,C3L-00008,Tumor,CPT0001300009,16,130N
S003,C3L-00032,Tumor,CPT0001420009,2,131
S005,C3L-00090,Tumor,CPT0001140003,12,129C
...,...,...,...,...,...
S140,NX5,Enriched_normal,CPT0189520002;CPT0189520003;CPT0189520004;CPT0...,15,127C
S141,NX6,Enriched_normal,CPT0189170002;CPT0189170003;CPT0189180002;CPT0...,15,128N
S142,NX7,Enriched_normal,CPT0183020002;CPT0183050002;CPT0183050003;CPT0...,15,131
S143,NX8,Enriched_normal,CPT0191190002;CPT0191190003;CPT0191200002;CPT0...,16,127C


In [282]:
set(cancer_to_protein['CPTAC3_UCEC_discovery']['acetylome']['metadata']['sample_type'])

{'Adjacent_normal', 'Enriched_normal', 'Myometrium_normal', 'Tumor'}

In [281]:
from collections import Counter
Counter(cancer_to_protein['CPTAC3_UCEC_discovery']['acetylome']['metadata']['case_id']).most_common()

[('C3L-00006', 2),
 ('C3L-00361', 2),
 ('C3L-00563', 2),
 ('C3L-00586', 2),
 ('C3L-00601', 2),
 ('C3L-00605', 2),
 ('C3L-00769', 2),
 ('C3L-00770', 2),
 ('C3L-00771', 2),
 ('C3L-00932', 2),
 ('C3L-00947', 2),
 ('C3L-00963', 2),
 ('C3L-01246', 2),
 ('C3L-01249', 2),
 ('C3L-01252', 2),
 ('C3L-01256', 2),
 ('C3L-01257', 2),
 ('C3L-01282', 2),
 ('C3L-01304', 2),
 ('C3L-01307', 2),
 ('C3L-01311', 2),
 ('C3L-01744', 2),
 ('C3N-00200', 2),
 ('C3N-00333', 2),
 ('C3N-00383', 2),
 ('C3N-00729', 2),
 ('C3N-00858', 2),
 ('C3N-00866', 2),
 ('C3N-01211', 2),
 ('C3N-01346', 2),
 ('C3L-00008', 1),
 ('C3L-00032', 1),
 ('C3L-00090', 1),
 ('C3L-00098', 1),
 ('C3L-00136', 1),
 ('C3L-00137', 1),
 ('C3L-00139', 1),
 ('C3L-00143', 1),
 ('C3L-00145', 1),
 ('C3L-00156', 1),
 ('C3L-00161', 1),
 ('C3L-00358', 1),
 ('C3L-00362', 1),
 ('C3L-00413', 1),
 ('C3L-00449', 1),
 ('C3L-00767', 1),
 ('C3L-00780', 1),
 ('C3L-00781', 1),
 ('C3L-00905', 1),
 ('C3L-00918', 1),
 ('C3L-00921', 1),
 ('C3L-00942', 1),
 ('C3L-00946

In [374]:
## process for final output
def generate_model_input(data_dict):
    output = None
    
    for dtype, df_dict in data_dict.items():
        filtered = df_dict['processed_data'].copy()
        metadata = df_dict['metadata'].copy()
        filtered = filtered[metadata.index]
        filtered = filtered.transpose()

        ## add in metadata
        if 'sample_type' in metadata.columns:
            filtered['sample_type'] = ['normal' if 'normal' in x else 'tumor' for x in metadata['sample_type']]
        else:
            filtered['sample_type'] = ['tumor'] * metadata.shape[0]
            
        ## rename_columns
        filtered.columns = [f'{c}_{dtype}' for c in filtered.columns]
        
        if output is None:
            output = filtered
        else:
            output = pd.merge(output, filtered, right_index=True, left_index=True)
            
    ## normalize sample types
    df = output[[c for c in output.columns if 'sample_type' in c]]
    output = output[[c for c in output.columns if f'sample_type' not in c]]
    output['sample_type'] = df.iloc[:, 0].to_list()
    
    ## split into gene groupings
    ## groupby tissue
#     output = output.groupby('sample_type').mean()
    output = output[[c for c in output.columns if '_proteome' in c]]
    output = output.transpose()
    
            
    return output
    

In [375]:
cancer_to_model_inputs = {}
df = None
for cancer, data_dict in cancer_to_protein.items():
    print(cancer)
    output = generate_model_input(data_dict)
    cancer_to_model_inputs[cancer] = output

CPTAC2_BRCA_prospective
CPTAC2_CRC_prospective
CPTAC2_OV_prospective
CPTAC3_CCRCC_discovery
CPTAC3_GBM_discovery
CPTAC3_HNSCC_discovery
CPTAC3_LSCC_discovery
CPTAC3_LUAD_discovery
CPTAC3_UCEC_discovery
TCGA_BRCA_retrospective
TCGA_OV_retrospective


In [376]:
cancer_to_model_inputs['CPTAC3_UCEC_discovery']

Unnamed: 0,S001,S002,S003,S005,S006,S007,S008,S009,S010,S011,...,S144,S145,S146,S147,S148,S149,S150,S151,S152,S153
A1BG_proteome,-1.1800,-0.68500,-0.528,-1.6700,-0.3740,-1.0800,-1.32000,-0.4670,-1.1200,-0.716,...,0.899,1.6000,1.320000,1.4400,1.4600,0.650,0.4580,1.15000,0.547,0.9400
A2M_proteome,-0.8630,-1.07000,-1.320,-1.1900,-0.0206,-0.7080,-0.70800,0.3700,-1.3100,-0.885,...,0.136,0.7600,0.954000,1.7300,2.0000,0.227,0.5200,1.46000,1.270,0.9040
A2ML1_proteome,-0.8020,-0.68400,0.435,-0.4430,-0.5370,-0.1260,-0.80800,-0.3390,0.9120,2.820,...,-1.630,0.6880,-0.628000,-0.2210,-0.3940,1.930,-0.2910,-0.02290,-0.197,-0.0803
AAAS_proteome,0.2560,0.13500,-0.240,-0.0993,0.3750,-0.1140,0.13800,0.4340,-0.0768,0.147,...,-0.293,0.6440,1.020000,0.2750,0.5440,0.239,0.4770,0.25200,0.405,0.2990
AACS_proteome,0.6650,0.33400,1.040,0.7570,0.0131,-0.1110,0.65600,0.0358,0.8460,0.445,...,-1.100,-0.0780,-0.305000,-0.2700,-0.4080,-0.376,-0.1510,-0.23800,0.284,-0.1760
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDC_proteome,-0.3320,-0.56400,0.151,-0.3980,-0.6460,-0.0229,-0.68100,0.4860,-0.2220,-0.350,...,-0.154,0.5540,1.490000,1.9000,1.0000,1.330,0.6620,0.04500,1.050,0.5870
ZYG11B_proteome,-0.4330,-0.00461,-0.074,-0.0752,0.2070,-0.3310,-0.28500,0.1690,-0.3200,-0.377,...,0.108,-0.4170,-0.623000,-0.4280,-0.3860,-0.674,-0.5700,-0.31900,-0.356,-0.4770
ZYX_proteome,-1.0200,-1.13000,-0.540,-0.7970,-1.8500,-1.1600,-0.56400,-0.6320,-0.6200,0.388,...,1.730,0.9550,-0.000941,0.1910,0.0358,-0.840,-0.3990,0.83500,0.416,-0.4220
ZZEF1_proteome,-0.1230,-0.07570,0.320,-0.0301,-0.1760,-0.1160,-0.08760,-0.2030,0.3630,0.011,...,0.167,0.0334,-0.164000,0.0493,-0.1420,-0.269,-0.0959,0.16900,0.273,-0.0931


In [377]:
cancer_to_model_inputs['CPTAC3_UCEC_discovery'].to_csv('/data/results/ucec_proteome_inputs.tsv', sep='\t')

## differentials

In [326]:
fps = sorted(os_helpers.listfiles('/data/differentials/', regex='.tsv.gz'))
fps

['/data/differentials/across_cohort_tumor_enrichment/acetyl.chisq.v2.1.tsv.gz',
 '/data/differentials/across_cohort_tumor_enrichment/phospho.chisq.v2.1.tsv.gz',
 '/data/differentials/tumor_normal_diff/acetyl/CPTAC3_GBM_discovery.fisher_exact.v2.1.tsv.gz',
 '/data/differentials/tumor_normal_diff/acetyl/CPTAC3_LSCC_discovery.fisher_exact.v2.1.tsv.gz',
 '/data/differentials/tumor_normal_diff/acetyl/CPTAC3_LUAD_discovery.fisher_exact.v2.1.tsv.gz',
 '/data/differentials/tumor_normal_diff/acetyl/CPTAC3_UCEC_discovery.fisher_exact.v2.1.tsv.gz',
 '/data/differentials/tumor_normal_diff/phospho/CPTAC2_CRC_prospective.fisher_exact.v2.1.tsv.gz',
 '/data/differentials/tumor_normal_diff/phospho/CPTAC2_OV_prospective.fisher_exact.v2.1.tsv.gz',
 '/data/differentials/tumor_normal_diff/phospho/CPTAC3_CCRCC_discovery.fisher_exact.v2.1.tsv.gz',
 '/data/differentials/tumor_normal_diff/phospho/CPTAC3_GBM_discovery.fisher_exact.v2.1.tsv.gz',
 '/data/differentials/tumor_normal_diff/phospho/CPTAC3_HNSCC_discov

In [354]:
fps = sorted(os_helpers.listfiles('/data/differentials/tumor_normal_enrichment/', regex='.tsv.gz'))
fps

['/data/differentials/tumor_normal_enrichment/acetyl/CPTAC3_GBM_discovery.fisher_exact.v2.1.tsv.gz',
 '/data/differentials/tumor_normal_enrichment/acetyl/CPTAC3_LSCC_discovery.fisher_exact.v2.1.tsv.gz',
 '/data/differentials/tumor_normal_enrichment/acetyl/CPTAC3_LUAD_discovery.fisher_exact.v2.1.tsv.gz',
 '/data/differentials/tumor_normal_enrichment/acetyl/CPTAC3_UCEC_discovery.fisher_exact.v2.1.tsv.gz',
 '/data/differentials/tumor_normal_enrichment/phospho/CPTAC2_CRC_prospective.fisher_exact.v2.1.tsv.gz',
 '/data/differentials/tumor_normal_enrichment/phospho/CPTAC2_OV_prospective.fisher_exact.v2.1.tsv.gz',
 '/data/differentials/tumor_normal_enrichment/phospho/CPTAC3_CCRCC_discovery.fisher_exact.v2.1.tsv.gz',
 '/data/differentials/tumor_normal_enrichment/phospho/CPTAC3_GBM_discovery.fisher_exact.v2.1.tsv.gz',
 '/data/differentials/tumor_normal_enrichment/phospho/CPTAC3_HNSCC_discovery.fisher_exact.v2.1.tsv.gz',
 '/data/differentials/tumor_normal_enrichment/phospho/CPTAC3_LSCC_discovery.

In [333]:
cancer_to_site_differentials = {}
for fp in fps:
    
    if 'tumor_normal_diff' in fp:
        cancer = fp.split('/')[-1].split('.')[0]
        if cancer not in cancer_to_site_differentials:
            cancer_to_site_differentials[cancer] = {}
        df = pd.read_csv(fp, sep='\t', index_col='site_id')
        df = df[['tumor_normal_median_log2fc', 't_fdr']]
        if '/acetyl/' in fp: cancer_to_site_differentials[cancer]['acetyl'] = df
        if '/phospho/' in fp: cancer_to_site_differentials[cancer]['phospho'] = df
        if '/ubiquityl/' in fp: cancer_to_site_differentials[cancer]['ubiquityl'] = df

cancer_to_combined_site_diffs = {}
for cancer, diffs in cancer_to_site_differentials.items():
    combined = None
    for k, df in diffs.items():
        df.index = [f'{x}_{k}' for x in df.index]
        df.index.name = 'site_id'
        df['datatype'] = [k] * df.shape[0]
        if combined is None:
            combined = df
        else:
            combined = pd.concat((combined, df), axis=0)
    cancer_to_combined_site_diffs[cancer] = combined
 

In [334]:
cancer_to_combined_site_diffs['CPTAC3_UCEC_discovery']

Unnamed: 0_level_0,tumor_normal_median_log2fc,t_fdr,datatype
site_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A2M:NP_000005.2:K1176_acetyl,-0.430000,1.190734e-01,acetyl
AAGAB:NP_078942.3:K290_acetyl,1.057500,2.652271e-16,acetyl
AATF:NP_036270.1:K380_acetyl,0.308150,7.522213e-02,acetyl
ABCE1:NP_001035809.1:K121_acetyl,0.345400,2.379359e-03,acetyl
ABCE1:NP_001035809.1:K210_acetyl,-0.075950,2.882943e-02,acetyl
...,...,...,...
ZZZ3:NP_056349.1:S314_phospho,-0.170750,4.894967e-02,phospho
ZZZ3:NP_056349.1:S391_phospho,0.005791,4.639110e-01,phospho
ZZZ3:NP_056349.1:S397_phospho,0.085100,2.672347e-01,phospho
ZZZ3:NP_056349.1:S397;S426_phospho,0.163450,5.353662e-01,phospho


In [341]:
cancer_to_combined_site_diffs['CPTAC3_UCEC_discovery'].to_csv('/data/results/ucec_differentials.tsv', sep='\t')

In [355]:
df = pd.read_csv('/data/differentials/tumor_normal_enrichment/acetyl/CPTAC3_UCEC_discovery.fisher_exact.v2.1.tsv.gz',
                sep='\t')
df

Unnamed: 0,site_id,num_detected_tumor,num_na_tumor,num_detected_normal,num_na_normal,estimate,p.value,conf.low,conf.high,method,alternative,fdr
0,A2M:NP_000005.2:K1168,6,89,1,48,3.215097,0.422844,0.373229,151.795937,Fisher's Exact Test for Count Data,two.sided,1.000000
1,A2M:NP_000005.2:K1176,47,48,26,23,0.867053,0.727091,0.409360,1.828295,Fisher's Exact Test for Count Data,two.sided,1.000000
2,A2M:NP_000005.2:K135,6,89,3,46,1.033473,1.000000,0.209152,6.676366,Fisher's Exact Test for Count Data,two.sided,1.000000
3,A2M:NP_000005.2:K145,7,88,2,47,1.862025,0.718335,0.336304,19.069695,Fisher's Exact Test for Count Data,two.sided,1.000000
4,A2M:NP_000005.2:K516,23,72,7,42,1.908565,0.197663,0.714484,5.728291,Fisher's Exact Test for Count Data,two.sided,0.823151
...,...,...,...,...,...,...,...,...,...,...,...,...
14179,ZYX:NP_001010972.1:K265,6,89,2,47,1.579584,0.716248,0.268773,16.602600,Fisher's Exact Test for Count Data,two.sided,1.000000
14180,ZYX:NP_001010972.1:K272,4,91,2,47,1.032737,1.000000,0.142055,11.811098,Fisher's Exact Test for Count Data,two.sided,1.000000
14181,ZYX:NP_001010972.1:K279,85,10,44,5,0.966139,1.000000,0.243595,3.338163,Fisher's Exact Test for Count Data,two.sided,1.000000
14182,ZYX:NP_001010972.1:K533,5,90,3,46,0.852830,1.000000,0.157867,5.732517,Fisher's Exact Test for Count Data,two.sided,1.000000


In [336]:
# df = pd.read_csv('/data/differentials/tumor_normal_diff/acetyl/CPTAC3_UCEC_discovery.fisher_exact.v2.1.tsv.gz',
#                 sep='\t')
# df

In [337]:
# df = pd.read_csv('/data/differentials/tumor_normal_diff/phospho/CPTAC3_UCEC_discovery.fisher_exact.v2.1.tsv.gz',
#                 sep='\t')
# df

In [338]:
# len(set(df['site_id']).intersection(set(cancer_to_protein['CPTAC3_UCEC_discovery']['acetylome']['processed_data'].index)))
# 

In [339]:
# df = pd.read_csv('/data/differentials/tumor_normal_diff/phospho/CPTAC3_GBM_discovery.fisher_exact.v2.1.tsv.gz',
#                 sep='\t')
# df

In [340]:
# df = pd.read_csv('/data/differentials/tumor_normal_enrichment/phospho/CPTAC3_GBM_discovery.fisher_exact.v2.1.tsv.gz',
#                 sep='\t')
# df

## corum

In [344]:
df = pd.read_csv('/data/uniprot_to_hugo_map.tsv', sep='\t', header=None)
uniprot_to_hugo = {k:v for k, v in zip(df[0], df[1])}
uniprot_to_hugo

{'P31946': 'YWHAB',
 'P62258': 'YWHAE',
 'Q04917': 'YWHAH',
 'P61981': 'YWHAG',
 'P31947': 'SFN',
 'P27348': 'YWHAQ',
 'P63104': 'YWHAZ',
 'P30443': 'HLA-A',
 'P01892': 'HLA-A',
 'P04439': 'HLA-A',
 'P13746': 'HLA-A',
 'Q96QU6': 'ACCS',
 'Q4AC99': 'ACCSL',
 'P30447': 'HLA-A',
 'P05534': 'HLA-A',
 'P18462': 'HLA-A',
 'P30450': 'HLA-A',
 'P30512': 'HLA-A',
 'P16188': 'HLA-A',
 'P16189': 'HLA-A',
 'P10314': 'HLA-A',
 'P16190': 'HLA-A',
 'P30453': 'HLA-A',
 'P30455': 'HLA-A',
 'P30456': 'HLA-A',
 'P30457': 'HLA-A',
 'P01891': 'HLA-A',
 'P10316': 'HLA-A',
 'P30459': 'HLA-A',
 'Q09160': 'HLA-A',
 'P01889': 'HLA-B',
 'P30460': 'HLA-B',
 'P30461': 'HLA-B',
 'P30462': 'HLA-B',
 'P30464': 'HLA-B',
 'P30466': 'HLA-B',
 'P03989': 'HLA-B',
 'P30685': 'HLA-B',
 'P18463': 'HLA-B',
 'Q95365': 'HLA-B',
 'P30475': 'HLA-B',
 'Q04826': 'HLA-B',
 'P30479': 'HLA-B',
 'P30480': 'HLA-B',
 'P30481': 'HLA-B',
 'P30483': 'HLA-B',
 'P30484': 'HLA-B',
 'P30485': 'HLA-B',
 'P30486': 'HLA-B',
 'P30487': 'HLA-B',
 'P

In [349]:
df = pd.read_csv('/data/corum.tsv', sep='\t')
complexes = [[uniprot_to_hugo[p] for p in entry.split(';') if p in uniprot_to_hugo]
             for entry in df['subunits(UniProt IDs)']]
complexes = [c for c in complexes if len(c)>1]
complexes

[['BCL6', 'HDAC4'],
 ['BCL6', 'HDAC5'],
 ['BCL6', 'HDAC7'],
 ['EP300', 'CREBBP', 'KAT2B', 'NCOA3'],
 ['SMC2', 'NCAPH', 'NCAPD2', 'NCAPG', 'SMC4'],
 ['HPS1', 'HPS4'],
 ['HPS6', 'HPS3', 'HPS5'],
 ['CDS1', 'MUS81'],
 ['HDAC3', 'TBL1X', 'NCOR1', 'GPS2', 'CORO2A', 'TBL1XR1'],
 ['SNAPIN',
  'BLOC1S1',
  'BLOC1S3',
  'BLOC1S2',
  'BLOC1S5',
  'DTNBP1',
  'BLOC1S4',
  'BLOC1S6'],
 ['ARPC1B', 'ARPC2', 'ARPC3', 'ARPC5', 'ARPC4', 'ACTR3', 'ACTR2'],
 ['PSME1', 'PSME2'],
 ['PSMD11',
  'PSMD12',
  'PSMD9',
  'PSMD14',
  'PSMD3',
  'PSMD10',
  'PSMC3',
  'PSMC2',
  'PSMC4',
  'PSMD8',
  'PSMD7',
  'PSMD4',
  'PSMC1',
  'PSMC5',
  'PSMC6',
  'PSMD2',
  'PSMD6',
  'PSMD5',
  'PSMD1',
  'PSMD13'],
 ['PFDN6', 'PFDN1', 'VBP1', 'PFDN5', 'PFDN4', 'PFDN2'],
 ['AP1G1', 'AP1G2', 'AP1S2', 'AP1S1', 'AP1B1', 'AP1S3', 'AP1M1', 'AP1M2'],
 ['MTA2', 'MBD3', 'BCL6', 'HDAC1', 'MTA3'],
 ['PSEN1', 'NCSTN', 'APH1A', 'PSENEN'],
 ['PSEN1', 'NCSTN', 'APH1A', 'PSENEN'],
 ['SMARCA5', 'KIF4A', 'SMC2', 'HDAC1', 'SIN3A', 'SMC4', 

In [351]:
import json
json.dump(complexes, open('/data/results/corum_complexes.json', 'w'))

In [346]:
df = pd.read_csv('/data/reactome.tsv', sep='\t', header=None)
df

Unnamed: 0,0
0,uniprot:P08603|chebi:24505|chebi:28879
1,uniprot:Q16621|uniprot:Q9ULX9|uniprot:O15525|u...
2,uniprot:Q16621|uniprot:Q9ULX9|uniprot:O15525|u...
3,ensembl:ENSG00000171855|uniprot:P10914
4,uniprot:Q9H1K0|uniprot:Q9NZN4|uniprot:Q9NZN3|u...
...,...
11576,ensembl:ENSG00000037280|uniprot:Q06330|uniprot...
11577,uniprot:P34130|uniprot:P15209
11578,uniprot:Q3LS18|uniprot:Q9UP38|uniprot:P04628|u...
11579,uniprot:P04637|uniprot:Q96ST3|uniprot:P06876
