In [5]:
# Load libraries
import pandas as pd
import numpy as np
import os
import json
from glob import glob

In [3]:
# Define file paths
DE_DIR = "/home/yakiyama/DE_results/020222/"
COLLAPSED_SIGS_PATH = "/home/yakiyama/CPTAC_Signatures/results/Union/Full_H_compressed_v1.1.tsv"

HRD_HRP_DDR_DE_PATH = os.path.join(DE_DIR, "HRD_v3/HRD_Global_02/analysis_files/DDR_diffexp_results_v3.tsv")
MMRD_FULL_DE_PATH = os.path.join(DE_DIR, "MMRD/MMRDvsMMRP_excludePOLEPOLD/analysis_files/full_diffexp_results.tsv")
MMRD_DDR_DE_PATH = os.path.join(DE_DIR, "MMRD/MMRDvsMMRP_excludePOLEPOLD/analysis_files/DDR_diffexp_results_v3.tsv")
MMRD_GSEA_PATH = os.path.join(DE_DIR, "MMRD/MMRDvsMMRP_excludePOLEPOLD/analysis_files/full_gsea.tsv")
ACUTE_CHRONIC_FULL_DE_PATH = os.path.join(DE_DIR, "HRD_v3/Group3Bvs2A/analysis_files/full_diffexp_results.tsv")
ACUTE_CHRONIC_DDR_DE_PATH = os.path.join(DE_DIR, "HRD_v3/Group3Bvs2A/analysis_files/DDR_diffexp_results_v3.tsv")
ACUTE_CHRONIC_GSEA_PATH = os.path.join(DE_DIR, "HRD_v3/Group3Bvs2A/analysis_files/full_gsea.tsv")
ACUTE_CHRONIC_PTMSEA_PATH = os.path.join(DE_DIR, "HRD_v3/Group3Bvs2A/analysis_files/ptmsea_combined_results.tsv")
H_MUT_COLLAPSED_PATH = "gs://cptac-pancan-getzlab/Freeze_061721/mutational_signatures/Full_H_compressed_v1.1.tsv"
H_MUT_FULL_PATH = "../results/mutational_signatures/H_uncollapsed_all.tsv"
W_MUT_FULL_PATH = "../results/mutational_signatures/W_uncollapsed_all.tsv"
SAMPLE_MAP_PATH = "gs://cptac-pancan-getzlab/mapping/PanCan_Participant_freeze_mapping.tsv"

GENESET_PATH = "/home/yakiyama/DE_data/genesets/092121/full_geneset_v3.json"

HRD_HRP_CP_DATA_PATH = "../supplemental_data/causalpath/HRD_Global_02_data.txt"
HRD_HRP_CP_PARAM_PATH = "../supplemental_data/causalpath/HRD_Global_02_parameters.txt"
ACUTE_CP_DATA_PATH = "../supplemental_data/causalpath/Group3Bvs2A_data.txt"
ACUTE_CP_PARAM_PATH = "../supplemental_data/causalpath/Group3Bvs2A_parameters.txt"
MMRD_MMRP_CP_DATA_PATH = "../supplemental_data/causalpath/MMRD_data.txt"
MMRD_MMRP_CP_PARAM_PATH = "../supplemental_data/causalpath/MMRD_parameters.txt"

KINASE_LIBRARY_PATH = "/home/yakiyama/CPTAC_PanCan_2021/analysis_files/kinase_library_051022/2.Dendrogram/phosphoproteome_res/enrichment_results"


In [3]:
# Combined CausalPath Data
hrd_hrp_cp_df = pd.read_csv(HRD_HRP_CP_DATA_PATH, sep='\t', index_col=0)
hypoxia_cp_df = pd.read_csv(ACUTE_CP_DATA_PATH, sep='\t', index_col=0)
mmrd_mmrp_cp_df = pd.read_csv(MMRD_MMRP_CP_DATA_PATH, sep='\t', index_col=0)

merged_cp_df = pd.concat([hrd_hrp_cp_df, hypoxia_cp_df, mmrd_mmrp_cp_df], 1)
merged_cp_df['Sites_Collapsed'] = merged_cp_df.apply(lambda x: x['Sites'].dropna().unique()[0] if '-P-' in x.name else np.nan, 1)
merged_cp_df['Effect_Collapsed'] = merged_cp_df.apply(lambda x: x['Effect'].unique()[0], 1)
merged_cp_df['Symbols_Collapsed'] = merged_cp_df.apply(lambda x: x['Symbols'].dropna().unique()[0], 1)
merged_cp_df['Feature_Collapsed'] = merged_cp_df.apply(lambda x: x['Feature'].dropna().unique()[0], 1)

merged_cp_df = merged_cp_df.drop(columns=['Sites'])
merged_cp_df = merged_cp_df.drop(columns=['Effect'])
merged_cp_df = merged_cp_df.drop(columns=['Symbols'])
merged_cp_df = merged_cp_df.drop(columns=['Feature'])

merged_cp_df = merged_cp_df.rename(columns={"Sites_Collapsed":"Sites", 
                                            "Effect_Collapsed":"Effect", 
                                            "Feature_Collapsed":"Feature", 
                                            "Symbols_Collapsed":"Symbols"})

merged_cp_df = merged_cp_df[['Symbols', 'Sites', 'Feature', 'Effect', 'HRD', 'HRD3Bvs2A_3B', 'MMRDvsMMRP_MMRD']]
merged_cp_df = merged_cp_df.rename(columns={'HRD3Bvs2A_3B':'HRD_hypoxia_data', 
                                            'HRD':'HRD_data', 
                                            'MMRDvsMMRP_MMRD':'MMRD_data'})
merged_cp_df

Unnamed: 0,Symbols,Sites,Feature,Effect,HRD_data,HRD_hypoxia_data,MMRD_data
KDM4A-P-S410,KDM4A,S410,P,,-0.770617,0.440954,-0.602466
PARP4-P-S1434,PARP4,S1434,P,,0.766541,,
POLD3-P-T277,POLD3,T277,P,,0.010874,0.084983,0.722302
LRWD1-R,LRWD1,,R,,-0.996278,0.646388,-0.793394
BAZ1B-P-S330,BAZ1B,S330,P,,0.015007,0.059176,0.988916
...,...,...,...,...,...,...,...
PAXIP1-P-S227,PAXIP1,S227,P,,,,0.602466
ABRAXAS1-P-S387-T390,ABRAXAS1,S387|T390,P,,,,0.894740
POLE-P-S1297,POLE,S1297,P,,,,0.106377
ABL1-P-S917,ABL1,S917,P,,,,0.935834


In [4]:
hrd_hrp_param_df = pd.read_csv(HRD_HRP_CP_PARAM_PATH, sep='\t', header=None)
hypoxia_param_df = pd.read_csv(ACUTE_CP_PARAM_PATH, sep='\t', header=None)
mmrd_param_df = pd.read_csv(MMRD_MMRP_CP_PARAM_PATH, sep='\t', header=None)

param_df = pd.concat([hrd_hrp_param_df ,hypoxia_param_df, mmrd_param_df], 1)
param_df.columns = ['HRD_params', 'HRD_hypoxia_params', 'MMRD_params']
param_df

Unnamed: 0,HRD_params,HRD_hypoxia_params,MMRD_params
0,proteomics-values-file = ../../data.txt,proteomics-values-file = ../../data.txt,proteomics-values-file = ../../data.txt
1,id-column = ID,id-column = ID,id-column = ID
2,symbols-column = Symbols,symbols-column = Symbols,symbols-column = Symbols
3,sites-column = Sites,sites-column = Sites,sites-column = Sites
4,feature-column = Feature,feature-column = Feature,feature-column = Feature
5,effect-column = Effect,effect-column = Effect,effect-column = Effect
6,value-transformation = signed-p-values,value-transformation = signed-p-values,value-transformation = signed-p-values
7,threshold-for-data-significance = 0.105 protein,threshold-for-data-significance = 0.105 protein,threshold-for-data-significance = 0.105 protein
8,threshold-for-data-significance = 0.105 phosph...,threshold-for-data-significance = 0.105 phosph...,threshold-for-data-significance = 0.105 phosph...
9,threshold-for-data-significance = 0.105 acetyl...,threshold-for-data-significance = 0.105 acetyl...,threshold-for-data-significance = 0.105 acetyl...


In [12]:
H_mut_full_df.index.map(sample_df.set_index('WXS').to_dict()['Proteome_Sample_ID'])

Index(['C09CO015', 'C11CO070', 'C17OV002', 'C20CO003', 'C3L.00156.T',
       'C3L.00586.T', 'C3L.00601.T', 'C3L.00770.T', 'C3L.01311.T',
       'C3N.00151.T',
       ...
       'C3N.04155', 'C3N.04162', 'C3N.04273', 'C3N.04275', 'C3N.04276',
       'C3N.04277', 'C3N.04278', 'C3N.04279', 'C3N.04280', 'C3N.04611'],
      dtype='object', length=1069)

In [13]:
# Mutational signature data
H_mut_full_df = pd.read_csv(H_MUT_FULL_PATH, sep='\t', index_col=0)
W_mut_df = pd.read_csv(W_MUT_FULL_PATH, sep='\t', index_col=0)
H_mut_collapsed_df = pd.read_csv(H_MUT_COLLAPSED_PATH, sep='\t', index_col=0)
## Remove columns from HRD signatures that included UV sample
H_mut_full_df = H_mut_full_df[[x for x in H_mut_full_df.columns if 'UV' not in x]]
W_mut_df = W_mut_df[[x for x in W_mut_df.columns if 'UV' not in x]]

# Global DDR analyses
hrd_hrp_ddr_df = pd.read_csv(HRD_HRP_DDR_DE_PATH, sep='\t', index_col=0)
mmrd_ddr_df = pd.read_csv(MMRD_DDR_DE_PATH, sep='\t', index_col=0)
mmrd_gsea_df = pd.read_csv(MMRD_GSEA_PATH, sep='\t', index_col=0)

# HRD Subtype analyses
acute_full_de_df = pd.read_csv(ACUTE_CHRONIC_FULL_DE_PATH, sep='\t', index_col=0)
acute_ddr_de_df = pd.read_csv(ACUTE_CHRONIC_DDR_DE_PATH, sep='\t', index_col=0)
acute_chronic_gsea_df = pd.read_csv(ACUTE_CHRONIC_GSEA_PATH, sep='\t', index_col=0)
acute_ptmsea_df = pd.read_csv(ACUTE_CHRONIC_PTMSEA_PATH, sep='\t', index_col=0)

# MMRD analyses
mmrd_full_de_df = pd.read_csv(MMRD_FULL_DE_PATH, sep='\t', index_col=0)
mmrd_ddr_de_df = pd.read_csv(MMRD_DDR_DE_PATH, sep='\t', index_col=0)
mmrd_gsea_df = pd.read_csv(MMRD_GSEA_PATH, sep='\t', index_col=0)

# Geneset
gs_d = {}
with open(GENESET_PATH, "r") as oFile:
    gs_d = json.load(oFile)
ddr_gs_df = pd.DataFrame(pd.Series(gs_d['DDR']), columns=['Genes'])

# PTM-SEA Subset for just one side of the comparison
acute_ptmsea_df = acute_ptmsea_df[acute_ptmsea_df['id'] == 'HRD3Bvs2A_3B']

# Mutational signatures: standardize sample IDs to Proteome_Sample_ID
sample_df = pd.read_csv(SAMPLE_MAP_PATH, sep='\t')
H_mut_full_df.index = H_mut_full_df.index.map(sample_df.set_index('WXS').to_dict()['Proteome_Sample_ID'])
H_mut_collapsed_df.index = H_mut_collapsed_df.index.map(sample_df.set_index('WXS').to_dict()['Proteome_Sample_ID'])

  interactivity=interactivity, compiler=compiler, result=result)


In [14]:
options = {}
options['strings_to_formulas'] = False
options['strings_to_urls'] = False

with pd.ExcelWriter("Supplemental_Table_3.xlsx", endgine='xlsxwriter', options=options) as excel_writer:
    W_mut_df.to_excel(excel_writer, sheet_name='Table 3A')
    H_mut_full_df.to_excel(excel_writer, sheet_name='Table 3B')
    H_mut_collapsed_df.to_excel(excel_writer, sheet_name='Table 3C')

    # Geneset
    ddr_gs_df.to_excel(excel_writer, sheet_name='Table 3D')

    # Global DDR Analysis
    hrd_hrp_ddr_df.to_excel(excel_writer, sheet_name='Table 3E')
    mmrd_full_de_df.to_excel(excel_writer, sheet_name='Table 3F')
    mmrd_ddr_df.to_excel(excel_writer, sheet_name='Table 3G')
    mmrd_gsea_df.to_excel(excel_writer, sheet_name='Table 3H')

    # HRD Subtype analyses
    acute_full_de_df.to_excel(excel_writer, sheet_name='Table 3I')
    acute_ddr_de_df.to_excel(excel_writer, sheet_name='Table 3J')
    acute_chronic_gsea_df.to_excel(excel_writer, sheet_name='Table 3K')
    acute_ptmsea_df.to_excel(excel_writer, sheet_name='Table 3L')

#     # CausalPath
#     merged_cp_df.to_excel(excel_writer, sheet_name='Table 3M')
#     param_df.to_excel(excel_writer, sheet_name='Table N')

    excel_writer.save()


## Kinase Library results for Supplemental Table 2

In [27]:
full_dendro_df = pd.DataFrame()
for dendro_file in glob(os.path.join(KINASE_LIBRARY_PATH, "*tsv")):
    dendrosplit_id = int(dendro_file.split('/')[-1].split('.')[0])
    dendro_df = pd.read_csv(dendro_file, sep='\t')
    dendro_df['dendrogram_split'] = dendrosplit_id
    full_dendro_df = pd.concat([full_dendro_df, dendro_df])
full_dendro_df.rename(columns={"Unnamed: 0":"geneSymbol"}, inplace=True)
full_dendro_df.sort_values("dendrogram_split", inplace=True)

In [30]:
full_dendro_df.to_csv("../analysis_files/kinase_library_051022/full_dendrogram_KinaseLibrary_results.tsv", 
                      sep='\t')
options = {}
options['strings_to_formulas'] = False
options['strings_to_urls'] = False
with pd.ExcelWriter("../analysis_files/kinase_library_051022/full_dendrogram_KinaseLibrary_results.xlsx", 
                    endgine='xlsxwriter', options=options) as excel_writer:
    full_dendro_df.to_excel(excel_writer, sheet_name="KinaseLibrary")
    excel_writer.save()