In [1]:
import os
import numpy as np
import pandas as pd

import sys
sys.path.append('C:\\Users\\Dell\\Desktop\\CV Projects\\prostate_cancer_genomics')

from config import PROSTATE_DATA_PATH

In [2]:
data_dir = os.path.join(PROSTATE_DATA_PATH, 'raw_data')

#### Building mutations

In [3]:
filename = '41588_2018_78_MOESM4_ESM.txt'

In [4]:
df = pd.read_csv(os.path.join(data_dir, filename), sep='\t', low_memory=False, skiprows=1)
df.head()

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_position,Strand,Tumor_Sample_Barcode,Matched_Norm_Sample_Barcode,...,pos,type,classification,ref_allele,patient,Primary_Met,pair_id,individual_id,case_sample,Pair_Set_ID
0,A1BG,1,broad.mit.edu,37,19,58862934,58862934,+,TCGA-EJ-5499,PRAD-TCGA-EJ-5499-Normal-SM-1U3NZ,...,58862934,Missense_Mutation,SNP,G,PRAD-TCGA-EJ-5499-Tumor-SM-1U3IG,Primary,PRAD-TCGA-EJ-5499-TP-NB-SM-1U3IG-SM-1U3NZ,PRAD-TCGA-EJ-5499,PRAD-TCGA-EJ-5499-Tumor-SM-1U3IG,Prim_762017
1,A1BG,1,broad.mit.edu,37,19,58863660,58863660,+,MO_1012,MO_1012-Normal,...,58863660,Missense_Mutation,SNP,A,MO_1012-Tumor-Abdomen_wall_nodule,Metastasis,MO_1012_TM_NB_MO_1012-Tumor-Abdomen_wall_nodul...,MO_1012,MO_1012-Tumor-Abdomen_wall_nodule,Met_762017
2,A1BG,1,broad.mit.edu,37,19,58863782,58863782,+,TCGA-CH-5752,PRAD-TCGA-CH-5752-Normal-SM-1U3IX,...,58863782,Silent,SNP,C,PRAD-TCGA-CH-5752-Tumor-SM-1U3ID,Primary,PRAD-TCGA-CH-5752-TP-NB-SM-1U3ID-SM-1U3IX,PRAD-TCGA-CH-5752,PRAD-TCGA-CH-5752-Tumor-SM-1U3ID,Prim_762017
3,A1BG,1,broad.mit.edu,37,19,58864304,58864304,+,06-134H1_LN,06-134A1_NORMAL,...,58864304,Silent,SNP,C,06-134H1_LN,Metastasis,06-134H1_LN_06-134A1_NORMAL,06-134,06-134H1_LN,Met_762017
4,A1CF,29974,broad.mit.edu,37,10,52569681,52569681,+,SC_9126,SC_9126_Normal,...,52569681,Missense_Mutation,SNP,C,SC_9126_Tumor,Metastasis,SC_9126_TM_NB_SC_9126_Tumor_SC_9126_Normal,SC_9126,SC_9126_Tumor,Met_762017


In [5]:
df.shape

(95354, 56)

In [6]:
print('Mutation distribution')
df['Variant_Classification'].value_counts()

Mutation distribution


Missense_Mutation           51002
Silent                      21346
Intron                      11385
Nonsense_Mutation            2830
Frame_Shift_Del              2755
Splice_Site                  2695
3'UTR                        1200
Frame_Shift_Ins               915
In_Frame_Del                  585
5'UTR                         445
In_Frame_Ins                   79
RNA                            42
Nonstop_Mutation               40
Stop_Codon_Del                 13
Start_Codon_Del                11
De_novo_Start_OutOfFrame        5
De_novo_Start_InFrame           2
Start_Codon_Ins                 2
lincRNA                         1
Stop_Codon_Ins                  1
Name: Variant_Classification, dtype: int64

In [7]:
filter_silent_muts = False    #filter silent mutations
filter_missense_muts = False    #filter missense mutations
filter_introns_muts = False    #filter intron mutations
keep_important_only = True    #keep important mutations only
truncating_only = False

In [8]:
if filter_silent_muts:
    #filter silent mutations
    df = df[df['Variant_Classification'] != 'Silent'].copy()
if filter_missense_muts:
    #filter missense mutations
    df = df[df['Variant_Classification'] != 'Missense_Mutation'].copy()
if filter_introns_muts:
    #filter intron mutations
    df = df[df['Variant_Classification'] != 'Intron'].copy()


if keep_important_only:
    #keep important mutations only
    exclude = ['Silent', 'Intron', "3\'UTR", "5\'UTR", 'RNA', 'lincRNA']
    df = df[~df['Variant_Classification'].isin(exclude)].copy()
if truncating_only:
    include = ['Nonsense_Mutation', 'Frame_Shift_Del', 'Frame_Shift_Ins']
    df = df[df['Variant_Classification'].isin(include)].copy()

In [9]:
df.shape

(60935, 56)

In [10]:
df['Tumor_Sample_Barcode'].sample(5)

35050          SC_9142
73703          SC_9163
85330     TCGA-XK-AAIW
59443    11-028L1_LUNG
68525     TCGA-HC-7232
Name: Tumor_Sample_Barcode, dtype: object

In [11]:
df_table = pd.pivot_table(data=df, index='Tumor_Sample_Barcode', columns='Hugo_Symbol', 
               values='Variant_Classification',
                aggfunc='count')
df_table.head()

Hugo_Symbol,A1BG,A1CF,A2M,A2ML1,A4GALT,A4GNT,AAAS,AACS,AADAC,AADACL3,...,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
Tumor_Sample_Barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00-029N9_LN,,,,,,,,,,,...,,,,,,,,,,
01-087MM_BONE,,,,,,,,,,,...,,,,,,,,,,
01-095N1_LN,,,,,,,,,,,...,,,,,,,,,,
01-120A1_LIVER,,,,,,,,,,,...,,,,,,,,,,
02-083E1_LN,,,,,,,,,,,...,,,,,,,,,,


In [12]:
df_table = df_table.fillna(0)
df_table.head()

Hugo_Symbol,A1BG,A1CF,A2M,A2ML1,A4GALT,A4GNT,AAAS,AACS,AADAC,AADACL3,...,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1,ZZZ3
Tumor_Sample_Barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00-029N9_LN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01-087MM_BONE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01-095N1_LN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01-120A1_LIVER,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
02-083E1_LN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
total_number_of_mutations = df_table.sum().sum()
total_number_of_mutations

60935.0

In [14]:
number_samples = df_table.shape[0]
number_samples

1011

In [15]:
print('Number of mutations', total_number_of_mutations // (number_samples + 0.0))

Number of mutations 60.0


#### Building response

In [16]:
filename = '41588_2018_78_MOESM5_ESM.xlsx'

In [17]:
df = pd.read_excel(os.path.join(data_dir, filename), sheet_name='Supplementary_Table3.txt', skiprows=2)
df.head()

Unnamed: 0,Cancer.Type.Detailed,Patient.ID,Mutation_count,Mutation burden (Mutations per Megabase),Normal Coverage,Tumor Coverage,Fraction of genome altered,Ploidy,Purity,Sample.Type,Metastatic.Site,Radical.Prostatectomy.Gleason.Score.for.Prostate.Cancer,Reviewed.Gleason.Category,Fusion,Age,Data.Source
0,Prostate Adenocarcinoma,AAPC-STID0000011640-Tumor-SM-2XU1H,8,0.206976,87.13,86.01,0.010487,2.024373,0.216579,Primary,,7,3+4,,40.0,AAPC
1,Prostate Adenocarcinoma,AAPC-STID0000021561-Tumor-SM-3RVWB,35,0.838843,171.94,129.16,0.135831,1.969338,0.3,Primary,,7,3+4,,45.0,AAPC
2,Prostate Adenocarcinoma,AAPC-STID0000011949-Tumor-SM-2XU1I,23,0.433527,102.92,69.41,0.190097,2.072929,0.492816,Primary,,>=8,5+4,,46.0,AAPC
3,Prostate Adenocarcinoma,AAPC-STID0000021610-Tumor-SM-2XU13,19,0.364544,111.26,89.49,0.054238,2.091553,0.300396,Primary,,6,3+3,,48.0,AAPC
4,Prostate Adenocarcinoma,AAPC-STID0000021537-Tumor-SM-3RVW7,16,0.460031,106.96,108.37,0.054551,1.855555,0.3,Primary,,7,3+4,,48.0,AAPC


In [18]:
df['Sample.Type'].value_counts()

Primary       680
Metastasis    333
Name: Sample.Type, dtype: int64

#### Building copy number variants

In [19]:
filename = '41588_2018_78_MOESM10_ESM.txt'

In [20]:
df = pd.read_csv(os.path.join(data_dir, filename), sep='\t', low_memory=False, skiprows=1, index_col=0)
df.head()

Unnamed: 0_level_0,00-029N9_LN,01-087MM_BONE,01-095N1_LN,01-120A1_LIVER,02-083E1_LN,03-082H1_LIVER,03-130L_RETROPERITONEAL,03-139E3_RETROPERITONEAL,03-163S4_LIVER,03-192B_LUNG,...,TP_2034,TP_2054,TP_2060,TP_2061,TP_2064,TP_2069,TP_2077,TP_2078,TP_2079,PROS11496-6115321-SM-6CNQ5
Hugo_Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PIK3CD,0,0,1.0,0,1,2,1,0,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
MTOR,0,0,0.0,0,0,2,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
JUN,0,0,0.0,0,0,0,0,0,0,2,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
NRAS,0,0,0.0,0,0,0,0,1,2,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
NOTCH2,0,0,0.0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
df = df.T
df.head()

Hugo_Symbol,PIK3CD,MTOR,JUN,NRAS,NOTCH2,RIT1,NTRK1,DDR2,MDM4,PARP1,...,MCAM,RNF26,C1QTNF5,MFRP,USP2,LOC100499227,THY1,PVRL1,TRIM29,OAF
00-029N9_LN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01-087MM_BONE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
01-095N1_LN,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
01-120A1_LIVER,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
02-083E1_LN,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [22]:
df = df.fillna(0.)
df.head()

Hugo_Symbol,PIK3CD,MTOR,JUN,NRAS,NOTCH2,RIT1,NTRK1,DDR2,MDM4,PARP1,...,MCAM,RNF26,C1QTNF5,MFRP,USP2,LOC100499227,THY1,PVRL1,TRIM29,OAF
00-029N9_LN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01-087MM_BONE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
01-095N1_LN,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
01-120A1_LIVER,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0
02-083E1_LN,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


#### Building copy number variants burden

In [23]:
filename = '41588_2018_78_MOESM5_ESM.xlsx'

In [24]:
df = pd.read_excel(os.path.join(data_dir, filename), skiprows=2, index_col=1)
df.head()

Unnamed: 0_level_0,Cancer.Type.Detailed,Mutation_count,Mutation burden (Mutations per Megabase),Normal Coverage,Tumor Coverage,Fraction of genome altered,Ploidy,Purity,Sample.Type,Metastatic.Site,Radical.Prostatectomy.Gleason.Score.for.Prostate.Cancer,Reviewed.Gleason.Category,Fusion,Age,Data.Source
Patient.ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
AAPC-STID0000011640-Tumor-SM-2XU1H,Prostate Adenocarcinoma,8,0.206976,87.13,86.01,0.010487,2.024373,0.216579,Primary,,7,3+4,,40.0,AAPC
AAPC-STID0000021561-Tumor-SM-3RVWB,Prostate Adenocarcinoma,35,0.838843,171.94,129.16,0.135831,1.969338,0.3,Primary,,7,3+4,,45.0,AAPC
AAPC-STID0000011949-Tumor-SM-2XU1I,Prostate Adenocarcinoma,23,0.433527,102.92,69.41,0.190097,2.072929,0.492816,Primary,,>=8,5+4,,46.0,AAPC
AAPC-STID0000021610-Tumor-SM-2XU13,Prostate Adenocarcinoma,19,0.364544,111.26,89.49,0.054238,2.091553,0.300396,Primary,,6,3+3,,48.0,AAPC
AAPC-STID0000021537-Tumor-SM-3RVW7,Prostate Adenocarcinoma,16,0.460031,106.96,108.37,0.054551,1.855555,0.3,Primary,,7,3+4,,48.0,AAPC


In [25]:
df['Fraction of genome altered']

Patient.ID
AAPC-STID0000011640-Tumor-SM-2XU1H    0.010487
AAPC-STID0000021561-Tumor-SM-3RVWB    0.135831
AAPC-STID0000011949-Tumor-SM-2XU1I    0.190097
AAPC-STID0000021610-Tumor-SM-2XU13    0.054238
AAPC-STID0000021537-Tumor-SM-3RVW7    0.054551
                                        ...   
TCGA-V1-A8MF                          0.088943
TCGA-V1-A8WV                          0.510004
TCGA-V1-A8WW                          0.159161
TCGA-XQ-A8TA                          0.351139
TCGA-YJ-A8SW                          0.499158
Name: Fraction of genome altered, Length: 1013, dtype: float64