## Survival analysis proof-of-concept

Much of the preprocessing code is based on: https://github.com/greenelab/mpmp/blob/master/mpmp/utilities/data_utilities.py#L510

In [1]:
from pathlib import Path

import pandas as pd

### Get clinical endpoint info

In [2]:
# use TCGA clinical data downloaded in mpmp repo
mpmp_location = Path('/home/jake/research/mpmp')
clinical_filename = (
    mpmp_location / 'data' / 'raw' / 'TCGA-CDR-SupplementalTableS1.xlsx'
)

In [3]:
clinical_df = pd.read_excel(
    clinical_filename,
    sheet_name='TCGA-CDR',
    index_col='bcr_patient_barcode',
    engine='openpyxl'
)

clinical_df.index.rename('sample_id', inplace=True)

# drop numeric index column
clinical_df.drop(labels=['Unnamed: 0'], axis=1, inplace=True)

# we want to use age as a covariate
clinical_df.rename(columns={'age_at_initial_pathologic_diagnosis': 'age'},
                   inplace=True)

print(clinical_df.shape)
clinical_df.iloc[:5, :5]

(11160, 32)


Unnamed: 0_level_0,type,age,gender,race,ajcc_pathologic_tumor_stage
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TCGA-OR-A5J1,ACC,58.0,MALE,WHITE,Stage II
TCGA-OR-A5J2,ACC,44.0,FEMALE,WHITE,Stage IV
TCGA-OR-A5J3,ACC,23.0,FEMALE,WHITE,Stage III
TCGA-OR-A5J4,ACC,23.0,FEMALE,WHITE,Stage IV
TCGA-OR-A5J5,ACC,30.0,MALE,WHITE,Stage III


In [4]:
# we want to use overall survival as the target variable except for
# certain cancer types where progression-free intervals are typically
# used (since very few deaths are observed)
# this is recommended in https://doi.org/10.1016/j.cell.2018.02.052
pfi_cancer_types = [
    'BRCA', 'DLBC', 'LGG', 'PCPG', 'PRAD',
    'READ', 'TGCT', 'THCA', 'THYM'
]

clinical_df['time_in_days'] = clinical_df['OS.time']
clinical_df['status'] = clinical_df['OS'].astype('bool')

pfi_samples = clinical_df.type.isin(pfi_cancer_types)
clinical_df.loc[pfi_samples, 'time_in_days'] = clinical_df[pfi_samples]['PFI.time']
clinical_df.loc[pfi_samples, 'status'] = clinical_df[pfi_samples]['PFI'].astype('bool')

# clean up columns and drop samples with NA survival times
na_survival_times = (clinical_df['time_in_days'].isna())
cols_to_keep = ['status', 'time_in_days', 'age', 'type']
clinical_df = clinical_df.loc[~na_survival_times, cols_to_keep].copy()

# mean impute missing age values
clinical_df.age.fillna(clinical_df.age.mean(), inplace=True)

print(clinical_df.shape)
clinical_df.head()

(11094, 4)


Unnamed: 0_level_0,status,time_in_days,age,type
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TCGA-OR-A5J1,True,1355.0,58.0,ACC
TCGA-OR-A5J2,True,1677.0,44.0,ACC
TCGA-OR-A5J3,False,2091.0,23.0,ACC
TCGA-OR-A5J4,True,423.0,23.0,ACC
TCGA-OR-A5J5,True,365.0,30.0,ACC
