In [1]:
from pathlib import Path

import pandas as pd

pd.set_option("display.max_rows", 10)
pd.set_option("display.max_columns", 10)
pd.set_option("display.max_colwidth", 50)
pd.set_option("display.width", 100)

In [2]:
# local paths
data_dir_raw = 'data/raw'
data_dir_proc = 'data/processed'
Path(data_dir_proc).mkdir(parents=True, exist_ok=True)

# load raw data

In [3]:
expression = pd.read_csv(Path(data_dir_raw) / 'expression.tsv', sep='\t', index_col=0)
print(expression.head())
print(expression.shape)

           TCGA-69-7978-01  TCGA-62-8399-01  TCGA-78-7539-01  TCGA-50-5931-11  TCGA-73-4658-01  \
sample                                                                                           
ARHGEF10L           9.9898          10.4257           9.6264           8.6835           9.2078   
HIF3A               4.2598          11.6239           9.1362           9.4824           5.0288   
RNF17               0.4181           0.0000           1.1231           0.8221           0.0000   
RNF10              10.3657          11.5489          11.6692          11.7341          11.6209   
RNF11              11.1718          11.0200          10.4679          11.6787          11.3414   

           ...  TCGA-55-7727-01  TCGA-91-6831-01  TCGA-MN-A4N4-01  TCGA-55-8302-01  \
sample     ...                                                                       
ARHGEF10L  ...           7.2428           8.8388           9.9341          10.1696   
HIF3A      ...           7.5416           3.5613       

In [4]:
clinical = pd.read_csv(Path(data_dir_raw) / 'clinical.tsv', sep='\t', index_col=0)
print(clinical.head())
print(clinical.shape)

          sampleID  ABSOLUTE_Ploidy  ABSOLUTE_Purity  AKT1 ALK_translocation  ...  \
0  TCGA-05-4244-01              NaN              NaN   NaN               NaN  ...   
1  TCGA-05-4249-01             3.77             0.46  none               NaN  ...   
2  TCGA-05-4250-01              NaN              NaN   NaN               NaN  ...   
3  TCGA-05-4382-01              NaN              NaN  none               NaN  ...   
4  TCGA-05-4384-01             2.04             0.48  none               NaN  ...   

             _GENOMIC_ID_TCGA_LUAD_RPPA _GENOMIC_ID_TCGA_LUAD_exp_HiSeqV2_percentile  \
0  59881fe3-70f4-4fa0-9355-f50250872752         e6a101b9-61f9-4ed1-a59f-d9db3fdb4555   
1  5b46a235-49a6-43ea-91b6-82a384702477         d1a8d88d-1708-4959-9695-6f2e67853bd5   
2  3f069d62-c371-4f30-bdb4-c58093bbb1d7         bba9333a-09f7-4585-b22e-e4ae4049f7da   
3                                   NaN         e4177b01-6898-4bb7-b38d-0c09f85c5668   
4  5e735c28-5f2b-47ec-98fb-2f57cc8a5d6a         7

In [5]:
survival = pd.read_csv(Path(data_dir_raw) / 'survival.tsv', sep='\t', index_col=0)
print(survival.head())
print(survival.shape)

            sample      _PATIENT  OS  OS.time  DSS  ...  DFI  DFI.time  PFI  PFI.time  Redaction
0  TCGA-05-4244-01  TCGA-05-4244   0      0.0  0.0  ...  NaN       NaN    0       0.0        NaN
1  TCGA-05-4249-01  TCGA-05-4249   0   1523.0  0.0  ...  NaN       NaN    0    1523.0        NaN
2  TCGA-05-4250-01  TCGA-05-4250   1    121.0  NaN  ...  NaN       NaN    0     121.0        NaN
3  TCGA-05-4382-01  TCGA-05-4382   0    607.0  0.0  ...  1.0     334.0    1     334.0        NaN
4  TCGA-05-4384-01  TCGA-05-4384   0    426.0  0.0  ...  NaN       NaN    1     183.0        NaN

[5 rows x 11 columns]
(641, 11)


# reshape

In [6]:
# transpose expression so index = samples and columns = genes
expression = expression.T  
expression.index.name = None
expression.columns.name = None
print(expression.shape)

(576, 20530)


In [7]:
#Â set clinical index = sampleID
clinical = clinical.set_index('sampleID') 
clinical.index.name = None
clinical.columns.name = None
print(clinical.shape)

(706, 147)


In [8]:
# set survival index = sample
survival = survival.set_index('sample') 
survival.index.name = None
survival.columns.name = None
print(survival.shape)

(641, 10)


# drop rows

In [9]:
# remove rows in survival where OS (status_col) or OS.time (time_col) is missing
survival = survival.dropna(subset=['OS', 'OS.time'])
print(survival.shape)

(632, 10)


In [10]:
# only use informed_consent_verified=YES
clinical = clinical.loc[clinical['informed_consent_verified'] == 'YES', :]
print(clinical.shape)

(642, 147)


In [11]:
# check sample id codes in clinical
print(clinical.value_counts(['sample_type_id', 'sample_type']).sort_index())

sample_type_id  sample_type        
1.0             FFPE Scrolls             1
                Primary Tumor          519
2.0             Recurrent Tumor          2
11.0            Solid Tissue Normal    120
Name: count, dtype: int64


See sample id code definitions:

https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/sample-type-codes

In [12]:
# drop normal samples
sample_type_mask = ~(clinical['sample_type_id'] == 11.0)
clinical = clinical.loc[sample_type_mask, :]
print(clinical.shape)

(522, 147)


In [13]:
# check only one sample per patient
mask = clinical.duplicated('patient_id', keep=False)
clinical\
    .loc[mask, ['patient_id', 'sample_type_id', 'sample_type']]\
    .sort_index()

Unnamed: 0,patient_id,sample_type_id,sample_type
TCGA-50-5066-01,5066,1.0,Primary Tumor
TCGA-50-5066-02,5066,2.0,Recurrent Tumor
TCGA-50-5946-01,5946,1.0,Primary Tumor
TCGA-50-5946-02,5946,2.0,Recurrent Tumor


In [14]:
# keep the primary tumour samples for these patients
clinical = clinical.drop_duplicates('patient_id', keep='first')
assert clinical['patient_id'].nunique() == clinical.shape[0]
print(clinical.value_counts('sample_type_id'))

sample_type_id
1.0    520
Name: count, dtype: int64


In [15]:
# use samples common to all data sets
samples = expression.index\
    .intersection(survival.index)\
    .intersection(clinical.index)

expression = expression.loc[samples, :]
clinical = clinical.loc[samples, :]
survival = survival.loc[samples, :]

assert (expression.index == clinical.index).all()
assert (expression.index == survival.index).all()

print(expression.shape)
print(clinical.shape)
print(survival.shape)

(506, 20530)
(506, 147)
(506, 10)


# drop columns

In [16]:
# drop redundant columns
cols_to_drop = [
    # used to derive event col (OS) and time col (OS.time) 
    'vital_status', 
    'days_to_death',
    'days_to_last_followup', 
    # duplicate of age_at_initial_pathologic_diagnosis
    'days_to_birth', 
    'years_to_birth', 
    # not relevant to this analysis
    'new_tumor_event_after_initial_treatment', # not modelling recurrence
    'followup_case_report_form_submission_reason', # not modelling recurrence
    'is_ffpe', # not analysing molecular data sensitive to FFPE samples
    'tissue_prospective_collection_indicator',
    'tissue_retrospective_collection_indicator',
    # already used for filtering and of no further use
    'informed_consent_verified', 
    'sample_type_id',
    'sample_type',
    # administrative etc
    'bcr_followup_barcode',
    'bcr_patient_barcode',
    'bcr_sample_barcode',
    'form_completion_date',
    'pathology_report_file_name',
    'patient_id',
    'system_version',
    'tissue_source_site',
    'vial_number',
    'WGS_as_of_20120731_0_no_1_yes',
    '_INTEGRATION',
    '_PATIENT',
    '_GENOMIC_ID_TCGA_LUAD_mutation',
    '_GENOMIC_ID_TCGA_LUAD_mutation_curated_broad_gene',
    '_GENOMIC_ID_TCGA_LUAD_PDMarray',
    '_GENOMIC_ID_TCGA_LUAD_exp_HiSeqV2',
    '_GENOMIC_ID_TCGA_LUAD_G4502A_07_3',
    '_GENOMIC_ID_TCGA_LUAD_hMethyl27',
    '_GENOMIC_ID_data/public/TCGA/LUAD/miRNA_GA_gene',
    '_GENOMIC_ID_TCGA_LUAD_gistic2',
    '_GENOMIC_ID_TCGA_LUAD_hMethyl450',
    '_GENOMIC_ID_TCGA_LUAD_PDMRNAseqCNV',
    '_GENOMIC_ID_TCGA_LUAD_gistic2thd',
    '_GENOMIC_ID_TCGA_LUAD_PDMarrayCNV',
    '_GENOMIC_ID_TCGA_LUAD_exp_HiSeqV2_exon',
    '_GENOMIC_ID_TCGA_LUAD_miRNA_HiSeq',
    '_GENOMIC_ID_TCGA_LUAD_RPPA_RBN',
    '_GENOMIC_ID_TCGA_LUAD_exp_HiSeqV2_PANCAN',
    '_GENOMIC_ID_TCGA_LUAD_PDMRNAseq',
    '_GENOMIC_ID_TCGA_LUAD_RPPA',
    '_GENOMIC_ID_TCGA_LUAD_exp_HiSeqV2_percentile',
    '_GENOMIC_ID_TCGA_LUAD_mutation_broad_gene',
    '_GENOMIC_ID_data/public/TCGA/LUAD/miRNA_HiSeq_gene',
    '_GENOMIC_ID_TCGA_LUAD_miRNA_GA',
]

column_mask = ~clinical.columns.isin(cols_to_drop)
clinical = clinical.loc[:, column_mask]
print(clinical.shape)

(506, 101)


In [17]:
# drop columns from survival that are already in expression or clinincal
column_mask = ~survival.columns.isin(expression.columns.union(clinical.columns))
survival = survival.loc[:, column_mask]
print(survival.shape)

(506, 10)


In [18]:
# drop columns from clinical that are already in expression
column_mask = ~clinical.columns.isin(expression.columns)
clinical = clinical.loc[:, column_mask]
print(clinical.shape)

(506, 86)


# save processed data

In [19]:
expression.to_csv(Path(data_dir_proc) / 'expression.tsv', sep='\t')
clinical.to_csv(Path(data_dir_proc) / 'clinical.tsv', sep='\t')
survival.to_csv(Path(data_dir_proc) / 'survival.tsv', sep='\t')