In [1]:
import os
import numpy as np
import pandas as pd
from sklearn import preprocessing

In [2]:
# Input Files
rna_file = os.path.join('data', 'raw', 'HiSeqV2')
# mut_file = os.path.join('data', 'raw', 'PANCAN_mutation')
clinical_file = os.path.join('data', 'raw', 'samples.tsv')

In [3]:
rna_out_file = os.path.join('data', 'pancan_scaled_rnaseq.tsv')
rna_out_zeroone_file = os.path.join('data', 'pancan_scaled_zeroone_rnaseq.tsv')
# mut_out_file = os.path.join('data', 'pancan_mutation.tsv')
clinical_processed_out_file = os.path.join('data', 'clinical_data.tsv')

In [4]:
# Load Data
rnaseq_df = pd.read_table(rna_file, index_col=0)
# mutation_df = pd.read_table(mut_file)
clinical_df = pd.read_table(clinical_file, index_col=0, low_memory=False)

In [5]:
clinical_columns_dict = {
    'gdc_platform': 'platform',
    'gdc_center.short_name': 'analysis_center',
    'gdc_cases.submitter_id': 'sample_id',
    'gdc_cases.demographic.gender': 'gender',
    'gdc_cases.demographic.race': 'race',
    'gdc_cases.demographic.ethnicity': 'ethnicity',
    'gdc_cases.project.primary_site': 'hospital',
    'gdc_cases.project.project_id': 'acronym',
    'gdc_cases.tissue_source_site.project': 'disease',
    'gdc_cases.diagnoses.vital_status': 'vital_status',
    'gdc_cases.samples.sample_type': 'sample_type',
    'cgc_case_age_at_diagnosis': 'age_at_diagnosis',
    'cgc_portion_id': 'portion_id',
    'cgc_slide_percent_tumor_nuclei': 'percent_tumor_nuclei',
    'cgc_drug_therapy_drug_name': 'drug',
    'xml_year_of_initial_pathologic_diagnosis': 'year_of_diagnosis',
    'xml_stage_event_pathologic_stage': 'stage', 
}

In [6]:
# Process RNAseq file
rnaseq_df.index = rnaseq_df.index.map(lambda x: x.split('|')[0])
rnaseq_df.columns = rnaseq_df.columns.str.slice(start=0, stop=15)
rnaseq_df = rnaseq_df.drop('?').fillna(0).sort_index(axis=1)

# Gene is listed twice in RNAseq data, drop both occurrences
rnaseq_df.drop('SLC35E2', axis=0, inplace=True)
rnaseq_df = rnaseq_df.T

In [7]:
# mutation_df.head()

In [8]:
# Filter mutation types and generate binary matrix
mutations = {
    'Frame_Shift_Del',
    'Frame_Shift_Ins',
    'In_Frame_Del',
    'In_Frame_Ins',
    'Missense_Mutation',
    'Nonsense_Mutation',
    'Nonstop_Mutation',
    'RNA',
    'Splice_Site',
    'Translation_Start_Site',
}

# Process synapse mutations
# mut_pivot = (mutation_df.query("effect in @mutations")
#                        .groupby(['#sample', 'chr',
#                                  'gene'])
#                        .apply(len).reset_index()
#                        .rename(columns={0: 'mutation'}))

# mut_pivot = (mut_pivot.pivot_table(index='#sample',
#                                   columns='gene', values='mutation',
#                                   fill_value=0)
#                      .astype(bool).astype(int))

In [9]:
mad_genes = rnaseq_df.mad(axis=0).sort_values(ascending=False)
top_mad_genes = mad_genes.iloc[0:5000, ].index

In [10]:
rnaseq_subset_df = rnaseq_df.loc[:, top_mad_genes]

In [11]:
rnaseq_scaled_df = preprocessing.StandardScaler().fit_transform(rnaseq_subset_df)
rnaseq_scaled_df = pd.DataFrame(rnaseq_scaled_df, columns=rnaseq_subset_df.columns,
                                index=rnaseq_subset_df.index)
rnaseq_scaled_df.to_csv(rna_out_file, sep='\t')

In [12]:
rnaseq_scaled_zeroone_df = preprocessing.MinMaxScaler().fit_transform(rnaseq_subset_df)
rnaseq_scaled_zeroone_df = pd.DataFrame(rnaseq_scaled_zeroone_df, columns=rnaseq_subset_df.columns,
                                        index=rnaseq_subset_df.index)
rnaseq_scaled_zeroone_df.to_csv(rna_out_zeroone_file, sep='\t')

In [13]:
clinical_sub_df = clinical_df.filter(items=clinical_columns_dict.keys())
clinical_sub_df = clinical_sub_df.rename(columns=clinical_columns_dict)
clinical_sub_df.index = clinical_sub_df['sample_id']
clinical_sub_df.drop('sample_id', axis=1, inplace=True)
clinical_sub_df['acronym'] = clinical_sub_df['acronym'].str[5:]
clinical_sub_df.to_csv(clinical_processed_out_file, sep='\t')
clinical_sub_df.head()

Unnamed: 0_level_0,ethnicity,sample_type,age_at_diagnosis,acronym,drug,disease,percent_tumor_nuclei,vital_status,gender,portion_id,platform,race,hospital,stage,year_of_diagnosis,analysis_center
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
TCGA-CD-8534,not hispanic or latino,Primary Tumor,41.0,STAD,,Stomach adenocarcinoma,85.0,alive,male,TCGA-CD-8534-01A-11,Illumina HiSeq,asian,Stomach,Stage II,2011,BCGSC
TCGA-ER-A19A,not hispanic or latino,Metastatic,79.0,SKCM,,Skin Cutaneous Melanoma,85.0,alive,male,TCGA-ER-A19A-06A-21,Illumina HiSeq,white,Skin,Stage IV,2006,UNC
TCGA-C5-A1M8,not hispanic or latino,Primary Tumor,43.0,CESC,,Cervical squamous cell carcinoma and endocervi...,85.0,alive,female,TCGA-C5-A1M8-01A-21,Illumina HiSeq,white,Cervix,,2001,UNC
TCGA-D1-A0ZN,not hispanic or latino,Primary Tumor,60.0,UCEC,,Uterine Corpus Endometrial Carcinoma,60.0,alive,female,TCGA-D1-A0ZN-01A-11,Illumina GA,white,Uterus,,2009,UNC
TCGA-EM-A4FF,not reported,Primary Tumor,40.0,THCA,,Thyroid carcinoma,75.0,alive,female,TCGA-EM-A4FF-01A-11,Illumina HiSeq,not reported,Thyroid,Stage I,2006,UNC
