# Process TCGA PanCanAtlas Data

Retrieve the downloaded expression data, update gene identifiers to entrez, and curate sample IDs. The script will also identify a balanced hold-out test set to compare projection performance into learned latent spaces across algorithms.

In [1]:
import os
import random
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
random.seed(1234)

## Read TCGA Barcode Curation Information

Extract information from TCGA barcodes - `cancer-type` and `sample-type`. See https://github.com/cognoma/cancer-data for more details

In [3]:
# Commit from https://github.com/cognoma/cancer-data/
sample_commit = 'da832c5edc1ca4d3f665b038d15b19fced724f4c'

In [4]:
url = 'https://raw.githubusercontent.com/cognoma/cancer-data/{}/mapping/tcga_cancertype_codes.csv'.format(sample_commit)
cancer_types_df = pd.read_csv(url,
                              dtype='str',
                              keep_default_na=False)

cancertype_codes_dict = dict(zip(cancer_types_df['TSS Code'],
                                 cancer_types_df.acronym))
cancer_types_df.head(2)

Unnamed: 0,TSS Code,Source Site,Study Name,BCR,acronym
0,1,International Genomics Consortium,ovarian serous cystadenocarcinoma,IGC,OV
1,2,MD Anderson Cancer Center,glioblastoma multiforme,IGC,GBM


In [5]:
url = 'https://raw.githubusercontent.com/cognoma/cancer-data/{}/mapping/tcga_sampletype_codes.csv'.format(sample_commit)
sample_types_df = pd.read_csv(url, dtype='str')

sampletype_codes_dict = dict(zip(sample_types_df.Code,
                                 sample_types_df.Definition))
sample_types_df.head(2)

Unnamed: 0,Code,Definition,Short Letter Code
0,1,Primary Solid Tumor,TP
1,2,Recurrent Solid Tumor,TR


## Read Entrez ID Curation Information

Load curated gene names from versioned resource. See https://github.com/cognoma/genes for more details

In [6]:
# Commit from https://github.com/cognoma/genes
genes_commit = 'ad9631bb4e77e2cdc5413b0d77cb8f7e93fc5bee'

In [7]:
url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/genes.tsv'.format(genes_commit)
gene_df = pd.read_table(url)

# Only consider protein-coding genes
gene_df = (
    gene_df.query("gene_type == 'protein-coding'")
)

print(gene_df.shape)
gene_df.head(2)

(20395, 7)


Unnamed: 0,entrez_gene_id,symbol,description,chromosome,gene_type,synonyms,aliases
0,1,A1BG,alpha-1-B glycoprotein,19,protein-coding,A1B|ABG|GAB|HYST2477,alpha-1B-glycoprotein|HEL-S-163pA|epididymis s...
1,2,A2M,alpha-2-macroglobulin,12,protein-coding,A2MD|CPAMD5|FWP007|S863-7,alpha-2-macroglobulin|C3 and PZP-like alpha-2-...


In [8]:
# Load gene updater - old to new Entrez gene identifiers
url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/updater.tsv'.format(genes_commit)
updater_df = pd.read_table(url)

old_to_new_entrez = dict(zip(updater_df.old_entrez_gene_id,
                             updater_df.new_entrez_gene_id))

## Read Gene Expression Data

In [9]:
file = os.path.join('download', 'EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2-v2.geneExp.tsv')
tcga_expr_df = pd.read_table(file, index_col=0)

print(tcga_expr_df.shape)
tcga_expr_df.head()

(20531, 11069)


Unnamed: 0_level_0,TCGA-OR-A5J1-01A-11R-A29S-07,TCGA-OR-A5J2-01A-11R-A29S-07,TCGA-OR-A5J3-01A-11R-A29S-07,TCGA-OR-A5J5-01A-11R-A29S-07,TCGA-OR-A5J6-01A-31R-A29S-07,TCGA-OR-A5J7-01A-11R-A29S-07,TCGA-OR-A5J8-01A-11R-A29S-07,TCGA-OR-A5J9-01A-11R-A29S-07,TCGA-OR-A5JA-01A-11R-A29S-07,TCGA-OR-A5JB-01A-11R-A29S-07,...,TCGA-CG-4449-01A-01R-1157-13,TCGA-CG-4462-01A-01R-1157-13,TCGA-CG-4465-01A-01R-1157-13,TCGA-CG-4466-01A-01R-1157-13,TCGA-CG-4469-01A-01R-1157-13,TCGA-CG-4472-01A-01R-1157-13,TCGA-CG-4474-01A-02R-1157-13,TCGA-CG-4475-01A-01R-1157-13,TCGA-CG-4476-01A-01R-1157-13,TCGA-CG-4477-01A-01R-1157-13
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
?|100130426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
?|100133144,3.2661,2.6815,1.7301,0.0,0.0,1.1673,1.4422,0.0,4.4556,7.1293,...,4.358154,5.676995,5.21935,14.846708,20.115492,6.997533,18.311906,12.057112,18.62874,17.874417
?|100134869,3.9385,8.9948,6.565,1.5492,4.4709,6.0529,2.2876,1.3599,5.0581,0.0,...,2.65636,3.342794,2.423442,5.055287,11.626054,13.654193,7.417109,11.585177,11.482418,14.919338
?|10357,149.135,81.0777,86.4879,53.9117,66.9063,103.506,94.9316,78.1955,69.2389,155.709,...,633.299781,294.018042,686.569179,563.573453,1039.307597,639.238135,742.479964,506.336449,712.452165,703.713324
?|10431,2034.1,1304.93,1054.66,2350.89,1257.99,1866.43,995.027,1762.12,1213.53,2005.57,...,1202.538277,644.002317,1181.884532,663.885074,647.530395,1297.152549,1152.909807,1375.495774,971.893874,1736.988111


## Process gene expression matrix

This involves updating Entrez gene ids, sorting and subsetting.

In [10]:
# Set index as entrez_gene_id
tcga_expr_df.index = tcga_expr_df.index.map(lambda x: x.split('|')[1])

In [11]:
tcga_expr_df = (tcga_expr_df
    .dropna(axis='rows')
    .rename(index=old_to_new_entrez)
    .groupby(level=0).mean()
    .transpose()
    .sort_index(axis='rows')
    .sort_index(axis='columns')
)

tcga_expr_df.index.rename('sample_id', inplace=True)

In [12]:
# Update sample IDs
tcga_expr_df.index = tcga_expr_df.index.str.slice(start=0, stop=15)
tcga_expr_df = tcga_expr_df.loc[~tcga_expr_df.index.duplicated(), :]

In [13]:
# Filter for valid Entrez gene identifiers
tcga_expr_df = tcga_expr_df.loc[:, tcga_expr_df.columns.isin(gene_df.entrez_gene_id.astype(str))]

In [14]:
print(tcga_expr_df.shape)
tcga_expr_df.head()

(11060, 16148)


gene_id,1,10,100,1000,10000,10001,10002,10003,100037417,10004,...,9987,9988,9989,999,9990,9991,9992,9993,9994,9997
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-02-0047-01,125.007,10.449,136.452,2302.47,1297.52,271.674,1.2293,8.6051,187.492,15.3662,...,4033.31,791.278,1810.14,264.913,684.225,1097.76,11.6783,4815.14,288.269,299.948
TCGA-02-0055-01,391.804,1.1212,222.004,1819.76,903.154,321.233,0.0,38.1219,426.853,44.8493,...,4431.67,921.424,2038.96,2.2425,466.534,1399.86,16.8185,2228.45,309.46,787.106
TCGA-02-2483-01,271.852,4.6438,255.831,2888.87,1319.68,458.048,4.6438,5.9103,260.002,29.1293,...,7839.58,846.708,2229.87,18.9974,565.184,1339.95,8.8654,2737.73,306.491,457.203
TCGA-02-2485-01,83.9429,20.0,129.048,6965.71,10136.2,418.571,5.2381,37.619,539.333,19.5238,...,6560.0,1592.74,1840.95,112.381,484.881,726.667,11.4286,2979.05,439.524,426.667
TCGA-02-2486-01,108.256,3.6585,205.488,2250.61,873.171,441.463,1.8293,83.5366,265.061,29.878,...,4330.49,816.335,1134.76,9.1463,397.677,1098.17,12.8049,1340.24,251.22,541.463


## Process TCGA cancer-type and sample-type info from barcodes

Cancer-type includes `OV`, `BRCA`, `LUSC`, `LUAD`, etc. while sample-type includes `Primary`, `Metastatic`, `Solid Tissue Normal`, etc.

See https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tissue-source-site-codes for more details.

The goal is to use this info to stratify training (90%) and testing (10%) balanced by cancer-type and sample-type. 

In [15]:
# Extract sample type in the order of the gene expression matrix
tcga_id = pd.DataFrame(tcga_expr_df.index)

# Extract the last two digits of the barcode and recode sample-type
tcga_id = tcga_id.assign(sample_type = tcga_id.sample_id.str[-2:])
tcga_id.sample_type = tcga_id.sample_type.replace(sampletype_codes_dict)

# Extract the first two ID numbers after `TCGA-` and recode cancer-type
tcga_id = tcga_id.assign(cancer_type = tcga_id.sample_id.str[5:7])
tcga_id.cancer_type = tcga_id.cancer_type.replace(cancertype_codes_dict)

# Append cancer-type with sample-type to generate stratification variable
tcga_id = tcga_id.assign(stratify_samples = tcga_id.cancer_type.str.cat(tcga_id.sample_type))

# Get stratification counts - function cannot work with singleton strats
stratify_counts = tcga_id.stratify_samples.value_counts().to_dict()

# Recode stratification variables if they are singletons
tcga_id = tcga_id.assign(stratify_samples_count = tcga_id.stratify_samples)
tcga_id.stratify_samples_count = tcga_id.stratify_samples_count.replace(stratify_counts)
tcga_id.loc[tcga_id.stratify_samples_count == 1, "stratify_samples"] = "other"

In [16]:
# Write out files for downstream use
file = os.path.join('data', 'tcga_sample_identifiers.tsv')

(
    tcga_id.drop(['stratify_samples', 'stratify_samples_count'], axis='columns')
    .to_csv(file, sep='\t', index=False)
)

print(tcga_id.shape)
tcga_id.head()

(11060, 5)


Unnamed: 0,sample_id,sample_type,cancer_type,stratify_samples,stratify_samples_count
0,TCGA-02-0047-01,Primary Solid Tumor,GBM,GBMPrimary Solid Tumor,154
1,TCGA-02-0055-01,Primary Solid Tumor,GBM,GBMPrimary Solid Tumor,154
2,TCGA-02-2483-01,Primary Solid Tumor,GBM,GBMPrimary Solid Tumor,154
3,TCGA-02-2485-01,Primary Solid Tumor,GBM,GBMPrimary Solid Tumor,154
4,TCGA-02-2486-01,Primary Solid Tumor,GBM,GBMPrimary Solid Tumor,154


In [17]:
cancertype_count_df = (
    pd.DataFrame(tcga_id.cancer_type.value_counts())
    .reset_index()
    .rename({'index': 'cancertype', 'cancer_type': 'n ='}, axis='columns')
)

file = os.path.join('data', 'tcga_sample_counts.tsv')
cancertype_count_df.to_csv(file, sep='\t', index=False)

cancertype_count_df

Unnamed: 0,cancertype,n =
0,BRCA,1218
1,KIRC,606
2,LUAD,576
3,THCA,572
4,UCEC,567
5,HNSC,566
6,LUSC,553
7,PRAD,550
8,LGG,530
9,COAD,495


## Stratify Balanced Training and Testing Sets in TCGA Gene Expression

Output training and testing gene expression datasets

In [18]:
train_df, test_df = train_test_split(tcga_expr_df,
                                     test_size=0.1,
                                     random_state=123,
                                     stratify=tcga_id.stratify_samples_count)

In [19]:
print(train_df.shape)
test_df.shape

(9954, 16148)


(1106, 16148)

In [20]:
train_file = os.path.join('data', 'train_tcga_expression_matrix_processed.tsv.gz')
train_df.to_csv(train_file, sep='\t', compression='gzip', float_format='%.3g')

In [21]:
test_file = os.path.join('data', 'test_tcga_expression_matrix_processed.tsv.gz')
test_df.to_csv(test_file, sep='\t', compression='gzip', float_format='%.3g')

## Sort genes based on median absolute deviation and output to file

In [22]:
# Determine most variably expressed genes and subset
mad_genes_df = pd.DataFrame(train_df.mad(axis=0).sort_values(ascending=False)).reset_index()
mad_genes_df.columns = ['gene_id', 'median_absolute_deviation']

file = os.path.join('data', 'tcga_mad_genes.tsv')
mad_genes_df.to_csv(file, sep='\t', index=False)