# Process TARGET PanCancer Data

Retrieve the downloaded expression data, update gene identifiers to entrez, and curate sample IDs. The script will also identify a balanced hold-out test set to compare projection performance into learned latent spaces across algorithms.

In [1]:
import os
import random
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
random.seed(1234)

## Read Phenotype Information

In [3]:
path = os.path.join('download', 'TARGET_phenotype.gz')
pheno_df = pd.read_table(path)

print(pheno_df.shape)
pheno_df.head(3)

(5958, 7)


Unnamed: 0,sample_id,primary_disease_code,_primary_disease,sample_type_code,_sample_type,_PATIENT,_cohort
0,TARGET-00-NAAEMA-20,Non cancerous tissue,Non cancerous tissue,CELLC,Control Analyte,NAAEMA,TARGET
1,TARGET-00-NAAEMB-20,Non cancerous tissue,Non cancerous tissue,CELLC,Control Analyte,NAAEMB,TARGET
2,TARGET-00-NAAEMC-20,Non cancerous tissue,Non cancerous tissue,CELLC,Control Analyte,NAAEMC,TARGET


## Read Entrez ID Curation Information

Load curated gene names from versioned resource. See https://github.com/cognoma/genes for more details

In [4]:
# Commit from https://github.com/cognoma/genes
genes_commit = 'ad9631bb4e77e2cdc5413b0d77cb8f7e93fc5bee'

In [5]:
url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/genes.tsv'.format(genes_commit)
gene_df = pd.read_table(url)

# Only consider protein-coding genes
gene_df = (
    gene_df.query("gene_type == 'protein-coding'")
)

print(gene_df.shape)
gene_df.head(2)

(20395, 7)


Unnamed: 0,entrez_gene_id,symbol,description,chromosome,gene_type,synonyms,aliases
0,1,A1BG,alpha-1-B glycoprotein,19,protein-coding,A1B|ABG|GAB|HYST2477,alpha-1B-glycoprotein|HEL-S-163pA|epididymis s...
1,2,A2M,alpha-2-macroglobulin,12,protein-coding,A2MD|CPAMD5|FWP007|S863-7,alpha-2-macroglobulin|C3 and PZP-like alpha-2-...


In [6]:
# Load gene updater - old to new Entrez gene identifiers
url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/updater.tsv'.format(genes_commit)
updater_df = pd.read_table(url)
old_to_new_entrez = dict(zip(updater_df.old_entrez_gene_id,
                             updater_df.new_entrez_gene_id))

## Read Probe Mapping Info

In [7]:
path = os.path.join('download', 'gencode.v23.annotation.gene.probeMap.gz')
probe_map_df = pd.read_table(path)

# Inner merge gene df to get ensembl to entrez mapping
probe_map_df = probe_map_df.merge(gene_df, how='inner', left_on='gene', right_on='symbol')

# Mapping to rename gene expression index
ensembl_to_entrez = dict(zip(probe_map_df.id, probe_map_df.entrez_gene_id))

print(probe_map_df.shape)
probe_map_df.head(3)

(18855, 13)


Unnamed: 0,id,gene,chrom,chromStart,chromEnd,strand,entrez_gene_id,symbol,description,chromosome,gene_type,synonyms,aliases
0,ENSG00000186092.4,OR4F5,chr1,69091,70008,+,79501,OR4F5,olfactory receptor family 4 subfamily F member 5,1,protein-coding,,olfactory receptor 4F5
1,ENSG00000278566.1,OR4F29,chr1,450740,451678,-,729759,OR4F29,olfactory receptor family 4 subfamily F member 29,1,protein-coding,OR7-21,olfactory receptor 4F3/4F16/4F29|olfactory rec...
2,ENSG00000273547.1,OR4F16,chr1,685716,686654,-,81399,OR4F16,olfactory receptor family 4 subfamily F member 16,1,protein-coding,OR1-1|OR7-21,olfactory receptor 4F3/4F16/4F29|olfactory rec...


## Read Gene Expression Data

In [8]:
file = os.path.join('download', 'target_RSEM_gene_fpkm.gz')
expr_df = pd.read_table(file, index_col=0)

print(expr_df.shape)

(60498, 734)


## Process gene expression matrix

This involves updating Entrez gene ids, sorting and subsetting

In [9]:
expr_df = (expr_df
    .dropna(axis='rows')
    .reindex(probe_map_df.id)
    .rename(index=ensembl_to_entrez)
    .rename(index=old_to_new_entrez)
    .groupby(level=0).mean()
    .transpose()
    .sort_index(axis='rows')
    .sort_index(axis='columns')
)

expr_df.index.rename('sample_id', inplace=True)

expr_df.head(2)

id,1,2,9,10,12,13,14,15,16,18,...,102724231,102724398,102724473,102724536,102724631,102724862,102724928,105375355,105378803,105378952
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TARGET-10-PAKSWW-03,5.3754,-1.1488,-1.4305,-9.9658,-7.76965,-9.9658,4.3786,-1.3183,2.0289,0.7321,...,-2.6349,-9.9658,-4.6082,0.1648,-9.9658,-5.0116,-9.9658,-0.8863,-9.9658,-9.9658
TARGET-10-PAMXHJ-09,4.9388,-1.2828,0.2881,-9.9658,-9.9658,-9.9658,3.866,1.4011,3.0791,2.6232,...,-2.2447,-4.2934,-4.6082,0.4016,-9.9658,-4.2934,-9.9658,-2.0529,-9.9658,-9.9658


## Stratify Balanced Training and Testing Sets in TARGET Gene Expression

Output training and testing gene expression datasets

In [10]:
strat = pheno_df.set_index('sample_id').reindex(expr_df.index).primary_disease_code

In [11]:
strat.value_counts()

AML       196
ALL       194
NBL       162
WT        132
AML-IF     32
CCSK       13
RT          5
Name: primary_disease_code, dtype: int64

In [12]:
train_df, test_df = train_test_split(expr_df,
                                     test_size=0.1,
                                     random_state=123,
                                     stratify=strat)

In [13]:
print(train_df.shape)
test_df.shape

(660, 18753)


(74, 18753)

In [14]:
train_file = os.path.join('data', 'train_target_expression_matrix_processed.tsv.gz')
train_df.to_csv(train_file, sep='\t', compression='gzip', float_format='%.3g')

In [15]:
test_file = os.path.join('data', 'test_target_expression_matrix_processed.tsv.gz')
test_df.to_csv(test_file, sep='\t', compression='gzip', float_format='%.3g')