# Process GTEx Gene Expression Data

Retrieve the downloaded expression data, update gene identifiers to entrez, and curate sample IDs. The script will also identify a balanced hold-out test set to compare projection performance into learned latent spaces across algorithms.

**Note:** GTEx version 7 was downloaded from https://www.gtexportal.org/home/datasets

In [1]:
import os
import random
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
random.seed(1234)

## Read Phenotype Information

In [3]:
path = os.path.join('download', 'GTEx_v7_Annotations_SampleAttributesDS.txt')
pheno_df = pd.read_table(path)

print(pheno_df.shape)
pheno_df.head(3)

(15598, 63)


Unnamed: 0,SAMPID,SMATSSCR,SMCENTER,SMPTHNTS,SMRIN,SMTS,SMTSD,SMUBRID,SMTSISCH,SMTSPAX,...,SME1ANTI,SMSPLTRD,SMBSMMRT,SME1SNSE,SME1PCTS,SMRRNART,SME1MPRT,SMNUM5CD,SMDPMPRT,SME2PCTS
0,GTEX-1117F-0003-SM-58Q7G,,B1,,,Blood,Whole Blood,13756,1188.0,,...,,,,,,,,,,
1,GTEX-1117F-0003-SM-5DWSB,,B1,,,Blood,Whole Blood,13756,1188.0,,...,,,,,,,,,,
2,GTEX-1117F-0003-SM-6WBT7,,B1,,,Blood,Whole Blood,13756,1188.0,,...,,,,,,,,,,


## Read Entrez ID Curation Information

Load curated gene names from versioned resource. See https://github.com/cognoma/genes for more details

In [4]:
# Commit from https://github.com/cognoma/genes
genes_commit = 'ad9631bb4e77e2cdc5413b0d77cb8f7e93fc5bee'

In [5]:
url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/genes.tsv'.format(genes_commit)
gene_df = pd.read_table(url)

# Only consider protein-coding genes
gene_df = (
    gene_df.query("gene_type == 'protein-coding'")
)

print(gene_df.shape)
gene_df.head(2)

(20395, 7)


Unnamed: 0,entrez_gene_id,symbol,description,chromosome,gene_type,synonyms,aliases
0,1,A1BG,alpha-1-B glycoprotein,19,protein-coding,A1B|ABG|GAB|HYST2477,alpha-1B-glycoprotein|HEL-S-163pA|epididymis s...
1,2,A2M,alpha-2-macroglobulin,12,protein-coding,A2MD|CPAMD5|FWP007|S863-7,alpha-2-macroglobulin|C3 and PZP-like alpha-2-...


In [6]:
# Load gene updater - old to new Entrez gene identifiers
url = 'https://raw.githubusercontent.com/cognoma/genes/{}/data/updater.tsv'.format(genes_commit)
updater_df = pd.read_table(url)
old_to_new_entrez = dict(zip(updater_df.old_entrez_gene_id,
                             updater_df.new_entrez_gene_id))

## Read Gene Expression Data

In [7]:
file = os.path.join('download', 'GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm.gct.gz')
expr_df = pd.read_table(file, sep='\t', skiprows=2, index_col=1)

print(expr_df.shape)

expr_df.head(2)

(56202, 11689)


Unnamed: 0_level_0,Name,GTEX-1117F-0226-SM-5GZZ7,GTEX-111CU-1826-SM-5GZYN,GTEX-111FC-0226-SM-5N9B8,GTEX-111VG-2326-SM-5N9BK,GTEX-111YS-2426-SM-5GZZQ,GTEX-1122O-2026-SM-5NQ91,GTEX-1128S-2126-SM-5H12U,GTEX-113IC-0226-SM-5HL5C,GTEX-117YX-2226-SM-5EGJJ,...,GTEX-ZVE2-0006-SM-51MRW,GTEX-ZVP2-0005-SM-51MRK,GTEX-ZVT2-0005-SM-57WBW,GTEX-ZVT3-0006-SM-51MT9,GTEX-ZVT4-0006-SM-57WB8,GTEX-ZVTK-0006-SM-57WBK,GTEX-ZVZP-0006-SM-51MSW,GTEX-ZVZQ-0006-SM-51MR8,GTEX-ZXES-0005-SM-57WCB,GTEX-ZXG5-0005-SM-57WCN
Description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DDX11L1,ENSG00000223972.4,0.1082,0.1158,0.02104,0.02329,0.0,0.04641,0.03076,0.09358,0.121,...,0.09012,0.1462,0.1045,0.0,0.6603,0.695,0.1213,0.4169,0.2355,0.145
WASH7P,ENSG00000227232.4,21.4,11.03,16.75,8.172,7.658,9.372,10.08,13.56,9.889,...,3.926,13.13,5.537,5.789,8.439,7.843,12.39,12.53,8.027,12.76


In [8]:
# Get GTEx gene mapping
expr_gene_ids = (
    expr_df
    .loc[:, ['Name']]
    .reset_index()
    .drop_duplicates(subset='Description')
)

# Inner merge gene df to get ensembl to entrez mapping
map_df = expr_gene_ids.merge(gene_df, how='inner', left_on='Description', right_on='symbol')

symbol_to_entrez = dict(zip(map_df.symbol, map_df.entrez_gene_id))

## Process gene expression matrix

This involves updating Entrez gene ids, sorting and subsetting

In [9]:
expr_df = (expr_df
 .drop(['Name'], axis='columns')
 .dropna(axis='rows')
 .groupby(level=0).mean()
 .reindex(map_df.symbol)
 .rename(index=symbol_to_entrez)
 .rename(index=old_to_new_entrez)
 .transpose()
 .sort_index(axis='rows')
 .sort_index(axis='columns')
)

expr_df.index.rename('sample_id', inplace=True)

expr_df.head(2)

symbol,1,2,9,10,12,13,14,15,16,18,...,101180976,101241878,101362076,101928601,102723547,102724231,102724473,102724928,105375355,105378803
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GTEX-1117F-0226-SM-5GZZ7,4.961,356.1,2.035,0.2791,48.12,1.401,174.4,0.4212,71.11,7.744,...,0.0,0.0,1.646,0.1025,0.0,2.521,0.0,0.0,10.53,0.03187
GTEX-1117F-0426-SM-5EGHI,0.6198,110.0,0.2491,0.0,5.08,0.1115,78.17,0.1274,53.43,6.029,...,0.0,0.0,0.4977,0.0,0.0,1.525,0.0,0.0,0.8182,0.0


## Stratify Balanced Training and Testing Sets in GTEx Gene Expression

Output training and testing gene expression datasets.

In [10]:
strat = pheno_df.set_index('SAMPID').reindex(expr_df.index).SMTSD

In [11]:
strat.value_counts()

Muscle - Skeletal                            564
Skin - Sun Exposed (Lower leg)               473
Thyroid                                      446
Adipose - Subcutaneous                       442
Artery - Tibial                              441
Lung                                         427
Nerve - Tibial                               414
Whole Blood                                  407
Esophagus - Mucosa                           407
Skin - Not Sun Exposed (Suprapubic)          387
Esophagus - Muscularis                       370
Adipose - Visceral (Omentum)                 355
Cells - Transformed fibroblasts              343
Heart - Left Ventricle                       303
Artery - Aorta                               299
Heart - Atrial Appendage                     297
Breast - Mammary Tissue                      290
Colon - Transverse                           274
Stomach                                      262
Testis                                       259
Pancreas            

In [12]:
train_df, test_df = train_test_split(expr_df,
                                     test_size=0.1,
                                     random_state=123,
                                     stratify=strat)

In [13]:
print(train_df.shape)
test_df.shape

(10519, 18356)


(1169, 18356)

In [14]:
train_file = os.path.join('data', 'train_gtex_expression_matrix_processed.tsv.gz')
train_df.to_csv(train_file, sep='\t', compression='gzip', float_format='%.3g')

In [15]:
test_file = os.path.join('data', 'test_gtex_expression_matrix_processed.tsv.gz')
test_df.to_csv(test_file, sep='\t', compression='gzip', float_format='%.3g')