# 01: Data download and preprocessing

**Author:** Grace Akatsu

**Class:** CPBS 7602, Fall 2025

---
## Overview
This notebook downloads and preprocesses the GTEx bulk tissue expression data required for assignment 2.

## Table of Contents
*   [Import libraries](#import_libraries)
*   [Set paths](#set_paths)
*   [Download data](#download_data)
*   [Read in data](#read_data)
*   [Generate a unified, cleaned dataset](#unify_clean)
---

## Import libraries <a class="anchor" id="import_libraries"></a>


In [1]:
import os
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler

## Set paths <a class="anchor" id="set_paths"></a>

In [2]:
DOWNLOAD_DIR = "data"
CLEAN_DATA_DIR = "clean_data"

os.makedirs(DOWNLOAD_DIR, exist_ok=True)
os.makedirs(CLEAN_DATA_DIR, exist_ok=True)


## Download data <a class="anchor" id="download_data"></a>

In [3]:
%%bash -s "$DOWNLOAD_DIR"

# Get variable from python
DOWNLOAD_DIR="$1"

echo "Using DOWNLOAD_DIR: $DOWNLOAD_DIR"

#   Gene TPMs
if [ ! -f "$DOWNLOAD_DIR/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz" ]; then
    curl -O --output-dir "$DOWNLOAD_DIR" "https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz"
fi

# 	A de-identified, open access version of the sample annotations available in dbGaP.
if [ ! -f "$DOWNLOAD_DIR/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt" ]; then
    curl -O --output-dir "$DOWNLOAD_DIR" "https://storage.googleapis.com/adult-gtex/annotations/v8/metadata-files/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt"
fi  

# A de-identified, open access version of the subject annotations available in dbGaP.
if [ ! -f "$DOWNLOAD_DIR/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt" ]; then
    curl -O --output-dir "$DOWNLOAD_DIR" "https://storage.googleapis.com/adult-gtex/annotations/v8/metadata-files/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt"
fi

Using DOWNLOAD_DIR: data


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1550M 100  1550M   0     0 60307k     0   0:00:26  0:00:26 --:--:-- 63630k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 11242k 100 11242k   0     0 25959k     0  --:--:-- --:--:-- --:--:-- 25964k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 20271 100 20271   0     0 120229     0  --:--:-- --:--:-- --:--:-- 120660


## Read in data <a class="anchor" id="read_data"></a>

In [4]:
# Read in sample metadata
sample_metadata = pd.read_csv(
    Path(DOWNLOAD_DIR) / "GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt",
    sep="\t"
)

sample_metadata.head()

Unnamed: 0,SAMPID,SMATSSCR,SMCENTER,SMPTHNTS,SMRIN,SMTS,SMTSD,SMUBRID,SMTSISCH,SMTSPAX,...,SME1ANTI,SMSPLTRD,SMBSMMRT,SME1SNSE,SME1PCTS,SMRRNART,SME1MPRT,SMNUM5CD,SMDPMPRT,SME2PCTS
0,GTEX-1117F-0003-SM-58Q7G,,B1,,,Blood,Whole Blood,13756,1188.0,,...,,,,,,,,,,
1,GTEX-1117F-0003-SM-5DWSB,,B1,,,Blood,Whole Blood,13756,1188.0,,...,,,,,,,,,,
2,GTEX-1117F-0003-SM-6WBT7,,B1,,,Blood,Whole Blood,13756,1188.0,,...,,,,,,,,,,
3,GTEX-1117F-0011-R10a-SM-AHZ7F,,"B1, A1",,,Brain,Brain - Frontal Cortex (BA9),9834,1193.0,,...,,,,,,,,,,
4,GTEX-1117F-0011-R10b-SM-CYKQ8,,"B1, A1",,7.2,Brain,Brain - Frontal Cortex (BA9),9834,1193.0,,...,,,,,,,,,,


In [5]:
# Read in subject metadata
subject_metadata = pd.read_csv(
    Path(DOWNLOAD_DIR) / "GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt",
    sep="\t"
)

subject_metadata.head()

Unnamed: 0,SUBJID,SEX,AGE,DTHHRDY
0,GTEX-1117F,2,60-69,4.0
1,GTEX-111CU,1,50-59,0.0
2,GTEX-111FC,1,60-69,1.0
3,GTEX-111VG,1,60-69,3.0
4,GTEX-111YS,1,60-69,0.0


In [5]:
# We want to find which tissue of origin has the greatest number of samples.
tissue_counts = metadata['SMTSD'].value_counts()

tissue_counts.head(n=10)

SMTSD
Whole Blood                            3288
Muscle - Skeletal                      1132
Lung                                    867
Skin - Sun Exposed (Lower leg)          849
Thyroid                                 812
Artery - Tibial                         770
Adipose - Subcutaneous                  763
Nerve - Tibial                          722
Heart - Left Ventricle                  689
Skin - Not Sun Exposed (Suprapubic)     638
Name: count, dtype: int64

In [6]:
# Get the sample IDs for the top 10 tissues with the most samples
top10_tissues = tissue_counts.head(n=10).index.tolist()

meta_subset = metadata[metadata["SMTSD"].isin(top10_tissues)].copy()
sample_ids = meta_subset["SAMPID"].tolist()

print("Number of samples in top 10 tissues: ", len(sample_ids), "\n")

meta_subset.head()

Number of samples in top 10 tissues:  10530 



Unnamed: 0,SAMPID,SMATSSCR,SMCENTER,SMPTHNTS,SMRIN,SMTS,SMTSD,SMUBRID,SMTSISCH,SMTSPAX,...,SME1ANTI,SMSPLTRD,SMBSMMRT,SME1SNSE,SME1PCTS,SMRRNART,SME1MPRT,SMNUM5CD,SMDPMPRT,SME2PCTS
0,GTEX-1117F-0003-SM-58Q7G,,B1,,,Blood,Whole Blood,13756,1188.0,,...,,,,,,,,,,
1,GTEX-1117F-0003-SM-5DWSB,,B1,,,Blood,Whole Blood,13756,1188.0,,...,,,,,,,,,,
2,GTEX-1117F-0003-SM-6WBT7,,B1,,,Blood,Whole Blood,13756,1188.0,,...,,,,,,,,,,
5,GTEX-1117F-0226-SM-5GZZ7,0.0,B1,"2 pieces, ~15% vessel stroma, rep delineated",6.8,Adipose Tissue,Adipose - Subcutaneous,2190,1214.0,1125.0,...,14648800.0,11999300.0,0.003158,14669500.0,50.0354,0.003105,0.99474,,0.0,50.1944
6,GTEX-1117F-0426-SM-5EGHI,0.0,B1,"2 pieces, !5% fibrous connective tissue, delin...",7.1,Muscle,Muscle - Skeletal,11907,1220.0,1119.0,...,13191500.0,11550200.0,0.003968,13340500.0,50.2809,0.006995,0.995041,,0.0,49.9455


In [7]:
# Read in only the column names (sample IDs) of the gene TPM data
# Here, the columns are samples and the rows are genes

gene_tpm_header = pd.read_csv(
    Path(DOWNLOAD_DIR) / "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz",
    sep="\t",
    skiprows=2,        # skip the 2 GCT header lines
    compression="gzip",
    nrows=0            # only read column names
)

gene_tpm_header.head()

Unnamed: 0,Name,Description,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2526-SM-5GZY6,...,GTEX-ZZPU-1126-SM-5N9CW,GTEX-ZZPU-1226-SM-5N9CK,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O


In [8]:
# Find the intersection of sample IDs in metadata and gene TPM data
# Here, the columns are samples and the rows are genes

sample_id_intersection = list(set(sample_ids) & set(gene_tpm_header))

print("Number of samples in expression data: ", len(sample_id_intersection))

Number of samples in expression data:  6471


In [9]:
# Now read in only the gene TPMs for these samples
# There are "Name" and "Description" columns corresponding
# to Ensembl gene IDs and gene names, respectively.
# I will use Ensembl gene IDs for unique gene identifiers.
cols_to_use = ["Name"] + sample_id_intersection

gene_tpms_all_genes = pd.read_csv(
    Path(DOWNLOAD_DIR) / "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz",
    sep="\t",
    skiprows=2,       # skip GCT header lines
    compression="gzip",
    usecols=cols_to_use
)

# Now transpose so that the genes are columns and the samples are rows
gene_tpms_all_genes = gene_tpms_all_genes.set_index("Name").T

gene_tpms_all_genes.head()


Name,ENSG00000223972.5,ENSG00000227232.5,ENSG00000278267.1,ENSG00000243485.5,ENSG00000237613.2,ENSG00000268020.3,ENSG00000240361.1,ENSG00000186092.4,ENSG00000238009.6,ENSG00000233750.3,...,ENSG00000198886.2,ENSG00000210176.1,ENSG00000210184.1,ENSG00000210191.1,ENSG00000198786.2,ENSG00000198695.2,ENSG00000210194.1,ENSG00000198727.2,ENSG00000210195.2,ENSG00000210196.2
GTEX-1117F-0226-SM-5GZZ7,0.0,8.764,0.0,0.07187,0.0,0.0,0.06621,0.0,0.0,0.03595,...,12400.0,0.0,0.0,0.0,2928.0,3799.0,16.24,6938.0,0.943,0.0
GTEX-1117F-0426-SM-5EGHI,0.0,3.861,0.0,0.0,0.0,0.056,0.05004,0.1025,0.04574,0.01359,...,34030.0,0.0,0.0,0.0,10400.0,14750.0,44.31,26310.0,6.414,6.226
GTEX-1117F-0526-SM-5EGHJ,0.0,7.349,1.004,0.0,0.0,0.0,0.0,0.07434,0.09953,0.0,...,13820.0,0.9891,0.0,0.0,4471.0,6728.0,23.74,8455.0,1.034,1.004
GTEX-1117F-2926-SM-5GZYI,0.0,12.5,0.0,0.06265,0.0,0.1292,0.0,0.0591,0.05275,0.0,...,17220.0,0.0,0.9195,0.0,2754.0,3898.0,10.22,9159.0,0.0,0.0
GTEX-111CU-0226-SM-5GZXC,0.04667,1.823,0.0,0.04659,0.0331,0.0,0.0,0.04395,0.0,0.05827,...,29430.0,23.98,2.052,1.705,11920.0,16030.0,50.29,24690.0,0.6113,0.0


In [10]:
# Next, we will find the top 5,000 most variable genes across these samples,
# sorted from highest to lowest variance.
gene_variances = gene_tpms_all_genes.var().sort_values(ascending=False)
gene_variances.head()

Name
ENSG00000244734.3     1.097184e+10
ENSG00000188536.12    1.048930e+09
ENSG00000198804.2     4.801091e+08
ENSG00000198938.2     3.489685e+08
ENSG00000163220.10    3.411307e+08
dtype: float64

In [11]:
# Isolate the TPMs for these top 5,000 most variable genes
top_5000_genes = gene_variances.head(5000).index.tolist()
gene_tpms_top5000 = gene_tpms_all_genes[top_5000_genes].copy()

gene_tpms_top5000.head()



Name,ENSG00000244734.3,ENSG00000188536.12,ENSG00000198804.2,ENSG00000198938.2,ENSG00000163220.10,ENSG00000198899.2,ENSG00000198886.2,ENSG00000198712.1,ENSG00000143632.14,ENSG00000198888.2,...,ENSG00000261236.7,ENSG00000188112.8,ENSG00000170035.15,ENSG00000024862.17,ENSG00000213619.9,ENSG00000176087.14,ENSG00000115596.3,ENSG00000138386.16,ENSG00000182872.15,ENSG00000070669.16
GTEX-1117F-0226-SM-5GZZ7,452.7,102.4,10790.0,19890.0,130.9,13880.0,12400.0,11720.0,37.18,15870.0,...,25.65,14.93,63.58,51.73,47.52,48.9,3.654,18.63,47.41,23.79
GTEX-1117F-0426-SM-5EGHI,225.7,52.01,33610.0,62560.0,122.9,51690.0,34030.0,37500.0,20940.0,38030.0,...,63.09,0.2883,47.39,22.71,54.52,43.75,0.7182,2.547,40.68,1.912
GTEX-1117F-0526-SM-5EGHJ,269.6,62.95,9689.0,19450.0,115.3,16270.0,13820.0,12250.0,25.15,13850.0,...,20.25,1.614,48.1,30.11,35.85,32.95,2.124,20.79,71.29,25.65
GTEX-1117F-2926-SM-5GZYI,205.4,73.8,17290.0,20160.0,158.1,17630.0,17220.0,12910.0,38.72,9093.0,...,23.82,28.35,47.93,37.13,43.33,48.56,7.423,52.44,53.98,27.61
GTEX-111CU-0226-SM-5GZXC,16.91,5.469,27390.0,31960.0,329.4,39610.0,29430.0,33890.0,37.27,19620.0,...,51.65,15.8,57.24,56.41,51.67,92.06,0.9951,27.71,57.69,144.5


## Generate a unified, cleaned dataset <a class="anchor" id="unify_clean"></a>

In [12]:
# First, make a key between the intersected sample IDs (the ones we ended up using) and the tissue of origin
sample_tissue_key = (
    meta_subset
    .loc[meta_subset["SAMPID"].isin(sample_id_intersection), ["SAMPID", "SMTSD"]]
)

# Set sample ID as the index for merging later
sample_tissue_key = sample_tissue_key.set_index("SAMPID")

sample_tissue_key.head()

Unnamed: 0_level_0,SMTSD
SAMPID,Unnamed: 1_level_1
GTEX-1117F-0226-SM-5GZZ7,Adipose - Subcutaneous
GTEX-1117F-0426-SM-5EGHI,Muscle - Skeletal
GTEX-1117F-0526-SM-5EGHJ,Artery - Tibial
GTEX-1117F-2926-SM-5GZYI,Skin - Not Sun Exposed (Suprapubic)
GTEX-111CU-0226-SM-5GZXC,Thyroid


In [13]:
# Then, make a normalized version of the gene tpms for the top 5,000 most variable genes
scaler = StandardScaler()
scaled_array = scaler.fit_transform(gene_tpms_top5000)

# That returns NumPy array, change back to dataframe
gene_tpms_top5000_standardized = pd.DataFrame(
    scaled_array,
    index=gene_tpms_top5000.index,
    columns=gene_tpms_top5000.columns
)

gene_tpms_top5000_standardized.head()

Name,ENSG00000244734.3,ENSG00000188536.12,ENSG00000198804.2,ENSG00000198938.2,ENSG00000163220.10,ENSG00000198899.2,ENSG00000198886.2,ENSG00000198712.1,ENSG00000143632.14,ENSG00000198888.2,...,ENSG00000261236.7,ENSG00000188112.8,ENSG00000170035.15,ENSG00000024862.17,ENSG00000213619.9,ENSG00000176087.14,ENSG00000115596.3,ENSG00000138386.16,ENSG00000182872.15,ENSG00000070669.16
GTEX-1117F-0226-SM-5GZZ7,-0.317682,-0.320624,-0.822484,-0.623593,-0.278153,-0.845429,-0.920044,-0.94437,-0.332153,-0.258633,...,-0.259958,0.087855,0.714803,0.76994,0.143898,-0.47663,-0.245165,-0.547999,-0.027921,-0.037272
GTEX-1117F-0426-SM-5EGHI,-0.319849,-0.32218,0.219063,1.660759,-0.278587,1.28664,0.307966,0.663645,1.117576,1.577841,...,1.470513,-0.589025,-0.033767,-0.57209,0.467826,-0.715005,-0.381079,-1.292842,-0.339644,-1.050707
GTEX-1117F-0526-SM-5EGHJ,-0.31943,-0.321842,-0.872736,-0.647149,-0.278998,-0.710659,-0.839425,-0.911312,-0.332987,-0.426037,...,-0.509545,-0.527738,-0.000939,-0.229877,-0.396138,-1.214898,-0.315997,-0.447965,1.078165,0.048887
GTEX-1117F-2926-SM-5GZYI,-0.320043,-0.321507,-0.525812,-0.609139,-0.276681,-0.63397,-0.646396,-0.870145,-0.332046,-0.820266,...,-0.344541,0.708257,-0.008799,0.094763,-0.049997,-0.492368,-0.070677,1.017824,0.276392,0.139678
GTEX-111CU-0226-SM-5GZXC,-0.321842,-0.323617,-0.064829,0.022578,-0.267405,0.605461,0.046808,0.438473,-0.332147,0.052142,...,0.941758,0.128075,0.421663,0.986367,0.335941,1.52109,-0.36826,-0.127483,0.448233,5.554269


In [14]:
# Sanity check to ensure samples match between tissue key and expression data
assert set(sample_tissue_key.index) == set(gene_tpms_top5000_standardized.index), \
    "Sample IDs do not match between metadata and expression!"

In [15]:
# Concatenate the sample tissue key with the gene TPMs for the top 5,000 most variable genes, by sample ID
final_dataset = sample_tissue_key.join(gene_tpms_top5000_standardized, how="inner")

# Rename the "SMTSD" column to "Tissue"
final_dataset = final_dataset.rename(columns={"SMTSD": "Tissue"})

final_dataset.head()

Unnamed: 0_level_0,Tissue,ENSG00000244734.3,ENSG00000188536.12,ENSG00000198804.2,ENSG00000198938.2,ENSG00000163220.10,ENSG00000198899.2,ENSG00000198886.2,ENSG00000198712.1,ENSG00000143632.14,...,ENSG00000261236.7,ENSG00000188112.8,ENSG00000170035.15,ENSG00000024862.17,ENSG00000213619.9,ENSG00000176087.14,ENSG00000115596.3,ENSG00000138386.16,ENSG00000182872.15,ENSG00000070669.16
SAMPID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GTEX-1117F-0226-SM-5GZZ7,Adipose - Subcutaneous,-0.317682,-0.320624,-0.822484,-0.623593,-0.278153,-0.845429,-0.920044,-0.94437,-0.332153,...,-0.259958,0.087855,0.714803,0.76994,0.143898,-0.47663,-0.245165,-0.547999,-0.027921,-0.037272
GTEX-1117F-0426-SM-5EGHI,Muscle - Skeletal,-0.319849,-0.32218,0.219063,1.660759,-0.278587,1.28664,0.307966,0.663645,1.117576,...,1.470513,-0.589025,-0.033767,-0.57209,0.467826,-0.715005,-0.381079,-1.292842,-0.339644,-1.050707
GTEX-1117F-0526-SM-5EGHJ,Artery - Tibial,-0.31943,-0.321842,-0.872736,-0.647149,-0.278998,-0.710659,-0.839425,-0.911312,-0.332987,...,-0.509545,-0.527738,-0.000939,-0.229877,-0.396138,-1.214898,-0.315997,-0.447965,1.078165,0.048887
GTEX-1117F-2926-SM-5GZYI,Skin - Not Sun Exposed (Suprapubic),-0.320043,-0.321507,-0.525812,-0.609139,-0.276681,-0.63397,-0.646396,-0.870145,-0.332046,...,-0.344541,0.708257,-0.008799,0.094763,-0.049997,-0.492368,-0.070677,1.017824,0.276392,0.139678
GTEX-111CU-0226-SM-5GZXC,Thyroid,-0.321842,-0.323617,-0.064829,0.022578,-0.267405,0.605461,0.046808,0.438473,-0.332147,...,0.941758,0.128075,0.421663,0.986367,0.335941,1.52109,-0.36826,-0.127483,0.448233,5.554269


In [16]:
# Write out the final cleaned dataset to a CSV file
final_dataset.to_csv(
    Path(CLEAN_DATA_DIR) / "gtex_top10_tissues_top5000_variable_genes_standardized.csv"
)