# 01: Data download and preprocessing

**Author:** Grace Akatsu

**Class:** CPBS 7602, Fall 2025

---
## Overview
This notebook downloads and preprocesses the GTEx bulk tissue expression data required for assignment 2.

## Table of Contents
*   [Import libraries](#import_libraries)
*   [Set paths](#set_paths)
*   [Download data](#download_data)
*   [Read in TPMs](#read_data)
*   [Most variable genes](#variable)
*   [Add age and sex variables](#age_sex)
*   [Standardize and save cleaned dataset](#save)
---

## Import libraries <a class="anchor" id="import_libraries"></a>


In [79]:
import os
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler

## Set paths <a class="anchor" id="set_paths"></a>

In [80]:
DOWNLOAD_DIR = "data"
CLEAN_DATA_DIR = "clean_data"

os.makedirs(DOWNLOAD_DIR, exist_ok=True)
os.makedirs(CLEAN_DATA_DIR, exist_ok=True)


## Download data <a class="anchor" id="download_data"></a>

In [81]:
%%bash -s "$DOWNLOAD_DIR"

# Get variable from python
DOWNLOAD_DIR="$1"

echo "Using DOWNLOAD_DIR: $DOWNLOAD_DIR"

#   Gene TPMs
if [ ! -f "$DOWNLOAD_DIR/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz" ]; then
    curl -O --output-dir "$DOWNLOAD_DIR" "https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz"
fi

# 	A de-identified, open access version of the sample annotations available in dbGaP.
if [ ! -f "$DOWNLOAD_DIR/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt" ]; then
    curl -O --output-dir "$DOWNLOAD_DIR" "https://storage.googleapis.com/adult-gtex/annotations/v8/metadata-files/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt"
fi  

# A de-identified, open access version of the subject annotations available in dbGaP.
if [ ! -f "$DOWNLOAD_DIR/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt" ]; then
    curl -O --output-dir "$DOWNLOAD_DIR" "https://storage.googleapis.com/adult-gtex/annotations/v8/metadata-files/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt"
fi

Using DOWNLOAD_DIR: data


## Read in TPMs <a class="anchor" id="read_data"></a>

In [82]:
# Read in sample metadata
sample_metadata = pd.read_csv(
    Path(DOWNLOAD_DIR) / "GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt",
    sep="\t"
)

sample_metadata.head()

Unnamed: 0,SAMPID,SMATSSCR,SMCENTER,SMPTHNTS,SMRIN,SMTS,SMTSD,SMUBRID,SMTSISCH,SMTSPAX,...,SME1ANTI,SMSPLTRD,SMBSMMRT,SME1SNSE,SME1PCTS,SMRRNART,SME1MPRT,SMNUM5CD,SMDPMPRT,SME2PCTS
0,GTEX-1117F-0003-SM-58Q7G,,B1,,,Blood,Whole Blood,13756,1188.0,,...,,,,,,,,,,
1,GTEX-1117F-0003-SM-5DWSB,,B1,,,Blood,Whole Blood,13756,1188.0,,...,,,,,,,,,,
2,GTEX-1117F-0003-SM-6WBT7,,B1,,,Blood,Whole Blood,13756,1188.0,,...,,,,,,,,,,
3,GTEX-1117F-0011-R10a-SM-AHZ7F,,"B1, A1",,,Brain,Brain - Frontal Cortex (BA9),9834,1193.0,,...,,,,,,,,,,
4,GTEX-1117F-0011-R10b-SM-CYKQ8,,"B1, A1",,7.2,Brain,Brain - Frontal Cortex (BA9),9834,1193.0,,...,,,,,,,,,,


In [83]:
# Get the sample IDs for the samples that come from blood and braintissues

meta_subset = sample_metadata[sample_metadata["SMTS"].isin(["Blood", "Brain"])].copy()
sample_ids = meta_subset["SAMPID"].tolist()

print("Number of samples originating from blood and brain tissues: ", len(sample_ids), "\n")

meta_subset.head()

Number of samples originating from blood and brain tissues:  6806 



Unnamed: 0,SAMPID,SMATSSCR,SMCENTER,SMPTHNTS,SMRIN,SMTS,SMTSD,SMUBRID,SMTSISCH,SMTSPAX,...,SME1ANTI,SMSPLTRD,SMBSMMRT,SME1SNSE,SME1PCTS,SMRRNART,SME1MPRT,SMNUM5CD,SMDPMPRT,SME2PCTS
0,GTEX-1117F-0003-SM-58Q7G,,B1,,,Blood,Whole Blood,13756,1188.0,,...,,,,,,,,,,
1,GTEX-1117F-0003-SM-5DWSB,,B1,,,Blood,Whole Blood,13756,1188.0,,...,,,,,,,,,,
2,GTEX-1117F-0003-SM-6WBT7,,B1,,,Blood,Whole Blood,13756,1188.0,,...,,,,,,,,,,
3,GTEX-1117F-0011-R10a-SM-AHZ7F,,"B1, A1",,,Brain,Brain - Frontal Cortex (BA9),9834,1193.0,,...,,,,,,,,,,
4,GTEX-1117F-0011-R10b-SM-CYKQ8,,"B1, A1",,7.2,Brain,Brain - Frontal Cortex (BA9),9834,1193.0,,...,,,,,,,,,,


In [84]:
# Read in only the column names (sample IDs) of the gene TPM data
# Here, the columns are samples and the rows are genes

gene_tpm_header = pd.read_csv(
    Path(DOWNLOAD_DIR) / "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz",
    sep="\t",
    skiprows=2,        # skip the 2 GCT header lines
    compression="gzip",
    nrows=0            # only read column names
)

gene_tpm_header.head()

Unnamed: 0,Name,Description,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2526-SM-5GZY6,...,GTEX-ZZPU-1126-SM-5N9CW,GTEX-ZZPU-1226-SM-5N9CK,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O


In [85]:
# Find the intersection of sample IDs in metadata and gene TPM data
# Here, the columns are samples and the rows are genes

sample_id_intersection = list(set(sample_ids) & set(gene_tpm_header))

print("Number of samples in expression data: ", len(sample_id_intersection))

Number of samples in expression data:  3571


In [86]:
# Now read in only the gene TPMs for these samples
# There are "Name" and "Description" columns corresponding
# to Ensembl gene IDs and gene names, respectively.
# I will use Ensembl gene IDs for unique gene identifiers.
cols_to_use = ["Name"] + sample_id_intersection

gene_tpms_all_genes = pd.read_csv(
    Path(DOWNLOAD_DIR) / "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz",
    sep="\t",
    skiprows=2,       # skip GCT header lines
    compression="gzip",
    usecols=cols_to_use
)

# Now transpose so that the genes are columns and the samples are rows
gene_tpms_all_genes = gene_tpms_all_genes.set_index("Name").T

gene_tpms_all_genes.head()


Name,ENSG00000223972.5,ENSG00000227232.5,ENSG00000278267.1,ENSG00000243485.5,ENSG00000237613.2,ENSG00000268020.3,ENSG00000240361.1,ENSG00000186092.4,ENSG00000238009.6,ENSG00000233750.3,...,ENSG00000198886.2,ENSG00000210176.1,ENSG00000210184.1,ENSG00000210191.1,ENSG00000198786.2,ENSG00000198695.2,ENSG00000210194.1,ENSG00000198727.2,ENSG00000210195.2,ENSG00000210196.2
GTEX-1117F-3226-SM-5N9CT,0.01776,6.892,0.0,0.0,0.0,0.03656,0.03267,0.1004,0.02986,0.0,...,56930.0,4.451,2.082,5.624,8380.0,7000.0,25.82,31410.0,5.584,5.42
GTEX-111FC-3126-SM-5GZZ2,0.0,4.225,0.4912,0.07713,0.0,0.0,0.1066,0.03638,0.03247,0.02894,...,48610.0,5.809,6.227,4.704,7620.0,6380.0,21.78,32050.0,2.53,5.894
GTEX-111FC-3326-SM-5GZYV,0.0,7.778,0.771,0.0,0.0,0.0,0.0,0.05711,0.02549,0.04543,...,45150.0,0.0,0.0,2.215,6256.0,6063.0,19.0,30980.0,3.178,2.313
GTEX-111YS-0006-SM-5NQBE,0.02171,1.555,0.0,0.0,0.0,0.0,0.03994,0.0,0.0365,1.139,...,4380.0,0.5441,0.0,0.0,1067.0,1972.0,6.529,3069.0,0.0,0.0
GTEX-1122O-0003-SM-5Q5DL,0.0,1.53,0.0,0.0,0.0,0.0,0.0,0.03751,0.1004,0.01989,...,17590.0,0.4991,0.5836,2.425,6048.0,6354.0,23.95,8713.0,4.696,0.5064


## Most variable genes <a class="anchor" id="variable"></a>

In [87]:
# Next, we will find the top 5,000 most variable genes across these samples,
# sorted from highest to lowest variance.
gene_variances = gene_tpms_all_genes.var().sort_values(ascending=False)
gene_variances.head()

Name
ENSG00000244734.3     1.831438e+10
ENSG00000188536.12    1.750638e+09
ENSG00000210082.2     1.381559e+09
ENSG00000198804.2     9.267500e+08
ENSG00000198712.1     8.103378e+08
dtype: float64

In [88]:
# Isolate the TPMs for these top 5,000 most variable genes
top_5000_genes = gene_variances.head(5000).index.tolist()
gene_tpms_top5000 = gene_tpms_all_genes[top_5000_genes].copy()

gene_tpms_top5000.head()

Name,ENSG00000244734.3,ENSG00000188536.12,ENSG00000210082.2,ENSG00000198804.2,ENSG00000198712.1,ENSG00000198938.2,ENSG00000198899.2,ENSG00000163220.10,ENSG00000198886.2,ENSG00000198763.3,...,ENSG00000141404.15,ENSG00000224531.5,ENSG00000126705.13,ENSG00000064012.21,ENSG00000182698.11,ENSG00000029363.16,ENSG00000249915.7,ENSG00000107862.4,ENSG00000272849.1,ENSG00000197114.11
GTEX-1117F-3226-SM-5N9CT,314.6,115.8,78170.0,76160.0,58320.0,65070.0,56680.0,17.46,56930.0,35030.0,...,4.842,6.48,16.09,0.3956,0.2936,13.91,24.56,30.52,11.03,21.09
GTEX-111FC-3126-SM-5GZZ2,69.26,26.66,56000.0,51600.0,59480.0,43790.0,57190.0,9.551,48610.0,32840.0,...,9.479,20.92,20.89,0.717,0.9976,27.85,39.35,37.57,11.99,26.41
GTEX-111FC-3326-SM-5GZYV,142.7,47.58,32100.0,27340.0,49380.0,37730.0,50440.0,10.45,45150.0,25620.0,...,7.798,39.66,60.37,0.893,0.2506,43.82,55.93,64.52,44.03,63.75
GTEX-111YS-0006-SM-5NQBE,234900.0,68780.0,3955.0,4220.0,7411.0,4811.0,5066.0,62080.0,4380.0,2812.0,...,0.8287,0.6487,0.6872,48.07,0.04486,11.87,29.13,12.34,9.853,33.77
GTEX-1122O-0003-SM-5Q5DL,3.198,1.48,8032.0,19860.0,31750.0,19360.0,18980.0,2.566,17590.0,6057.0,...,0.7487,8.623,8.247,8.605,0.0,66.93,69.87,66.12,11.03,27.78


Now, add in information about tissue of origin.

In [89]:
# First, make a key between the intersected sample IDs (the ones we ended up using) and the tissue of origin
sample_tissue_key = (
    meta_subset
    .loc[meta_subset["SAMPID"].isin(sample_id_intersection), ["SAMPID", "SMTS"]]
)

# Set sample ID as the index for merging later
sample_tissue_key = sample_tissue_key.set_index("SAMPID")

sample_tissue_key.head()

Unnamed: 0_level_0,SMTS
SAMPID,Unnamed: 1_level_1
GTEX-1117F-3226-SM-5N9CT,Brain
GTEX-111FC-3126-SM-5GZZ2,Brain
GTEX-111FC-3326-SM-5GZYV,Brain
GTEX-111YS-0006-SM-5NQBE,Blood
GTEX-1122O-0003-SM-5Q5DL,Blood


In [90]:
# Sanity check to ensure samples match between tissue key and expression data
assert set(sample_tissue_key.index) == set(gene_tpms_top5000.index), \
    "Sample IDs do not match between metadata and expression!"

In [91]:
# Concatenate the sample tissue key with the gene TPMs for the top 5,000 most variable genes, by sample ID
final_TPMs = sample_tissue_key.join(gene_tpms_top5000, how="inner")

# Rename the "SMTS" column to "Tissue"
final_TPMs = final_TPMs.rename(columns={"SMTS": "Tissue"})

final_TPMs.head()

Unnamed: 0_level_0,Tissue,ENSG00000244734.3,ENSG00000188536.12,ENSG00000210082.2,ENSG00000198804.2,ENSG00000198712.1,ENSG00000198938.2,ENSG00000198899.2,ENSG00000163220.10,ENSG00000198886.2,...,ENSG00000141404.15,ENSG00000224531.5,ENSG00000126705.13,ENSG00000064012.21,ENSG00000182698.11,ENSG00000029363.16,ENSG00000249915.7,ENSG00000107862.4,ENSG00000272849.1,ENSG00000197114.11
SAMPID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GTEX-1117F-3226-SM-5N9CT,Brain,314.6,115.8,78170.0,76160.0,58320.0,65070.0,56680.0,17.46,56930.0,...,4.842,6.48,16.09,0.3956,0.2936,13.91,24.56,30.52,11.03,21.09
GTEX-111FC-3126-SM-5GZZ2,Brain,69.26,26.66,56000.0,51600.0,59480.0,43790.0,57190.0,9.551,48610.0,...,9.479,20.92,20.89,0.717,0.9976,27.85,39.35,37.57,11.99,26.41
GTEX-111FC-3326-SM-5GZYV,Brain,142.7,47.58,32100.0,27340.0,49380.0,37730.0,50440.0,10.45,45150.0,...,7.798,39.66,60.37,0.893,0.2506,43.82,55.93,64.52,44.03,63.75
GTEX-111YS-0006-SM-5NQBE,Blood,234900.0,68780.0,3955.0,4220.0,7411.0,4811.0,5066.0,62080.0,4380.0,...,0.8287,0.6487,0.6872,48.07,0.04486,11.87,29.13,12.34,9.853,33.77
GTEX-1122O-0003-SM-5Q5DL,Blood,3.198,1.48,8032.0,19860.0,31750.0,19360.0,18980.0,2.566,17590.0,...,0.7487,8.623,8.247,8.605,0.0,66.93,69.87,66.12,11.03,27.78


## Add age and sex variables <a class="anchor" id="age_sex"></a>

In [92]:
# Read in subject metadata
subject_metadata = pd.read_csv(
    Path(DOWNLOAD_DIR) / "GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt",
    sep="\t"
)

subject_metadata.head()

Unnamed: 0,SUBJID,SEX,AGE,DTHHRDY
0,GTEX-1117F,2,60-69,4.0
1,GTEX-111CU,1,50-59,0.0
2,GTEX-111FC,1,60-69,1.0
3,GTEX-111VG,1,60-69,3.0
4,GTEX-111YS,1,60-69,0.0


1 = male, 2 = female.

In [93]:
# The sample IDs have the subject IDs as the first 10 characters.
# Let's create a subject ID column in the final dataset

TPMS_with_subject_ID = final_TPMs.copy()

# Insert this column to the leftmost position
TPMS_with_subject_ID.insert(0, "SUBJID", TPMS_with_subject_ID.index.str.slice(0, 10))

TPMS_with_subject_ID.head()


Unnamed: 0_level_0,SUBJID,Tissue,ENSG00000244734.3,ENSG00000188536.12,ENSG00000210082.2,ENSG00000198804.2,ENSG00000198712.1,ENSG00000198938.2,ENSG00000198899.2,ENSG00000163220.10,...,ENSG00000141404.15,ENSG00000224531.5,ENSG00000126705.13,ENSG00000064012.21,ENSG00000182698.11,ENSG00000029363.16,ENSG00000249915.7,ENSG00000107862.4,ENSG00000272849.1,ENSG00000197114.11
SAMPID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GTEX-1117F-3226-SM-5N9CT,GTEX-1117F,Brain,314.6,115.8,78170.0,76160.0,58320.0,65070.0,56680.0,17.46,...,4.842,6.48,16.09,0.3956,0.2936,13.91,24.56,30.52,11.03,21.09
GTEX-111FC-3126-SM-5GZZ2,GTEX-111FC,Brain,69.26,26.66,56000.0,51600.0,59480.0,43790.0,57190.0,9.551,...,9.479,20.92,20.89,0.717,0.9976,27.85,39.35,37.57,11.99,26.41
GTEX-111FC-3326-SM-5GZYV,GTEX-111FC,Brain,142.7,47.58,32100.0,27340.0,49380.0,37730.0,50440.0,10.45,...,7.798,39.66,60.37,0.893,0.2506,43.82,55.93,64.52,44.03,63.75
GTEX-111YS-0006-SM-5NQBE,GTEX-111YS,Blood,234900.0,68780.0,3955.0,4220.0,7411.0,4811.0,5066.0,62080.0,...,0.8287,0.6487,0.6872,48.07,0.04486,11.87,29.13,12.34,9.853,33.77
GTEX-1122O-0003-SM-5Q5DL,GTEX-1122O,Blood,3.198,1.48,8032.0,19860.0,31750.0,19360.0,18980.0,2.566,...,0.7487,8.623,8.247,8.605,0.0,66.93,69.87,66.12,11.03,27.78


In [94]:
# Now merge the age and sex subject metadata with the final dataset
final_dataset_unstandardized = TPMS_with_subject_ID.merge(
    subject_metadata[["SUBJID", "AGE", "SEX"]],
    on="SUBJID",
    how="left"
)

# Move AGE column to the left (position 1)
final_dataset_unstandardized.insert(1, "AGE", final_dataset_unstandardized.pop("AGE"))

# Move SEX column to the left (position 2)
final_dataset_unstandardized.insert(2, "SEX", final_dataset_unstandardized.pop("SEX"))

# Add in the labeled sample ID index again
final_dataset_unstandardized.index = TPMS_with_subject_ID.index

# I will also rename the "sex" column values from numeric to string labels
# 1 = male, 2 = female.
final_dataset_unstandardized["SEX"] = final_dataset_unstandardized["SEX"].replace({1: "M", 2: "F"})

final_dataset_unstandardized.head()

Unnamed: 0_level_0,SUBJID,AGE,SEX,Tissue,ENSG00000244734.3,ENSG00000188536.12,ENSG00000210082.2,ENSG00000198804.2,ENSG00000198712.1,ENSG00000198938.2,...,ENSG00000141404.15,ENSG00000224531.5,ENSG00000126705.13,ENSG00000064012.21,ENSG00000182698.11,ENSG00000029363.16,ENSG00000249915.7,ENSG00000107862.4,ENSG00000272849.1,ENSG00000197114.11
SAMPID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GTEX-1117F-3226-SM-5N9CT,GTEX-1117F,60-69,F,Brain,314.6,115.8,78170.0,76160.0,58320.0,65070.0,...,4.842,6.48,16.09,0.3956,0.2936,13.91,24.56,30.52,11.03,21.09
GTEX-111FC-3126-SM-5GZZ2,GTEX-111FC,60-69,M,Brain,69.26,26.66,56000.0,51600.0,59480.0,43790.0,...,9.479,20.92,20.89,0.717,0.9976,27.85,39.35,37.57,11.99,26.41
GTEX-111FC-3326-SM-5GZYV,GTEX-111FC,60-69,M,Brain,142.7,47.58,32100.0,27340.0,49380.0,37730.0,...,7.798,39.66,60.37,0.893,0.2506,43.82,55.93,64.52,44.03,63.75
GTEX-111YS-0006-SM-5NQBE,GTEX-111YS,60-69,M,Blood,234900.0,68780.0,3955.0,4220.0,7411.0,4811.0,...,0.8287,0.6487,0.6872,48.07,0.04486,11.87,29.13,12.34,9.853,33.77
GTEX-1122O-0003-SM-5Q5DL,GTEX-1122O,60-69,F,Blood,3.198,1.48,8032.0,19860.0,31750.0,19360.0,...,0.7487,8.623,8.247,8.605,0.0,66.93,69.87,66.12,11.03,27.78


## Standardize and save cleaned dataset <a class="anchor" id="save"></a>

In [95]:
# Then, make a normalized version of the gene TPMs
scaler = StandardScaler()
scaled_array = scaler.fit_transform(final_dataset_unstandardized.drop(['SUBJID','AGE','SEX','Tissue'], axis=1))

# That returns NumPy array, change back to dataframe
standardized_tpms= pd.DataFrame(
    scaled_array,
    index=final_dataset_unstandardized.index,
    columns=final_dataset_unstandardized.drop(['SUBJID','AGE','SEX','Tissue'], axis=1).columns
)

standardized_tpms.head()

Unnamed: 0_level_0,ENSG00000244734.3,ENSG00000188536.12,ENSG00000210082.2,ENSG00000198804.2,ENSG00000198712.1,ENSG00000198938.2,ENSG00000198899.2,ENSG00000163220.10,ENSG00000198886.2,ENSG00000198763.3,...,ENSG00000141404.15,ENSG00000224531.5,ENSG00000126705.13,ENSG00000064012.21,ENSG00000182698.11,ENSG00000029363.16,ENSG00000249915.7,ENSG00000107862.4,ENSG00000272849.1,ENSG00000197114.11
SAMPID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GTEX-1117F-3226-SM-5N9CT,-0.442027,-0.443194,0.652588,1.073375,0.289145,0.827291,0.516296,-0.36488,0.702961,0.355638,...,-0.37137,-0.674648,0.012171,-0.410176,-0.437162,-0.560999,-0.555407,0.288839,-0.141724,-0.216464
GTEX-111FC-3126-SM-5GZZ2,-0.44384,-0.445324,0.056045,0.266497,0.3299,0.001732,0.536849,-0.365206,0.347274,0.231717,...,-0.122493,0.100503,0.269929,-0.392914,-0.399349,0.187837,0.239327,0.667693,-0.090124,0.069512
GTEX-111FC-3326-SM-5GZYV,-0.443297,-0.444824,-0.587049,-0.530525,-0.024953,-0.233366,0.264824,-0.365169,0.199356,-0.176829,...,-0.212716,1.106481,2.389987,-0.383462,-0.439472,1.045721,1.130245,2.115937,1.63205,2.076722
GTEX-111YS-0006-SM-5NQBE,1.29164,1.198125,-1.344365,-1.290095,-1.499493,-1.510461,-1.563751,2.198348,-1.543597,-1.467425,...,-0.586773,-0.987677,-0.814953,2.150325,-0.450522,-0.670584,-0.30984,-0.688121,-0.204989,0.465148
GTEX-1122O-0003-SM-5Q5DL,-0.444328,-0.445926,-1.234662,-0.776268,-0.644366,-0.946031,-1.003016,-0.365495,-0.978858,-1.283806,...,-0.591067,-0.55961,-0.408995,0.030735,-0.452932,2.287155,1.879304,2.201918,-0.141724,0.143156


In [96]:
# Add back in the subject ID, age, sex, and tissue columns
final_dataset = final_dataset_unstandardized[['SUBJID','AGE','SEX','Tissue']].join(standardized_tpms, how="inner")

final_dataset.head()

Unnamed: 0_level_0,SUBJID,AGE,SEX,Tissue,ENSG00000244734.3,ENSG00000188536.12,ENSG00000210082.2,ENSG00000198804.2,ENSG00000198712.1,ENSG00000198938.2,...,ENSG00000141404.15,ENSG00000224531.5,ENSG00000126705.13,ENSG00000064012.21,ENSG00000182698.11,ENSG00000029363.16,ENSG00000249915.7,ENSG00000107862.4,ENSG00000272849.1,ENSG00000197114.11
SAMPID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GTEX-1117F-3226-SM-5N9CT,GTEX-1117F,60-69,F,Brain,-0.442027,-0.443194,0.652588,1.073375,0.289145,0.827291,...,-0.37137,-0.674648,0.012171,-0.410176,-0.437162,-0.560999,-0.555407,0.288839,-0.141724,-0.216464
GTEX-111FC-3126-SM-5GZZ2,GTEX-111FC,60-69,M,Brain,-0.44384,-0.445324,0.056045,0.266497,0.3299,0.001732,...,-0.122493,0.100503,0.269929,-0.392914,-0.399349,0.187837,0.239327,0.667693,-0.090124,0.069512
GTEX-111FC-3326-SM-5GZYV,GTEX-111FC,60-69,M,Brain,-0.443297,-0.444824,-0.587049,-0.530525,-0.024953,-0.233366,...,-0.212716,1.106481,2.389987,-0.383462,-0.439472,1.045721,1.130245,2.115937,1.63205,2.076722
GTEX-111YS-0006-SM-5NQBE,GTEX-111YS,60-69,M,Blood,1.29164,1.198125,-1.344365,-1.290095,-1.499493,-1.510461,...,-0.586773,-0.987677,-0.814953,2.150325,-0.450522,-0.670584,-0.30984,-0.688121,-0.204989,0.465148
GTEX-1122O-0003-SM-5Q5DL,GTEX-1122O,60-69,F,Blood,-0.444328,-0.445926,-1.234662,-0.776268,-0.644366,-0.946031,...,-0.591067,-0.55961,-0.408995,0.030735,-0.452932,2.287155,1.879304,2.201918,-0.141724,0.143156


In [97]:
# Write out the final cleaned dataset to a CSV file
final_dataset.to_csv(
    Path(CLEAN_DATA_DIR) / "gtex_blood_brain_top5000_variable_genes_standardized.csv"
)