# Description

It reads GTEx v8 metadata on samples and subjects and writes a file with that info.

# Modules

In [1]:
import re

import pandas as pd

from clustermatch import conf

# Settings

# Paths

In [2]:
TISSUE_DIR = conf.GTEX["DATA_DIR"] / "data_by_tissue"
assert TISSUE_DIR.exists()

In [3]:
OUTPUT_DIR = conf.GTEX["DATA_DIR"]
display(OUTPUT_DIR)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

PosixPath('/opt/data/data/gtex_v8')

# Data

## GTEx samples info

In [None]:
assert conf.GTEX["SAMPLE_ATTRS_FILE"].exists(), "Sample files does not exist"

In [4]:
gtex_samples = pd.read_csv(
    conf.GTEX["SAMPLE_ATTRS_FILE"],
    sep="\t",
    index_col="SAMPID",
)

In [5]:
display(gtex_samples.shape)
assert gtex_samples.index.is_unique

(22951, 62)

In [6]:
gtex_samples.head()

Unnamed: 0_level_0,SMATSSCR,SMCENTER,SMPTHNTS,SMRIN,SMTS,SMTSD,SMUBRID,SMTSISCH,SMTSPAX,SMNABTCH,...,SME1ANTI,SMSPLTRD,SMBSMMRT,SME1SNSE,SME1PCTS,SMRRNART,SME1MPRT,SMNUM5CD,SMDPMPRT,SME2PCTS
SAMPID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GTEX-1117F-0003-SM-58Q7G,,B1,,,Blood,Whole Blood,13756,1188.0,,BP-38516,...,,,,,,,,,,
GTEX-1117F-0003-SM-5DWSB,,B1,,,Blood,Whole Blood,13756,1188.0,,BP-38516,...,,,,,,,,,,
GTEX-1117F-0003-SM-6WBT7,,B1,,,Blood,Whole Blood,13756,1188.0,,BP-38516,...,,,,,,,,,,
GTEX-1117F-0011-R10a-SM-AHZ7F,,"B1, A1",,,Brain,Brain - Frontal Cortex (BA9),9834,1193.0,,,...,,,,,,,,,,
GTEX-1117F-0011-R10b-SM-CYKQ8,,"B1, A1",,7.2,Brain,Brain - Frontal Cortex (BA9),9834,1193.0,,BP-42319,...,,,,,,,,,,


## GTEx subject phenotypes

In [None]:
assert conf.GTEX["SUBJECTS_ATTRS_FILE"].exists(), "Subject files does not exist"

In [7]:
gtex_phenotypes = pd.read_csv(
    conf.GTEX["SUBJECTS_ATTRS_FILE"],
    sep="\t",
)

In [8]:
gtex_phenotypes.shape

(980, 4)

In [9]:
gtex_phenotypes.head()

Unnamed: 0,SUBJID,SEX,AGE,DTHHRDY
0,GTEX-1117F,2,60-69,4.0
1,GTEX-111CU,1,50-59,0.0
2,GTEX-111FC,1,60-69,1.0
3,GTEX-111VG,1,60-69,3.0
4,GTEX-111YS,1,60-69,0.0


## GTEx gene expression sample

In [10]:
pd.read_pickle(next(TISSUE_DIR.glob("*.pkl"))).head()

Unnamed: 0_level_0,GTEX-111FC-0826-SM-5GZWO,GTEX-111YS-0426-SM-5987O,GTEX-1122O-0826-SM-5GICV,GTEX-117YW-0326-SM-5N9CY,GTEX-117YX-1126-SM-5H128,GTEX-11DXX-0326-SM-5PNWC,GTEX-11DXY-0826-SM-5EGGR,GTEX-11DXZ-0626-SM-5GU77,GTEX-11EM3-0626-SM-5H12Z,GTEX-11EMC-0726-SM-5EGJO,...,GTEX-ZTPG-1226-SM-4YCDO,GTEX-ZUA1-0826-SM-4YCDL,GTEX-ZV7C-0326-SM-57WB1,GTEX-ZVT2-1026-SM-5GU55,GTEX-ZVZP-0226-SM-5NQ73,GTEX-ZYFC-1126-SM-5E44W,GTEX-ZYFG-0426-SM-5E43M,GTEX-ZYT6-1726-SM-5E44P,GTEX-ZYW4-0926-SM-59HJS,GTEX-ZZPU-0926-SM-5GZYT
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000223972.5,0.0,0.0,0.01863,0.0132,0.0,0.01595,0.0,0.02215,0.0,0.0,...,0.03078,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01152,0.0
ENSG00000227232.5,2.458,0.719,0.776,1.289,0.3104,0.4154,0.4755,2.682,0.6632,1.421,...,1.002,0.5498,0.9787,0.6925,0.6903,1.869,0.3865,0.6712,0.6447,1.016
ENSG00000278267.1,0.0,0.0,0.0,0.0,0.0,0.4056,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4723
ENSG00000243485.5,0.03116,0.03063,0.0,0.0527,0.0,0.03185,0.02352,0.08846,0.0,0.0,...,0.06145,0.0,0.03193,0.06436,0.03308,0.0,0.0,0.05938,0.06897,0.0
ENSG00000237613.2,0.0,0.02176,0.0,0.0,0.0,0.02263,0.0,0.0,0.0,0.0,...,0.04366,0.0,0.02269,0.0,0.0,0.03842,0.0,0.01406,0.0,0.0


## Gene Ensembl ID -> Symbol mapping

In [11]:
gene_map = pd.read_pickle(conf.GTEX["DATA_DIR"] / "gtex_gene_id_symbol_mappings.pkl")

In [12]:
gene_map = gene_map.set_index("gene_ens_id")["gene_symbol"].to_dict()

In [13]:
assert gene_map["ENSG00000145309.5"] == "CABS1"

# Get GTEx sample metadata

In [14]:
gtex_samples_ids = gtex_samples.index.to_list()
display(gtex_samples_ids[:5])

['GTEX-1117F-0003-SM-58Q7G',
 'GTEX-1117F-0003-SM-5DWSB',
 'GTEX-1117F-0003-SM-6WBT7',
 'GTEX-1117F-0011-R10a-SM-AHZ7F',
 'GTEX-1117F-0011-R10b-SM-CYKQ8']

In [15]:
gtex_samples_ids = pd.Series(gtex_samples_ids).rename("SAMPID")

In [16]:
gtex_samples_ids

0             GTEX-1117F-0003-SM-58Q7G
1             GTEX-1117F-0003-SM-5DWSB
2             GTEX-1117F-0003-SM-6WBT7
3        GTEX-1117F-0011-R10a-SM-AHZ7F
4        GTEX-1117F-0011-R10b-SM-CYKQ8
                     ...              
22946                   K-562-SM-E9EZC
22947                   K-562-SM-E9EZI
22948                   K-562-SM-E9EZO
22949                   K-562-SM-E9EZT
22950                   K-562-SM-E9EZZ
Name: SAMPID, Length: 22951, dtype: object

In [17]:
gtex_subjects_ids = gtex_samples_ids.str.extract(
    r"([\w\d]+\-[\w\d]+)", flags=re.IGNORECASE, expand=True
)[0].rename("SUBJID")

In [18]:
gtex_subjects_ids

0        GTEX-1117F
1        GTEX-1117F
2        GTEX-1117F
3        GTEX-1117F
4        GTEX-1117F
            ...    
22946         K-562
22947         K-562
22948         K-562
22949         K-562
22950         K-562
Name: SUBJID, Length: 22951, dtype: object

In [19]:
gtex_metadata = pd.concat([gtex_samples_ids, gtex_subjects_ids], axis=1)

In [20]:
gtex_metadata

Unnamed: 0,SAMPID,SUBJID
0,GTEX-1117F-0003-SM-58Q7G,GTEX-1117F
1,GTEX-1117F-0003-SM-5DWSB,GTEX-1117F
2,GTEX-1117F-0003-SM-6WBT7,GTEX-1117F
3,GTEX-1117F-0011-R10a-SM-AHZ7F,GTEX-1117F
4,GTEX-1117F-0011-R10b-SM-CYKQ8,GTEX-1117F
...,...,...
22946,K-562-SM-E9EZC,K-562
22947,K-562-SM-E9EZI,K-562
22948,K-562-SM-E9EZO,K-562
22949,K-562-SM-E9EZT,K-562


In [21]:
gtex_phenotypes

Unnamed: 0,SUBJID,SEX,AGE,DTHHRDY
0,GTEX-1117F,2,60-69,4.0
1,GTEX-111CU,1,50-59,0.0
2,GTEX-111FC,1,60-69,1.0
3,GTEX-111VG,1,60-69,3.0
4,GTEX-111YS,1,60-69,0.0
...,...,...,...,...
975,GTEX-ZYY3,2,60-69,4.0
976,GTEX-ZZ64,1,20-29,0.0
977,GTEX-ZZPT,1,50-59,4.0
978,GTEX-ZZPU,2,50-59,0.0


In [22]:
gtex_metadata = pd.merge(gtex_metadata, gtex_phenotypes).set_index("SAMPID")

In [23]:
gtex_metadata

Unnamed: 0_level_0,SUBJID,SEX,AGE,DTHHRDY
SAMPID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GTEX-1117F-0003-SM-58Q7G,GTEX-1117F,2,60-69,4.0
GTEX-1117F-0003-SM-5DWSB,GTEX-1117F,2,60-69,4.0
GTEX-1117F-0003-SM-6WBT7,GTEX-1117F,2,60-69,4.0
GTEX-1117F-0011-R10a-SM-AHZ7F,GTEX-1117F,2,60-69,4.0
GTEX-1117F-0011-R10b-SM-CYKQ8,GTEX-1117F,2,60-69,4.0
...,...,...,...,...
K-562-SM-E9EZC,K-562,2,50-59,
K-562-SM-E9EZI,K-562,2,50-59,
K-562-SM-E9EZO,K-562,2,50-59,
K-562-SM-E9EZT,K-562,2,50-59,


In [24]:
gtex_metadata = pd.merge(gtex_metadata, gtex_samples, left_index=True, right_index=True)

In [25]:
gtex_metadata = gtex_metadata.replace(
    {
        "SEX": {
            1: "Male",
            2: "Female",
        }
    }
)

In [26]:
gtex_metadata = gtex_metadata.sort_index()

In [27]:
gtex_metadata.head()

Unnamed: 0_level_0,SUBJID,SEX,AGE,DTHHRDY,SMATSSCR,SMCENTER,SMPTHNTS,SMRIN,SMTS,SMTSD,...,SME1ANTI,SMSPLTRD,SMBSMMRT,SME1SNSE,SME1PCTS,SMRRNART,SME1MPRT,SMNUM5CD,SMDPMPRT,SME2PCTS
SAMPID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GTEX-1117F-0003-SM-58Q7G,GTEX-1117F,Female,60-69,4.0,,B1,,,Blood,Whole Blood,...,,,,,,,,,,
GTEX-1117F-0003-SM-5DWSB,GTEX-1117F,Female,60-69,4.0,,B1,,,Blood,Whole Blood,...,,,,,,,,,,
GTEX-1117F-0003-SM-6WBT7,GTEX-1117F,Female,60-69,4.0,,B1,,,Blood,Whole Blood,...,,,,,,,,,,
GTEX-1117F-0011-R10a-SM-AHZ7F,GTEX-1117F,Female,60-69,4.0,,"B1, A1",,,Brain,Brain - Frontal Cortex (BA9),...,,,,,,,,,,
GTEX-1117F-0011-R10b-SM-CYKQ8,GTEX-1117F,Female,60-69,4.0,,"B1, A1",,7.2,Brain,Brain - Frontal Cortex (BA9),...,,,,,,,,,,


# Testing

In [28]:
gtex_metadata.shape

(22951, 66)

In [29]:
assert not gtex_metadata["SUBJID"].isna().any()

In [30]:
assert not gtex_metadata["SMTS"].isna().any()
assert not gtex_metadata["SMTSD"].isna().any()

In [31]:
assert not gtex_metadata["SEX"].isna().any()
assert gtex_metadata["SEX"].unique().shape[0] == 2
assert set(gtex_metadata["SEX"].unique()) == {"Female", "Male"}

# Save

In [32]:
output_filename = OUTPUT_DIR / "gtex_v8-sample_metadata.pkl"
display(output_filename)

PosixPath('/opt/data/data/gtex_v8/gtex_v8-sample_metadata.pkl')

In [33]:
gtex_metadata.to_pickle(output_filename)