# Description

It reads GTEx v8 metadata on samples and subjects and writes a file with that info.

# Modules

In [1]:
import re

import pandas as pd

from clustermatch import conf

# Settings

# Paths

In [2]:
TISSUE_DIR = conf.GTEX["DATA_DIR"] / "data_by_tissue"
assert TISSUE_DIR.exists()

In [3]:
OUTPUT_DIR = conf.GTEX["DATA_DIR"]
display(OUTPUT_DIR)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

PosixPath('/opt/data/data/gtex_v8')

# Data

## GTEx samples info

In [4]:
assert conf.GTEX["SAMPLE_ATTRS_FILE"].exists(), "Sample files does not exist"

In [5]:
gtex_samples = pd.read_csv(
    conf.GTEX["SAMPLE_ATTRS_FILE"],
    sep="\t",
    index_col="SAMPID",
)

In [6]:
display(gtex_samples.shape)
assert gtex_samples.index.is_unique

(22951, 62)

In [7]:
gtex_samples.head()

Unnamed: 0_level_0,SMATSSCR,SMCENTER,SMPTHNTS,SMRIN,SMTS,SMTSD,SMUBRID,SMTSISCH,SMTSPAX,SMNABTCH,...,SME1ANTI,SMSPLTRD,SMBSMMRT,SME1SNSE,SME1PCTS,SMRRNART,SME1MPRT,SMNUM5CD,SMDPMPRT,SME2PCTS
SAMPID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GTEX-1117F-0003-SM-58Q7G,,B1,,,Blood,Whole Blood,13756,1188.0,,BP-38516,...,,,,,,,,,,
GTEX-1117F-0003-SM-5DWSB,,B1,,,Blood,Whole Blood,13756,1188.0,,BP-38516,...,,,,,,,,,,
GTEX-1117F-0003-SM-6WBT7,,B1,,,Blood,Whole Blood,13756,1188.0,,BP-38516,...,,,,,,,,,,
GTEX-1117F-0011-R10a-SM-AHZ7F,,"B1, A1",,,Brain,Brain - Frontal Cortex (BA9),9834,1193.0,,,...,,,,,,,,,,
GTEX-1117F-0011-R10b-SM-CYKQ8,,"B1, A1",,7.2,Brain,Brain - Frontal Cortex (BA9),9834,1193.0,,BP-42319,...,,,,,,,,,,


## GTEx subject phenotypes

In [8]:
assert conf.GTEX["SUBJECTS_ATTRS_FILE"].exists(), "Subject files does not exist"

In [9]:
gtex_phenotypes = pd.read_csv(
    conf.GTEX["SUBJECTS_ATTRS_FILE"],
    sep="\t",
)

In [10]:
gtex_phenotypes.shape

(980, 4)

In [11]:
gtex_phenotypes.head()

Unnamed: 0,SUBJID,SEX,AGE,DTHHRDY
0,GTEX-1117F,2,60-69,4.0
1,GTEX-111CU,1,50-59,0.0
2,GTEX-111FC,1,60-69,1.0
3,GTEX-111VG,1,60-69,3.0
4,GTEX-111YS,1,60-69,0.0


## GTEx gene expression sample

In [12]:
pd.read_pickle(next(TISSUE_DIR.glob("*.pkl"))).head()

Unnamed: 0_level_0,GTEX-OIZF-1926-SM-7PBZS,GTEX-P44H-2226-SM-E9U4P,GTEX-QEL4-1826-SM-EZ6KU,GTEX-RN64-2426-SM-EZ6L2,GTEX-RU72-2526-SM-EWRML,GTEX-S32W-1126-SM-4AD5V,GTEX-S33H-1926-SM-EYYVH,GTEX-S3XE-1226-SM-4AD4L,GTEX-S4Q7-0926-SM-4AD5D,GTEX-S4UY-0926-SM-4AD6O,...,GTEX-SE5C-1026-SM-4BRUG,GTEX-SNMC-0826-SM-4DM66,GTEX-SNOS-0526-SM-4DM54,GTEX-T2YK-2326-SM-EZ6LA,GTEX-T5JW-1026-SM-EZ6LR,GTEX-T6MN-2226-SM-EVYAM,GTEX-TMMY-1526-SM-4DXST,GTEX-U3ZM-0826-SM-4DXU6,GTEX-U3ZN-1226-SM-4DXUD,GTEX-U4B1-1226-SM-4DXT7
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000223972.5,0.01741,0.0,0.0,0.0,0.01921,0.0,0.0,0.0,0.0,0.0,...,0.02747,0.0,0.0,0.02358,0.01574,0.0,0.0,0.0,0.0,0.02943
ENSG00000227232.5,3.083,7.071,2.603,8.501,1.026,3.699,2.418,5.452,5.164,6.111,...,5.901,7.256,12.58,7.554,3.095,5.93,5.136,5.515,3.736,4.023
ENSG00000278267.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000243485.5,0.03476,0.0,0.0344,0.04011,0.03836,0.0,0.0,0.04644,0.0,0.0,...,0.0,0.0,0.07844,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000237613.2,0.0,0.0,0.04888,0.0285,0.0,0.0,0.0,0.0,0.04808,0.0,...,0.0,0.0,0.0,0.0,0.0,0.03106,0.0,0.0,0.0,0.0


## Gene Ensembl ID -> Symbol mapping

In [13]:
gene_map = pd.read_pickle(conf.GTEX["DATA_DIR"] / "gtex_gene_id_symbol_mappings.pkl")

In [14]:
gene_map = gene_map.set_index("gene_ens_id")["gene_symbol"].to_dict()

In [15]:
assert gene_map["ENSG00000145309.5"] == "CABS1"

# Get GTEx sample metadata

In [16]:
gtex_samples_ids = gtex_samples.index.to_list()
display(gtex_samples_ids[:5])

['GTEX-1117F-0003-SM-58Q7G',
 'GTEX-1117F-0003-SM-5DWSB',
 'GTEX-1117F-0003-SM-6WBT7',
 'GTEX-1117F-0011-R10a-SM-AHZ7F',
 'GTEX-1117F-0011-R10b-SM-CYKQ8']

In [17]:
gtex_samples_ids = pd.Series(gtex_samples_ids).rename("SAMPID")

In [18]:
gtex_samples_ids

0             GTEX-1117F-0003-SM-58Q7G
1             GTEX-1117F-0003-SM-5DWSB
2             GTEX-1117F-0003-SM-6WBT7
3        GTEX-1117F-0011-R10a-SM-AHZ7F
4        GTEX-1117F-0011-R10b-SM-CYKQ8
                     ...              
22946                   K-562-SM-E9EZC
22947                   K-562-SM-E9EZI
22948                   K-562-SM-E9EZO
22949                   K-562-SM-E9EZT
22950                   K-562-SM-E9EZZ
Name: SAMPID, Length: 22951, dtype: object

In [19]:
gtex_subjects_ids = gtex_samples_ids.str.extract(
    r"([\w\d]+\-[\w\d]+)", flags=re.IGNORECASE, expand=True
)[0].rename("SUBJID")

In [20]:
gtex_subjects_ids

0        GTEX-1117F
1        GTEX-1117F
2        GTEX-1117F
3        GTEX-1117F
4        GTEX-1117F
            ...    
22946         K-562
22947         K-562
22948         K-562
22949         K-562
22950         K-562
Name: SUBJID, Length: 22951, dtype: object

In [21]:
gtex_metadata = pd.concat([gtex_samples_ids, gtex_subjects_ids], axis=1)

In [22]:
gtex_metadata

Unnamed: 0,SAMPID,SUBJID
0,GTEX-1117F-0003-SM-58Q7G,GTEX-1117F
1,GTEX-1117F-0003-SM-5DWSB,GTEX-1117F
2,GTEX-1117F-0003-SM-6WBT7,GTEX-1117F
3,GTEX-1117F-0011-R10a-SM-AHZ7F,GTEX-1117F
4,GTEX-1117F-0011-R10b-SM-CYKQ8,GTEX-1117F
...,...,...
22946,K-562-SM-E9EZC,K-562
22947,K-562-SM-E9EZI,K-562
22948,K-562-SM-E9EZO,K-562
22949,K-562-SM-E9EZT,K-562


In [23]:
gtex_phenotypes

Unnamed: 0,SUBJID,SEX,AGE,DTHHRDY
0,GTEX-1117F,2,60-69,4.0
1,GTEX-111CU,1,50-59,0.0
2,GTEX-111FC,1,60-69,1.0
3,GTEX-111VG,1,60-69,3.0
4,GTEX-111YS,1,60-69,0.0
...,...,...,...,...
975,GTEX-ZYY3,2,60-69,4.0
976,GTEX-ZZ64,1,20-29,0.0
977,GTEX-ZZPT,1,50-59,4.0
978,GTEX-ZZPU,2,50-59,0.0


In [24]:
gtex_metadata = pd.merge(gtex_metadata, gtex_phenotypes).set_index("SAMPID")

In [25]:
gtex_metadata

Unnamed: 0_level_0,SUBJID,SEX,AGE,DTHHRDY
SAMPID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GTEX-1117F-0003-SM-58Q7G,GTEX-1117F,2,60-69,4.0
GTEX-1117F-0003-SM-5DWSB,GTEX-1117F,2,60-69,4.0
GTEX-1117F-0003-SM-6WBT7,GTEX-1117F,2,60-69,4.0
GTEX-1117F-0011-R10a-SM-AHZ7F,GTEX-1117F,2,60-69,4.0
GTEX-1117F-0011-R10b-SM-CYKQ8,GTEX-1117F,2,60-69,4.0
...,...,...,...,...
K-562-SM-E9EZC,K-562,2,50-59,
K-562-SM-E9EZI,K-562,2,50-59,
K-562-SM-E9EZO,K-562,2,50-59,
K-562-SM-E9EZT,K-562,2,50-59,


In [26]:
gtex_metadata = pd.merge(gtex_metadata, gtex_samples, left_index=True, right_index=True)

In [27]:
gtex_metadata = gtex_metadata.replace(
    {
        "SEX": {
            1: "Male",
            2: "Female",
        }
    }
)

In [28]:
gtex_metadata = gtex_metadata.sort_index()

In [29]:
gtex_metadata.head()

Unnamed: 0_level_0,SUBJID,SEX,AGE,DTHHRDY,SMATSSCR,SMCENTER,SMPTHNTS,SMRIN,SMTS,SMTSD,...,SME1ANTI,SMSPLTRD,SMBSMMRT,SME1SNSE,SME1PCTS,SMRRNART,SME1MPRT,SMNUM5CD,SMDPMPRT,SME2PCTS
SAMPID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GTEX-1117F-0003-SM-58Q7G,GTEX-1117F,Female,60-69,4.0,,B1,,,Blood,Whole Blood,...,,,,,,,,,,
GTEX-1117F-0003-SM-5DWSB,GTEX-1117F,Female,60-69,4.0,,B1,,,Blood,Whole Blood,...,,,,,,,,,,
GTEX-1117F-0003-SM-6WBT7,GTEX-1117F,Female,60-69,4.0,,B1,,,Blood,Whole Blood,...,,,,,,,,,,
GTEX-1117F-0011-R10a-SM-AHZ7F,GTEX-1117F,Female,60-69,4.0,,"B1, A1",,,Brain,Brain - Frontal Cortex (BA9),...,,,,,,,,,,
GTEX-1117F-0011-R10b-SM-CYKQ8,GTEX-1117F,Female,60-69,4.0,,"B1, A1",,7.2,Brain,Brain - Frontal Cortex (BA9),...,,,,,,,,,,


# Testing

In [30]:
gtex_metadata.shape

(22951, 66)

In [31]:
assert not gtex_metadata["SUBJID"].isna().any()

In [32]:
assert not gtex_metadata["SMTS"].isna().any()
assert not gtex_metadata["SMTSD"].isna().any()

In [33]:
assert not gtex_metadata["SEX"].isna().any()
assert gtex_metadata["SEX"].unique().shape[0] == 2
assert set(gtex_metadata["SEX"].unique()) == {"Female", "Male"}

# Save

In [34]:
output_filename = OUTPUT_DIR / "gtex_v8-sample_metadata.pkl"
display(output_filename)

PosixPath('/opt/data/data/gtex_v8/gtex_v8-sample_metadata.pkl')

In [35]:
gtex_metadata.to_pickle(output_filename)