# Description

It reads GTEx v8 metadata on samples and subjects and writes a file with that info.

# Modules

In [1]:
import re

import pandas as pd

from ccc import conf

# Settings

# Paths

In [2]:
TISSUE_DIR = conf.GTEX["DATA_DIR"] / "data_by_tissue"
assert TISSUE_DIR.exists()

In [3]:
OUTPUT_DIR = conf.GTEX["DATA_DIR"]
display(OUTPUT_DIR)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

PosixPath('/opt/data/data/gtex_v8')

# Data

## GTEx samples info

In [4]:
assert conf.GTEX["SAMPLE_ATTRS_FILE"].exists(), "Sample files does not exist"

In [5]:
gtex_samples = pd.read_csv(
    conf.GTEX["SAMPLE_ATTRS_FILE"],
    sep="\t",
    index_col="SAMPID",
)

In [6]:
display(gtex_samples.shape)
assert gtex_samples.index.is_unique

(22951, 62)

In [7]:
gtex_samples.head()

Unnamed: 0_level_0,SMATSSCR,SMCENTER,SMPTHNTS,SMRIN,SMTS,SMTSD,SMUBRID,SMTSISCH,SMTSPAX,SMNABTCH,...,SME1ANTI,SMSPLTRD,SMBSMMRT,SME1SNSE,SME1PCTS,SMRRNART,SME1MPRT,SMNUM5CD,SMDPMPRT,SME2PCTS
SAMPID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GTEX-1117F-0003-SM-58Q7G,,B1,,,Blood,Whole Blood,13756,1188.0,,BP-38516,...,,,,,,,,,,
GTEX-1117F-0003-SM-5DWSB,,B1,,,Blood,Whole Blood,13756,1188.0,,BP-38516,...,,,,,,,,,,
GTEX-1117F-0003-SM-6WBT7,,B1,,,Blood,Whole Blood,13756,1188.0,,BP-38516,...,,,,,,,,,,
GTEX-1117F-0011-R10a-SM-AHZ7F,,"B1, A1",,,Brain,Brain - Frontal Cortex (BA9),9834,1193.0,,,...,,,,,,,,,,
GTEX-1117F-0011-R10b-SM-CYKQ8,,"B1, A1",,7.2,Brain,Brain - Frontal Cortex (BA9),9834,1193.0,,BP-42319,...,,,,,,,,,,


## GTEx subject phenotypes

In [8]:
assert conf.GTEX["SUBJECTS_ATTRS_FILE"].exists(), "Subject files does not exist"

In [9]:
gtex_phenotypes = pd.read_csv(
    conf.GTEX["SUBJECTS_ATTRS_FILE"],
    sep="\t",
)

In [10]:
gtex_phenotypes.shape

(980, 4)

In [11]:
gtex_phenotypes.head()

Unnamed: 0,SUBJID,SEX,AGE,DTHHRDY
0,GTEX-1117F,2,60-69,4.0
1,GTEX-111CU,1,50-59,0.0
2,GTEX-111FC,1,60-69,1.0
3,GTEX-111VG,1,60-69,3.0
4,GTEX-111YS,1,60-69,0.0


## GTEx gene expression sample

In [12]:
pd.read_pickle(next(TISSUE_DIR.glob("*.pkl"))).head()

Unnamed: 0_level_0,GTEX-111CU-0126-SM-5GZWZ,GTEX-111YS-0126-SM-5987T,GTEX-1122O-0326-SM-5H124,GTEX-117YX-0126-SM-5EGH5,GTEX-11DXX-0126-SM-5EGH7,GTEX-11DXY-1626-SM-5H12L,GTEX-11DXZ-0226-SM-5EGGZ,GTEX-11EM3-0326-SM-5A5KJ,GTEX-11EMC-0526-SM-5EGJN,GTEX-11EQ9-0126-SM-5986I,...,GTEX-ZT9W-0126-SM-4YCFD,GTEX-ZT9X-0126-SM-4YCFC,GTEX-ZTSS-0326-SM-5987M,GTEX-ZUA1-1926-SM-5E45E,GTEX-ZVP2-0126-SM-5NQ7D,GTEX-ZVT2-0826-SM-5GIEO,GTEX-ZY6K-0126-SM-5SIAM,GTEX-ZYFG-0926-SM-5BC5U,GTEX-ZYVF-1626-SM-5N9EH,GTEX-ZZPU-1226-SM-5N9CK
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000223972.5,0.03757,0.0,0.01742,0.0,0.02762,0.04834,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0473,0.0,0.0,0.0,0.07783,0.0,0.0,0.03629
ENSG00000227232.5,0.9295,1.396,1.52,1.565,1.546,5.758,3.878,1.553,6.861,1.84,...,1.407,1.127,3.203,2.636,2.536,1.168,5.776,1.916,5.968,2.268
ENSG00000278267.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.396,0.0,0.0,0.0,0.0,0.0,0.0
ENSG00000243485.5,0.0,0.0793,0.0,0.0,0.0,0.04825,0.09592,0.0,0.0384,0.0,...,0.0415,0.03456,0.0,0.06219,0.0405,0.0,0.0,0.06296,0.02164,0.0
ENSG00000237613.2,0.0,0.0,0.0,0.04263,0.07835,0.03428,0.03407,0.0,0.0,0.05345,...,0.0,0.0,0.0,0.02209,0.0,0.0,0.0368,0.01491,0.0,0.0


# Get GTEx sample metadata

In [13]:
gtex_samples_ids = gtex_samples.index.to_list()
display(gtex_samples_ids[:5])

['GTEX-1117F-0003-SM-58Q7G',
 'GTEX-1117F-0003-SM-5DWSB',
 'GTEX-1117F-0003-SM-6WBT7',
 'GTEX-1117F-0011-R10a-SM-AHZ7F',
 'GTEX-1117F-0011-R10b-SM-CYKQ8']

In [14]:
gtex_samples_ids = pd.Series(gtex_samples_ids).rename("SAMPID")

In [15]:
gtex_samples_ids

0             GTEX-1117F-0003-SM-58Q7G
1             GTEX-1117F-0003-SM-5DWSB
2             GTEX-1117F-0003-SM-6WBT7
3        GTEX-1117F-0011-R10a-SM-AHZ7F
4        GTEX-1117F-0011-R10b-SM-CYKQ8
                     ...              
22946                   K-562-SM-E9EZC
22947                   K-562-SM-E9EZI
22948                   K-562-SM-E9EZO
22949                   K-562-SM-E9EZT
22950                   K-562-SM-E9EZZ
Name: SAMPID, Length: 22951, dtype: object

In [16]:
gtex_subjects_ids = gtex_samples_ids.str.extract(
    r"([\w\d]+\-[\w\d]+)", flags=re.IGNORECASE, expand=True
)[0].rename("SUBJID")

In [17]:
gtex_subjects_ids

0        GTEX-1117F
1        GTEX-1117F
2        GTEX-1117F
3        GTEX-1117F
4        GTEX-1117F
            ...    
22946         K-562
22947         K-562
22948         K-562
22949         K-562
22950         K-562
Name: SUBJID, Length: 22951, dtype: object

In [18]:
gtex_metadata = pd.concat([gtex_samples_ids, gtex_subjects_ids], axis=1)

In [19]:
gtex_metadata

Unnamed: 0,SAMPID,SUBJID
0,GTEX-1117F-0003-SM-58Q7G,GTEX-1117F
1,GTEX-1117F-0003-SM-5DWSB,GTEX-1117F
2,GTEX-1117F-0003-SM-6WBT7,GTEX-1117F
3,GTEX-1117F-0011-R10a-SM-AHZ7F,GTEX-1117F
4,GTEX-1117F-0011-R10b-SM-CYKQ8,GTEX-1117F
...,...,...
22946,K-562-SM-E9EZC,K-562
22947,K-562-SM-E9EZI,K-562
22948,K-562-SM-E9EZO,K-562
22949,K-562-SM-E9EZT,K-562


In [20]:
gtex_phenotypes

Unnamed: 0,SUBJID,SEX,AGE,DTHHRDY
0,GTEX-1117F,2,60-69,4.0
1,GTEX-111CU,1,50-59,0.0
2,GTEX-111FC,1,60-69,1.0
3,GTEX-111VG,1,60-69,3.0
4,GTEX-111YS,1,60-69,0.0
...,...,...,...,...
975,GTEX-ZYY3,2,60-69,4.0
976,GTEX-ZZ64,1,20-29,0.0
977,GTEX-ZZPT,1,50-59,4.0
978,GTEX-ZZPU,2,50-59,0.0


In [21]:
gtex_metadata = pd.merge(gtex_metadata, gtex_phenotypes).set_index("SAMPID")

In [22]:
gtex_metadata

Unnamed: 0_level_0,SUBJID,SEX,AGE,DTHHRDY
SAMPID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GTEX-1117F-0003-SM-58Q7G,GTEX-1117F,2,60-69,4.0
GTEX-1117F-0003-SM-5DWSB,GTEX-1117F,2,60-69,4.0
GTEX-1117F-0003-SM-6WBT7,GTEX-1117F,2,60-69,4.0
GTEX-1117F-0011-R10a-SM-AHZ7F,GTEX-1117F,2,60-69,4.0
GTEX-1117F-0011-R10b-SM-CYKQ8,GTEX-1117F,2,60-69,4.0
...,...,...,...,...
K-562-SM-E9EZC,K-562,2,50-59,
K-562-SM-E9EZI,K-562,2,50-59,
K-562-SM-E9EZO,K-562,2,50-59,
K-562-SM-E9EZT,K-562,2,50-59,


In [23]:
gtex_metadata = pd.merge(gtex_metadata, gtex_samples, left_index=True, right_index=True)

In [24]:
gtex_metadata = gtex_metadata.replace(
    {
        "SEX": {
            1: "Male",
            2: "Female",
        }
    }
)

In [25]:
gtex_metadata = gtex_metadata.sort_index()

In [26]:
gtex_metadata.head()

Unnamed: 0_level_0,SUBJID,SEX,AGE,DTHHRDY,SMATSSCR,SMCENTER,SMPTHNTS,SMRIN,SMTS,SMTSD,...,SME1ANTI,SMSPLTRD,SMBSMMRT,SME1SNSE,SME1PCTS,SMRRNART,SME1MPRT,SMNUM5CD,SMDPMPRT,SME2PCTS
SAMPID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GTEX-1117F-0003-SM-58Q7G,GTEX-1117F,Female,60-69,4.0,,B1,,,Blood,Whole Blood,...,,,,,,,,,,
GTEX-1117F-0003-SM-5DWSB,GTEX-1117F,Female,60-69,4.0,,B1,,,Blood,Whole Blood,...,,,,,,,,,,
GTEX-1117F-0003-SM-6WBT7,GTEX-1117F,Female,60-69,4.0,,B1,,,Blood,Whole Blood,...,,,,,,,,,,
GTEX-1117F-0011-R10a-SM-AHZ7F,GTEX-1117F,Female,60-69,4.0,,"B1, A1",,,Brain,Brain - Frontal Cortex (BA9),...,,,,,,,,,,
GTEX-1117F-0011-R10b-SM-CYKQ8,GTEX-1117F,Female,60-69,4.0,,"B1, A1",,7.2,Brain,Brain - Frontal Cortex (BA9),...,,,,,,,,,,


# Testing

In [27]:
gtex_metadata.shape

(22951, 66)

In [28]:
assert not gtex_metadata["SUBJID"].isna().any()

In [29]:
assert not gtex_metadata["SMTS"].isna().any()
assert not gtex_metadata["SMTSD"].isna().any()

In [30]:
assert not gtex_metadata["SEX"].isna().any()
assert gtex_metadata["SEX"].unique().shape[0] == 2
assert set(gtex_metadata["SEX"].unique()) == {"Female", "Male"}

# Save

In [31]:
output_filename = OUTPUT_DIR / "gtex_v8-sample_metadata.pkl"
display(output_filename)

PosixPath('/opt/data/data/gtex_v8/gtex_v8-sample_metadata.pkl')

In [32]:
gtex_metadata.to_pickle(output_filename)