# Description

It splits gene expression data from GTEx v8 by tissue and saves a gene id/symbol mapping file.

# Modules

In [1]:
import pandas as pd
from tqdm import tqdm

from clustermatch.utils import simplify_string
from clustermatch import conf

# Settings

In [2]:
OUTPUT_DIR = conf.GTEX["DATA_DIR"] / "data_by_tissue"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

PosixPath('/opt/data/data/gtex_v8/data_by_tissue')

# Data loading

## GTEx v8

### Sample metadata

In [3]:
gtex_sample_attrs = pd.read_csv(
    conf.GTEX["SAMPLE_ATTRS_FILE"],
    sep="\t",
)

In [4]:
gtex_sample_attrs.shape

(22951, 63)

In [5]:
gtex_sample_attrs.head()

Unnamed: 0,SAMPID,SMATSSCR,SMCENTER,SMPTHNTS,SMRIN,SMTS,SMTSD,SMUBRID,SMTSISCH,SMTSPAX,...,SME1ANTI,SMSPLTRD,SMBSMMRT,SME1SNSE,SME1PCTS,SMRRNART,SME1MPRT,SMNUM5CD,SMDPMPRT,SME2PCTS
0,GTEX-1117F-0003-SM-58Q7G,,B1,,,Blood,Whole Blood,13756,1188.0,,...,,,,,,,,,,
1,GTEX-1117F-0003-SM-5DWSB,,B1,,,Blood,Whole Blood,13756,1188.0,,...,,,,,,,,,,
2,GTEX-1117F-0003-SM-6WBT7,,B1,,,Blood,Whole Blood,13756,1188.0,,...,,,,,,,,,,
3,GTEX-1117F-0011-R10a-SM-AHZ7F,,"B1, A1",,,Brain,Brain - Frontal Cortex (BA9),9834,1193.0,,...,,,,,,,,,,
4,GTEX-1117F-0011-R10b-SM-CYKQ8,,"B1, A1",,7.2,Brain,Brain - Frontal Cortex (BA9),9834,1193.0,,...,,,,,,,,,,


# Get tissues names

In [6]:
gtex_tissues = gtex_sample_attrs["SMTSD"].unique()
display(len(gtex_tissues))
display(gtex_tissues)

55

array(['Whole Blood', 'Brain - Frontal Cortex (BA9)',
       'Adipose - Subcutaneous', 'Muscle - Skeletal', 'Artery - Tibial',
       'Artery - Coronary', 'Heart - Atrial Appendage',
       'Adipose - Visceral (Omentum)', 'Ovary', 'Uterus', 'Vagina',
       'Breast - Mammary Tissue', 'Skin - Not Sun Exposed (Suprapubic)',
       'Minor Salivary Gland', 'Brain - Cortex', 'Adrenal Gland',
       'Thyroid', 'Lung', 'Spleen', 'Pancreas', 'Esophagus - Muscularis',
       'Esophagus - Mucosa', 'Esophagus - Gastroesophageal Junction',
       'Stomach', 'Colon - Sigmoid', 'Small Intestine - Terminal Ileum',
       'Colon - Transverse', 'Prostate', 'Testis',
       'Skin - Sun Exposed (Lower leg)', 'Nerve - Tibial',
       'Heart - Left Ventricle', 'Pituitary', 'Brain - Cerebellum',
       'Cells - Cultured fibroblasts', 'Artery - Aorta',
       'Cells - EBV-transformed lymphocytes',
       'Brain - Cerebellar Hemisphere', 'Brain - Caudate (basal ganglia)',
       'Brain - Nucleus accumbens (ba

# Get sample IDs for each tissue

In [7]:
# first, get all sample IDs with expression data
gtex_all_sample_ids_with_expr_data = set(
    pd.read_csv(
        conf.GTEX["DATA_TPM_GCT_FILE"],
        sep="\t",
        skiprows=2,
        nrows=1,
        usecols=lambda x: x not in ("Name", "Description"),
    ).columns
)

In [8]:
len(gtex_all_sample_ids_with_expr_data)

17382

In [9]:
list(gtex_all_sample_ids_with_expr_data)[:10]

['GTEX-12WSJ-1726-SM-5LZVV',
 'GTEX-144GL-1426-SM-79OMA',
 'GTEX-11DXZ-0006-SM-5LZZH',
 'GTEX-R55C-1926-SM-2TF4K',
 'GTEX-TKQ1-1326-SM-4DXU7',
 'GTEX-1399T-0526-SM-5J1NC',
 'GTEX-ZPCL-1226-SM-4WWFF',
 'GTEX-1F48J-1626-SM-9WPOE',
 'GTEX-P4QS-0926-SM-48TBS',
 'GTEX-RNOR-0011-R8A-SM-EZ6MN']

In [10]:
# get sample IDs by tissue
sample_ids_by_tissue = {
    tissue_name: sorted(
        list(
            gtex_all_sample_ids_with_expr_data.intersection(
                set(
                    gtex_sample_attrs[gtex_sample_attrs["SMTSD"] == tissue_name][
                        "SAMPID"
                    ].tolist()
                )
            )
        )
    )
    for tissue_name in gtex_tissues
}

In [11]:
assert len(gtex_tissues) == len(sample_ids_by_tissue)

In [12]:
sample_ids_by_tissue["Whole Blood"][:10]

['GTEX-111YS-0006-SM-5NQBE',
 'GTEX-1122O-0005-SM-5O99J',
 'GTEX-1128S-0005-SM-5P9HI',
 'GTEX-113IC-0006-SM-5NQ9C',
 'GTEX-113JC-0006-SM-5O997',
 'GTEX-117XS-0005-SM-5PNU6',
 'GTEX-117YW-0005-SM-5NQ8Z',
 'GTEX-1192W-0005-SM-5NQBQ',
 'GTEX-1192X-0005-SM-5NQC3',
 'GTEX-11DXW-0006-SM-5NQ7Y']

In [13]:
# all IDs are unique
assert all(
    [
        len(sample_ids_by_tissue[tissue_name])
        == len(set(sample_ids_by_tissue[tissue_name]))
        for tissue_name in sample_ids_by_tissue.keys()
    ]
)

## Show sample size by tissue

In [14]:
tissue_sample_size = pd.DataFrame(
    [{"tissue": k, "sample_size": len(v)} for k, v in sample_ids_by_tissue.items()]
)

In [15]:
tissue_sample_size = tissue_sample_size.sort_values("sample_size", ascending=False)
display(tissue_sample_size)

Unnamed: 0,tissue,sample_size
3,Muscle - Skeletal,803
0,Whole Blood,755
29,Skin - Sun Exposed (Lower leg),701
2,Adipose - Subcutaneous,663
4,Artery - Tibial,663
16,Thyroid,653
30,Nerve - Tibial,619
12,Skin - Not Sun Exposed (Suprapubic),604
17,Lung,578
21,Esophagus - Mucosa,555


In [16]:
# some testing
_tmp = tissue_sample_size.set_index("tissue").squeeze()
assert _tmp.loc["Muscle - Skeletal"] == 803
assert _tmp.loc["Whole Blood"] == 755
assert _tmp.loc["Skin - Not Sun Exposed (Suprapubic)"] == 604
assert _tmp.loc["Kidney - Medulla"] == 4

These numbers match those you can find here: https://gtexportal.org/home/tissueSummaryPage#sampleCountsPerTissue

# Split expression data by tissue

In [17]:
pbar = tqdm(tissue_sample_size["tissue"])

gene_id_symbol_map_tuples = set()

for tissue_name in pbar:
    pbar.set_description(tissue_name)

    tissue_ids = sample_ids_by_tissue[tissue_name]
    if len(tissue_ids) == 0:
        continue

    tissue_data = pd.read_csv(
        conf.GTEX["DATA_TPM_GCT_FILE"],
        sep="\t",
        skiprows=2,
        usecols=["Name", "Description"] + tissue_ids,
    )

    tissue_data = tissue_data.rename(
        columns={
            "Name": "gene_ens_id",
            "Description": "gene_symbol",
        }
    )

    # add gene id / gene symbol to mapping variable
    gene_id_symbol_map_tuples.update(
        tissue_data[["gene_ens_id", "gene_symbol"]].itertuples(index=False)
    )

    tissue_data = tissue_data.drop(columns=["gene_symbol"]).set_index("gene_ens_id")

    assert not tissue_data.isna().any().any()
    assert tissue_data.index.is_unique
    assert tissue_data.columns.is_unique

    # save
    tissue_name_simple = simplify_string(simplify_string(tissue_name.lower()))
    tissue_data.to_pickle(path=OUTPUT_DIR / f"gtex_v8_data_{tissue_name_simple}.pkl")

Cells - Leukemia cell line (CML): 100%|██████████| 55/55 [47:45<00:00, 52.10s/it]


## Testing

Here I take a random tissue file that was just generate it, read it, and check the expected number of samples, and that the expected sample IDs are there. I also check that same expression values for gene/sample pairs are correct (I read those from a different console, just to make sure I'm not making a mistake here).

In [18]:
_tmp = pd.read_pickle(OUTPUT_DIR / "gtex_v8_data_brain_cerebellar_hemisphere.pkl")

In [19]:
# taken from GTEx webpage (see above)
assert _tmp.shape[1] == 215

In [20]:
assert "GTEX-11DXY-0011-R11a-SM-DNZZN" in _tmp.columns
assert "GTEX-WL46-0011-R11A-SM-3MJFT" in _tmp.columns
assert "GTEX-ZF28-0011-R11a-SM-4WWEI" in _tmp.columns

In [21]:
_v = _tmp.loc["ENSG00000223972.5", "GTEX-11DXY-0011-R11a-SM-DNZZN"]
assert _v == 0.04045, _v
_v = _tmp.loc["ENSG00000278267.1", "GTEX-11DXY-0011-R11a-SM-DNZZN"]
assert _v == 0.0, _v

_v = _tmp.loc["ENSG00000233327.10", "GTEX-WL46-0011-R11A-SM-3MJFT"]
assert _v == 146.4000, _v
_v = _tmp.loc["ENSG00000237118.2", "GTEX-WL46-0011-R11A-SM-3MJFT"]
assert _v == 0.3357, _v

_v = _tmp.loc["ENSG00000233327.10", "GTEX-ZF28-0011-R11a-SM-4WWEI"]
assert _v == 30.7200, _v
_v = _tmp.loc["ENSG00000186907.7", "GTEX-ZF28-0011-R11a-SM-4WWEI"]
assert _v == 0.94720, _v

# Save gene mappings

In [22]:
list(gene_id_symbol_map_tuples)[:5]

[Pandas(gene_ens_id='ENSG00000145309.5', gene_symbol='CABS1'),
 Pandas(gene_ens_id='ENSG00000175820.3', gene_symbol='CCDC168'),
 Pandas(gene_ens_id='ENSG00000083454.21', gene_symbol='P2RX5'),
 Pandas(gene_ens_id='ENSG00000071794.15', gene_symbol='HLTF'),
 Pandas(gene_ens_id='ENSG00000211918.1', gene_symbol='IGHD2-15')]

In [23]:
gene_mappings = pd.DataFrame(gene_id_symbol_map_tuples)

In [24]:
gene_mappings.shape

(56200, 2)

In [25]:
gene_mappings.head()

Unnamed: 0,gene_ens_id,gene_symbol
0,ENSG00000145309.5,CABS1
1,ENSG00000175820.3,CCDC168
2,ENSG00000083454.21,P2RX5
3,ENSG00000071794.15,HLTF
4,ENSG00000211918.1,IGHD2-15


## Save

In [26]:
output_filename = conf.GTEX["DATA_DIR"] / "gtex_gene_id_symbol_mappings.pkl"
display(output_filename)

PosixPath('/opt/data/data/gtex_v8/gtex_gene_id_symbol_mappings.pkl')

In [27]:
gene_mappings.to_pickle(output_filename)

## Testing

In [28]:
gene_mappings = pd.read_pickle(output_filename)

In [29]:
# no null
assert gene_mappings.dropna(how="any").shape == gene_mappings.shape

In [30]:
# no duplicates
assert gene_mappings.drop_duplicates().shape == gene_mappings.shape

In [31]:
# check gene id and gene symbol lengths (check no empty entries)
_tmp = gene_mappings.copy()
_tmp = _tmp.assign(id_len=gene_mappings["gene_ens_id"].apply(len))
_tmp = _tmp.assign(symbol_len=gene_mappings["gene_symbol"].apply(len))

In [32]:
_tmp_unique = _tmp["id_len"].unique()
display(_tmp_unique)

array([17, 18, 23, 24])

In [33]:
_tmp.drop_duplicates(subset=["id_len"])

Unnamed: 0,gene_ens_id,gene_symbol,id_len,symbol_len
0,ENSG00000145309.5,CABS1,17,5
2,ENSG00000083454.21,P2RX5,18,5
836,ENSG00000230542.6_PAR_Y,LINC00102,23,9
4631,ENSG00000169084.13_PAR_Y,DHRSX,24,5


Unique gene id lengths seem to be valid

In [34]:
assert set(_tmp_unique) == set([17, 18, 24, 23])

In [35]:
_tmp_unique = _tmp["symbol_len"].unique()
display(_tmp_unique)

array([ 5,  7,  4,  8,  6, 12, 13,  9, 11,  3, 10, 15,  2, 14, 16, 17,  1,
       18, 19])

No gene symbol is empty, that's good

In [36]:
assert (_tmp_unique > 0).all()

In [37]:
assert _tmp_unique.min() == 1
assert _tmp_unique.max() == 19

In [38]:
# show how different gene symbol's lengths look like
_tmp.drop_duplicates(subset=["symbol_len"]).sort_values("symbol_len")

Unnamed: 0,gene_ens_id,gene_symbol,id_len,symbol_len
5157,ENSG00000164458.9,T,17,1
152,ENSG00000274210.1,U1,17,2
44,ENSG00000198805.11,PNP,18,3
3,ENSG00000071794.15,HLTF,18,4
0,ENSG00000145309.5,CABS1,17,5
5,ENSG00000256812.1,CAPNS2,17,6
1,ENSG00000175820.3,CCDC168,17,7
4,ENSG00000211918.1,IGHD2-15,17,8
10,ENSG00000199691.1,RN7SKP173,17,9
46,ENSG00000271977.1,AC226119.4,17,10


Unique gene symbol lengths seem to be valid

In [39]:
assert gene_mappings["gene_ens_id"].unique().shape[0] == gene_mappings.shape[0]

In [40]:
# some gene symbols map to multiple gene ids
display(gene_mappings["gene_symbol"].unique().shape)
assert gene_mappings["gene_symbol"].unique().shape[0] < gene_mappings.shape[0]

(54592,)

In [41]:
# show some duplicated gene symbols
gene_mappings[gene_mappings["gene_symbol"].duplicated(keep=False)].sort_values(
    "gene_symbol"
)

Unnamed: 0,gene_ens_id,gene_symbol
30745,ENSG00000252830.2,5S_rRNA
31643,ENSG00000274759.1,5S_rRNA
3514,ENSG00000276861.1,5S_rRNA
37081,ENSG00000201285.1,5S_rRNA
38398,ENSG00000283433.1,5S_rRNA
...,...,...
55070,ENSG00000277306.1,uc_338
44089,ENSG00000275414.1,uc_338
25910,ENSG00000276673.1,uc_338
17324,ENSG00000277948.1,uc_338


In [42]:
_tmp = gene_mappings.set_index("gene_ens_id").squeeze()

In [43]:
assert _tmp.loc["ENSG00000223972.5"] == "DDX11L1"
assert _tmp.loc["ENSG00000243485.5"] == "MIR1302-2HG"
assert _tmp.loc["ENSG00000274059.1"] == "5S_rRNA"  # repeated gene
assert _tmp.loc["ENSG00000275305.1"] == "5S_rRNA"  # repeated gene