# Description

It splits gene expression data from GTEx v8 by tissue and saves a gene id/symbol mapping file.

# Modules

In [None]:
import pickle
from pathlib import Path

import pandas as pd
from tqdm.notebook import tqdm

from clustermatch.utils import simplify_string
from clustermatch import conf

# Settings

In [None]:
OUTPUT_DIR = conf.GTEX["BASE_DIR"] / "data_by_tissue"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

# Data loading

## GTEx v8

### Sample metadata

In [None]:
gtex_sample_attrs = pd.read_csv(
    conf.GTEX["SAMPLE_ATTRS_FILE"],
    sep="\t",
)

In [None]:
gtex_sample_attrs.shape

In [None]:
gtex_sample_attrs.head()

# Get tissues names

In [None]:
gtex_tissues = gtex_sample_attrs["SMTSD"].unique()
display(len(gtex_tissues))
display(gtex_tissues)

# Get sample IDs for each tissue

In [None]:
# first, get all sample IDs with expression data
gtex_all_sample_ids_with_expr_data = set(
    pd.read_csv(
        conf.GTEX["DATA_TPM_GCT_FILE"],
        sep="\t",
        skiprows=2,
        nrows=1,
        usecols=lambda x: x not in ("Name", "Description"),
    ).columns
)

In [None]:
len(gtex_all_sample_ids_with_expr_data)

In [None]:
list(gtex_all_sample_ids_with_expr_data)[:10]

In [None]:
# get sample IDs by tissue
sample_ids_by_tissue = {
    tissue_name: sorted(
        list(
            gtex_all_sample_ids_with_expr_data.intersection(
                set(
                    gtex_sample_attrs[gtex_sample_attrs["SMTSD"] == tissue_name][
                        "SAMPID"
                    ].tolist()
                )
            )
        )
    )
    for tissue_name in gtex_tissues
}

In [None]:
assert len(gtex_tissues) == len(sample_ids_by_tissue)

In [None]:
sample_ids_by_tissue["Whole Blood"][:10]

In [None]:
# all IDs are unique
assert all(
    [
        len(sample_ids_by_tissue[tissue_name])
        == len(set(sample_ids_by_tissue[tissue_name]))
        for tissue_name in sample_ids_by_tissue.keys()
    ]
)

## Show sample size by tissue

In [None]:
tissue_sample_size = pd.DataFrame(
    [{"tissue": k, "sample_size": len(v)} for k, v in sample_ids_by_tissue.items()]
)

In [None]:
tissue_sample_size = tissue_sample_size.sort_values("sample_size", ascending=False)
display(tissue_sample_size)

In [None]:
# some testing
_tmp = tissue_sample_size.set_index("tissue").squeeze()
assert _tmp.loc["Muscle - Skeletal"] == 803
assert _tmp.loc["Whole Blood"] == 755
assert _tmp.loc["Skin - Not Sun Exposed (Suprapubic)"] == 604
assert _tmp.loc["Kidney - Medulla"] == 4

These numbers match those you can find here: https://gtexportal.org/home/tissueSummaryPage#sampleCountsPerTissue

# Split expression data by tissue

In [None]:
pbar = tqdm(tissue_sample_size["tissue"])

gene_id_symbol_map_tuples = set()

for tissue_name in pbar:
    pbar.set_description(tissue_name)

    tissue_ids = sample_ids_by_tissue[tissue_name]
    if len(tissue_ids) == 0:
        continue

    tissue_data = pd.read_csv(
        conf.GTEX["DATA_TPM_GCT_FILE"],
        sep="\t",
        skiprows=2,
        usecols=["Name", "Description"] + tissue_ids,
    )

    tissue_data = tissue_data.rename(
        columns={
            "Name": "gene_ens_id",
            "Description": "gene_symbol",
        }
    )

    # add gene id / gene symbol to mapping variable
    gene_id_symbol_map_tuples.update(
        tissue_data[["gene_ens_id", "gene_symbol"]].itertuples(index=False)
    )

    tissue_data = tissue_data.drop(columns=["gene_symbol"]).set_index("gene_ens_id")

    assert tissue_data.index.is_unique
    assert tissue_data.columns.is_unique

    # save
    tissue_name_simple = simplify_string(simplify_string(tissue_name.lower()))
    tissue_data.to_pickle(path=OUTPUT_DIR / f"gtex_v8_data_{tissue_name_simple}.pkl")

## Testing

In [None]:
_tmp = pd.read_pickle(OUTPUT_DIR / "gtex_v8_data_brain_cerebellar_hemisphere.pkl")

In [None]:
# taken from GTEx webpage (see above)
assert _tmp.shape[1] == 215

In [None]:
assert "GTEX-11DXY-0011-R11a-SM-DNZZN" in _tmp.columns
assert "GTEX-WL46-0011-R11A-SM-3MJFT" in _tmp.columns
assert "GTEX-ZF28-0011-R11a-SM-4WWEI" in _tmp.columns

In [None]:
_v = _tmp.loc["ENSG00000223972.5", "GTEX-11DXY-0011-R11a-SM-DNZZN"]
assert _v == 0.04045, _v
_v = _tmp.loc["ENSG00000278267.1", "GTEX-11DXY-0011-R11a-SM-DNZZN"]
assert _v == 0.0, _v

_v = _tmp.loc["ENSG00000233327.10", "GTEX-WL46-0011-R11A-SM-3MJFT"]
assert _v == 146.4000, _v
_v = _tmp.loc["ENSG00000237118.2", "GTEX-WL46-0011-R11A-SM-3MJFT"]
assert _v == 0.3357, _v

_v = _tmp.loc["ENSG00000233327.10", "GTEX-ZF28-0011-R11a-SM-4WWEI"]
assert _v == 30.7200, _v
_v = _tmp.loc["ENSG00000186907.7", "GTEX-ZF28-0011-R11a-SM-4WWEI"]
assert _v == 0.94720, _v

# Save gene mappings

In [None]:
list(gene_id_symbol_map_tuples)[:5]

In [None]:
gene_mappings = pd.DataFrame(gene_id_symbol_map_tuples)

In [None]:
gene_mappings.shape

In [None]:
gene_mappings.head()

## Save

In [None]:
output_filename = conf.GTEX["BASE_DIR"] / "gtex_gene_id_symbol_mappings.pkl"
display(output_filename)

In [None]:
gene_mappings.to_pickle(output_filename)

## Testing

In [None]:
gene_mappings = pd.read_pickle(output_filename)

In [None]:
# no null
assert gene_mappings.dropna(how="any").shape == gene_mappings.shape

In [None]:
# no duplicates
assert gene_mappings.drop_duplicates().shape == gene_mappings.shape

In [None]:
# check gene id and gene symbol lengths (check no empty entries)
_tmp = gene_mappings.copy()
_tmp = _tmp.assign(id_len=gene_mappings["gene_ens_id"].apply(len))
_tmp = _tmp.assign(symbol_len=gene_mappings["gene_symbol"].apply(len))

In [None]:
_tmp_unique = _tmp["id_len"].unique()
display(_tmp_unique)

In [None]:
_tmp.drop_duplicates(subset=["id_len"])

Unique gene id lengths seem to be valid

In [None]:
assert list(_tmp_unique) == [17, 18, 24, 23]

In [None]:
_tmp_unique = _tmp["symbol_len"].unique()
display(_tmp_unique)

No gene symbol is empty, that's good

In [None]:
assert (_tmp_unique > 0).all()

In [None]:
assert _tmp_unique.min() == 1
assert _tmp_unique.max() == 19

In [None]:
# show how different gene symbol's lengths look like
_tmp.drop_duplicates(subset=["symbol_len"]).sort_values("symbol_len")

Unique gene symbol lengths seem to be valid

In [None]:
assert gene_mappings["gene_ens_id"].unique().shape[0] == gene_mappings.shape[0]

In [None]:
# some gene symbols map to multiple gene ids
display(gene_mappings["gene_symbol"].unique().shape)
assert gene_mappings["gene_symbol"].unique().shape[0] < gene_mappings.shape[0]

In [None]:
# show some duplicated gene symbols
gene_mappings[gene_mappings["gene_symbol"].duplicated(keep=False)].sort_values(
    "gene_symbol"
)

In [None]:
_tmp = gene_mappings.set_index("gene_ens_id").squeeze()

In [None]:
assert _tmp.loc["ENSG00000223972.5"] == "DDX11L1"
assert _tmp.loc["ENSG00000243485.5"] == "MIR1302-2HG"
assert _tmp.loc["ENSG00000274059.1"] == "5S_rRNA"  # repeated gene
assert _tmp.loc["ENSG00000275305.1"] == "5S_rRNA"  # repeated gene