# Description

According to the settings specified below, this notebook:
 1. reads all the data from one source (GTEx, recount2, etc) according to the gene selection method (`GENE_SELECTION_STRATEGY`),
 2. runs a quick performance test using the correlation coefficient specified (`CORRELATION_METHOD`), and
 3. computes the correlation matrix across all the genes using the correlation coefficient specified.

# Modules

In [1]:
import pandas as pd
from tqdm import tqdm

from clustermatch import conf
from clustermatch.utils import simplify_string
from clustermatch.corr import clustermatch

# Settings

In [2]:
GENE_SELECTION_STRATEGY = "var_raw"

In [3]:
# for clustermatch, I select the top 5 tissues (according to sample size, see nbs/05_preprocessing/00-gtex_v8-split_by_tissue.ipynb)
TISSUES = [
    "Muscle - Skeletal",
    "Whole Blood",
    "Skin - Sun Exposed (Lower leg)",
    "Adipose - Subcutaneous",
    "Artery - Tibial",
]

In [4]:
def clustermatch_k2(data):
    n_clusters = list(range(2, 2 + 1))
    return clustermatch(data, internal_n_clusters=n_clusters)


CORRELATION_METHOD = clustermatch_k2

method_name = CORRELATION_METHOD.__name__
display(method_name)

'clustermatch_k2'

In [5]:
PERFORMANCE_TEST_N_TOP_GENES = 500

# Paths

In [6]:
INPUT_DIR = conf.GTEX["GENE_SELECTION_DIR"]
display(INPUT_DIR)

assert INPUT_DIR.exists()

PosixPath('/opt/data/results/gtex_v8/gene_selection')

In [7]:
OUTPUT_DIR = conf.GTEX["SIMILARITY_MATRICES_DIR"]
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

PosixPath('/opt/data/results/gtex_v8/similarity_matrices')

# Data loading

In [8]:
tissue_in_file_names = [f"_data_{simplify_string(t.lower())}-" for t in TISSUES]

In [9]:
input_files = sorted(list(INPUT_DIR.glob(f"*-{GENE_SELECTION_STRATEGY}.pkl")))
input_files = [
    f for f in input_files if any(tn in f.name for tn in tissue_in_file_names)
]
display(len(input_files))

assert len(input_files) == len(TISSUES), len(TISSUES)
display(input_files)

5

[PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_adipose_subcutaneous-var_raw.pkl'),
 PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_artery_tibial-var_raw.pkl'),
 PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_muscle_skeletal-var_raw.pkl'),
 PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_skin_sun_exposed_lower_leg-var_raw.pkl'),
 PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_whole_blood-var_raw.pkl')]

# Compute similarity

## Performance test

In [10]:
display(input_files[0])
test_data = pd.read_pickle(input_files[0]).iloc[:PERFORMANCE_TEST_N_TOP_GENES]

PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_adipose_subcutaneous-var_raw.pkl')

In [11]:
test_data.shape

(500, 663)

In [12]:
test_data.head()

Unnamed: 0_level_0,GTEX-1117F-0226-SM-5GZZ7,GTEX-111CU-1826-SM-5GZYN,GTEX-111FC-0226-SM-5N9B8,GTEX-111VG-2326-SM-5N9BK,GTEX-111YS-2426-SM-5GZZQ,GTEX-1122O-2026-SM-9YFMG,GTEX-1128S-2126-SM-5H12U,GTEX-113IC-0226-SM-5HL5C,GTEX-117YX-2226-SM-5EGJJ,GTEX-11DXW-0326-SM-5H11W,...,GTEX-ZXES-2026-SM-5NQ6R,GTEX-ZXG5-0226-SM-59HJI,GTEX-ZYFC-0326-SM-5NQ7H,GTEX-ZYFD-0226-SM-5NQ86,GTEX-ZYT6-0326-SM-7LG5R,GTEX-ZYVF-0226-SM-5GIEG,GTEX-ZYW4-0226-SM-5E44M,GTEX-ZYY3-0226-SM-5E45M,GTEX-ZZ64-1626-SM-5E43W,GTEX-ZZPU-2726-SM-5NQ8O
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000198938.2,19890.0,22670.0,31580.0,35680.0,43170.0,43400.0,23850.0,24240.0,33190.0,38500.0,...,38340.0,28750.0,45570.0,62970.0,37120.0,25490.0,38000.0,38310.0,28740.0,37650.0
ENSG00000198886.2,12400.0,22230.0,27330.0,18780.0,51290.0,44250.0,36100.0,19370.0,43040.0,39050.0,...,39140.0,19910.0,34190.0,22900.0,26770.0,22350.0,20110.0,36860.0,35940.0,29870.0
ENSG00000198899.2,13880.0,29860.0,30250.0,25050.0,54710.0,42500.0,35230.0,15870.0,44020.0,42060.0,...,47150.0,27420.0,24630.0,34790.0,29440.0,28640.0,26590.0,40390.0,38170.0,35170.0
ENSG00000198804.2,10790.0,11170.0,21520.0,20030.0,31460.0,32570.0,21820.0,18570.0,20670.0,20220.0,...,32630.0,21210.0,22990.0,25390.0,24380.0,13940.0,13980.0,18500.0,20760.0,31970.0
ENSG00000198888.2,15870.0,18770.0,25570.0,19760.0,38440.0,34070.0,27410.0,21560.0,29330.0,36640.0,...,37960.0,24190.0,39950.0,31940.0,25410.0,15760.0,26500.0,28270.0,30060.0,31440.0


This is a quick performance test of the correlation measure. The following line (`_tmp = ...`) is the setup code, which is needed in case the correlation method was optimized using `numba` and needs to be compiled before performing the test.

In [13]:
_tmp = CORRELATION_METHOD(test_data.iloc[:3])

display(_tmp.shape)
display(_tmp)

(3, 3)

gene_ens_id,ENSG00000198938.2,ENSG00000198886.2,ENSG00000198899.2
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000198938.2,1.0,0.115891,0.173306
ENSG00000198886.2,0.115891,1.0,0.394675
ENSG00000198899.2,0.173306,0.394675,1.0


In [14]:
%timeit CORRELATION_METHOD(test_data)

1.35 s ± 3.97 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Run

In [15]:
pbar = tqdm(input_files, ncols=100)

for tissue_data_file in pbar:
    pbar.set_description(tissue_data_file.stem)

    # read
    data = pd.read_pickle(tissue_data_file)

    # compute correlations
    data_corrs = CORRELATION_METHOD(data)

    # save
    output_filename = f"{tissue_data_file.stem}-{method_name}.pkl"
    data_corrs.to_pickle(path=OUTPUT_DIR / output_filename)

gtex_v8_data_whole_blood-var_raw: 100%|██████████████████████████████| 5/5 [10:58<00:00, 131.62s/it]
