# Description

According to the settings specified below, this notebook:
 1. reads all the data from one source (GTEx, recount2, etc) according to the gene selection method (`GENE_SELECTION_STRATEGY`),
 2. runs a quick performance test using the correlation coefficient specified (`CORRELATION_METHOD`), and
 3. computes the correlation matrix across all the genes using the correlation coefficient specified.

# Modules

In [1]:
import pandas as pd
from tqdm import tqdm

from clustermatch import conf
from clustermatch.corr import pearson

# Settings

In [2]:
GENE_SELECTION_STRATEGY = "var_pc_log2"

In [3]:
CORRELATION_METHOD = pearson

method_name = CORRELATION_METHOD.__name__
display(method_name)

'pearson'

In [4]:
PERFORMANCE_TEST_N_TOP_GENES = 500

# Paths

In [5]:
INPUT_DIR = conf.GTEX["GENE_SELECTION_DIR"]
display(INPUT_DIR)

assert INPUT_DIR.exists()

PosixPath('/opt/data/results/gtex_v8/gene_selection')

In [6]:
OUTPUT_DIR = conf.GTEX["SIMILARITY_MATRICES_DIR"]
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

PosixPath('/opt/data/results/gtex_v8/similarity_matrices')

# Data loading

In [7]:
input_files = sorted(list(INPUT_DIR.glob(f"*-{GENE_SELECTION_STRATEGY}.pkl")))
display(len(input_files))

assert len(input_files) == conf.GTEX["N_TISSUES"], len(input_files)
display(input_files[:5])

54

[PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_adipose_subcutaneous-var_pc_log2.pkl'),
 PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_adipose_visceral_omentum-var_pc_log2.pkl'),
 PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_adrenal_gland-var_pc_log2.pkl'),
 PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_artery_aorta-var_pc_log2.pkl'),
 PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_artery_coronary-var_pc_log2.pkl')]

# Compute similarity

## Performance test

In [8]:
display(input_files[0])
test_data = pd.read_pickle(input_files[0]).iloc[:PERFORMANCE_TEST_N_TOP_GENES]

PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_adipose_subcutaneous-var_pc_log2.pkl')

In [9]:
test_data.shape

(500, 663)

In [10]:
test_data.head()

Unnamed: 0_level_0,GTEX-1117F-0226-SM-5GZZ7,GTEX-111CU-1826-SM-5GZYN,GTEX-111FC-0226-SM-5N9B8,GTEX-111VG-2326-SM-5N9BK,GTEX-111YS-2426-SM-5GZZQ,GTEX-1122O-2026-SM-9YFMG,GTEX-1128S-2126-SM-5H12U,GTEX-113IC-0226-SM-5HL5C,GTEX-117YX-2226-SM-5EGJJ,GTEX-11DXW-0326-SM-5H11W,...,GTEX-ZXES-2026-SM-5NQ6R,GTEX-ZXG5-0226-SM-59HJI,GTEX-ZYFC-0326-SM-5NQ7H,GTEX-ZYFD-0226-SM-5NQ86,GTEX-ZYT6-0326-SM-7LG5R,GTEX-ZYVF-0226-SM-5GIEG,GTEX-ZYW4-0226-SM-5E44M,GTEX-ZYY3-0226-SM-5E45M,GTEX-ZZ64-1626-SM-5E43W,GTEX-ZZPU-2726-SM-5NQ8O
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000129824.15,0.9301,270.4,345.8,242.2,180.8,0.1904,0.1864,294.4,332.6,216.0,...,0.2916,311.7,230.0,302.2,316.5,0.2567,340.2,0.6962,269.1,0.1917
ENSG00000149968.11,100.4,785.1,14.0,44.48,17.96,2.567,0.1841,46.91,13.95,0.1953,...,3.948,108.8,3.823,6.617,11.44,7.619,27.43,21.02,122.6,10.82
ENSG00000134184.12,75.68,0.2281,0.207,0.0,36.93,0.1651,0.06058,174.0,0.1429,56.47,...,54.92,46.23,0.1468,0.04705,67.24,0.4005,0.5507,56.06,0.4048,0.0
ENSG00000224114.1,1.15,0.8068,0.5856,74.28,0.314,0.6115,49.95,139.6,83.54,1.364,...,0.1915,5.593,60.02,61.04,1.596,81.6,149.4,0.2032,1.227,0.5036
ENSG00000173432.10,2765.0,241.1,1.522,99.98,921.4,2859.0,3874.0,21.41,1338.0,61.48,...,1666.0,165.3,81.97,180.9,24.65,276.2,53.54,1436.0,140.5,756.3


This is a quick performance test of the correlation measure. The following line (`_tmp = ...`) is the setup code, which is needed in case the correlation method was optimized using `numba` and needs to be compiled before performing the test.

In [11]:
_tmp = CORRELATION_METHOD(test_data.iloc[:3])

display(_tmp.shape)
display(_tmp)

(3, 3)

gene_ens_id,ENSG00000129824.15,ENSG00000149968.11,ENSG00000134184.12
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000129824.15,1.0,0.142708,0.044838
ENSG00000149968.11,0.142708,1.0,-0.007867
ENSG00000134184.12,0.044838,-0.007867,1.0


In [12]:
%timeit CORRELATION_METHOD(test_data)

48.8 ms ± 288 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Run

In [13]:
pbar = tqdm(input_files, ncols=100)

for tissue_data_file in pbar:
    pbar.set_description(tissue_data_file.stem)

    # read
    data = pd.read_pickle(tissue_data_file)

    # compute correlations
    data_corrs = CORRELATION_METHOD(data)

    # save
    output_filename = f"{tissue_data_file.stem}-{method_name}.pkl"
    data_corrs.to_pickle(path=OUTPUT_DIR / output_filename)

gtex_v8_data_whole_blood-var_pc_log2: 100%|█████████████████████████| 54/54 [03:17<00:00,  3.66s/it]
