# Description

According to the settings specified below, this notebook:
 1. reads all the data from one source (GTEx, recount2, etc) according to the gene selection method (`GENE_SELECTION_STRATEGY`),
 2. runs a quick performance test using the correlation coefficient specified (`CORRELATION_METHOD`), and
 3. computes the correlation matrix across all the genes using the correlation coefficient specified.

# Modules

In [1]:
from time import time

import pandas as pd

from ccc import conf
from ccc.utils import simplify_string
from ccc.corr import mic

# Settings

In [2]:
GENE_SELECTION_STRATEGY = "var_pc_log2"

In [3]:
# select the top 5 tissues (according to sample size, see nbs/05_preprocessing/00-gtex_v8-split_by_tissue.ipynb)
TISSUES = [
    # "Muscle - Skeletal",
    "Whole Blood",
    # "Skin - Sun Exposed (Lower leg)",
    # "Adipose - Subcutaneous",
    # "Artery - Tibial",
]

In [4]:
conf.GENERAL["N_JOBS"]

20

In [5]:
def mic_parallel(data):
    return mic(data, estimator="mic_e", n_jobs=conf.GENERAL["N_JOBS"])


CORRELATION_METHOD = mic_parallel

method_name = CORRELATION_METHOD.__name__
display(method_name)

'mic_parallel'

In [6]:
PERFORMANCE_TEST_N_TOP_GENES = 500

# Paths

In [7]:
INPUT_DIR = conf.GTEX["GENE_SELECTION_DIR"]
display(INPUT_DIR)

assert INPUT_DIR.exists()

PosixPath('/opt/data/results/gtex_v8/gene_selection')

In [8]:
OUTPUT_DIR = conf.GTEX["SIMILARITY_MATRICES_DIR"]
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

PosixPath('/opt/data/results/gtex_v8/similarity_matrices')

# Data loading

In [9]:
tissue_in_file_names = [f"_data_{simplify_string(t.lower())}-" for t in TISSUES]

In [10]:
input_files = sorted(list(INPUT_DIR.glob(f"*-{GENE_SELECTION_STRATEGY}.pkl")))
input_files = [
    f for f in input_files if any(tn in f.name for tn in tissue_in_file_names)
]
display(len(input_files))

assert len(input_files) == len(TISSUES), len(TISSUES)
display(input_files)

1

[PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_whole_blood-var_pc_log2.pkl')]

# Compute similarity

## Performance test

In [11]:
display(input_files[0])
test_data = pd.read_pickle(input_files[0]).iloc[:PERFORMANCE_TEST_N_TOP_GENES]

PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_whole_blood-var_pc_log2.pkl')

In [12]:
test_data.shape

(500, 755)

In [13]:
test_data.head()

Unnamed: 0_level_0,GTEX-111YS-0006-SM-5NQBE,GTEX-1122O-0005-SM-5O99J,GTEX-1128S-0005-SM-5P9HI,GTEX-113IC-0006-SM-5NQ9C,GTEX-113JC-0006-SM-5O997,GTEX-117XS-0005-SM-5PNU6,GTEX-117YW-0005-SM-5NQ8Z,GTEX-1192W-0005-SM-5NQBQ,GTEX-1192X-0005-SM-5NQC3,GTEX-11DXW-0006-SM-5NQ7Y,...,GTEX-ZVE2-0006-SM-51MRW,GTEX-ZVP2-0005-SM-51MRK,GTEX-ZVT2-0005-SM-57WBW,GTEX-ZVT3-0006-SM-51MT9,GTEX-ZVT4-0006-SM-57WB8,GTEX-ZVTK-0006-SM-57WBK,GTEX-ZVZP-0006-SM-51MSW,GTEX-ZVZQ-0006-SM-51MR8,GTEX-ZXES-0005-SM-57WCB,GTEX-ZXG5-0005-SM-57WCN
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000169429.10,0.5623,0.8067,116.9,4.047,211.0,58.11,68.38,249.5,5.095,295.9,...,39.96,0.1393,0.2238,245.0,513.6,1626.0,0.5633,515.7,1.194,1163.0
ENSG00000135245.9,0.6529,1.385,199.2,2.266,116.7,192.3,161.5,263.5,23.54,251.9,...,114.3,1.833,0.4115,149.0,935.3,233.6,0.8882,134.0,1.12,295.7
ENSG00000163631.16,1.848,0.2503,0.08429,1.251,1348.0,9.971,101.3,95.09,1.264,119.3,...,2.092,2.11,0.03588,171.8,107.1,71.25,1.772,309.6,0.07361,17.75
ENSG00000277632.1,1.696,1.345,235.1,11.77,141.7,199.1,525.5,659.9,10.91,209.3,...,61.34,2.25,0.7231,261.2,400.0,288.5,2.696,287.5,3.323,618.9
ENSG00000239839.6,185.2,1.779,694.3,23.84,297.3,3122.0,2521.0,1504.0,80.06,652.0,...,1010.0,253.8,94.52,6083.0,2768.0,52.06,34.57,17.36,352.3,63.85


This is a quick performance test of the correlation measure. The following line (`_tmp = ...`) is the setup code, which is needed in case the correlation method was optimized using `numba` and needs to be compiled before performing the test.

In [14]:
_tmp = CORRELATION_METHOD(test_data.iloc[:3])

display(_tmp.shape)
display(_tmp)

(3, 3)

gene_ens_id,ENSG00000169429.10,ENSG00000135245.9,ENSG00000163631.16
gene_ens_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000169429.10,1.0,0.898121,0.559862
ENSG00000135245.9,0.898121,1.0,0.600884
ENSG00000163631.16,0.559862,0.600884,1.0


In [15]:
%timeit -r1 CORRELATION_METHOD(test_data)

7min 53s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


## Run

In [16]:
for tissue_data_file in input_files:
    display(tissue_data_file.stem)

    # read
    data = pd.read_pickle(tissue_data_file)

    # compute correlations
    start_time = time()

    data_corrs = CORRELATION_METHOD(data)

    end_time = time()
    elapsed_time = end_time - start_time
    display(elapsed_time)

    # save
    output_filename = f"{tissue_data_file.stem}-{method_name}.pkl"
    data_corrs.to_pickle(path=OUTPUT_DIR / output_filename)

'gtex_v8_data_whole_blood-var_pc_log2'

43487.54033827782