# Description

According to the settings specified below, this notebook:
 1. reads all the data from one source (GTEx, recount2, etc) according to the gene selection method (`GENE_SELECTION_STRATEGY`),
 2. runs a quick performance test using the correlation coefficient specified (`CORRELATION_METHOD`), and
 3. computes the correlation matrix across all the genes using the correlation coefficient specified.

# Modules

In [1]:
import pandas as pd

from clustermatch import conf
from clustermatch.corr import spearman

# Settings

In [2]:
GENE_SELECTION_STRATEGY = "var_pc_log2"

In [3]:
CORRELATION_METHOD = spearman

method_name = CORRELATION_METHOD.__name__
display(method_name)

'spearman'

In [4]:
PERFORMANCE_TEST_N_TOP_GENES = 50

# Paths

In [5]:
INPUT_FILE = (
    conf.RECOUNT2FULL["GENE_SELECTION_DIR"]
    / f"recount2_rpkm-{GENE_SELECTION_STRATEGY}.pkl"
)
display(INPUT_FILE)

assert INPUT_FILE.exists()

PosixPath('/opt/data/results/recount2full/gene_selection/recount2_rpkm-var_pc_log2.pkl')

In [6]:
OUTPUT_DIR = conf.RECOUNT2FULL["SIMILARITY_MATRICES_DIR"]
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

PosixPath('/opt/data/results/recount2full/similarity_matrices')

# Data loading

In [7]:
data = pd.read_pickle(INPUT_FILE)

In [8]:
data.shape

(5000, 37032)

In [9]:
data.head()

Unnamed: 0,SRP000599.SRR013549,SRP000599.SRR013550,SRP000599.SRR013551,SRP000599.SRR013552,SRP000599.SRR013553,SRP000599.SRR013554,SRP000599.SRR013555,SRP000599.SRR013556,SRP000599.SRR013557,SRP000599.SRR013558,...,SRP035599.SRR1139372,SRP035599.SRR1139393,SRP035599.SRR1139388,SRP035599.SRR1139378,SRP035599.SRR1139399,SRP035599.SRR1139386,SRP035599.SRR1139375,SRP035599.SRR1139382,SRP035599.SRR1139356,SRP035599.SRR1139370
ENSG00000283293,45.340502,2.334996,822.911013,0.0,0.0,59.262734,89.232042,52.689774,84.642278,76.550334,...,107.729307,212.671129,40.966736,90.314617,341.722891,182.996937,399.501963,223.983651,150.315163,46.134643
ENSG00000202198,45.141293,2.313833,816.267243,0.0,0.0,59.119741,88.423293,52.212223,84.268908,76.021789,...,106.752908,210.743596,40.595436,89.496055,338.625704,181.339452,395.881099,221.954266,148.95279,45.716504
ENSG00000277027,2892.902222,64.923624,2325.178875,2040.439754,737.33598,750.130979,897.111915,872.432824,828.456318,775.575618,...,14.262494,100.647909,7.627791,10.965602,130.146293,25.545523,105.272058,71.787249,45.647575,10.079155
ENSG00000269900,2929.378999,65.629851,2316.634319,2063.037408,758.072935,759.542097,908.185083,884.121024,839.435983,783.798951,...,14.049621,99.146533,7.515042,10.801936,128.203811,25.164247,103.700833,70.717475,44.967907,9.92872
ENSG00000274012,4.345167,4.489634,2414.223064,229.321454,251.869422,118.716381,991.603565,841.500002,923.032654,919.186612,...,4189.905669,813.318752,237.672987,3093.486852,1069.247832,3972.654129,1120.269285,1177.947546,784.626799,1120.676727


# Compute similarity

## Performance test

In [10]:
# select a subset of the genes
test_data = data.sample(n=PERFORMANCE_TEST_N_TOP_GENES, random_state=0)

In [11]:
test_data.shape

(50, 37032)

In [12]:
test_data.head()

Unnamed: 0,SRP000599.SRR013549,SRP000599.SRR013550,SRP000599.SRR013551,SRP000599.SRR013552,SRP000599.SRR013553,SRP000599.SRR013554,SRP000599.SRR013555,SRP000599.SRR013556,SRP000599.SRR013557,SRP000599.SRR013558,...,SRP035599.SRR1139372,SRP035599.SRR1139393,SRP035599.SRR1139388,SRP035599.SRR1139378,SRP035599.SRR1139399,SRP035599.SRR1139386,SRP035599.SRR1139375,SRP035599.SRR1139382,SRP035599.SRR1139356,SRP035599.SRR1139370
ENSG00000264281,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,895.257951,1178.463237,1545.326624,944.996223,1438.281708,1080.115776,1715.689533,1559.217986,1132.025119,1879.164263
ENSG00000110104,0.0,0.568504,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,24.224516,29.376698,32.23091,32.778868,33.383575,19.41177,32.288124,39.91743,30.828314,40.215054
ENSG00000160883,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.123248,0.013679,0.018089,0.025117,0.029487,0.0,0.0,1.501656,0.0,0.117545
ENSG00000080822,0.0,0.0,0.0,0.879132,2.014272,0.183379,0.0,2.229092,0.0,0.241166,...,20.742053,21.185725,18.188702,20.825416,15.496153,22.311505,17.215879,22.453095,19.407953,8.222836
ENSG00000199990,8961.747789,35.74634,422.171726,13326.124486,0.0,170.868658,188.025434,218.618043,147.456667,98.354233,...,0.0,0.369069,0.544603,0.72493,0.0,0.0,0.0,0.127091,0.0,0.752035


This is a quick performance test of the correlation measure. The following line (`_tmp = ...`) is the setup code, which is needed in case the correlation method was optimized using `numba` and needs to be compiled before performing the test.

In [13]:
_tmp = CORRELATION_METHOD(test_data.iloc[:3])

display(_tmp.shape)
display(_tmp)

(3, 3)

Unnamed: 0,ENSG00000264281,ENSG00000110104,ENSG00000160883
ENSG00000264281,1.0,0.482396,0.142065
ENSG00000110104,0.482396,1.0,0.044178
ENSG00000160883,0.142065,0.044178,1.0


In [14]:
%timeit CORRELATION_METHOD(test_data)

187 ms ± 985 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Run

In [15]:
# compute correlations
data_corrs = CORRELATION_METHOD(data)

In [16]:
display(data_corrs.shape)

assert data.shape[0] == data_corrs.shape[0]

(5000, 5000)

In [17]:
data_corrs.head()

Unnamed: 0,ENSG00000283293,ENSG00000202198,ENSG00000277027,ENSG00000269900,ENSG00000274012,ENSG00000276168,ENSG00000222328,ENSG00000277209,ENSG00000199153,ENSG00000198975,...,ENSG00000091140,ENSG00000271029,ENSG00000053372,ENSG00000113845,ENSG00000231864,ENSG00000141574,ENSG00000169750,ENSG00000129559,ENSG00000237441,ENSG00000204103
ENSG00000283293,1.0,0.999995,0.775942,0.775672,0.746934,0.742736,0.597757,0.754145,0.184315,0.18393,...,-0.065746,-0.000966,0.04388,-0.056071,0.163562,0.167481,0.201552,-0.104922,0.120857,0.186545
ENSG00000202198,0.999995,1.0,0.776191,0.775924,0.746909,0.74266,0.598109,0.754395,0.185232,0.184681,...,-0.066264,-0.001348,0.043305,-0.056482,0.16285,0.166897,0.201062,-0.105502,0.120152,0.186211
ENSG00000277027,0.775942,0.776191,1.0,0.99999,0.745275,0.738482,0.660559,0.815825,0.334711,0.326562,...,-0.117128,0.033452,0.049611,-0.086283,0.188504,0.142549,0.18812,-0.154408,0.048148,0.130833
ENSG00000269900,0.775672,0.775924,0.99999,1.0,0.745103,0.738208,0.660972,0.816017,0.336193,0.327756,...,-0.117902,0.032839,0.048756,-0.087053,0.187588,0.141678,0.187281,-0.155177,0.047189,0.130198
ENSG00000274012,0.746934,0.746909,0.745275,0.745103,1.0,0.970719,0.607713,0.721553,0.217818,0.215162,...,-0.119052,0.01708,0.023516,-0.08992,0.211201,0.193238,0.214187,-0.132113,0.129397,0.199827


## Save

In [18]:
output_filename = OUTPUT_DIR / f"{INPUT_FILE.stem}-{method_name}.pkl"
display(output_filename)

PosixPath('/opt/data/results/recount2full/similarity_matrices/recount2_rpkm-var_pc_log2-spearman.pkl')

In [19]:
# save
data_corrs.to_pickle(output_filename)