# Description

According to the settings specified below, this notebook:
 1. reads all the data from one source (GTEx, recount2, etc) according to the gene selection method (`GENE_SELECTION_STRATEGY`),
 2. runs a quick performance test using the correlation coefficient specified (`CORRELATION_METHOD`), and
 3. computes the correlation matrix across all the genes using the correlation coefficient specified.

# Modules

In [1]:
import pandas as pd

from clustermatch import conf
from clustermatch.corr import clustermatch

# Settings

In [2]:
# we don't have gene subsets for recount2
# GENE_SELECTION_STRATEGY = "var_raw"

In [3]:
def clustermatch_k2(data):
    n_clusters = list(range(2, 2 + 1))
    return clustermatch(data, internal_n_clusters=n_clusters)


CORRELATION_METHOD = clustermatch_k2

method_name = CORRELATION_METHOD.__name__
display(method_name)

'clustermatch_k2'

In [4]:
PERFORMANCE_TEST_N_TOP_GENES = 500

# Paths

In [5]:
INPUT_FILE = conf.RECOUNT2["DATA_FILE"]
display(INPUT_FILE)

assert INPUT_FILE.exists()

PosixPath('/opt/data/data/recount2/recount_data_prep_PLIER.pkl')

In [6]:
OUTPUT_DIR = conf.RECOUNT2["SIMILARITY_MATRICES_DIR"]
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

PosixPath('/opt/data/results/recount2/similarity_matrices')

# Data loading

In [7]:
data = pd.read_pickle(INPUT_FILE)

In [8]:
data.shape

(6750, 37032)

In [9]:
data.head()

Unnamed: 0,SRP000599.SRR013549,SRP000599.SRR013550,SRP000599.SRR013551,SRP000599.SRR013552,SRP000599.SRR013553,SRP000599.SRR013554,SRP000599.SRR013555,SRP000599.SRR013556,SRP000599.SRR013557,SRP000599.SRR013558,...,SRP035599.SRR1139372,SRP035599.SRR1139393,SRP035599.SRR1139388,SRP035599.SRR1139378,SRP035599.SRR1139399,SRP035599.SRR1139386,SRP035599.SRR1139375,SRP035599.SRR1139382,SRP035599.SRR1139356,SRP035599.SRR1139370
GAS6,-0.3125,-0.312931,-0.312931,-0.312931,-0.312931,-0.308253,-0.312931,-0.312931,-0.312931,-0.312931,...,-0.301711,-0.305581,-0.303344,-0.2978,-0.307122,-0.285499,-0.309599,-0.30022,-0.297667,-0.310151
MMP14,-0.328279,-0.328279,-0.328279,-0.328279,-0.328279,-0.328279,-0.328279,-0.328279,-0.328279,-0.32514,...,-0.314587,-0.322952,-0.326439,-0.325994,-0.326272,-0.322523,-0.326375,-0.326339,-0.322127,-0.327438
DSP,-0.286319,-0.286859,-0.286859,-0.286859,-0.286859,-0.286859,-0.277195,-0.256862,-0.27879,-0.269701,...,-0.286859,-0.286859,-0.286745,-0.286688,-0.286725,-0.286529,-0.286859,-0.286671,-0.286859,-0.28674
MARCKSL1,-0.536646,-0.536646,-0.536646,-0.536646,-0.536646,-0.536646,-0.536646,-0.536646,-0.536646,-0.536646,...,0.807663,1.294564,1.527655,1.404788,1.047931,0.892119,1.507099,2.458255,2.919662,1.410846
SPARC,-0.370498,-0.370498,-0.369171,-0.370498,-0.370498,-0.370498,-0.370498,-0.370498,-0.370498,-0.370498,...,-0.345409,-0.31075,-0.34812,-0.356938,-0.355206,-0.366197,-0.351174,-0.363703,-0.350825,-0.360762


# Compute similarity

## Performance test

In [10]:
# select a subset of the genes
test_data = data.sample(n=PERFORMANCE_TEST_N_TOP_GENES, random_state=0)

In [11]:
test_data.shape

(500, 37032)

In [12]:
test_data.head()

Unnamed: 0,SRP000599.SRR013549,SRP000599.SRR013550,SRP000599.SRR013551,SRP000599.SRR013552,SRP000599.SRR013553,SRP000599.SRR013554,SRP000599.SRR013555,SRP000599.SRR013556,SRP000599.SRR013557,SRP000599.SRR013558,...,SRP035599.SRR1139372,SRP035599.SRR1139393,SRP035599.SRR1139388,SRP035599.SRR1139378,SRP035599.SRR1139399,SRP035599.SRR1139386,SRP035599.SRR1139375,SRP035599.SRR1139382,SRP035599.SRR1139356,SRP035599.SRR1139370
S1PR5,-0.266852,-0.266852,-0.266852,-0.266852,-0.266852,-0.266852,-0.266852,-0.266852,-0.266852,-0.266852,...,-0.266852,-0.266852,-0.261464,-0.262521,-0.263571,-0.232207,-0.263313,-0.263926,-0.266852,-0.266852
NDUFA12,-0.38902,-0.38902,-0.38902,-0.38902,-0.38902,-0.38902,-0.38902,-0.38902,-0.38902,-0.38902,...,0.264052,0.141801,-0.101283,0.016574,0.054545,0.109291,-0.080455,-0.039797,-0.171493,-0.039619
EXOSC10,-0.775256,-0.781191,-0.781191,-0.781191,-0.781191,-0.781191,-0.781191,-0.781191,-0.781191,-0.781191,...,-0.151619,-0.190753,-0.263083,0.006409,-0.090697,-0.19675,-0.039938,0.155149,0.076707,0.201055
ALOX15B,-0.104175,-0.104175,-0.104175,-0.104175,-0.104175,-0.104175,-0.104175,-0.104175,-0.104175,-0.104175,...,-0.102583,-0.104175,-0.104175,-0.090938,-0.104175,-0.091455,-0.097841,-0.097053,-0.098525,-0.10249
CACNB1,-0.412386,-0.412386,-0.412386,-0.412386,-0.412386,-0.412386,-0.412386,-0.412386,-0.412386,-0.412386,...,-0.002829,-0.17621,-0.124728,-0.034,-0.170579,-0.134903,-0.050322,-0.160912,-0.034017,-0.001866


This is a quick performance test of the correlation measure. The following line (`_tmp = ...`) is the setup code, which is needed in case the correlation method was optimized using `numba` and needs to be compiled before performing the test.

In [13]:
_tmp = CORRELATION_METHOD(test_data.iloc[:3])

display(_tmp.shape)
display(_tmp)

(3, 3)

Unnamed: 0,S1PR5,NDUFA12,EXOSC10
S1PR5,1.0,0.003439,0.047912
NDUFA12,0.003439,1.0,0.082032
EXOSC10,0.047912,0.082032,1.0


In [14]:
%timeit CORRELATION_METHOD(test_data)

1min 21s ± 78.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Run

In [15]:
# compute correlations
data_corrs = CORRELATION_METHOD(data)

In [16]:
display(data_corrs.shape)

assert data.shape[0] == data_corrs.shape[0]

(6750, 6750)

In [17]:
data_corrs.head()

Unnamed: 0,GAS6,MMP14,DSP,MARCKSL1,SPARC,CTSD,EPAS1,PALLD,PHC2,LGALS3BP,...,LDHB,LDHC,ACAP2,ACAP3,CFL2,CFL1,NFIB,PLEKHG6,GNGT2,SERPINH1
GAS6,1.0,0.359879,0.161606,0.069152,0.179718,0.241732,0.273178,0.240247,0.22299,0.220751,...,0.008947,0.065011,0.020923,0.244929,0.129664,0.067795,0.106808,0.090599,0.099318,0.255519
MMP14,0.359879,1.0,0.144209,0.0602,0.271937,0.192011,0.258037,0.207851,0.187494,0.193339,...,0.011178,0.075366,0.033851,0.134849,0.096144,0.051084,0.068585,0.110153,0.111159,0.264664
DSP,0.161606,0.144209,1.0,0.042226,0.062718,0.048719,0.207457,0.277144,0.018145,0.147427,...,0.006362,0.060892,0.005772,0.027932,0.135008,0.003201,0.142329,0.143227,-2.7e-05,0.312919
MARCKSL1,0.069152,0.0602,0.042226,1.0,0.041695,0.066842,0.021648,0.039905,0.100069,0.043523,...,0.099796,0.05016,0.036485,0.13652,0.028806,0.131773,0.02184,0.044066,0.057527,0.099114
SPARC,0.179718,0.271937,0.062718,0.041695,1.0,0.067178,0.123818,0.169873,0.081537,0.117885,...,0.034611,0.033891,0.00314,0.040078,0.128344,0.009827,0.104421,0.013683,0.072254,0.13684


In [18]:
output_filename = OUTPUT_DIR / f"{INPUT_FILE.stem}-{method_name}.pkl"
display(output_filename)

PosixPath('/opt/data/results/recount2/similarity_matrices/recount_data_prep_PLIER-clustermatch_k2.pkl')

In [19]:
# save
data_corrs.to_pickle(output_filename)