# Description

TODO

# Modules

In [1]:
import pandas as pd
import numpy as np

from clustermatch import conf

# Settings

In [2]:
DATASET_CONFIG = conf.GTEX
GTEX_TISSUE = "whole_blood"
GENE_SEL_STRATEGY = "var_pc_log2"

In [3]:
# this cell has the "parameters" tag

# size of gene pair groups to process in parallel
CHUNK_SIZE = 50

# Paths

In [4]:
INPUT_GENE_EXPR_FILE = (
    DATASET_CONFIG["GENE_SELECTION_DIR"]
    / f"gtex_v8_data_{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl"
)
display(INPUT_GENE_EXPR_FILE)

assert INPUT_GENE_EXPR_FILE.exists()

PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_whole_blood-var_pc_log2.pkl')

In [5]:
INPUT_GENE_PAIRS_INTERSECTIONS_FILE = (
    DATASET_CONFIG["GENE_PAIR_INTERSECTIONS"]
    / f"gene_pair_intersections-gtex_v8-{GTEX_TISSUE}-{GENE_SEL_STRATEGY}-sample.pkl"
)
display(INPUT_GENE_PAIRS_INTERSECTIONS_FILE)

assert INPUT_GENE_PAIRS_INTERSECTIONS_FILE.exists()

PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections/gene_pair_intersections-gtex_v8-whole_blood-var_pc_log2-sample.pkl')

In [6]:
OUTPUT_FILE = (
    INPUT_GENE_PAIRS_INTERSECTIONS_FILE.parent
    / f"{INPUT_GENE_PAIRS_INTERSECTIONS_FILE.stem}-mic.pkl"
)

display(OUTPUT_FILE)

PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections/gene_pair_intersections-gtex_v8-whole_blood-var_pc_log2-sample-mic.pkl')

# Data

## Gene expression

In [7]:
gene_expr_dict = pd.read_pickle(INPUT_GENE_EXPR_FILE).T.to_dict(orient="series")

In [8]:
len(gene_expr_dict)

5000

In [9]:
gene_expr_dict[list(gene_expr_dict.keys())[0]]

GTEX-111YS-0006-SM-5NQBE       0.5623
GTEX-1122O-0005-SM-5O99J       0.8067
GTEX-1128S-0005-SM-5P9HI     116.9000
GTEX-113IC-0006-SM-5NQ9C       4.0470
GTEX-113JC-0006-SM-5O997     211.0000
                              ...    
GTEX-ZVTK-0006-SM-57WBK     1626.0000
GTEX-ZVZP-0006-SM-51MSW        0.5633
GTEX-ZVZQ-0006-SM-51MR8      515.7000
GTEX-ZXES-0005-SM-57WCB        1.1940
GTEX-ZXG5-0005-SM-57WCN     1163.0000
Name: ENSG00000169429.10, Length: 755, dtype: float64

## Gene pairs intersection

In [10]:
intersections = pd.read_pickle(INPUT_GENE_PAIRS_INTERSECTIONS_FILE)

In [11]:
len(intersections)

16

In [12]:
intersections["Clustermatch (high), Pearson (high), Spearman (high)"]

Unnamed: 0,gene0,gene1
0,ENSG00000125726.10,ENSG00000076604.14
1,ENSG00000182541.17,ENSG00000233214.1
2,ENSG00000250966.2,ENSG00000185963.13
3,ENSG00000135930.13,ENSG00000111897.6
4,ENSG00000169891.17,ENSG00000225889.7
...,...,...
995,ENSG00000112658.7,ENSG00000274536.6
996,ENSG00000120306.9,ENSG00000151948.11
997,ENSG00000256006.1,ENSG00000162711.16
998,ENSG00000121931.15,ENSG00000138434.16


# Compute Maximal Information Coefficient (MIC)

## Functions

In [13]:
import warnings
from sklearn.metrics import pairwise_distances
from minepy.mine import MINE

In [14]:
def _mic(x, y):
    """
    FIXME: move to library
    """
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        mine = MINE(alpha=0.6, c=15, est="mic_approx")
        mine.compute_score(x, y)
        return mine.mic()

In [15]:
_mic(np.random.rand(10), np.random.rand(10))

0.3958156020033588

## Run

In [16]:
from concurrent.futures import ProcessPoolExecutor, as_completed
from collections import defaultdict

from tqdm import tqdm

from clustermatch.utils import chunker

In [17]:
def _compute_mic(gene_sets):
    res = {
        (gs[0], gs[1]): _mic(
            gene_expr_dict[gs[0]].to_numpy(), gene_expr_dict[gs[1]].to_numpy()
        )
        for gs in gene_sets
    }

    return pd.Series(res, index=gene_sets)

In [18]:
# testing
gene_set_key = "Clustermatch (high), Pearson (high), Spearman (high)"
gene_set = intersections[gene_set_key].sample(n=10)

_res = _compute_mic(list(gene_set.itertuples(index=False)))
assert _res.index.to_list() == list(gene_set.itertuples(index=False, name=None))

In [19]:
all_chunks = []

for (
    gene_set_key
) in intersections.keys():  # ["Clustermatch (high), Pearson (low), Spearman (low)"]
    gene_set = list(intersections[gene_set_key].itertuples(index=False, name=None))

    for chunk in list(chunker(list(gene_set), CHUNK_SIZE)):
        all_chunks.append((gene_set_key, chunk))

# all_chunks = [
#     (gene_set_key, chunk)
#     for chunk in list(chunker(list(intersections[gene_set_key].itertuples(index=False, name=None)), 2))
#     for gene_set_key in ["Clustermatch (high), Pearson (low), Spearman (low)"] # intersections.keys()
# ]

In [20]:
len(all_chunks)

246

In [21]:
all_chunks[:1]

[('Clustermatch (high), Pearson (high), Spearman (high)',
  [('ENSG00000125726.10', 'ENSG00000076604.14'),
   ('ENSG00000182541.17', 'ENSG00000233214.1'),
   ('ENSG00000250966.2', 'ENSG00000185963.13'),
   ('ENSG00000135930.13', 'ENSG00000111897.6'),
   ('ENSG00000169891.17', 'ENSG00000225889.7'),
   ('ENSG00000204681.10', 'ENSG00000275857.1'),
   ('ENSG00000167895.14', 'ENSG00000168229.3'),
   ('ENSG00000177406.4', 'ENSG00000133985.2'),
   ('ENSG00000082898.16', 'ENSG00000136816.15'),
   ('ENSG00000082258.12', 'ENSG00000186908.14'),
   ('ENSG00000118900.14', 'ENSG00000136040.8'),
   ('ENSG00000197956.9', 'ENSG00000111261.13'),
   ('ENSG00000173020.10', 'ENSG00000261512.2'),
   ('ENSG00000274717.1', 'ENSG00000166501.12'),
   ('ENSG00000154978.12', 'ENSG00000260822.1'),
   ('ENSG00000068912.13', 'ENSG00000145416.13'),
   ('ENSG00000087152.15', 'ENSG00000165801.9'),
   ('ENSG00000127586.16', 'ENSG00000277734.7'),
   ('ENSG00000058799.14', 'ENSG00000267670.1'),
   ('ENSG00000157150.4', 'E

In [22]:
all_results = defaultdict(list)

with ProcessPoolExecutor(max_workers=conf.GENERAL["N_JOBS"]) as executor:
    tasks = {
        executor.submit(_compute_mic, chunk): gene_set_key
        for gene_set_key, chunk in all_chunks
    }

    pbar = tqdm(as_completed(tasks), total=len(all_chunks), ncols=100)

    for future in pbar:
        gene_set_key = tasks[future]
        gene_set_mic = future.result()

        all_results[gene_set_key].append(gene_set_mic)

_tmp = {}

for k in all_results.keys():
    _tmp[k] = pd.concat(all_results[k])

all_results = _tmp

100%|█████████████████████████████████████████████████████████████| 246/246 [06:12<00:00,  1.51s/it]


In [23]:
assert len(all_results) == len(intersections.keys())

# Save

In [24]:
import pickle

In [25]:
with open(OUTPUT_FILE, "wb") as handle:
    pickle.dump(all_results, handle)