# Description

TODO

# Modules

In [1]:
from pathlib import Path

import pandas as pd
import numpy as np

from clustermatch import conf

# Settings

In [2]:
DATASET_CONFIG = conf.GTEX
GTEX_TISSUE = "whole_blood"
GENE_SEL_STRATEGY = "var_pc_log2"

METHOD_NAME = "mic"

In [3]:
# this cell has the "parameters" tag

# size of gene pair groups to process in parallel
CHUNK_SIZE = 100

# Paths

In [4]:
INPUT_GENE_EXPR_FILE = (
    DATASET_CONFIG["GENE_SELECTION_DIR"]
    / f"gtex_v8_data_{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl"
)
display(INPUT_GENE_EXPR_FILE)

assert INPUT_GENE_EXPR_FILE.exists()

PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_whole_blood-var_pc_log2.pkl')

In [5]:
GENE_PAIRS_FILE_SUFFIX = (
    DATASET_CONFIG["GENE_SELECTION_DIR"]
    / f"gtex_v8_data_{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl"
)
display(GENE_PAIRS_FILE_SUFFIX)

assert GENE_PAIRS_FILE_SUFFIX.exists()

PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_whole_blood-var_pc_log2.pkl')

In [6]:
INPUT_DIR = GENE_PAIRS_FILE_SUFFIX.parent / "samples"
display(INPUT_DIR)

PosixPath('/opt/data/results/gtex_v8/gene_selection/samples')

In [7]:
INPUT_GENE_PAIRS_FILE = INPUT_DIR / (
    f"{GENE_PAIRS_FILE_SUFFIX.stem}-gene_pairs-sample_" + "{sample_id}" + ".pkl"
)
display(INPUT_GENE_PAIRS_FILE)

INPUT_GENE_PAIRS_FILE_TEMPLATE = str(INPUT_GENE_PAIRS_FILE)
display(INPUT_GENE_PAIRS_FILE_TEMPLATE)

PosixPath('/opt/data/results/gtex_v8/gene_selection/samples/gtex_v8_data_whole_blood-var_pc_log2-gene_pairs-sample_{sample_id}.pkl')

'/opt/data/results/gtex_v8/gene_selection/samples/gtex_v8_data_whole_blood-var_pc_log2-gene_pairs-sample_{sample_id}.pkl'

In [8]:
OUTPUT_DIR = DATASET_CONFIG["RESULTS_DIR"] / "comparison_others" / METHOD_NAME
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

PosixPath('/opt/data/results/gtex_v8/comparison_others/mic')

In [9]:
OUTPUT_FILE_TEMPLATE = str(
    OUTPUT_DIR / (INPUT_GENE_PAIRS_FILE.name[:-4] + f"-{METHOD_NAME}.pkl")
)

display(OUTPUT_FILE_TEMPLATE)

'/opt/data/results/gtex_v8/comparison_others/mic/gtex_v8_data_whole_blood-var_pc_log2-gene_pairs-sample_{sample_id}-mic.pkl'

# Data

## Gene expression

In [10]:
gene_expr_dict = pd.read_pickle(INPUT_GENE_EXPR_FILE).T.to_dict(orient="series")

In [11]:
len(gene_expr_dict)

5000

In [12]:
gene_expr_dict[list(gene_expr_dict.keys())[0]]

GTEX-111YS-0006-SM-5NQBE       0.5623
GTEX-1122O-0005-SM-5O99J       0.8067
GTEX-1128S-0005-SM-5P9HI     116.9000
GTEX-113IC-0006-SM-5NQ9C       4.0470
GTEX-113JC-0006-SM-5O997     211.0000
                              ...    
GTEX-ZVTK-0006-SM-57WBK     1626.0000
GTEX-ZVZP-0006-SM-51MSW        0.5633
GTEX-ZVZQ-0006-SM-51MR8      515.7000
GTEX-ZXES-0005-SM-57WCB        1.1940
GTEX-ZXG5-0005-SM-57WCN     1163.0000
Name: ENSG00000169429.10, Length: 755, dtype: float64

## Gene pairs intersection

In [13]:
# intersections = pd.read_pickle(INPUT_GENE_PAIRS_INTERSECTIONS_FILE)

In [14]:
# len(intersections)

In [15]:
# intersections["Clustermatch (high), Pearson (high), Spearman (high)"]

# Compute Maximal Information Coefficient (MIC)

## Functions

In [16]:
import warnings
from sklearn.metrics import pairwise_distances
from minepy.mine import MINE

In [17]:
def _mic(x, y):
    """
    FIXME: move to library
    """
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        mine = MINE(alpha=0.6, c=15, est="mic_approx")
        mine.compute_score(x, y)
        return mine.mic()

In [18]:
_mic(np.random.rand(10), np.random.rand(10))

0.2364527976600289

## Get all sample files

In [19]:
all_sample_files = []

sample_id = 0
sample_file = Path(INPUT_GENE_PAIRS_FILE_TEMPLATE.format(sample_id=sample_id))

while sample_file.exists():
    all_sample_files.append((sample_id, sample_file))

    sample_id += 1
    sample_file = Path(INPUT_GENE_PAIRS_FILE_TEMPLATE.format(sample_id=sample_id))

In [20]:
# all_sample_files = sorted(
#     list(
#         INPUT_GENE_PAIRS_FILE.parent.glob(INPUT_GENE_PAIRS_FILE.name.format(sample_id="*"))
#     )
# )

In [21]:
len(all_sample_files)

1

In [22]:
all_sample_files[:3]

[(0,
  PosixPath('/opt/data/results/gtex_v8/gene_selection/samples/gtex_v8_data_whole_blood-var_pc_log2-gene_pairs-sample_0.pkl'))]

## Run

In [23]:
from concurrent.futures import ProcessPoolExecutor, as_completed
from collections import defaultdict

from tqdm import tqdm

from clustermatch.utils import chunker

In [24]:
def _compute_mic(gene_sets: list):
    res = [
        _mic(gene_expr_dict[gs[0]].to_numpy(), gene_expr_dict[gs[1]].to_numpy())
        for gs in gene_sets
    ]

    return pd.Series(res, index=pd.MultiIndex.from_tuples(gene_sets))

In [25]:
# testing
# gene_set_key = "Clustermatch (high), Pearson (high), Spearman (high)"
gene_set = pd.read_pickle(all_sample_files[0][1]).sample(n=10)
display(gene_set)

_res = _compute_mic(list(gene_set.itertuples(index=False)))
display(_res.shape)
display(_res.head())

# make sure order is preserved
assert _res.index.to_list() == list(gene_set.itertuples(index=False, name=None))

Unnamed: 0,gene0,gene1
4261563,ENSG00000132965.9,ENSG00000215717.5
9231599,ENSG00000160305.17,ENSG00000146285.13
2220463,ENSG00000198851.9,ENSG00000076382.16
8687958,ENSG00000277443.2,ENSG00000211883.1
636227,ENSG00000127954.12,ENSG00000111652.9
11430922,ENSG00000112406.4,ENSG00000141497.13
4920103,ENSG00000163586.9,ENSG00000130066.16
4989708,ENSG00000160808.9,ENSG00000116209.11
5538692,ENSG00000280088.1,ENSG00000110031.12
7697744,ENSG00000204103.3,ENSG00000168887.10


(10,)

ENSG00000132965.9   ENSG00000215717.5     0.320085
ENSG00000160305.17  ENSG00000146285.13    0.377725
ENSG00000198851.9   ENSG00000076382.16    0.348644
ENSG00000277443.2   ENSG00000211883.1     0.149447
ENSG00000127954.12  ENSG00000111652.9     0.606960
dtype: float64

In [26]:
all_chunks = []

for (sample_id, sample_file) in all_sample_files:
    gene_pairs_df = pd.read_pickle(sample_file)
    gene_pairs_subset = list(gene_pairs_df.itertuples(index=False, name=None))

    for chunk in list(chunker(gene_pairs_subset, CHUNK_SIZE)):
        all_chunks.append((sample_id, chunk))

In [27]:
len(all_chunks)

330

In [28]:
all_chunks[:2]

[(0,
  [('ENSG00000161217.11', 'ENSG00000248527.1'),
   ('ENSG00000106927.11', 'ENSG00000136929.12'),
   ('ENSG00000177156.10', 'ENSG00000156711.16'),
   ('ENSG00000197965.11', 'ENSG00000272155.1'),
   ('ENSG00000278330.1', 'ENSG00000179387.9'),
   ('ENSG00000275993.2', 'ENSG00000119929.12'),
   ('ENSG00000143771.11', 'ENSG00000213341.10'),
   ('ENSG00000173391.8', 'ENSG00000110422.11'),
   ('ENSG00000138326.18', 'ENSG00000242473.1'),
   ('ENSG00000268433.1', 'ENSG00000187808.4'),
   ('ENSG00000117616.17', 'ENSG00000173020.10'),
   ('ENSG00000211959.2', 'ENSG00000008083.13'),
   ('ENSG00000161835.10', 'ENSG00000013306.15'),
   ('ENSG00000079432.7', 'ENSG00000025770.18'),
   ('ENSG00000124102.4', 'ENSG00000124357.12'),
   ('ENSG00000168903.8', 'ENSG00000204394.12'),
   ('ENSG00000101096.19', 'ENSG00000160683.4'),
   ('ENSG00000115541.10', 'ENSG00000267990.1'),
   ('ENSG00000187808.4', 'ENSG00000179869.14'),
   ('ENSG00000168010.10', 'ENSG00000197579.7'),
   ('ENSG00000136872.17', 'ENSG0

In [29]:
all_results = defaultdict(list)

with ProcessPoolExecutor(max_workers=conf.GENERAL["N_JOBS"]) as executor:
    tasks = {
        executor.submit(_compute_mic, chunk): sample_id
        for sample_id, chunk in all_chunks
    }

    pbar = tqdm(as_completed(tasks), total=len(all_chunks), ncols=100)

    for future in pbar:
        sample_id = tasks[future]
        sample_file_mic = future.result()

        all_results[sample_id].append(sample_file_mic)

100%|█████████████████████████████████████████████████████████████| 330/330 [16:40<00:00,  3.03s/it]


# Save for each sample file

In [30]:
for sample_id in all_results.keys():
    sample_file_all_results_df = pd.concat(all_results[sample_id]).sort_index()
    assert not sample_file_all_results_df.isna().any()
    sample_file_all_results_gene_pairs_set = set(sample_file_all_results_df.index)

    # testing: load input gene pairs
    sample_file_df = pd.read_pickle(
        INPUT_GENE_PAIRS_FILE_TEMPLATE.format(sample_id=sample_id)
    )
    assert sample_file_df.drop_duplicates().shape[0] == sample_file_df.shape[0]

    # testing: number of gene pairs are the same in input data and in results
    assert sample_file_df.shape[0] == sample_file_all_results_df.shape[0]

    # testing: make sure gene ids are the same in results as in input gene pairs
    sample_file_gene_pairs_list = list(
        sample_file_df.itertuples(index=False, name=None)
    )
    sample_file_gene_pairs_set = set(sample_file_gene_pairs_list)
    assert len(sample_file_gene_pairs_set) == len(
        sample_file_gene_pairs_set.intersection(sample_file_all_results_gene_pairs_set)
    )

    # save results with same order (in gene pairs) as input sample data
    sample_file_all_results_df = sample_file_all_results_df.loc[
        sample_file_gene_pairs_list
    ]
    sample_file_all_results_df.to_pickle(
        OUTPUT_FILE_TEMPLATE.format(sample_id=sample_id)
    )

In [31]:
# show how one result set looks like
display(sample_file_all_results_df.shape)
display(sample_file_all_results_df.head())

(33000,)

ENSG00000161217.11  ENSG00000248527.1     0.186836
ENSG00000106927.11  ENSG00000136929.12    0.225252
ENSG00000177156.10  ENSG00000156711.16    0.406688
ENSG00000197965.11  ENSG00000272155.1     0.593150
ENSG00000278330.1   ENSG00000179387.9     0.363143
dtype: float64