# Description

TODO

# Modules

In [1]:
from pathlib import Path

import pandas as pd
import numpy as np

from clustermatch import conf

# Settings

In [2]:
DATASET_CONFIG = conf.GTEX
GTEX_TISSUE = "whole_blood"
GENE_SEL_STRATEGY = "var_pc_log2"

METHOD_NAME = "mic"

In [3]:
# this cell has the "parameters" tag

# size of gene pair groups to process in parallel
CHUNK_SIZE = 100

# Paths

In [4]:
INPUT_GENE_EXPR_FILE = (
    DATASET_CONFIG["GENE_SELECTION_DIR"]
    / f"gtex_v8_data_{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl"
)
display(INPUT_GENE_EXPR_FILE)

assert INPUT_GENE_EXPR_FILE.exists()

PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_whole_blood-var_pc_log2.pkl')

In [5]:
GENE_PAIRS_FILE_SUFFIX = (
    DATASET_CONFIG["GENE_SELECTION_DIR"]
    / f"gtex_v8_data_{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl"
)
display(GENE_PAIRS_FILE_SUFFIX)

assert GENE_PAIRS_FILE_SUFFIX.exists()

PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_whole_blood-var_pc_log2.pkl')

In [6]:
INPUT_DIR = GENE_PAIRS_FILE_SUFFIX.parent / "samples"
display(INPUT_DIR)

PosixPath('/opt/data/results/gtex_v8/gene_selection/samples')

In [7]:
INPUT_GENE_PAIRS_FILE = INPUT_DIR / (
    f"{GENE_PAIRS_FILE_SUFFIX.stem}-all-gene_pairs-sample_" + "{sample_id}" + ".pkl"
)
display(INPUT_GENE_PAIRS_FILE)

INPUT_GENE_PAIRS_FILE_TEMPLATE = str(INPUT_GENE_PAIRS_FILE)
display(INPUT_GENE_PAIRS_FILE_TEMPLATE)

PosixPath('/opt/data/results/gtex_v8/gene_selection/samples/gtex_v8_data_whole_blood-var_pc_log2-all-gene_pairs-sample_{sample_id}.pkl')

'/opt/data/results/gtex_v8/gene_selection/samples/gtex_v8_data_whole_blood-var_pc_log2-all-gene_pairs-sample_{sample_id}.pkl'

In [8]:
OUTPUT_DIR = DATASET_CONFIG["RESULTS_DIR"] / "comparison_others" / METHOD_NAME
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

PosixPath('/opt/data/results/gtex_v8/comparison_others/mic')

In [9]:
OUTPUT_FILE_TEMPLATE = str(
    OUTPUT_DIR / (INPUT_GENE_PAIRS_FILE.name[:-4] + f"-{METHOD_NAME}.pkl")
)

display(OUTPUT_FILE_TEMPLATE)

'/opt/data/results/gtex_v8/comparison_others/mic/gtex_v8_data_whole_blood-var_pc_log2-all-gene_pairs-sample_{sample_id}-mic.pkl'

# Data

## Gene expression

In [10]:
gene_expr_dict = pd.read_pickle(INPUT_GENE_EXPR_FILE).T.to_dict(orient="series")

In [11]:
len(gene_expr_dict)

5000

In [12]:
gene_expr_dict[list(gene_expr_dict.keys())[0]]

GTEX-111YS-0006-SM-5NQBE       0.5623
GTEX-1122O-0005-SM-5O99J       0.8067
GTEX-1128S-0005-SM-5P9HI     116.9000
GTEX-113IC-0006-SM-5NQ9C       4.0470
GTEX-113JC-0006-SM-5O997     211.0000
                              ...    
GTEX-ZVTK-0006-SM-57WBK     1626.0000
GTEX-ZVZP-0006-SM-51MSW        0.5633
GTEX-ZVZQ-0006-SM-51MR8      515.7000
GTEX-ZXES-0005-SM-57WCB        1.1940
GTEX-ZXG5-0005-SM-57WCN     1163.0000
Name: ENSG00000169429.10, Length: 755, dtype: float64

## Gene pairs intersection

In [13]:
# intersections = pd.read_pickle(INPUT_GENE_PAIRS_INTERSECTIONS_FILE)

In [14]:
# len(intersections)

In [15]:
# intersections["Clustermatch (high), Pearson (high), Spearman (high)"]

# Compute Maximal Information Coefficient (MIC)

## Functions

In [16]:
import warnings
from sklearn.metrics import pairwise_distances
from minepy.mine import MINE

In [17]:
def _mic(x, y):
    """
    FIXME: move to library
    """
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=DeprecationWarning)

        mine = MINE(alpha=0.6, c=15, est="mic_approx")
        mine.compute_score(x, y)
        return mine.mic()

In [18]:
_mic(np.random.rand(10), np.random.rand(10))

0.12451124978365385

## Get all sample files

In [19]:
all_sample_files = []

sample_id = 0
sample_file = Path(INPUT_GENE_PAIRS_FILE_TEMPLATE.format(sample_id=sample_id))
display(sample_file)

while sample_file.exists():
    all_sample_files.append((sample_id, sample_file))

    sample_id += 1
    sample_file = Path(INPUT_GENE_PAIRS_FILE_TEMPLATE.format(sample_id=sample_id))

PosixPath('/opt/data/results/gtex_v8/gene_selection/samples/gtex_v8_data_whole_blood-var_pc_log2-all-gene_pairs-sample_0.pkl')

In [20]:
# all_sample_files = sorted(
#     list(
#         INPUT_GENE_PAIRS_FILE.parent.glob(INPUT_GENE_PAIRS_FILE.name.format(sample_id="*"))
#     )
# )

In [21]:
display(len(all_sample_files))
assert len(all_sample_files) > 0

1

In [22]:
all_sample_files[:3]

[(0,
  PosixPath('/opt/data/results/gtex_v8/gene_selection/samples/gtex_v8_data_whole_blood-var_pc_log2-all-gene_pairs-sample_0.pkl'))]

## Run

In [23]:
from concurrent.futures import ProcessPoolExecutor, as_completed
from collections import defaultdict

from tqdm import tqdm

from clustermatch.utils import chunker

In [24]:
def _compute_mic(gene_sets: list):
    res = [
        _mic(gene_expr_dict[gs[0]].to_numpy(), gene_expr_dict[gs[1]].to_numpy())
        for gs in gene_sets
    ]

    return pd.Series(res, index=pd.MultiIndex.from_tuples(gene_sets))

In [25]:
# testing
# gene_set_key = "Clustermatch (high), Pearson (high), Spearman (high)"
gene_set = pd.read_pickle(all_sample_files[0][1]).sample(n=10)
display(gene_set)

_res = _compute_mic(list(gene_set.itertuples(index=False)))
display(_res.shape)
display(_res.head())

# make sure order is preserved
assert _res.index.to_list() == list(gene_set.itertuples(index=False, name=None))

Unnamed: 0,gene0,gene1
1692344,ENSG00000100941.8,ENSG00000137575.11
8616440,ENSG00000182220.14,ENSG00000225101.4
8056444,ENSG00000174600.13,ENSG00000175274.18
736268,ENSG00000067066.16,ENSG00000110852.4
11383571,ENSG00000251578.1,ENSG00000198189.10
1617963,ENSG00000100483.13,ENSG00000274180.1
10506160,ENSG00000222112.1,ENSG00000079277.19
12222845,ENSG00000276597.1,ENSG00000255163.1
10048076,ENSG00000211725.3,ENSG00000204622.11
50513,ENSG00000004700.15,ENSG00000096433.10


(10,)

ENSG00000100941.8   ENSG00000137575.11    0.368722
ENSG00000182220.14  ENSG00000225101.4     0.253846
ENSG00000174600.13  ENSG00000175274.18    0.174016
ENSG00000067066.16  ENSG00000110852.4     0.240572
ENSG00000251578.1   ENSG00000198189.10    0.180615
dtype: float64

In [26]:
all_chunks = []

for (sample_id, sample_file) in all_sample_files:
    gene_pairs_df = pd.read_pickle(sample_file)
    gene_pairs_subset = list(gene_pairs_df.itertuples(index=False, name=None))

    for chunk in list(chunker(gene_pairs_subset, CHUNK_SIZE)):
        all_chunks.append((sample_id, chunk))

In [27]:
len(all_chunks)

1000

In [28]:
all_chunks[:2]

[(0,
  [('ENSG00000181192.11', 'ENSG00000144579.7'),
   ('ENSG00000111641.11', 'ENSG00000258476.5'),
   ('ENSG00000172081.13', 'ENSG00000130429.12'),
   ('ENSG00000105255.10', 'ENSG00000231721.6'),
   ('ENSG00000153310.19', 'ENSG00000241657.1'),
   ('ENSG00000175063.16', 'ENSG00000178607.15'),
   ('ENSG00000143401.14', 'ENSG00000159588.14'),
   ('ENSG00000058668.14', 'ENSG00000162366.7'),
   ('ENSG00000169967.16', 'ENSG00000205423.11'),
   ('ENSG00000166145.14', 'ENSG00000230076.1'),
   ('ENSG00000144040.12', 'ENSG00000224383.7'),
   ('ENSG00000069399.14', 'ENSG00000149489.8'),
   ('ENSG00000117697.14', 'ENSG00000120694.19'),
   ('ENSG00000279430.1', 'ENSG00000211943.2'),
   ('ENSG00000013563.13', 'ENSG00000211885.1'),
   ('ENSG00000096088.16', 'ENSG00000110315.6'),
   ('ENSG00000140443.13', 'ENSG00000122861.15'),
   ('ENSG00000103111.14', 'ENSG00000234506.5'),
   ('ENSG00000168297.15', 'ENSG00000169045.17'),
   ('ENSG00000221988.12', 'ENSG00000277978.1'),
   ('ENSG00000099308.10', 'EN

In [29]:
all_results = defaultdict(list)

with ProcessPoolExecutor(max_workers=conf.GENERAL["N_JOBS"]) as executor:
    tasks = {
        executor.submit(_compute_mic, chunk): sample_id
        for sample_id, chunk in all_chunks
    }

    pbar = tqdm(as_completed(tasks), total=len(all_chunks), ncols=100)

    for future in pbar:
        sample_id = tasks[future]
        sample_file_mic = future.result()

        all_results[sample_id].append(sample_file_mic)

100%|███████████████████████████████████████████████████████████| 1000/1000 [50:41<00:00,  3.04s/it]


# Save for each sample file

In [30]:
for sample_id in all_results.keys():
    sample_file_all_results_df = pd.concat(all_results[sample_id]).sort_index()
    assert not sample_file_all_results_df.isna().any()
    sample_file_all_results_gene_pairs_set = set(sample_file_all_results_df.index)

    # testing: load input gene pairs
    sample_file_df = pd.read_pickle(
        INPUT_GENE_PAIRS_FILE_TEMPLATE.format(sample_id=sample_id)
    )
    assert sample_file_df.drop_duplicates().shape[0] == sample_file_df.shape[0]

    # testing: number of gene pairs are the same in input data and in results
    assert sample_file_df.shape[0] == sample_file_all_results_df.shape[0]

    # testing: make sure gene ids are the same in results as in input gene pairs
    sample_file_gene_pairs_list = list(
        sample_file_df.itertuples(index=False, name=None)
    )
    sample_file_gene_pairs_set = set(sample_file_gene_pairs_list)
    assert len(sample_file_gene_pairs_set) == len(
        sample_file_gene_pairs_set.intersection(sample_file_all_results_gene_pairs_set)
    )

    # save results with same order (in gene pairs) as input sample data
    sample_file_all_results_df = sample_file_all_results_df.loc[
        sample_file_gene_pairs_list
    ]
    sample_file_all_results_df.to_pickle(
        OUTPUT_FILE_TEMPLATE.format(sample_id=sample_id)
    )

In [31]:
# show how one result set looks like
display(sample_file_all_results_df.shape)
display(sample_file_all_results_df.head())

(100000,)

ENSG00000181192.11  ENSG00000144579.7     0.630906
ENSG00000111641.11  ENSG00000258476.5     0.210311
ENSG00000172081.13  ENSG00000130429.12    0.643822
ENSG00000105255.10  ENSG00000231721.6     0.385814
ENSG00000153310.19  ENSG00000241657.1     0.164555
dtype: float64