# Description

It combines all coefficient values in one tissue (see `Settings` below) into a single dataframe for easier processing later.

This notebook incorporates results using MIC, which was computed only in a subset of gene pairs due to its computational complexity.

# Modules loading

In [None]:
import pandas as pd

from clustermatch import conf
from clustermatch.utils import get_upper_triag

# Settings

In [None]:
DATASET_CONFIG = conf.GTEX
# whole blood by default, but this is a parameters cells that can be changed when running papermill
GTEX_TISSUE = "whole_blood"
GENE_SEL_STRATEGY = "var_pc_log2"

METHOD_NAME = "mic"

# Paths

In [None]:
COMPARISONS_DIR = DATASET_CONFIG["RESULTS_DIR"] / "comparison_others"
display(COMPARISONS_DIR)

In [None]:
INPUT_DIR = COMPARISONS_DIR / METHOD_NAME
display(INPUT_DIR)

In [None]:
INPUT_CORR_FILE_TEMPLATE = (
    DATASET_CONFIG["SIMILARITY_MATRICES_DIR"]
    / DATASET_CONFIG["SIMILARITY_MATRIX_FILENAME_TEMPLATE"]
)
display(INPUT_CORR_FILE_TEMPLATE)

In [None]:
INPUT_CORR_FILE_TEMPLATE = (
    DATASET_CONFIG["SIMILARITY_MATRICES_DIR"]
    / DATASET_CONFIG["SIMILARITY_MATRIX_FILENAME_TEMPLATE"]
)
display(INPUT_CORR_FILE_TEMPLATE)

In [None]:
OUTPUT_FILE = (
    COMPARISONS_DIR / f"gtex_v8_data_{GTEX_TISSUE}-{GENE_SEL_STRATEGY}-all.pkl"
)
display(OUTPUT_FILE)

# Load data

## Clustermatch

In [None]:
clustermatch_df = pd.read_pickle(
    str(INPUT_CORR_FILE_TEMPLATE).format(
        tissue=GTEX_TISSUE,
        gene_sel_strategy=GENE_SEL_STRATEGY,
        corr_method="clustermatch",
    )
)

In [None]:
clustermatch_df.shape

In [None]:
clustermatch_df.head()

In [None]:
clustermatch_df = get_upper_triag(clustermatch_df)

In [None]:
clustermatch_df = (
    clustermatch_df.unstack()
    .rename_axis((None, None))
    .dropna()
    .sort_index()
    .rename("clustermatch")
)

In [None]:
clustermatch_df.shape

In [None]:
clustermatch_df.head()

## Pearson

In [None]:
pearson_df = pd.read_pickle(
    str(INPUT_CORR_FILE_TEMPLATE).format(
        tissue=GTEX_TISSUE,
        gene_sel_strategy=GENE_SEL_STRATEGY,
        corr_method="pearson",
    )
)

In [None]:
pearson_df.shape

In [None]:
pearson_df.head()

In [None]:
pearson_df = get_upper_triag(pearson_df)

In [None]:
# make pearson abs
pearson_df = (
    pearson_df.unstack()
    .rename_axis((None, None))
    .dropna()
    .abs()
    .sort_index()
    .rename("pearson")
)

In [None]:
pearson_df.shape

In [None]:
pearson_df.head()

## Spearman

In [None]:
spearman_df = pd.read_pickle(
    str(INPUT_CORR_FILE_TEMPLATE).format(
        tissue=GTEX_TISSUE,
        gene_sel_strategy=GENE_SEL_STRATEGY,
        corr_method="spearman",
    )
)

In [None]:
spearman_df.shape

In [None]:
spearman_df.head()

In [None]:
spearman_df = get_upper_triag(spearman_df)

In [None]:
# make spearman abs
spearman_df = (
    spearman_df.unstack()
    .rename_axis((None, None))
    .dropna()
    .abs()
    .sort_index()
    .rename("spearman")
)

In [None]:
spearman_df.shape

In [None]:
spearman_df.head()

## MIC

In [None]:
mic_df = pd.read_pickle(
    str(INPUT_CORR_FILE_TEMPLATE).format(
        tissue=GTEX_TISSUE,
        gene_sel_strategy=GENE_SEL_STRATEGY,
        corr_method="mic_parallel",
    )
)

In [None]:
mic_df.shape

In [None]:
mic_df.head()

In [None]:
mic_df = get_upper_triag(mic_df)

In [None]:
mic_df = mic_df.unstack().rename_axis((None, None)).dropna().sort_index().rename("mic")

In [None]:
mic_df.shape

In [None]:
mic_df.head()

## Checks

In [None]:
assert (
    len(set(clustermatch_df.index).intersection(set(mic_df.index)))
    == clustermatch_df.index.shape[0]
)

In [None]:
assert (
    len(set(clustermatch_df.index).intersection(set(mic_df.index)))
    == mic_df.index.shape[0]
)

In [None]:
assert (
    len(set(pearson_df.index).intersection(set(mic_df.index))) == mic_df.index.shape[0]
)

In [None]:
assert (
    len(set(spearman_df.index).intersection(set(mic_df.index))) == mic_df.index.shape[0]
)

## Merge

In [None]:
df = pd.concat(
    [clustermatch_df, pearson_df, spearman_df, mic_df], join="inner", axis=1
).sort_index()

In [None]:
display(df.shape)
assert df.shape[0] == mic_df.shape[0]

In [None]:
assert not df.isna().any().any()

In [None]:
df.head()

# Save

In [None]:
df.to_pickle(OUTPUT_FILE)