# Description

TODO

# Modules

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from clustermatch.plots import jointplot
from clustermatch import conf

# Settings

In [2]:
DATASET_CONFIG = conf.GTEX
GTEX_TISSUE = "whole_blood"
GENE_SEL_STRATEGY = "var_pc_log2"

In [3]:
# maximum amount of gene pairs to sample
MAX_SAMPLE_SIZE = 1000

# Paths

In [4]:
INPUT_GENE_PAIRS_INTERSECTIONS_FILE = (
    DATASET_CONFIG["GENE_PAIR_INTERSECTIONS"]
    / f"gene_pair_intersections-gtex_v8-{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl"
)
display(INPUT_GENE_PAIRS_INTERSECTIONS_FILE)

assert INPUT_GENE_PAIRS_INTERSECTIONS_FILE.exists()

PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections/gene_pair_intersections-gtex_v8-whole_blood-var_pc_log2.pkl')

In [5]:
OUTPUT_FILE = (
    INPUT_GENE_PAIRS_INTERSECTIONS_FILE.parent
    / f"{INPUT_GENE_PAIRS_INTERSECTIONS_FILE.stem}-sample.pkl"
)

display(OUTPUT_FILE)

PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections/gene_pair_intersections-gtex_v8-whole_blood-var_pc_log2-sample.pkl')

# Data

## Gene pairs intersection

In [6]:
df_plot = pd.read_pickle(INPUT_GENE_PAIRS_INTERSECTIONS_FILE)

In [7]:
df_plot.shape

(12497500, 9)

In [8]:
df_plot.head()

Unnamed: 0,Unnamed: 1,Pearson (high),Pearson (low),Spearman (high),Spearman (low),Clustermatch (high),Clustermatch (low),clustermatch,pearson,spearman
ENSG00000000419.12,ENSG00000002834.17,True,False,True,False,True,False,0.418721,0.681847,0.786595
ENSG00000000419.12,ENSG00000002919.14,True,False,True,False,True,False,0.40509,0.734699,0.816991
ENSG00000000419.12,ENSG00000002933.7,False,True,False,True,False,True,0.007466,0.013825,0.004128
ENSG00000000419.12,ENSG00000003402.19,True,False,True,False,True,False,0.391683,0.727347,0.803653
ENSG00000000419.12,ENSG00000004478.7,False,True,False,False,False,False,0.099013,0.094147,0.231269


# CHANGE - Look at specific gene pair cases

In [9]:
df_r_data = df_plot

In [10]:
df_r_data.head()

Unnamed: 0,Unnamed: 1,Pearson (high),Pearson (low),Spearman (high),Spearman (low),Clustermatch (high),Clustermatch (low),clustermatch,pearson,spearman
ENSG00000000419.12,ENSG00000002834.17,True,False,True,False,True,False,0.418721,0.681847,0.786595
ENSG00000000419.12,ENSG00000002919.14,True,False,True,False,True,False,0.40509,0.734699,0.816991
ENSG00000000419.12,ENSG00000002933.7,False,True,False,True,False,True,0.007466,0.013825,0.004128
ENSG00000000419.12,ENSG00000003402.19,True,False,True,False,True,False,0.391683,0.727347,0.803653
ENSG00000000419.12,ENSG00000004478.7,False,True,False,False,False,False,0.099013,0.094147,0.231269


In [11]:
df_r_data_boolean_cols = set(
    [x for x in df_r_data.columns if " (high)" in x or " (low)" in x]
)

In [12]:
df_r_data_boolean_cols

{'Clustermatch (high)',
 'Clustermatch (low)',
 'Pearson (high)',
 'Pearson (low)',
 'Spearman (high)',
 'Spearman (low)'}

## Functions

In [13]:
def get_gene_pairs(first_coef, query_set):
    """
    FIXME: move this function to the library

    It queries a dataframe with the intersections of different groups (i.e.,
    clustermatch high, pearson low, etc) given a query set. It returns a slice of
    the dataframe according to the query set provided.

    The function needs to access a variable named "df_r_data" that has the
    intersections between coefficients.

    Args:
        first_coef: the main coefficient ("clustermatch", "pearson" or "spearman")
            of interest. The final dataframe will be sorted according to this
            coefficient.
        query_set: a tuple with strings that specifies a query. For example
            ("Clustermatch (high)", "Pearson (low") would select all gene pairs
            for which clustermatch is high and pearson is low.

    Returns:
        A slice of variable "data_r_data" where the conditions specified in query_set
        apply.
    """
    assert all([x in df_r_data_boolean_cols for x in query_set])

    query = np.concatenate(
        [
            # columns that have to be true
            np.concatenate(
                [df_r_data[c].to_numpy().reshape(-1, 1) for c in query_set], axis=1
            )
            .all(axis=1)
            .reshape(-1, 1),
            # rest of the columns, that have to be false
            np.concatenate(
                [
                    ~df_r_data[c].to_numpy().reshape(-1, 1)
                    for c in df_r_data_boolean_cols
                    if c not in query_set
                ],
                axis=1,
            )
            .all(axis=1)
            .reshape(-1, 1),
        ],
        axis=1,
    ).all(axis=1)

    _tmp_df = df_r_data[query]

    # sort by firt_coef value
    _tmp_df = _tmp_df.sort_values(first_coef, ascending=False)

    _tmp_df = _tmp_df[
        [x for x in _tmp_df.columns if "(high)" not in x and "(low)" not in x]
    ]

    return _tmp_df

# Get intersections dataframe

In [14]:
intersections = {}

## Agreements

In [15]:
_queries = [
    [
        "Clustermatch (high)",
        "Pearson (high)",
        "Spearman (high)",
        # "Clustermatch (low)",
        # "Pearson (low)",
        # "Spearman (low)",
    ],
    [
        "Clustermatch (high)",
        "Pearson (high)",
        # "Spearman (high)",
        # "Clustermatch (low)",
        # "Pearson (low)",
        # "Spearman (low)",
    ],
    [
        "Clustermatch (high)",
        # "Pearson (high)",
        "Spearman (high)",
        # "Clustermatch (low)",
        # "Pearson (low)",
        # "Spearman (low)",
    ],
    [
        # "Clustermatch (high)",
        "Pearson (high)",
        "Spearman (high)",
        # "Clustermatch (low)",
        # "Pearson (low)",
        # "Spearman (low)",
    ],
    [
        # "Clustermatch (high)",
        # "Pearson (high)",
        # "Spearman (high)",
        "Clustermatch (low)",
        "Pearson (low)",
        # "Spearman (low)",
    ],
    [
        # "Clustermatch (high)",
        # "Pearson (high)",
        # "Spearman (high)",
        "Clustermatch (low)",
        # "Pearson (low)",
        "Spearman (low)",
    ],
    [
        # "Clustermatch (high)",
        # "Pearson (high)",
        # "Spearman (high)",
        # "Clustermatch (low)",
        "Pearson (low)",
        "Spearman (low)",
    ],
    [
        # "Clustermatch (high)",
        # "Pearson (high)",
        # "Spearman (high)",
        "Clustermatch (low)",
        "Pearson (low)",
        "Spearman (low)",
    ],
]

for idx, _query in enumerate(_queries):
    _query_str = str(", ".join(_query))
    assert _query_str not in intersections

    _tmp_df = get_gene_pairs(
        "clustermatch",
        set(_query),
    )

    intersections[_query_str] = _tmp_df.sample(
        n=min(_tmp_df.shape[0], MAX_SAMPLE_SIZE), random_state=idx
    ).index.to_frame(
        index=False, name=["gene0", "gene1"]
    )

In [16]:
assert len(intersections) == 8

In [17]:
_sizes = set()
for _query_str in intersections.keys():
    _sizes.add(intersections[_query_str].shape[0])

In [18]:
assert _sizes == {MAX_SAMPLE_SIZE}

In [19]:
intersections[_query_str].head()

Unnamed: 0,gene0,gene1
0,ENSG00000073849.14,ENSG00000165131.6
1,ENSG00000112137.17,ENSG00000119686.9
2,ENSG00000157150.4,ENSG00000232656.7
3,ENSG00000138835.22,ENSG00000229391.7
4,ENSG00000161055.3,ENSG00000169926.10


## Disagreements

In [20]:
_queries = [
    [
        "Clustermatch (high)",
        # "Pearson (high)",
        "Spearman (high)",
        # "Clustermatch (low)",
        "Pearson (low)",
        # "Spearman (low)",
    ],
    [
        "Clustermatch (high)",
        # "Pearson (high)",
        # "Spearman (high)",
        # "Clustermatch (low)",
        "Pearson (low)",
        # "Spearman (low)",
    ],
    [
        "Clustermatch (high)",
        # "Pearson (high)",
        # "Spearman (high)",
        # "Clustermatch (low)",
        # "Pearson (low)",
        "Spearman (low)",
    ],
    [
        "Clustermatch (high)",
        # "Pearson (high)",
        # "Spearman (high)",
        # "Clustermatch (low)",
        "Pearson (low)",
        "Spearman (low)",
    ],
    [
        # "Clustermatch (high)",
        "Pearson (high)",
        # "Spearman (high)",
        "Clustermatch (low)",
        # "Pearson (low)",
        # "Spearman (low)",
    ],
    [
        # "Clustermatch (high)",
        "Pearson (high)",
        # "Spearman (high)",
        # "Clustermatch (low)",
        # "Pearson (low)",
        "Spearman (low)",
    ],
    [
        # "Clustermatch (high)",
        "Pearson (high)",
        # "Spearman (high)",
        "Clustermatch (low)",
        # "Pearson (low)",
        "Spearman (low)",
    ],
    [
        # "Clustermatch (high)",
        # "Pearson (high)",
        "Spearman (high)",
        # "Clustermatch (low)",
        "Pearson (low)",
        # "Spearman (low)",
    ],
]

for idx, _query in enumerate(_queries):
    _query_str = str(", ".join(_query))
    assert _query_str not in intersections

    _tmp_df = get_gene_pairs(
        "clustermatch",
        set(_query),
    )

    intersections[_query_str] = _tmp_df.sample(
        n=min(_tmp_df.shape[0], MAX_SAMPLE_SIZE), random_state=idx
    ).index.to_frame(index=False, name=["gene0", "gene1"])

In [21]:
assert len(intersections) == 16

In [22]:
_sizes = set()
for _query_str in intersections.keys():
    _sizes.add(intersections[_query_str].shape[0])

In [23]:
_sizes

{8, 28, 87, 527, 531, 1000}

In [24]:
assert _sizes == {MAX_SAMPLE_SIZE, 28, 8, 87, 531, 527}

In [25]:
intersections[_query_str].head()

Unnamed: 0,gene0,gene1
0,ENSG00000115866.10,ENSG00000139194.7
1,ENSG00000133657.14,ENSG00000163739.4
2,ENSG00000166579.15,ENSG00000163739.4
3,ENSG00000108515.17,ENSG00000283063.1
4,ENSG00000177463.15,ENSG00000163739.4


# Save

In [26]:
import pickle

In [27]:
with open(OUTPUT_FILE, "wb") as handle:
    pickle.dump(intersections, handle)