# Description

From the intersections groups (visualized in the UpSet plot), it samples from the "Agreements", that is, only intersection groups where there is agreement between methods (clustermatch, pearson and spearman).

# Modules

In [1]:
import pandas as pd
import numpy as np

from clustermatch import conf

# Settings

In [2]:
DATASET_CONFIG = conf.GTEX
GTEX_TISSUE = "whole_blood"
GENE_SEL_STRATEGY = "var_pc_log2"

In [3]:
# amount of gene pairs to sample
SAMPLE_SIZE = 33000

# number of samples to take
N_SAMPLES = 1

# Paths

In [4]:
INPUT_GENE_PAIRS_INTERSECTIONS_FILE = (
    DATASET_CONFIG["GENE_PAIR_INTERSECTIONS"]
    / f"gene_pair_intersections-gtex_v8-{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl"
)
display(INPUT_GENE_PAIRS_INTERSECTIONS_FILE)

assert INPUT_GENE_PAIRS_INTERSECTIONS_FILE.exists()

PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections/gene_pair_intersections-gtex_v8-whole_blood-var_pc_log2.pkl')

In [5]:
OUTPUT_DIR = INPUT_GENE_PAIRS_INTERSECTIONS_FILE.parent / "samples"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

PosixPath('/opt/data/results/gtex_v8/gene_pair_intersections/samples')

In [6]:
OUTPUT_FILE_TEMPLATE = str(
    OUTPUT_DIR
    / (
        f"{INPUT_GENE_PAIRS_INTERSECTIONS_FILE.stem}-agreements_sample_"
        + "{sample_id}"
        + ".pkl"
    )
)

display(OUTPUT_FILE_TEMPLATE)

'/opt/data/results/gtex_v8/gene_pair_intersections/samples/gene_pair_intersections-gtex_v8-whole_blood-var_pc_log2-agreements_sample_{sample_id}.pkl'

# Data

## Gene pairs intersection data

In [7]:
def is_intersection_column(column_name):
    return " (high)" in column_name or " (low)" in column_name

In [8]:
gene_pairs_intersections = pd.read_pickle(INPUT_GENE_PAIRS_INTERSECTIONS_FILE)

In [9]:
gene_pairs_intersections = gene_pairs_intersections[
    [c for c in gene_pairs_intersections.columns if is_intersection_column(c)]
]

In [10]:
gene_pairs_intersections.shape

(12497500, 6)

In [11]:
gene_pairs_intersections.head()

Unnamed: 0,Unnamed: 1,Pearson (high),Pearson (low),Spearman (high),Spearman (low),Clustermatch (high),Clustermatch (low)
ENSG00000000419.12,ENSG00000002834.17,True,False,True,False,True,False
ENSG00000000419.12,ENSG00000002919.14,True,False,True,False,True,False
ENSG00000000419.12,ENSG00000002933.7,False,True,False,True,False,True
ENSG00000000419.12,ENSG00000003402.19,True,False,True,False,True,False
ENSG00000000419.12,ENSG00000004478.7,False,True,False,False,False,False


## Gene pairs intersection - gene pairs

In [12]:
# gene_pairs_df = gene_pairs_intersections.rename_axis(("gene0", "gene1")).index.to_frame(index=False)

In [13]:
# gene_pairs_df.shape

In [14]:
# gene_pairs_df.head()

# Functions

In [15]:
# df_r_data = df_plot

In [16]:
# df_r_data.head()

In [17]:
# df_r_data_boolean_cols = set(
#     [x for x in df_r_data.columns if " (high)" in x or " (low)" in x]
# )

In [18]:
# df_r_data_boolean_cols

In [19]:
def get_gene_pairs(gene_pairs_intersections, query_set):
    """
    FIXME: UPDATE

    It queries the given dataframe with the intersections of different groups (i.e.,
    clustermatch high, pearson low, etc) given a query set. It returns a slice of
    the dataframe according to the query set provided.

    Args:
        gene_pairs_intersections: a dataframe with gene pairs in rows and intersection
            groups as columns (which are boolean). No other columns are allowed.
        query_set: a tuple with strings that specifies a query. For example
            ("Clustermatch (high)", "Pearson (low") would select all gene pairs
            for which clustermatch is high and pearson is low.

    Returns:
        A slice of gene pairs in the input dataframe where the conditions specified in query_set
        apply.
    """
    assert all([x in gene_pairs_intersections.columns for x in query_set])

    query = np.concatenate(
        [
            # columns that have to be true
            np.concatenate(
                [
                    gene_pairs_intersections[c].to_numpy().reshape(-1, 1)
                    for c in query_set
                ],
                axis=1,
            )
            .all(axis=1)
            .reshape(-1, 1),
            # rest of the columns, that have to be false
            np.concatenate(
                [
                    ~gene_pairs_intersections[c].to_numpy().reshape(-1, 1)
                    for c in gene_pairs_intersections.columns
                    if c not in query_set
                ],
                axis=1,
            )
            .all(axis=1)
            .reshape(-1, 1),
        ],
        axis=1,
    ).all(axis=1)

    _tmp_df = gene_pairs_intersections[query]

    # _tmp_df = _tmp_df[
    #     [x for x in _tmp_df.columns if "(high)" not in x and "(low)" not in x]
    # ]

    return _tmp_df.rename_axis(("gene0", "gene1")).index.to_frame(index=False)

In [20]:
_tmp = get_gene_pairs(
    gene_pairs_intersections,
    {
        "Clustermatch (high)",
        "Pearson (high)",
        "Spearman (high)",
    },
)

display(_tmp.dtypes)
display(_tmp.shape)
display(_tmp.head())

assert _tmp.shape[0] > int(3.12e6)

gene0    object
gene1    object
dtype: object

(3120576, 2)

Unnamed: 0,gene0,gene1
0,ENSG00000000419.12,ENSG00000002834.17
1,ENSG00000000419.12,ENSG00000002919.14
2,ENSG00000000419.12,ENSG00000003402.19
3,ENSG00000000419.12,ENSG00000004660.14
4,ENSG00000000419.12,ENSG00000005020.12


# Get intersections dataframe

## Agreements

In [21]:
_queries = [
    [
        "Clustermatch (high)",
        "Pearson (high)",
        "Spearman (high)",
        # "Clustermatch (low)",
        # "Pearson (low)",
        # "Spearman (low)",
    ],
    [
        "Clustermatch (high)",
        "Pearson (high)",
        # "Spearman (high)",
        # "Clustermatch (low)",
        # "Pearson (low)",
        # "Spearman (low)",
    ],
    [
        "Clustermatch (high)",
        # "Pearson (high)",
        "Spearman (high)",
        # "Clustermatch (low)",
        # "Pearson (low)",
        # "Spearman (low)",
    ],
    [
        # "Clustermatch (high)",
        "Pearson (high)",
        "Spearman (high)",
        # "Clustermatch (low)",
        # "Pearson (low)",
        # "Spearman (low)",
    ],
    [
        # "Clustermatch (high)",
        # "Pearson (high)",
        # "Spearman (high)",
        "Clustermatch (low)",
        "Pearson (low)",
        # "Spearman (low)",
    ],
    [
        # "Clustermatch (high)",
        # "Pearson (high)",
        # "Spearman (high)",
        "Clustermatch (low)",
        # "Pearson (low)",
        "Spearman (low)",
    ],
    [
        # "Clustermatch (high)",
        # "Pearson (high)",
        # "Spearman (high)",
        # "Clustermatch (low)",
        "Pearson (low)",
        "Spearman (low)",
    ],
    [
        # "Clustermatch (high)",
        # "Pearson (high)",
        # "Spearman (high)",
        "Clustermatch (low)",
        "Pearson (low)",
        "Spearman (low)",
    ],
]

In [22]:
display(len(_queries))
assert len(_queries) == 8

8

In [23]:
gene_pairs_df = []

for idx, _query in enumerate(_queries):
    group_df = get_gene_pairs(gene_pairs_intersections, set(_query))
    gene_pairs_df.append(group_df)

gene_pairs_df = pd.concat(gene_pairs_df)

In [24]:
display(gene_pairs_df.shape)
assert gene_pairs_df.drop_duplicates().shape == gene_pairs_df.shape
assert (gene_pairs_df.shape[0] > 7.3e6) and (gene_pairs_df.shape[0] < 7.4e6)

(7379251, 2)

In [25]:
gene_pairs_df.head()

Unnamed: 0,gene0,gene1
0,ENSG00000000419.12,ENSG00000002834.17
1,ENSG00000000419.12,ENSG00000002919.14
2,ENSG00000000419.12,ENSG00000003402.19
3,ENSG00000000419.12,ENSG00000004660.14
4,ENSG00000000419.12,ENSG00000005020.12


# Create samples and save

In [26]:
for sample_id in range(N_SAMPLES):
    data_sample = gene_pairs_df.sample(n=SAMPLE_SIZE, random_state=sample_id)

    output_filepath = OUTPUT_FILE_TEMPLATE.format(sample_id=sample_id)
    display(output_filepath)

    data_sample.to_pickle(output_filepath)

'/opt/data/results/gtex_v8/gene_pair_intersections/samples/gene_pair_intersections-gtex_v8-whole_blood-var_pc_log2-agreements_sample_0.pkl'

'/opt/data/results/gtex_v8/gene_pair_intersections/samples/gene_pair_intersections-gtex_v8-whole_blood-var_pc_log2-agreements_sample_1.pkl'

'/opt/data/results/gtex_v8/gene_pair_intersections/samples/gene_pair_intersections-gtex_v8-whole_blood-var_pc_log2-agreements_sample_2.pkl'

'/opt/data/results/gtex_v8/gene_pair_intersections/samples/gene_pair_intersections-gtex_v8-whole_blood-var_pc_log2-agreements_sample_3.pkl'

'/opt/data/results/gtex_v8/gene_pair_intersections/samples/gene_pair_intersections-gtex_v8-whole_blood-var_pc_log2-agreements_sample_4.pkl'

'/opt/data/results/gtex_v8/gene_pair_intersections/samples/gene_pair_intersections-gtex_v8-whole_blood-var_pc_log2-agreements_sample_5.pkl'

'/opt/data/results/gtex_v8/gene_pair_intersections/samples/gene_pair_intersections-gtex_v8-whole_blood-var_pc_log2-agreements_sample_6.pkl'

'/opt/data/results/gtex_v8/gene_pair_intersections/samples/gene_pair_intersections-gtex_v8-whole_blood-var_pc_log2-agreements_sample_7.pkl'

'/opt/data/results/gtex_v8/gene_pair_intersections/samples/gene_pair_intersections-gtex_v8-whole_blood-var_pc_log2-agreements_sample_8.pkl'

'/opt/data/results/gtex_v8/gene_pair_intersections/samples/gene_pair_intersections-gtex_v8-whole_blood-var_pc_log2-agreements_sample_9.pkl'

In [27]:
display(data_sample.dtypes)
display(data_sample.shape)
display(data_sample.head())

gene0    object
gene1    object
dtype: object

(33000, 2)

Unnamed: 0,gene0,gene1
2617724,ENSG00000205268.10,ENSG00000091039.16
2407828,ENSG00000267102.1,ENSG00000167775.10
958146,ENSG00000121691.4,ENSG00000255197.5
165792,ENSG00000165732.12,ENSG00000065243.19
411050,ENSG00000211860.1,ENSG00000127184.12
