# Description

It samples all the universe of gene pairs within the top genes initially selected (5,000 genes with maximum variance).

# Modules

In [None]:
import pandas as pd
import numpy as np

from clustermatch import conf

# Settings

In [None]:
DATASET_CONFIG = conf.GTEX
GTEX_TISSUE = "whole_blood"
GENE_SEL_STRATEGY = "var_pc_log2"

In [None]:
# amount of gene pairs to sample
SAMPLE_SIZE = 10000

# number of samples to take
N_SAMPLES = 10

# Paths

In [None]:
INPUT_FILE = (
    DATASET_CONFIG["GENE_SELECTION_DIR"]
    / f"gtex_v8_data_{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl"
)
display(INPUT_FILE)

assert INPUT_FILE.exists()

In [None]:
OUTPUT_DIR = INPUT_FILE.parent / "samples"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

In [None]:
OUTPUT_FILE_TEMPLATE = str(
    OUTPUT_DIR / (f"{INPUT_FILE.stem}-gene_pairs-sample_" + "{sample_id}" + ".pkl")
)

display(OUTPUT_FILE_TEMPLATE)

# Data

## Genes IDs universe

In [None]:
genes_ids = pd.read_pickle(INPUT_FILE).index.tolist()

In [None]:
len(genes_ids)

In [None]:
genes_ids[:10]

# Create list of gene pairs

In [None]:
gene_pairs = []

for i in range(len(genes_ids) - 1):
    for j in range(i + 1, len(genes_ids)):
        gene_pairs.append((genes_ids[i], genes_ids[j]))

gene_pairs_df = pd.DataFrame(data=gene_pairs, columns=["gene0", "gene1"])

In [None]:
assert gene_pairs_df.shape[0] == len(genes_ids) * (len(genes_ids) - 1) / 2
display(gene_pairs_df.shape)

In [None]:
gene_pairs_df.shape

In [None]:
gene_pairs_df.head()

# Create samples and save

In [None]:
for sample_id in range(N_SAMPLES):
    data_sample = gene_pairs_df.sample(n=SAMPLE_SIZE, random_state=sample_id)

    output_filepath = OUTPUT_FILE_TEMPLATE.format(sample_id=sample_id)
    display(output_filepath)

    data_sample.to_pickle(output_filepath)

In [None]:
display(data_sample.shape)
display(data_sample.head())