# Description

It samples all the universe of gene pairs within the top genes initially selected (5,000 genes with maximum variance). This sample is needed to compare Clustermatch with other coefficients that are slower, like Maximal Information Coefficient.

# Modules

In [1]:
import pandas as pd

from clustermatch import conf

# Settings

In [2]:
DATASET_CONFIG = conf.GTEX
GTEX_TISSUE = "whole_blood"
GENE_SEL_STRATEGY = "var_pc_log2"

In [3]:
# amount of gene pairs to sample
SAMPLE_SIZE = 100000

# number of samples to take
N_SAMPLES = 1

# Paths

In [4]:
INPUT_CORR_FILE_TEMPLATE = (
    DATASET_CONFIG["SIMILARITY_MATRICES_DIR"]
    / DATASET_CONFIG["SIMILARITY_MATRIX_FILENAME_TEMPLATE"]
)
display(INPUT_CORR_FILE_TEMPLATE)

PosixPath('/opt/data/results/gtex_v8/similarity_matrices/gtex_v8_data_{tissue}-{gene_sel_strategy}-{corr_method}.pkl')

In [5]:
# I read this files to get the gene pairs so I make sure that the gene pair order in index is exactly the same
INPUT_FILE = DATASET_CONFIG["SIMILARITY_MATRICES_DIR"] / str(
    INPUT_CORR_FILE_TEMPLATE
).format(
    tissue=GTEX_TISSUE,
    gene_sel_strategy=GENE_SEL_STRATEGY,
    corr_method="all",
)
display(INPUT_FILE)

PosixPath('/opt/data/results/gtex_v8/similarity_matrices/gtex_v8_data_whole_blood-var_pc_log2-all.pkl')

In [6]:
# INPUT_FILE = (
#     DATASET_CONFIG["GENE_SELECTION_DIR"]
#     / f"gtex_v8_data_{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl"
# )
# display(INPUT_FILE)

# assert INPUT_FILE.exists()

In [7]:
OUTPUT_DIR = DATASET_CONFIG["GENE_SELECTION_DIR"] / "samples"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

PosixPath('/opt/data/results/gtex_v8/gene_selection/samples')

In [8]:
OUTPUT_FILE_TEMPLATE = str(
    OUTPUT_DIR / (f"{INPUT_FILE.stem}-gene_pairs-sample_" + "{sample_id}" + ".pkl")
)

display(OUTPUT_FILE_TEMPLATE)

'/opt/data/results/gtex_v8/gene_selection/samples/gtex_v8_data_whole_blood-var_pc_log2-all-gene_pairs-sample_{sample_id}.pkl'

# Data

## Gene pairs universe

In [9]:
gene_pairs_df = (
    pd.read_pickle(INPUT_FILE).index.rename(("gene0", "gene1")).to_frame(index=False)
)

In [10]:
gene_pairs_df.shape

(12497500, 2)

In [11]:
gene_pairs_df.head()

Unnamed: 0,gene0,gene1
0,ENSG00000000419.12,ENSG00000002834.17
1,ENSG00000000419.12,ENSG00000002919.14
2,ENSG00000000419.12,ENSG00000002933.7
3,ENSG00000000419.12,ENSG00000003402.19
4,ENSG00000000419.12,ENSG00000004478.7


# Create samples and save

In [12]:
for sample_id in range(N_SAMPLES):
    data_sample = gene_pairs_df.sample(n=SAMPLE_SIZE, random_state=sample_id)

    output_filepath = OUTPUT_FILE_TEMPLATE.format(sample_id=sample_id)
    display(output_filepath)

    data_sample.to_pickle(output_filepath)

'/opt/data/results/gtex_v8/gene_selection/samples/gtex_v8_data_whole_blood-var_pc_log2-all-gene_pairs-sample_0.pkl'

In [13]:
display(data_sample.dtypes)
display(data_sample.shape)
display(data_sample.head())

gene0    object
gene1    object
dtype: object

(100000, 2)

Unnamed: 0,gene0,gene1
8548968,ENSG00000181192.11,ENSG00000144579.7
2585413,ENSG00000111641.11,ENSG00000258476.5
7848759,ENSG00000172081.13,ENSG00000130429.12
2095670,ENSG00000105255.10,ENSG00000231721.6
6038958,ENSG00000153310.19,ENSG00000241657.1
