# Description

It samples all the universe of gene pairs within the top genes initially selected (5,000 genes with maximum variance).

# Modules

In [1]:
import pandas as pd
import numpy as np

from clustermatch import conf

# Settings

In [2]:
DATASET_CONFIG = conf.GTEX
GTEX_TISSUE = "whole_blood"
GENE_SEL_STRATEGY = "var_pc_log2"

In [3]:
# amount of gene pairs to sample
SAMPLE_SIZE = 33000

# number of samples to take
N_SAMPLES = 1

# Paths

In [4]:
INPUT_FILE = (
    DATASET_CONFIG["GENE_SELECTION_DIR"]
    / f"gtex_v8_data_{GTEX_TISSUE}-{GENE_SEL_STRATEGY}.pkl"
)
display(INPUT_FILE)

assert INPUT_FILE.exists()

PosixPath('/opt/data/results/gtex_v8/gene_selection/gtex_v8_data_whole_blood-var_pc_log2.pkl')

In [5]:
OUTPUT_DIR = INPUT_FILE.parent / "samples"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

PosixPath('/opt/data/results/gtex_v8/gene_selection/samples')

In [6]:
OUTPUT_FILE_TEMPLATE = str(
    OUTPUT_DIR / (f"{INPUT_FILE.stem}-gene_pairs-sample_" + "{sample_id}" + ".pkl")
)

display(OUTPUT_FILE_TEMPLATE)

'/opt/data/results/gtex_v8/gene_selection/samples/gtex_v8_data_whole_blood-var_pc_log2-gene_pairs-sample_{sample_id}.pkl'

# Data

## Genes IDs universe

In [7]:
genes_ids = pd.read_pickle(INPUT_FILE).index.tolist()

In [8]:
len(genes_ids)

5000

In [9]:
genes_ids[:10]

['ENSG00000169429.10',
 'ENSG00000135245.9',
 'ENSG00000163631.16',
 'ENSG00000277632.1',
 'ENSG00000239839.6',
 'ENSG00000186652.9',
 'ENSG00000129824.15',
 'ENSG00000152463.14',
 'ENSG00000123689.5',
 'ENSG00000012223.12']

# Create list of gene pairs

In [10]:
gene_pairs = []

for i in range(len(genes_ids) - 1):
    for j in range(i + 1, len(genes_ids)):
        gene_pairs.append((genes_ids[i], genes_ids[j]))

gene_pairs_df = pd.DataFrame(data=gene_pairs, columns=["gene0", "gene1"])

In [11]:
assert gene_pairs_df.shape[0] == len(genes_ids) * (len(genes_ids) - 1) / 2
display(gene_pairs_df.shape)

(12497500, 2)

In [12]:
gene_pairs_df.shape

(12497500, 2)

In [13]:
gene_pairs_df.head()

Unnamed: 0,gene0,gene1
0,ENSG00000169429.10,ENSG00000135245.9
1,ENSG00000169429.10,ENSG00000163631.16
2,ENSG00000169429.10,ENSG00000277632.1
3,ENSG00000169429.10,ENSG00000239839.6
4,ENSG00000169429.10,ENSG00000186652.9


# Create samples and save

In [14]:
for sample_id in range(N_SAMPLES):
    data_sample = gene_pairs_df.sample(n=SAMPLE_SIZE, random_state=sample_id)

    output_filepath = OUTPUT_FILE_TEMPLATE.format(sample_id=sample_id)
    display(output_filepath)

    data_sample.to_pickle(output_filepath)

'/opt/data/results/gtex_v8/gene_selection/samples/gtex_v8_data_whole_blood-var_pc_log2-gene_pairs-sample_0.pkl'

In [15]:
display(data_sample.dtypes)
display(data_sample.shape)
display(data_sample.head())

gene0    object
gene1    object
dtype: object

(33000, 2)

Unnamed: 0,gene0,gene1
8548968,ENSG00000161217.11,ENSG00000248527.1
2585413,ENSG00000106927.11,ENSG00000136929.12
7848759,ENSG00000177156.10,ENSG00000156711.16
2095670,ENSG00000197965.11,ENSG00000272155.1
6038958,ENSG00000278330.1,ENSG00000179387.9
