# Description

It uses the same strategy for GTEx (`00-gtex_v8-split_by_tissue.ipynb`) to select the top variable genes in recount2.

# Modules

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

from clustermatch import conf

# Settings

In [None]:
N_TOP_GENES_MAX_VARIANCE = 5000

# Paths

In [None]:
INPUT_FILE_DIR = conf.RECOUNT2FULL["DATA_DIR"] / "recount2_rpkm.pkl"
display(INPUT_FILE_DIR)

In [None]:
OUTPUT_DIR = conf.RECOUNT2FULL["GENE_SELECTION_DIR"]
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

# Select top genes

Based on the previous findings, I select genes with both strategies `var_raw` and `var_pc_log2`.

Then I save, for both, the raw data (note that I only use the strategies to select genes, not to log-transform the data).

In [None]:
input_files = sorted(
    [
        INPUT_FILE_DIR,
    ]
)

display(input_files[:5])

## Run

In [None]:
pbar = tqdm(input_files, ncols=100)

for data_file in pbar:
    pbar.set_description(data_file.stem)

    data = pd.read_pickle(data_file)

    # var_raw
    top_genes_var = (
        data.var(axis=1).sort_values(ascending=False).head(N_TOP_GENES_MAX_VARIANCE)
    )
    selected_data = data.loc[top_genes_var.index]

    output_filename = f"{data_file.stem}-var_raw.pkl"
    selected_data.to_pickle(path=OUTPUT_DIR / output_filename)

    # var_pc_log2
    log2_tissue_data = np.log2(data + 1)

    top_genes_var = (
        log2_tissue_data.var(axis=1)
        .sort_values(ascending=False)
        .head(N_TOP_GENES_MAX_VARIANCE)
    )
    # save the same raw data, but with genes selected by var_pc_log2
    selected_data = data.loc[top_genes_var.index]

    output_filename = f"{data_file.stem}-var_pc_log2.pkl"
    selected_data.to_pickle(path=OUTPUT_DIR / output_filename)