# Environment variables

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import conf

In [None]:
N_JOBS = conf.GENERAL["N_JOBS"]
display(N_JOBS)

In [None]:
%env MKL_NUM_THREADS=$N_JOBS
%env OPEN_BLAS_NUM_THREADS=$N_JOBS
%env NUMEXPR_NUM_THREADS=$N_JOBS
%env OMP_NUM_THREADS=$N_JOBS

# Modules

In [None]:
from pathlib import Path

import pandas as pd
from tqdm import tqdm

from gls import GLSPhenoplier

# Settings

In [None]:
OUTPUT_DIR = conf.RESULTS["GLS"]
display(OUTPUT_DIR)

OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

# Load data

## PhenomeXcan (S-MultiXcan)

In [None]:
INPUT_SUBSET = "z_score_std"

In [None]:
INPUT_STEM = "projection-smultixcan-efo_partial-mashr-zscores"

In [None]:
input_filepath = Path(
    conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
    INPUT_SUBSET,
    f"{INPUT_SUBSET}-{INPUT_STEM}.pkl",
).resolve()

In [None]:
data = pd.read_pickle(input_filepath)

In [None]:
data.shape

In [None]:
data.head()

## Clustering results

In [None]:
CONSENSUS_CLUSTERING_DIR = Path(
    conf.RESULTS["CLUSTERING_DIR"], "consensus_clustering"
).resolve()

display(CONSENSUS_CLUSTERING_DIR)

In [None]:
input_file = Path(CONSENSUS_CLUSTERING_DIR, "best_partitions_by_k.pkl").resolve()
display(input_file)

In [None]:
best_partitions = pd.read_pickle(input_file)

In [None]:
best_partitions.shape

In [None]:
best_partitions.head()

## MultiPLIER summary

In [None]:
multiplier_model_summary = pd.read_pickle(conf.MULTIPLIER["MODEL_SUMMARY_FILE"])

In [None]:
multiplier_model_summary.shape

In [None]:
multiplier_model_summary.head()

In [None]:
well_aligned_lvs = multiplier_model_summary[
    (multiplier_model_summary["FDR"] < 0.05) | (multiplier_model_summary["AUC"] >= 0.75)
]

display(well_aligned_lvs.shape)
display(well_aligned_lvs.head())

In [None]:
well_aligned_lv_codes = set([f"LV{lvi}" for lvi in well_aligned_lvs["LV index"]])

In [None]:
len(well_aligned_lv_codes)

In [None]:
list(well_aligned_lv_codes)[:5]

# Select LVs from CRISPR analysis

In [None]:
# FIXME hardcoded
deg_enrich = pd.read_csv(
    Path(
        conf.RESULTS["BASE_DIR"],
        "crispr_analyses",
        "fgsea-all_lvs.tsv",
    ).resolve(),
    sep="\t",
)

In [None]:
deg_enrich.shape

In [None]:
deg_enrich.head()

In [None]:
deg_enrich_max_idx = deg_enrich.groupby(["lv", "pathway"])["padj"].idxmax()

In [None]:
deg_enrich = deg_enrich.loc[deg_enrich_max_idx].reset_index(drop=True)
display(deg_enrich.shape)
display(deg_enrich.head())

## Lipids-increasing gene sets

In [None]:
deg_increase = deg_enrich[
    deg_enrich["pathway"].isin(("gene_set_increase",)) & (deg_enrich["padj"] < 0.05)
].sort_values("padj", ascending=True)

In [None]:
deg_increase.shape

In [None]:
deg_increase.head()

In [None]:
lvs_increase = deg_increase["lv"].unique()

In [None]:
lvs_increase.shape

In [None]:
lvs_increase

## Lipids-decreasing gene sets

In [None]:
deg_decrease = deg_enrich[
    deg_enrich["pathway"].isin(("gene_set_decrease",)) & (deg_enrich["padj"] < 0.05)
].sort_values("padj", ascending=True)

In [None]:
deg_decrease.shape

In [None]:
deg_decrease.head()

In [None]:
lvs_decrease = deg_decrease["lv"].unique()

In [None]:
lvs_decrease.shape

In [None]:
lvs_decrease

## Merge final

In [None]:
_tmp0 = pd.DataFrame({"lv": lvs_increase, "lv_set": "lipids-increasing"})

_tmp1 = pd.DataFrame({"lv": lvs_decrease, "lv_set": "lipids-decreasing"})

In [None]:
gls_selected_lvs = pd.concat([_tmp0, _tmp1], ignore_index=True)

In [None]:
gls_selected_lvs.shape

In [None]:
gls_selected_lvs.head()

# Select traits from specific partition/cluster

In [None]:
PHENOTYPES_CONFIG = [
    # cardiovascular
    (29, 14),
    (29, 16),
    (29, 11),
    (29, 21),
    (29, 17),
]

# GLSPhenoplier

## Get list of phenotypes/lvs pairs

In [None]:
phenotypes_lvs_pairs = []

for part_k, cluster_id in PHENOTYPES_CONFIG:
    part = best_partitions.loc[part_k, "partition"]

    # get traits
    cluster_traits = data.index[part == cluster_id]

    #     # get extra lvs
    #     lv_list = _get_lvs_data(part_k, cluster_id)

    #     for extra_part_k, extra_cluster_id in extra_for_lvs:
    #         extra_lv_list = _get_lvs_data(part_k, cluster_id)
    #         lv_list.extend(extra_lv_list)

    for phenotype_code in cluster_traits:
        for idx, lv_row in gls_selected_lvs.iterrows():
            phenotypes_lvs_pairs.append(
                {
                    "phenotype_part_k": part_k,
                    "phenotype_cluster_id": cluster_id,
                    "phenotype": phenotype_code,
                    "lv": lv_row["lv"],
                    "lv_set": lv_row["lv_set"],
                }
            )

phenotypes_lvs_pairs = pd.DataFrame(phenotypes_lvs_pairs).drop_duplicates()

In [None]:
phenotypes_lvs_pairs = phenotypes_lvs_pairs.sort_values(
    ["phenotype", "lv"]
).reset_index(drop=True)

In [None]:
phenotypes_lvs_pairs.shape

In [None]:
phenotypes_lvs_pairs.head()

## Run

In [None]:
output_file = OUTPUT_DIR / "gls_phenotypes-crispr_lvs.pkl"
display(output_file)

In [None]:
results = []

pbar = tqdm(total=phenotypes_lvs_pairs.shape[0])

for idx, row in phenotypes_lvs_pairs.iterrows():
    phenotype_code = row["phenotype"]
    lv_code = row["lv"]

    pbar.set_description(f"{phenotype_code} - {lv_code}")

    gls_model = GLSPhenoplier(
        smultixcan_result_set_filepath=conf.PHENOMEXCAN[
            "SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"
        ]
    )
    gls_model.fit_named(lv_code, phenotype_code)
    res = gls_model.results

    results.append(
        {
            "part_k": row["phenotype_part_k"],
            "cluster_id": row["phenotype_cluster_id"],
            "phenotype": phenotype_code,
            "lv": lv_code,
            "lv_set": row["lv_set"],
            "lv_with_pathway": lv_code in well_aligned_lv_codes,
            "coef": res.params.loc["lv"],
            "pvalue": res.pvalues.loc["lv"],
            "summary": gls_model.results_summary,
        }
    )

    if (idx % 10) == 0:
        pd.DataFrame(results).to_pickle(output_file)

    pbar.update(1)

pbar.close()

In [None]:
results = pd.DataFrame(results)

In [None]:
results.shape

In [None]:
results.head()

In [None]:
results.sort_values("pvalue").head(10)

## Save

In [None]:
results.to_pickle(output_file)