# Environment variables

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import conf

In [None]:
N_JOBS = conf.GENERAL["N_JOBS"]
display(N_JOBS)

In [None]:
%env MKL_NUM_THREADS=$N_JOBS
%env OPEN_BLAS_NUM_THREADS=$N_JOBS
%env NUMEXPR_NUM_THREADS=$N_JOBS
%env OMP_NUM_THREADS=$N_JOBS

# Modules

In [None]:
from pathlib import Path

import pandas as pd
from tqdm import tqdm

from gls import GLSPhenoplier

# Settings

In [None]:
OUTPUT_DIR = conf.RESULTS["GLS"]
display(OUTPUT_DIR)

OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

# Load data

## PhenomeXcan (S-MultiXcan)

In [None]:
# INPUT_SUBSET = "z_score_std"

In [None]:
# INPUT_STEM = "projection-smultixcan-efo_partial-mashr-zscores"

In [None]:
# input_filepath = Path(
#     conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
#     INPUT_SUBSET,
#     f"{INPUT_SUBSET}-{INPUT_STEM}.pkl",
# ).resolve()

In [None]:
# data = pd.read_pickle(input_filepath)

In [None]:
# data.shape

In [None]:
# data.head()

## MultiPLIER summary

In [None]:
multiplier_model_summary = pd.read_pickle(conf.MULTIPLIER["MODEL_SUMMARY_FILE"])

In [None]:
multiplier_model_summary.shape

In [None]:
multiplier_model_summary.head()

In [None]:
well_aligned_lvs = multiplier_model_summary[
    (multiplier_model_summary["FDR"] < 0.05) | (multiplier_model_summary["AUC"] >= 0.75)
]

display(well_aligned_lvs.shape)
display(well_aligned_lvs.head())

In [None]:
well_aligned_lv_codes = set([f"LV{lvi}" for lvi in well_aligned_lvs["LV index"]])

In [None]:
len(well_aligned_lv_codes)

In [None]:
list(well_aligned_lv_codes)[:5]

## eMERGE traits info

In [None]:
# FIXME: hardcoded
input_filepath = Path(
    "/home/miltondp/projects/labs/greenelab/phenoplier/base/data/emerge",
    "eMERGE_III_PMBB_GSA_v2_2020_phecode_AFR_EUR_cc50_counts_w_dictionary.txt",
).resolve()
display(input_filepath)

In [None]:
emerge_traits_df = pd.read_csv(
    input_filepath,
    sep="\t",
    dtype={"phecode": str},
    usecols=["phecode", "phenotype", "category"],
)

In [None]:
emerge_traits_df = emerge_traits_df.rename(
    columns={
        "phenotype": "phecode_phenotype",
        "category": "phecode_category",
    }
)

In [None]:
emerge_traits_df.shape

In [None]:
emerge_traits_df.head()

## eMERGE (S-MultiXcan)

In [None]:
# FIXME: path hardcoded
emerge_smultixcan_zscores_filepath = Path(
    "/home/miltondp/projects/labs/greenelab/phenoplier/base/data/emerge/gene_assoc/emerge-smultixcan-mashr-zscores.pkl"
).resolve()

display(emerge_smultixcan_zscores_filepath)

In [None]:
_tmp = pd.read_pickle(emerge_smultixcan_zscores_filepath)

In [None]:
_tmp.shape

In [None]:
_tmp.head()

## GLS results on PhenomeXcan

In [None]:
input_filepath = conf.RESULTS["GLS"] / "gls_phenotypes-crispr_lvs.pkl"
display(input_filepath)

In [None]:
gls_phenomexcan_crispr = pd.read_pickle(input_filepath)

In [None]:
gls_phenomexcan_crispr.shape

In [None]:
gls_phenomexcan_crispr.head()

## GLS results on eMERGE

In [None]:
# input_filepath = conf.RESULTS["GLS"] / "gls_phenotypes-emerge.pkl"
# display(input_filepath)

In [None]:
# gls_emerge = pd.read_pickle(input_filepath)

In [None]:
# gls_emerge.shape

In [None]:
# gls_emerge.head()

# Select LV from previous GLS run on PhenomeXcan

In [None]:
gls_phenomexcan_lvs = (
    gls_phenomexcan_crispr[["lv", "lv_set"]].drop_duplicates().reset_index(drop=True)
)

In [None]:
gls_phenomexcan_lvs.shape

In [None]:
gls_phenomexcan_lvs.head()

# Select traits from previous GLS run on eMERGE

In [None]:
emerge_traits_df["phecode_category"].unique()

In [None]:
gls_traits = emerge_traits_df[
    emerge_traits_df["phecode_category"].isin(
        [
            #     "hematopoietic",
            "circulatory system",
            "endocrine/metabolic",
        ]
    )
]["phecode"].unique()

In [None]:
gls_traits.shape

In [None]:
gls_traits

# GLSPhenoplier

## Get list of phenotypes/lvs pairs

In [None]:
phenotypes_lvs_pairs = []

for idx, lv_row in gls_phenomexcan_lvs.iterrows():
    for phenotype_code in gls_traits:
        phenotypes_lvs_pairs.append(
            {
                "phenotype": phenotype_code,
                "lv": lv_row["lv"],
                "lv_set": lv_row["lv_set"],
            }
        )

phenotypes_lvs_pairs = pd.DataFrame(phenotypes_lvs_pairs).drop_duplicates()

In [None]:
phenotypes_lvs_pairs = phenotypes_lvs_pairs.sort_values("phenotype").reset_index(
    drop=True
)

In [None]:
phenotypes_lvs_pairs.shape

In [None]:
phenotypes_lvs_pairs.head()

## Run

In [None]:
output_file = OUTPUT_DIR / "gls_phenotypes-emerge-crispr_lvs.pkl"
display(output_file)

In [None]:
results = []

pbar = tqdm(total=phenotypes_lvs_pairs.shape[0])

for idx, row in phenotypes_lvs_pairs.iterrows():
    phenotype_code = row["phenotype"]
    lv_code = row["lv"]

    pbar.set_description(f"{phenotype_code} - {lv_code}")

    gls_model = GLSPhenoplier(
        smultixcan_result_set_filepath=emerge_smultixcan_zscores_filepath
    )
    gls_model.fit_named(lv_code, phenotype_code)
    res = gls_model.results

    results.append(
        {
            "phenotype": phenotype_code,
            "lv": lv_code,
            "lv_set": row["lv_set"],
            "lv_with_pathway": lv_code in well_aligned_lv_codes,
            "coef": res.params.loc["lv"],
            "pvalue": res.pvalues.loc["lv"],
            "summary": gls_model.results_summary,
        }
    )

    if (idx % 10) == 0:
        pd.DataFrame(results).to_pickle(output_file)

    pbar.update(1)

pbar.close()

In [None]:
results = pd.DataFrame(results)

In [None]:
results.shape

In [None]:
results.head()

In [None]:
results.sort_values("pvalue").head(10)

## Save

In [None]:
results.to_pickle(output_file)