# Environment variables

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import conf

In [None]:
N_JOBS = conf.GENERAL["N_JOBS"]
display(N_JOBS)

In [None]:
%env MKL_NUM_THREADS=$N_JOBS
%env OPEN_BLAS_NUM_THREADS=$N_JOBS
%env NUMEXPR_NUM_THREADS=$N_JOBS
%env OMP_NUM_THREADS=$N_JOBS

# Modules

In [None]:
from pathlib import Path

import statsmodels.api as sm
import numpy as np
import pandas as pd
from sklearn.preprocessing import scale
from tqdm import tqdm

from gls import GLSPhenoplier

# Settings

In [None]:
OUTPUT_DIR = conf.RESULTS["GLS"]
display(OUTPUT_DIR)

OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

# Load data

## eMERGE to PhenomeXcan maps

In [None]:
# FIXME: hardcoded
input_filepath = Path(
    "/home/miltondp/projects/labs/greenelab/phenoplier/base/data/emerge",
    "eMERGE_III_PMBB_GSA_v2_2020_phecode_AFR_EUR_cc50_counts_w_dictionary.txt",
).resolve()
display(input_filepath)

In [None]:
emerge_traits_df = pd.read_csv(
    input_filepath,
    sep="\t",
    dtype={"phecode": str},
    usecols=["phecode", "phenotype", "category"],
)

In [None]:
emerge_traits_df = emerge_traits_df.rename(
    columns={
        "phenotype": "phecode_phenotype",
        "category": "phecode_category",
    }
)

In [None]:
emerge_traits_df.shape

In [None]:
emerge_traits_df.head()

## eMERGE to PhenomeXcan maps

In [None]:
# FIXME: hardcoded
emerge_phenomexcan_maps_filepath = Path(
    "/home/miltondp/projects/labs/greenelab/phenoplier/base/data/emerge",
    "phecodes_phenomexcan_maps.tsv",
).resolve()
display(emerge_phenomexcan_maps_filepath)

In [None]:
emerge_phenomexcan_maps = pd.read_csv(
    emerge_phenomexcan_maps_filepath, sep="\t", dtype={"phecode": str}
)

In [None]:
emerge_phenomexcan_maps = emerge_phenomexcan_maps.dropna(
    subset=["phecode", "phenomexcan"], how="any"
)

In [None]:
emerge_phenomexcan_maps.shape

In [None]:
emerge_phenomexcan_maps.head()

## eMERGE (S-MultiXcan) projection

In [None]:
# FIXME hardcoded
input_filepath = Path(
    "/home/miltondp/projects/labs/greenelab/phenoplier/base/results/projections",
    "projection-emerge-smultixcan-mashr-zscores.pkl",
).resolve()
display(input_filepath)

In [None]:
emerge_projection = pd.read_pickle(input_filepath)

In [None]:
emerge_projection.shape

In [None]:
emerge_projection.head()

## eMERGE (S-MultiXcan) projection

In [None]:
# FIXME: path hardcoded
emerge_smultixcan_projection_filepath = Path(
    "/home/miltondp/projects/labs/greenelab/phenoplier/base/data/emerge/gene_assoc/emerge-smultixcan-mashr-zscores.pkl"
).resolve()

display(emerge_smultixcan_projection_filepath)

In [None]:
_tmp = pd.read_pickle(emerge_smultixcan_projection_filepath)

In [None]:
_tmp.shape

In [None]:
_tmp.head()

## GLS results on PhenomeXcan

In [None]:
input_filepath = conf.RESULTS["GLS"] / "gls_phenotypes.pkl"
display(input_filepath)

In [None]:
gls_phenomexcan = pd.read_pickle(input_filepath)

In [None]:
gls_phenomexcan.shape

In [None]:
gls_phenomexcan.head()

## MultiPLIER summary

In [None]:
multiplier_model_summary = pd.read_pickle(conf.MULTIPLIER["MODEL_SUMMARY_FILE"])

In [None]:
multiplier_model_summary.shape

In [None]:
multiplier_model_summary.head()

In [None]:
well_aligned_lvs = multiplier_model_summary[
    (multiplier_model_summary["FDR"] < 0.05) | (multiplier_model_summary["AUC"] >= 0.75)
]

display(well_aligned_lvs.shape)
display(well_aligned_lvs.head())

In [None]:
well_aligned_lv_codes = set([f"LV{lvi}" for lvi in well_aligned_lvs["LV index"]])

In [None]:
len(well_aligned_lv_codes)

In [None]:
list(well_aligned_lv_codes)[:5]

# Select LV from previous GLS run on PhenomeXcan

In [None]:
gls_phenomexcan_lvs = gls_phenomexcan["lv"].unique()

In [None]:
gls_phenomexcan_lvs.shape

In [None]:
gls_phenomexcan_lvs

# Select eMERGE traits

In [None]:
gls_phenomexcan_traits = gls_phenomexcan["phenotype"].unique()

In [None]:
gls_phenomexcan_traits.shape

In [None]:
gls_phenomexcan_in_emerge = emerge_phenomexcan_maps[
    (emerge_phenomexcan_maps["efo"].isin(gls_phenomexcan_traits))
    | (emerge_phenomexcan_maps["phenomexcan"].isin(gls_phenomexcan_traits))
]

In [None]:
gls_phenomexcan_in_emerge

In [None]:
gls_emerge_phecodes = gls_phenomexcan_in_emerge["phecode"].unique().tolist()

In [None]:
# these are the mapped traits from PhenomeXcan to phecodes
gls_emerge_phecodes

In [None]:
# phecode_to_desc_map = emerge_traits_df[["phecode", "phecode_phenotype"]].set_index("phecode").squeeze().to_dict()

# GLSPhenoplier

## Get list of phenotypes/lvs pairs

In [None]:
phenotypes_lvs_pairs = []

# for lvs run for PhenomeXcan, I take the top traits in eMERGE + global mapped phenotypes
for lv in gls_phenomexcan_lvs:
    lv_traits = emerge_projection.loc[lv]
    lv_traits = lv_traits[lv_traits > 0.0]
    lv_traits = lv_traits.sort_values(ascending=False).head(20)

    for phenotype_code in set(lv_traits.index.tolist() + gls_emerge_phecodes):
        phenotypes_lvs_pairs.append(
            {
                "phenotype": phenotype_code,
                "lv": lv,
            }
        )

phenotypes_lvs_pairs = pd.DataFrame(phenotypes_lvs_pairs).drop_duplicates()

In [None]:
phenotypes_lvs_pairs = phenotypes_lvs_pairs.sort_values("phenotype").reset_index(
    drop=True
)

In [None]:
phenotypes_lvs_pairs.shape

In [None]:
phenotypes_lvs_pairs.head()

## Run

In [None]:
output_file = OUTPUT_DIR / "gls_phenotypes-emerge.pkl"
display(output_file)

In [None]:
results = []

pbar = tqdm(total=phenotypes_lvs_pairs.shape[0])

for idx, row in phenotypes_lvs_pairs.iterrows():
    phenotype_code = row["phenotype"]
    lv_code = row["lv"]

    pbar.set_description(f"{phenotype_code} - {lv_code}")

    gls_model = GLSPhenoplier(
        smultixcan_result_set_filepath=emerge_smultixcan_projection_filepath
    )
    gls_model.fit_named(lv_code, phenotype_code)
    res = gls_model.results

    results.append(
        {
            "phenotype": phenotype_code,
            "lv": lv_code,
            "lv_with_pathway": lv_code in well_aligned_lv_codes,
            "coef": res.params.loc["lv"],
            "pvalue": res.pvalues.loc["lv"],
            "summary": gls_model.results_summary,
        }
    )

    if (idx % 10) == 0:
        pd.DataFrame(results).to_pickle(output_file)

    pbar.update(1)

pbar.close()

In [None]:
results = pd.DataFrame(results)

In [None]:
results.shape

In [None]:
results.head()

In [None]:
results.sort_values("pvalue").head(10)

## Save

In [None]:
results.to_pickle(output_file)