# Description

TODO

# Environment variables

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import conf

In [None]:
# N_JOBS = conf.GENERAL["N_JOBS"]
# set N_JOBS manually
N_JOBS = 2
display(N_JOBS)

In [None]:
%env MKL_NUM_THREADS=$N_JOBS
%env OPEN_BLAS_NUM_THREADS=$N_JOBS
%env NUMEXPR_NUM_THREADS=$N_JOBS
%env OMP_NUM_THREADS=$N_JOBS

# Modules

In [None]:
import itertools

import numpy as np
import pandas as pd
from tqdm import tqdm

from gls import GLSPhenoplier

# Settings

In [None]:
N_SIMULATED_PHENOTYPES = 100

In [None]:
OUTPUT_DIR = conf.RESULTS["GLS"]
display(OUTPUT_DIR)

OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

In [None]:
OUTPUT_FILENAME = OUTPUT_DIR / "gls-null_simulations-real_data.pkl"
display(OUTPUT_FILENAME)

# Load data

## MultiPLIER Z matrix

In [None]:
multiplier_z_matrix = pd.read_pickle(conf.MULTIPLIER["MODEL_Z_MATRIX_FILE"])

In [None]:
multiplier_z_matrix.shape

In [None]:
multiplier_z_matrix.head()

In [None]:
lv_codes = list(multiplier_z_matrix.columns)
display(lv_codes[:5])

# GLSPhenoplier

## Functions

In [None]:
def get_df_from_results(results_list):
    df = pd.DataFrame(results_list).astype(
        {
            "phenotype": "category",
            "lv": "category",
        }
    )

    return df

## Load `phenotype_assocs` and `lv_weights`

In [None]:
phenotype_assocs, lv_weights = GLSPhenoplier._get_data(
    conf.PHENOMEXCAN["SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"]
)[1:]

In [None]:
phenotype_assocs.shape

In [None]:
phenotype_assocs.head()

In [None]:
phenotype_list = list(phenotype_assocs.columns)
display(phenotype_list[:5])

In [None]:
lv_weights.shape

In [None]:
lv_weights.head()

## Generate simulated phenotypes

In [None]:
rs = np.random.RandomState(0)

In [None]:
phenotype_codes = rs.choice(phenotype_list, size=N_SIMULATED_PHENOTYPES, replace=False)
display(phenotype_codes[:3])
display(len(phenotype_codes))
assert len(phenotype_codes) == N_SIMULATED_PHENOTYPES

simulated_phenotypes = {}

for phenotype_code in phenotype_codes:
    phenotype = phenotype_assocs[phenotype_code].copy()
    rs.shuffle(phenotype)

    simulated_phenotypes[phenotype_code] = phenotype

In [None]:
display(len(simulated_phenotypes))
assert len(simulated_phenotypes) == N_SIMULATED_PHENOTYPES

In [None]:
simulated_phenotypes[list(simulated_phenotypes.keys())[0]]

In [None]:
simulated_phenotypes = pd.DataFrame(simulated_phenotypes)

In [None]:
simulated_phenotypes.shape

In [None]:
simulated_phenotypes.head()

In [None]:
simulated_phenotypes.describe()

## Merge simulated phenotypes and LVs into one dataframe

In [None]:
run_confs = pd.DataFrame(
    data=itertools.product(list(simulated_phenotypes.columns), lv_codes),
    columns=["phenotype", "lv"],
)

In [None]:
display(run_confs)
assert run_confs.shape[0] == int(N_SIMULATED_PHENOTYPES * len(lv_codes))

## Run

In [None]:
results = []

pbar = tqdm(total=run_confs.shape[0])

for phenotype_code, lv_code in run_confs.sample(frac=1, random_state=rs).itertuples(
    name=None, index=False
):
    pbar.set_description(f"{phenotype_code} - {lv_code}")

    phenotype = simulated_phenotypes[phenotype_code]

    gls_model = GLSPhenoplier(
        smultixcan_result_set_filepath=conf.PHENOMEXCAN[
            "SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"
        ]
    )
    gls_model.fit_named(lv_code, phenotype)
    res = gls_model.results

    results.append(
        {
            "phenotype": phenotype_code,
            "lv": lv_code,
            "coef": res.params.loc["lv"],
            "pvalue": res.pvalues_onesided.loc["lv"],
            #                 "pvalue_twosided": res.pvalues.loc["lv"],
            #                 "summary": gls_model.results_summary,
        }
    )

    # save results every 10 models trained
    if (len(results) % 10) == 0:
        get_df_from_results(results).to_pickle(OUTPUT_FILENAME)

    pbar.update(1)

pbar.close()

In [None]:
results = get_df_from_results(results)

In [None]:
results.shape

In [None]:
results.head()

In [None]:
results.sort_values("pvalue").head(10)

## Save

In [None]:
results.to_pickle(OUTPUT_FILENAME)