# Description

TODO

# Environment variables

In [None]:
%load_ext autoreload
%autoreload 2

In [2]:
import conf

In [3]:
N_JOBS = conf.GENERAL["N_JOBS"]
display(N_JOBS)

3

In [4]:
%env MKL_NUM_THREADS=$N_JOBS
%env OPEN_BLAS_NUM_THREADS=$N_JOBS
%env NUMEXPR_NUM_THREADS=$N_JOBS
%env OMP_NUM_THREADS=$N_JOBS

env: MKL_NUM_THREADS=3
env: OPEN_BLAS_NUM_THREADS=3
env: NUMEXPR_NUM_THREADS=3
env: OMP_NUM_THREADS=3


# Modules

In [5]:
from pathlib import Path

import statsmodels.api as sm
import numpy as np
import pandas as pd
from sklearn.preprocessing import scale
from tqdm import tqdm

from gls import GLSPhenoplier

# Settings

In [6]:
N_SIMULATED_PHENOTYPES = 50

In [7]:
OUTPUT_DIR = conf.RESULTS["GLS"]
display(OUTPUT_DIR)

OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

PosixPath('/opt/data/results/gls')

In [8]:
OUTPUT_FILENAME = OUTPUT_DIR / "gls-null_simulations.pkl"
display(OUTPUT_FILENAME)

PosixPath('/opt/data/results/gls/gls-null_simulations.pkl')

# Load data

## MultiPLIER Z matrix

In [9]:
multiplier_z_matrix = pd.read_pickle(conf.MULTIPLIER["MODEL_Z_MATRIX_FILE"])

In [10]:
multiplier_z_matrix.shape

(6750, 987)

In [11]:
multiplier_z_matrix.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
GAS6,0.0,0.0,0.039438,0.0,0.050476,0.0,0.0,0.0,0.590949,0.0,...,0.050125,0.0,0.033407,0.0,0.0,0.005963,0.347362,0.0,0.0,0.0
MMP14,0.0,0.0,0.0,0.0,0.070072,0.0,0.0,0.004904,1.720179,2.423595,...,0.0,0.0,0.001007,0.0,0.035747,0.0,0.0,0.0,0.014978,0.0
DSP,0.0,0.0,0.0,0.0,0.0,0.041697,0.0,0.005718,0.0,0.0,...,0.020853,0.0,0.0,0.0,0.0,0.005774,0.0,0.0,0.0,0.416405
MARCKSL1,0.305212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.161843,0.149471,...,0.027134,0.05272,0.0,0.030189,0.060884,0.0,0.0,0.0,0.0,0.44848
SPARC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014014,...,0.0,0.0,0.0,0.0,0.0,0.0,0.067779,0.0,0.122417,0.062665


In [12]:
lv_codes = list(multiplier_z_matrix.columns)
display(lv_codes[:5])

['LV1', 'LV2', 'LV3', 'LV4', 'LV5']

## PhenomeXcan (S-MultiXcan)

In [13]:
# INPUT_SUBSET = "z_score_std"

In [14]:
# INPUT_STEM = "projection-smultixcan-efo_partial-mashr-zscores"

In [15]:
# input_filepath = Path(
#     conf.RESULTS["DATA_TRANSFORMATIONS_DIR"],
#     INPUT_SUBSET,
#     f"{INPUT_SUBSET}-{INPUT_STEM}.pkl",
# ).resolve()

In [16]:
# data = pd.read_pickle(input_filepath)

In [17]:
# data.shape

In [18]:
# data.head()

# GLSPhenoplier

## Load `lv_weights`

In [19]:
lv_weights = GLSPhenoplier._get_data(
    conf.PHENOMEXCAN["SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"]
)[2]

In [20]:
lv_weights.shape

(6450, 987)

In [21]:
lv_weights.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
GAS6,0.0,0.0,0.039438,0.0,0.050476,0.0,0.0,0.0,0.590949,0.0,...,0.050125,0.0,0.033407,0.0,0.0,0.005963,0.347362,0.0,0.0,0.0
MMP14,0.0,0.0,0.0,0.0,0.070072,0.0,0.0,0.004904,1.720179,2.423595,...,0.0,0.0,0.001007,0.0,0.035747,0.0,0.0,0.0,0.014978,0.0
DSP,0.0,0.0,0.0,0.0,0.0,0.041697,0.0,0.005718,0.0,0.0,...,0.020853,0.0,0.0,0.0,0.0,0.005774,0.0,0.0,0.0,0.416405
MARCKSL1,0.305212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.161843,0.149471,...,0.027134,0.05272,0.0,0.030189,0.060884,0.0,0.0,0.0,0.0,0.44848
SPARC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014014,...,0.0,0.0,0.0,0.0,0.0,0.0,0.067779,0.0,0.122417,0.062665


## Run

In [22]:
rs = np.random.RandomState(0)

In [None]:
results = []

pbar = tqdm(total=int(N_SIMULATED_PHENOTYPES * len(lv_codes)))

for idx in range(N_SIMULATED_PHENOTYPES):
    # compute an association for all LVs
    for lv_code in lv_codes:
        # generate a random phenotype
        phenotype_code = f"random_normal-{idx}"

        phenotype = pd.Series(
            # use abs to simulate MultiPLIER z-scores (always positives)
            np.abs(rs.normal(size=lv_weights.shape[0])),
            index=lv_weights.index.copy(),
            name=phenotype_code,
        )
        
        pbar.set_description(f"{phenotype_code} - {lv_code}")

        gls_model = GLSPhenoplier(
            smultixcan_result_set_filepath=conf.PHENOMEXCAN[
                "SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"
            ]
        )
        gls_model.fit_named(lv_code, phenotype)
        res = gls_model.results

        results.append(
            {
                "phenotype": phenotype_code,
                "lv": lv_code,
                "coef": res.params.loc["lv"],
                "pvalue": res.pvalues_onesided.loc["lv"],
                "pvalue_twosided": res.pvalues.loc["lv"],
                "summary": gls_model.results_summary,
            }
        )

        # save results every 10 models trained
        if (len(results) % 10) == 0:
            pd.DataFrame(results).to_pickle(OUTPUT_FILENAME)

        pbar.update(1)

pbar.close()

random_normal-0 - LV67:   0%|          | 66/49350 [13:04<220:47:54, 16.13s/it]

In [None]:
results = pd.DataFrame(results)

In [None]:
results.shape

In [None]:
results.head()

In [None]:
results.sort_values("pvalue").head(10)

## Save

In [None]:
results.to_pickle(OUTPUT_FILENAME)