# Description

* reads MultiXcan results on a random phenotype file (using Elastic Net models)
* runs PhenoPLIER on all LVs to compute the null

# Environment variables

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import conf

In [None]:
# N_JOBS = conf.GENERAL["N_JOBS"]
# set N_JOBS manually, because we are parallelizing outside
N_JOBS = 1
display(N_JOBS)

In [None]:
%env MKL_NUM_THREADS=$N_JOBS
%env OPEN_BLAS_NUM_THREADS=$N_JOBS
%env NUMEXPR_NUM_THREADS=$N_JOBS
%env OMP_NUM_THREADS=$N_JOBS

# Modules

In [None]:
import itertools
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
from scipy import stats
import pandas as pd
from tqdm import tqdm

from utils import chunker
from entity import Gene
from gls import GLSPhenoplier

# Settings

In [None]:
N_SIMULATED_PHENOTYPES = 1  # disable generation of derived random phenotypes
CHUNK_SIZE = 50
EQTL_MODEL = "ELASTIC_NET"

# Paths

In [None]:
OUTPUT_DIR = conf.RESULTS["GLS"] / "null_simulations"
display(OUTPUT_DIR)

OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

In [None]:
OUTPUT_FILENAME = OUTPUT_DIR / "en-null_simulations.pkl"
display(OUTPUT_FILENAME)

# Load data

## MultiXcan on random phenotype

This result was downloaded from the MultiXcan paper here: https://github.com/hakyimlab/multixcan-paper

In [None]:
multixcan_random_phenotype = pd.read_csv(
    conf.PHENOMEXCAN["BASE_DIR"] / "random__ccn30__mt_results.txt",
    sep="\t",
    usecols=["gene", "pvalue"],
)

In [None]:
multixcan_random_phenotype.shape

In [None]:
multixcan_random_phenotype.head()

In [None]:
multixcan_random_phenotype["gene"] = multixcan_random_phenotype["gene"].str.split(
    ".", n=1, expand=True
)[0]

In [None]:
multixcan_random_phenotype = multixcan_random_phenotype.set_index("gene")

In [None]:
multixcan_random_phenotype.head()

In [None]:
assert multixcan_random_phenotype.index.is_unique

## MultiPLIER Z matrix

In [None]:
# multiplier_z_matrix = pd.read_pickle(conf.MULTIPLIER["MODEL_Z_MATRIX_FILE"])

In [None]:
# multiplier_z_matrix.shape

In [None]:
# multiplier_z_matrix.head()

In [None]:
# lv_codes = list(multiplier_z_matrix.columns)
# display(lv_codes[:5])

# Preprocess MultiXcan results

## Convert gene IDs to Gene names

In [None]:
smultixcan_results = multixcan_random_phenotype.rename(index=Gene.GENE_ID_TO_NAME_MAP)

In [None]:
smultixcan_results.shape

In [None]:
smultixcan_results.head()

## Remove duplicated gene entries

In [None]:
_tmp = smultixcan_results.index[smultixcan_results.index.duplicated(keep="first")]
assert _tmp.shape[0] == 0

## Convert p-values to z-scores

In [None]:
smultixcan_results = smultixcan_results.assign(
    zscore=np.abs(stats.norm.ppf(smultixcan_results["pvalue"].to_numpy() / 2))
)

In [None]:
smultixcan_results = smultixcan_results.drop(columns="pvalue").squeeze()

In [None]:
smultixcan_results.head()

In [None]:
smultixcan_results.describe()

## Some checks

In [None]:
# the data should have no NaN values
assert smultixcan_results.shape == smultixcan_results.dropna(how="any").shape

# GLSPhenoplier

## Identify clusters of non-related genes

In [None]:
en_gene_corr = GLSPhenoplier._get_data(
    conf.PHENOMEXCAN["SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"],
    model_type="ELASTIC_NET",
)[0]

In [None]:
_comm_genes = en_gene_corr.index.intersection(smultixcan_results.index)

In [None]:
en_gene_corr = en_gene_corr.loc[_comm_genes, _comm_genes]

In [None]:
en_gene_corr.shape

In [None]:
en_gene_corr.head()

In [None]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
en_gene_dist = en_gene_corr.abs().copy()
np.fill_diagonal(en_gene_dist.values, 0.0)

In [None]:
en_gene_dist

In [None]:
_tmp = en_gene_dist.unstack()
_tmp = _tmp[(_tmp > 0.0) & (_tmp < 1.0)]

In [None]:
_tmp.sort_values()

In [None]:
ac = AgglomerativeClustering(
    n_clusters=None,
    compute_full_tree=True,
    linkage="complete",
    affinity="precomputed",
    distance_threshold=1e-100,
)

In [None]:
ac.fit(en_gene_dist)

In [None]:
gene_part = pd.Series(ac.labels_)
display(gene_part.value_counts())

In [None]:
en_gene_dist.loc[(ac.labels_ == 141), (ac.labels_ == 141)]

In [None]:
phenotype_gene_clusters = {
    cluster_id: smultixcan_results.loc[en_gene_dist.index[gene_part == cluster_id]]
    for cluster_id in gene_part.value_counts().index
}

In [None]:
phenotype_gene_clusters[141]

## Functions

In [None]:
rs = np.random.RandomState(0)

In [None]:
def get_shuffled_phenotype():
    shuffled_gene_clusters = []
    for cluster_id, gene_assoc_cluster in phenotype_gene_clusters.items():
        gc = gene_assoc_cluster.copy()
        rs.shuffle(gc)
        shuffled_gene_clusters.append(gc)

    return pd.concat(shuffled_gene_clusters)

In [None]:
def get_df_from_results(results_list):
    df = pd.DataFrame(results_list).astype(
        {
            "phenotype": "category",
            "lv": "category",
        }
    )

    return df

## Load `phenotype_assocs` and `lv_weights`

In [None]:
lv_weights = GLSPhenoplier._get_data(
    conf.PHENOMEXCAN["SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"],
    model_type=EQTL_MODEL,
)[2]

In [None]:
lv_weights.shape

In [None]:
lv_weights.head()

## Generate simulated phenotypes

In [None]:
# phenotype_codes = rs.choice(phenotype_list, size=N_SIMULATED_PHENOTYPES, replace=False)
# display(phenotype_codes[:3])
# display(len(phenotype_codes))
# assert len(phenotype_codes) == N_SIMULATED_PHENOTYPES

simulated_phenotypes = {
    "smultixcan phenotype 0": smultixcan_results.loc[en_gene_dist.index]
}

for idx in tqdm(range(1, N_SIMULATED_PHENOTYPES)):
    simulated_phenotypes[f"smultixcan phenotype {idx}"] = get_shuffled_phenotype()

In [None]:
display(len(simulated_phenotypes))
assert len(simulated_phenotypes) == N_SIMULATED_PHENOTYPES

In [None]:
simulated_phenotypes[list(simulated_phenotypes.keys())[0]]

In [None]:
simulated_phenotypes = pd.DataFrame(simulated_phenotypes)

In [None]:
simulated_phenotypes.shape

In [None]:
simulated_phenotypes.head()

In [None]:
simulated_phenotypes.describe()

## Merge simulated phenotypes and LVs into one dataframe

In [None]:
# smultixcan_results = smultixcan_results.loc[smultixcan_results.index.intersection(lv_weights.index)]

In [None]:
# smultixcan_results.shape

In [None]:
# smultixcan_results.head()

In [None]:
# assert not smultixcan_results.isna().any()

In [None]:
# simulated_phenotypes = pd.DataFrame({"smultixcan_random_phenotype": smultixcan_results})

In [None]:
# simulated_phenotypes.shape

In [None]:
# simulated_phenotypes.head()

In [None]:
run_confs = pd.DataFrame(
    data=itertools.product(
        list(simulated_phenotypes.columns), list(lv_weights.columns)
    ),
    columns=["phenotype", "lv"],
)

In [None]:
run_confs

## Split run configurations

In [None]:
run_confs_chunks = chunker(run_confs.sample(frac=1, random_state=rs), CHUNK_SIZE)

## Run

In [None]:
def run(run_confs_subset):
    results = []

    for phenotype_code, lv_code in run_confs_subset.itertuples(name=None, index=False):
        phenotype = simulated_phenotypes[phenotype_code]

        gls_model = GLSPhenoplier(
            smultixcan_result_set_filepath=conf.PHENOMEXCAN[
                "SMULTIXCAN_EFO_PARTIAL_MASHR_ZSCORES_FILE"
            ],
            model_type=EQTL_MODEL,
        )
        gls_model.fit_named(lv_code, phenotype)
        res = gls_model.results

        results.append(
            {
                "phenotype": phenotype_code,
                "lv": lv_code,
                "coef": res.params.loc["lv"],
                "pvalue": res.pvalues_onesided.loc["lv"],
            }
        )

    return get_df_from_results(results)

In [None]:
all_results = []

with tqdm(total=run_confs.shape[0]) as pbar:
    with ProcessPoolExecutor(max_workers=conf.GENERAL["N_JOBS"]) as executor:
        tasks = [executor.submit(run, chunk) for chunk in run_confs_chunks]

        for future in as_completed(tasks):
            res = future.result()
            all_results.append(res)

            if (len(all_results) % conf.GENERAL["N_JOBS"]) == 0:
                df = pd.concat(all_results, ignore_index=True)
                df.to_pickle(OUTPUT_FILENAME)

            pbar.update(res.shape[0])

In [None]:
all_results = pd.concat(all_results, ignore_index=True)

In [None]:
# results = get_df_from_results(results)

In [None]:
all_results.shape

In [None]:
all_results.head()

In [None]:
all_results.sort_values("pvalue").head(10)

## Save

In [None]:
all_results.to_pickle(OUTPUT_FILENAME)