# Description

(Please, take a look at the README.md file in this directory for instructions on how to run this notebook)

It computes an LV-specific correlation matrix by using the top genes in that LV only.

It has specicfic parameters for papermill (see under `Settings` below).

This notebook is not directly run. See README.md.

# Modules

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
from scipy.spatial.distance import squareform
from scipy import sparse
import pandas as pd
from tqdm import tqdm

import conf
from utils import chunker
from entity import Gene
from gls import GLSPhenoplier

# Settings

In [3]:
# a cohort name (it could be something like UK_BIOBANK, etc)
COHORT_NAME = None

# reference panel such as 1000G or GTEX_V8
REFERENCE_PANEL = "GTEX_V8"

# predictions models such as MASHR or ELASTIC_NET
EQTL_MODEL = "MASHR"

# A range of LVs in the format X-Y, such as 1-50 (from LV1 to LV50).
# If None, all LVs will be processed.
LV_RANGE = None

# A number from 0.0 to 1.0 indicating the top percentile of the genes in the LV to keep.
# A value of 0.01 would take the top 1% of the genes in the LV.
# If zero or None, then all nonzero weighted genes in the LV will be kept.
LV_PERCENTILE = None

In [4]:
# Parameters
PHENOPLIER_NOTEBOOK_FILEPATH = (
    "projects/asthma-copd/nbs/20_gene_corrs/jobs/15-create_corr_mat_per_lv.ipynb"
)
COHORT_NAME = "asthma_only"
LV_RANGE = "151-200"
LV_PERCENTILE = "0.01"
OUTPUT_DIR_BASE = "/opt/data/projects/asthma-copd/results/gls_phenoplier"


In [5]:
N_JOBS = 1

In [6]:
assert COHORT_NAME is not None and len(COHORT_NAME) > 0, "A cohort name must be given"

COHORT_NAME = COHORT_NAME.lower()
display(f"Cohort name: {COHORT_NAME}")

'Cohort name: asthma_only'

In [7]:
assert (
    REFERENCE_PANEL is not None and len(REFERENCE_PANEL) > 0
), "A reference panel must be given"

display(f"Reference panel: {REFERENCE_PANEL}")

'Reference panel: GTEX_V8'

In [8]:
assert (
    EQTL_MODEL is not None and len(EQTL_MODEL) > 0
), "A prediction/eQTL model must be given"

EQTL_MODEL_FILES_PREFIX = conf.PHENOMEXCAN["PREDICTION_MODELS"][f"{EQTL_MODEL}_PREFIX"]
display(f"eQTL model: {EQTL_MODEL} / {EQTL_MODEL_FILES_PREFIX}")

'eQTL model: MASHR / mashr_'

In [9]:
if LV_PERCENTILE is not None:
    LV_PERCENTILE = float(LV_PERCENTILE)

display(f"LV percentile: {LV_PERCENTILE}")

'LV percentile: 0.01'

In [10]:
assert (
    OUTPUT_DIR_BASE is not None and len(OUTPUT_DIR_BASE) > 0
), "Output directory path must be given"

OUTPUT_DIR_BASE = (Path(OUTPUT_DIR_BASE) / "gene_corrs" / COHORT_NAME).resolve()

OUTPUT_DIR_BASE.mkdir(parents=True, exist_ok=True)

display(f"Using output dir base: {OUTPUT_DIR_BASE}")

'Using output dir base: /opt/data/projects/asthma-copd/results/gls_phenoplier/gene_corrs/asthma_only'

# Load data

## Gene correlations

In [11]:
input_files = list(OUTPUT_DIR_BASE.glob("gene_corrs-symbols*.pkl"))
display(input_files)
assert len(input_files) > 0, "No input correlation files"

[PosixPath('/opt/data/projects/asthma-copd/results/gls_phenoplier/gene_corrs/asthma_only/gene_corrs-symbols.pkl')]

In [12]:
# load correlation matrix
gene_corrs_dict = {f.name: pd.read_pickle(f) for f in input_files}

In [13]:
orig_corr_name = "gene_corrs-symbols.pkl"

In [14]:
gene_corrs_dict[orig_corr_name].shape

(6443, 6443)

In [15]:
gene_corrs_dict[orig_corr_name].head()

Unnamed: 0,NOC2L,HES4,ISG15,AGRN,TNFRSF18,TNFRSF4,B3GALT6,UBE2J2,ACAP3,TAS1R3,...,PLXNB2,ADM2,MIOX,SCO2,TYMP,CPT1B,CHKB,MAPK8IP2,ARSA,SHANK3
NOC2L,1.0,0.12059,0.177063,0.085746,0.006792,0.007685,0.004673,0.010734,0.0112,0.007529,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
HES4,0.12059,1.0,0.67206,0.391127,0.011695,0.009275,0.00597,0.007385,0.002641,0.004767,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ISG15,0.177063,0.67206,1.0,0.424696,0.013929,0.013311,0.008466,0.010608,0.006689,0.011659,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AGRN,0.085746,0.391127,0.424696,1.0,0.005309,0.011848,0.002161,0.002535,0.005359,0.007012,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TNFRSF18,0.006792,0.011695,0.013929,0.005309,1.0,0.338085,0.550817,0.154394,0.193627,0.101004,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
current_index = gene_corrs_dict[orig_corr_name].index
assert all(
    [current_index.equals(gc.index) for k, gc in gene_corrs_dict.items()]
), "Correlation matrices are not compatible"

## MultiPLIER Z

In [17]:
multiplier_z = pd.read_pickle(conf.MULTIPLIER["MODEL_Z_MATRIX_FILE"])

In [18]:
multiplier_z.shape

(6750, 987)

In [19]:
multiplier_z.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
GAS6,0.0,0.0,0.039438,0.0,0.050476,0.0,0.0,0.0,0.590949,0.0,...,0.050125,0.0,0.033407,0.0,0.0,0.005963,0.347362,0.0,0.0,0.0
MMP14,0.0,0.0,0.0,0.0,0.070072,0.0,0.0,0.004904,1.720179,2.423595,...,0.0,0.0,0.001007,0.0,0.035747,0.0,0.0,0.0,0.014978,0.0
DSP,0.0,0.0,0.0,0.0,0.0,0.041697,0.0,0.005718,0.0,0.0,...,0.020853,0.0,0.0,0.0,0.0,0.005774,0.0,0.0,0.0,0.416405
MARCKSL1,0.305212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.161843,0.149471,...,0.027134,0.05272,0.0,0.030189,0.060884,0.0,0.0,0.0,0.0,0.44848
SPARC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014014,...,0.0,0.0,0.0,0.0,0.0,0.0,0.067779,0.0,0.122417,0.062665


# Compute inverse correlation matrix for each LV

In [20]:
def exists_df(output_dir, base_filename):
    full_filepath = output_dir / (base_filename + ".npz")

    return full_filepath.exists()

In [21]:
def store_df(output_dir, nparray, base_filename):
    if base_filename in ("metadata", "gene_names"):
        np.savez_compressed(output_dir / (base_filename + ".npz"), data=nparray)
    else:
        sparse.save_npz(
            output_dir / (base_filename + ".npz"),
            sparse.csc_matrix(nparray),
            compressed=False,
        )

In [22]:
def get_output_dir(gene_corr_filename):
    path = OUTPUT_DIR_BASE / gene_corr_filename
    assert path.exists()
    return path.with_suffix(".per_lv")

In [23]:
def compute_chol_inv(lv_codes):
    for gene_corr_filename, gene_corrs in gene_corrs_dict.items():
        output_dir = get_output_dir(gene_corr_filename)
        output_dir.mkdir(parents=True, exist_ok=True)
        display(f"Output dir: {str(output_dir)}")

        # save LV chol inverse
        for lv_code in lv_codes:
            lv_data = multiplier_z[lv_code]

            corr_mat_sub = GLSPhenoplier.get_sub_mat(gene_corrs, lv_data, LV_PERCENTILE)
            store_df(output_dir, corr_mat_sub.to_numpy(), f"{lv_code}_corr_mat")

            chol_mat = np.linalg.cholesky(corr_mat_sub)
            chol_inv = np.linalg.inv(chol_mat)

            store_df(output_dir, chol_inv, lv_code)

        # save metadata
        if not exists_df(output_dir, "metadata"):
            metadata = np.array([REFERENCE_PANEL, EQTL_MODEL])
            store_df(output_dir, metadata, "metadata")
        else:
            display("Metadata file already exists")

        # save gene names
        if not exists_df(output_dir, "gene_names"):
            gene_names = np.array(gene_corrs.index.tolist())
            store_df(output_dir, gene_names, "gene_names")
        else:
            display("Gene names file already exists")

In [24]:
if LV_RANGE is None:
    # divide LVs in chunks for parallel processing
    display("LV_RANGE was not given")

    lvs_chunks = list(chunker(list(multiplier_z.columns), 50))
else:
    display("LV_RANGE was given")

    assert "-" in LV_RANGE, "LV_RANGE has no '-'"
    lv_min, lv_max = LV_RANGE.split("-")
    lv_min, lv_max = int(lv_min), int(lv_max)
    assert lv_min <= lv_max, "LV_RANGE is incorrect"

    # create a single chunk in this case
    lvs_chunks = [[f"LV{i}" for i in range(lv_min, lv_max + 1)]]

'LV_RANGE was given'

In [25]:
display(f"# of chunks: {len(lvs_chunks)}")
display(f"# of LVs in each chunk: {len(lvs_chunks[0])}")

'# of chunks: 1'

'# of LVs in each chunk: 50'

In [26]:
with ProcessPoolExecutor(max_workers=N_JOBS) as executor, tqdm(
    total=len(lvs_chunks), ncols=100
) as pbar:
    tasks = [executor.submit(compute_chol_inv, chunk) for chunk in lvs_chunks]
    for future in as_completed(tasks):
        res = future.result()
        pbar.update(1)

100%|████████████████████████████████████████████████████████████████| 1/1 [14:35<00:00, 875.95s/it]


## Some checks

In [27]:
def load_df(output_dir, base_filename):
    full_filepath = output_dir / (base_filename + ".npz")

    if base_filename in ("metadata", "gene_names"):
        return np.load(full_filepath)["data"]
    else:
        return sparse.load_npz(full_filepath).toarray()

In [28]:
_genes = load_df(get_output_dir(orig_corr_name), "gene_names")

In [29]:
display(len(_genes))
assert len(_genes) == gene_corrs_dict[orig_corr_name].index.shape[0]

6443

In [30]:
_metadata = load_df(get_output_dir(orig_corr_name), "metadata")

In [31]:
display(_metadata)
assert _metadata[0] == REFERENCE_PANEL
assert _metadata[1] == EQTL_MODEL

array(['GTEX_V8', 'MASHR'], dtype='<U7')