# Description

**TODO UPDATE**

(Please, take a look at the README.md file in this directory for instructions on how to run this notebook)

This notebook reads all gene correlations across all chromosomes and computes a single correlation matrix by assembling a big correlation matrix with all genes.

# Modules

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
from scipy.spatial.distance import squareform
from scipy import sparse
import pandas as pd
from tqdm import tqdm

import conf
from utils import chunker
from entity import Gene
from gls import GLSPhenoplier

# Settings

In [3]:
# a cohort name (it could be something like UK_BIOBANK, etc)
COHORT_NAME = None

# reference panel such as 1000G or GTEX_V8
REFERENCE_PANEL = None

# predictions models such as MASHR or ELASTIC_NET
EQTL_MODEL = None

LV_CODE = None

# A number from 0.0 to 1.0 indicating the top percentile of the genes in the LV to keep.
# A value of 0.01 would take the top 1% of the genes in the LV.
# If zero or None, then all nonzero weighted genes in the LV will be kept.
LV_PERCENTILE = None

In [4]:
# Parameters
COHORT_NAME = "phenomexcan_rapid_gwas"
REFERENCE_PANEL = "GTEX_V8"
EQTL_MODEL = "MASHR"
LV_CODE = "LV100"
LV_PERCENTILE = "0.01"


In [5]:
N_JOBS = 1

In [6]:
assert COHORT_NAME is not None and len(COHORT_NAME) > 0, "A cohort name must be given"

COHORT_NAME = COHORT_NAME.lower()
display(f"Cohort name: {COHORT_NAME}")

'Cohort name: phenomexcan_rapid_gwas'

In [7]:
assert (
    REFERENCE_PANEL is not None and len(REFERENCE_PANEL) > 0
), "A reference panel must be given"

display(f"Reference panel: {REFERENCE_PANEL}")

'Reference panel: GTEX_V8'

In [8]:
assert (
    EQTL_MODEL is not None and len(EQTL_MODEL) > 0
), "A prediction/eQTL model must be given"

EQTL_MODEL_FILES_PREFIX = conf.PHENOMEXCAN["PREDICTION_MODELS"][f"{EQTL_MODEL}_PREFIX"]
display(f"eQTL model: {EQTL_MODEL} / {EQTL_MODEL_FILES_PREFIX}")

'eQTL model: MASHR / mashr_'

In [9]:
assert LV_CODE is not None and len(LV_CODE) > 0, "An LV code must be given"

display(f"LV code: {LV_CODE}")

'LV code: LV100'

In [10]:
if LV_PERCENTILE is not None:
    LV_PERCENTILE = float(LV_PERCENTILE)

display(f"LV percentile: {LV_PERCENTILE}")

'LV percentile: 0.01'

In [11]:
OUTPUT_DIR_BASE = (
    conf.RESULTS["GLS"]
    / "gene_corrs"
    / "cohorts"
    / COHORT_NAME.lower()
    / REFERENCE_PANEL.lower()
    / EQTL_MODEL.lower()
)
OUTPUT_DIR_BASE.mkdir(parents=True, exist_ok=True)

display(f"Using output dir base: {OUTPUT_DIR_BASE}")

'Using output dir base: /project/ritchie20/projects/phenoplier/base/results/gls/gene_corrs/cohorts/phenomexcan_rapid_gwas/gtex_v8/mashr'

# Load data

## Gene correlations

In [12]:
input_files = list(OUTPUT_DIR_BASE.glob("gene_corrs-symbols*.pkl"))
display(input_files)
assert len(input_files) > 0, "No input correlation files"

[PosixPath('/project/ritchie20/projects/phenoplier/base/results/gls/gene_corrs/cohorts/phenomexcan_rapid_gwas/gtex_v8/mashr/gene_corrs-symbols.pkl'),
 PosixPath('/project/ritchie20/projects/phenoplier/base/results/gls/gene_corrs/cohorts/phenomexcan_rapid_gwas/gtex_v8/mashr/gene_corrs-symbols-within_distance_10mb.pkl'),
 PosixPath('/project/ritchie20/projects/phenoplier/base/results/gls/gene_corrs/cohorts/phenomexcan_rapid_gwas/gtex_v8/mashr/gene_corrs-symbols-within_distance_5mb.pkl'),
 PosixPath('/project/ritchie20/projects/phenoplier/base/results/gls/gene_corrs/cohorts/phenomexcan_rapid_gwas/gtex_v8/mashr/gene_corrs-symbols-within_distance_2mb.pkl')]

In [13]:
# load correlation matrix
gene_corrs_dict = {f.name: pd.read_pickle(f) for f in input_files}

In [14]:
orig_corr_name = "gene_corrs-symbols.pkl"

In [15]:
gene_corrs_dict[orig_corr_name].shape

(6428, 6428)

In [16]:
gene_corrs_dict[orig_corr_name].head()

Unnamed: 0,NOC2L,HES4,ISG15,AGRN,TNFRSF18,TNFRSF4,B3GALT6,UBE2J2,ACAP3,TAS1R3,...,PLXNB2,ADM2,MIOX,SCO2,TYMP,CPT1B,CHKB,MAPK8IP2,ARSA,SHANK3
NOC2L,1.0,0.118397,0.103852,0.09198,0.006284,0.007338,0.004868,0.043019,0.012004,0.006924,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
HES4,0.118397,1.0,0.849549,0.402466,0.011284,0.008897,0.005317,0.010216,0.002582,0.005373,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ISG15,0.103852,0.849549,1.0,0.398077,0.011218,0.011083,0.006313,0.012211,0.003459,0.009198,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AGRN,0.09198,0.402466,0.398077,1.0,0.005065,0.011509,0.001824,0.011029,0.005019,0.006681,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TNFRSF18,0.006284,0.011284,0.011218,0.005065,1.0,0.337836,0.551124,0.10746,0.193531,0.101072,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
current_index = gene_corrs_dict[orig_corr_name].index
assert all(
    [current_index.equals(gc.index) for k, gc in gene_corrs_dict.items()]
), "Correlation matrices are not compatible"

## MultiPLIER Z

In [18]:
multiplier_z = pd.read_pickle(conf.MULTIPLIER["MODEL_Z_MATRIX_FILE"])

In [19]:
multiplier_z.shape

(6750, 987)

In [20]:
multiplier_z.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
GAS6,0.0,0.0,0.039438,0.0,0.050476,0.0,0.0,0.0,0.590949,0.0,...,0.050125,0.0,0.033407,0.0,0.0,0.005963,0.347362,0.0,0.0,0.0
MMP14,0.0,0.0,0.0,0.0,0.070072,0.0,0.0,0.004904,1.720179,2.423595,...,0.0,0.0,0.001007,0.0,0.035747,0.0,0.0,0.0,0.014978,0.0
DSP,0.0,0.0,0.0,0.0,0.0,0.041697,0.0,0.005718,0.0,0.0,...,0.020853,0.0,0.0,0.0,0.0,0.005774,0.0,0.0,0.0,0.416405
MARCKSL1,0.305212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.161843,0.149471,...,0.027134,0.05272,0.0,0.030189,0.060884,0.0,0.0,0.0,0.0,0.44848
SPARC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014014,...,0.0,0.0,0.0,0.0,0.0,0.0,0.067779,0.0,0.122417,0.062665


# Compute inverse correlation matrix for each LV

In [21]:
def exists_df(output_dir, base_filename):
    full_filepath = output_dir / (base_filename + ".npz")

    return full_filepath.exists()

In [22]:
def store_df(output_dir, nparray, base_filename):
    if base_filename in ("metadata", "gene_names"):
        np.savez_compressed(output_dir / (base_filename + ".npz"), data=nparray)
    else:
        sparse.save_npz(
            output_dir / (base_filename + ".npz"),
            sparse.csc_matrix(nparray),
            compressed=False,
        )

In [23]:
def get_output_dir(gene_corr_filename):
    path = OUTPUT_DIR_BASE / gene_corr_filename
    assert path.exists()
    return path.with_suffix(".per_lv")

In [24]:
def compute_chol_inv(lv_codes):
    for gene_corr_filename, gene_corrs in gene_corrs_dict.items():
        output_dir = get_output_dir(gene_corr_filename)
        output_dir.mkdir(parents=True, exist_ok=True)
        display(f"Output dir: {str(output_dir)}")

        # save LV chol inverse
        for lv_code in lv_codes:
            lv_data = multiplier_z[lv_code]

            corr_mat_sub = GLSPhenoplier.get_sub_mat(gene_corrs, lv_data, LV_PERCENTILE)
            store_df(output_dir, corr_mat_sub.to_numpy(), f"{lv_code}_corr_mat")

            chol_mat = np.linalg.cholesky(corr_mat_sub)
            chol_inv = np.linalg.inv(chol_mat)

            store_df(output_dir, chol_inv, lv_code)

        # save metadata
        if not exists_df(output_dir, "metadata"):
            metadata = np.array([REFERENCE_PANEL, EQTL_MODEL])
            store_df(output_dir, metadata, "metadata")
        else:
            display("Metadata file already exists")

        # save gene names
        if not exists_df(output_dir, "gene_names"):
            gene_names = np.array(gene_corrs.index.tolist())
            store_df(output_dir, gene_names, "gene_names")
        else:
            display("Gene names file already exists")

In [25]:
# divide LVs in chunks for parallel processing
# lvs_chunks = list(chunker(list(multiplier_z.columns), 50))
lvs_chunks = [[LV_CODE]]

In [26]:
with ProcessPoolExecutor(max_workers=N_JOBS) as executor, tqdm(
    total=len(lvs_chunks), ncols=100
) as pbar:
    tasks = [executor.submit(compute_chol_inv, chunk) for chunk in lvs_chunks]
    for future in as_completed(tasks):
        res = future.result()
        pbar.update(1)

  0%|                                                                         | 0/1 [00:00<?, ?it/s][2022-08-29 16:07:58,921 - numexpr.utils] INFO: Note: NumExpr detected 28 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.


'Metadata file already exists'

'Gene names file already exists'

'Output dir: /project/ritchie20/projects/phenoplier/base/results/gls/gene_corrs/cohorts/phenomexcan_rapid_gwas/gtex_v8/mashr/gene_corrs-symbols-within_distance_10mb.per_lv'

'Metadata file already exists'

'Gene names file already exists'

'Output dir: /project/ritchie20/projects/phenoplier/base/results/gls/gene_corrs/cohorts/phenomexcan_rapid_gwas/gtex_v8/mashr/gene_corrs-symbols-within_distance_5mb.per_lv'

'Metadata file already exists'

'Gene names file already exists'

'Output dir: /project/ritchie20/projects/phenoplier/base/results/gls/gene_corrs/cohorts/phenomexcan_rapid_gwas/gtex_v8/mashr/gene_corrs-symbols-within_distance_2mb.per_lv'

'Metadata file already exists'

'Gene names file already exists'

100%|█████████████████████████████████████████████████████████████████| 1/1 [01:31<00:00, 91.86s/it]


## Some checks

In [27]:
def load_df(output_dir, base_filename):
    full_filepath = output_dir / (base_filename + ".npz")

    if base_filename in ("metadata", "gene_names"):
        return np.load(full_filepath)["data"]
    else:
        return sparse.load_npz(full_filepath).toarray()

In [28]:
_genes = load_df(get_output_dir(orig_corr_name), "gene_names")

In [29]:
display(len(_genes))
assert len(_genes) == gene_corrs_dict[orig_corr_name].index.shape[0]

6428

In [30]:
_metadata = load_df(get_output_dir(orig_corr_name), "metadata")

In [31]:
display(_metadata)
assert _metadata[0] == REFERENCE_PANEL
assert _metadata[1] == EQTL_MODEL

array(['GTEX_V8', 'MASHR'], dtype='<U7')

In [32]:
all_lvs_inv = {}
lv_prev = None

for gene_corr_filename, _ in gene_corrs_dict.items():
    output_dir = get_output_dir(gene_corr_filename)

    lv_data = load_df(output_dir, LV_CODE)
    display(lv_data)

    if lv_prev is not None:
        assert lv_data.shape == lv_prev.shape
        assert not np.allclose(lv_data, lv_prev)

    lv_prev = lv_data

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])