# Description

**TODO UPDATE**

(Please, take a look at the README.md file in this directory for instructions on how to run this notebook)

This notebook reads all gene correlations across all chromosomes and computes a single correlation matrix by assembling a big correlation matrix with all genes.

# Modules

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
from scipy.spatial.distance import squareform
from scipy import sparse
import pandas as pd
from tqdm import tqdm

import conf
from utils import chunker
from entity import Gene

# Settings

In [3]:
# a cohort name (it could be something like UK_BIOBANK, etc)
COHORT_NAME = None

# reference panel such as 1000G or GTEX_V8
REFERENCE_PANEL = None

# predictions models such as MASHR or ELASTIC_NET
EQTL_MODEL = None

# This is one S-MultiXcan result file on the same target cohort
# Genes will be read from here to align the correlation matrices
SMULTIXCAN_RESULTS_TEMPLATE = (
    conf.RESULTS["GLS_NULL_SIMS"]
    / "twas"
    / "smultixcan"
    / "random.pheno0-gtex_v8-mashr-smultixcan.txt"
)

In [4]:
# FIXME: remove later
# Parameters
COHORT_NAME = "1000G_EUR"
REFERENCE_PANEL = "1000G"
EQTL_MODEL = "MASHR"

In [5]:
assert COHORT_NAME is not None and len(COHORT_NAME) > 0, "A cohort name must be given"

COHORT_NAME = COHORT_NAME.lower()
display(f"Cohort name: {COHORT_NAME}")

'Cohort name: 1000g_eur'

In [6]:
assert (
    REFERENCE_PANEL is not None and len(REFERENCE_PANEL) > 0
), "A reference panel must be given"

display(f"Reference panel: {REFERENCE_PANEL}")

'Reference panel: 1000G'

In [7]:
assert (
    EQTL_MODEL is not None and len(EQTL_MODEL) > 0
), "A prediction/eQTL model must be given"

EQTL_MODEL_FILES_PREFIX = conf.PHENOMEXCAN["PREDICTION_MODELS"][f"{EQTL_MODEL}_PREFIX"]
display(f"eQTL model: {EQTL_MODEL}) / {EQTL_MODEL_FILES_PREFIX}")

'eQTL model: MASHR) / mashr_'

In [8]:
assert (SMULTIXCAN_RESULTS_TEMPLATE is not None) and (
    SMULTIXCAN_RESULTS_TEMPLATE.exists()
), "You have to provide the path to a S-MultiXcan results file"

In [9]:
OUTPUT_DIR_BASE = (
    conf.RESULTS["GLS"]
    / "gene_corrs"
    / "cohorts"
    / COHORT_NAME.lower()
    / REFERENCE_PANEL.lower()
    / EQTL_MODEL.lower()
    / "all_genes" # FIXME: remove this later
)
OUTPUT_DIR_BASE.mkdir(parents=True, exist_ok=True)

display(f"Using output dir base: {OUTPUT_DIR_BASE}")

'Using output dir base: /opt/data/results/gls/gene_corrs/cohorts/1000g_eur/1000g/mashr/all_genes'

# Load data

## S-MultiXcan genes

In [10]:
smultixcan_df = pd.read_csv(SMULTIXCAN_RESULTS_TEMPLATE, sep="\t")

In [11]:
smultixcan_df.shape

(22317, 18)

In [12]:
smultixcan_df.head()

Unnamed: 0,gene,gene_name,pvalue,n,n_indep,p_i_best,t_i_best,p_i_worst,t_i_worst,eigen_max,eigen_min,eigen_min_kept,z_min,z_max,z_mean,z_sd,tmi,status
0,ENSG00000131941.7,RHPN2,4e-05,48.0,3.0,0.000213947,Artery_Tibial,0.990132,Brain_Nucleus_accumbens_basal_ganglia,36.556432,7.692089e-16,2.519701,-2.721185,3.701952,1.283152,1.825567,3.0,0
1,ENSG00000076650.6,GPATCH1,7.8e-05,40.0,3.0,0.000453439,Brain_Cerebellum,0.817384,Brain_Frontal_Cortex_BA9,29.990208,2.086487e-15,1.815203,-3.506853,2.383485,-2.016745,1.715495,3.0,0
2,ENSG00000100906.10,NFKBIA,9.6e-05,1.0,1.0,9.591208e-05,Brain_Frontal_Cortex_BA9,9.6e-05,Brain_Frontal_Cortex_BA9,1.0,1.0,1.0,-3.900707,-3.900707,-3.900707,,1.0,0
3,ENSG00000136319.11,TTC5,0.000109,47.0,5.0,0.001402826,Brain_Hippocampus,0.961887,Colon_Sigmoid,21.272442,8.142339e-16,0.732606,-3.194069,1.397514,-0.916662,1.068989,5.0,0
4,ENSG00000152990.13,ADGRA3,0.000135,41.0,12.0,3.211289e-07,Heart_Atrial_Appendage,0.653657,Whole_Blood,12.988248,3.499412e-16,0.444682,-5.110605,3.59941,-0.464735,2.316607,12.0,0


In [13]:
assert not smultixcan_df.isin([np.inf, -np.inf]).any().any()

In [14]:
# remove NaNs
smultixcan_df = smultixcan_df.dropna(subset=["pvalue"])
display(smultixcan_df.shape)

(22314, 18)

In [15]:
smultixcan_genes = set(smultixcan_df["gene_name"].tolist())

In [16]:
len(smultixcan_genes)

22308

In [17]:
sorted(list(smultixcan_genes))[:5]

['A1BG', 'A1CF', 'A2M', 'A2ML1', 'A3GALT2']

## Gene correlations

In [18]:
input_file = OUTPUT_DIR_BASE / "gene_corrs-symbols.pkl"
display(input_file)
assert input_file.exists()

PosixPath('/opt/data/results/gls/gene_corrs/cohorts/1000g_eur/1000g/mashr/all_genes/gene_corrs-symbols.pkl')

In [19]:
# load correlation matrix
gene_corrs = pd.read_pickle(input_file)

In [20]:
gene_corrs.shape

(6442, 6442)

In [21]:
gene_corrs.head()

Unnamed: 0,NOC2L,HES4,ISG15,AGRN,TNFRSF18,TNFRSF4,B3GALT6,UBE2J2,ACAP3,TAS1R3,...,PLXNB2,ADM2,MIOX,SCO2,TYMP,CPT1B,CHKB,MAPK8IP2,ARSA,SHANK3
NOC2L,1.0,0.115011,0.173138,0.056096,0.008032,0.008727,0.006797,0.004533,0.00735,0.010391,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
HES4,0.115011,1.0,0.681368,0.360588,0.011545,0.010729,0.003577,0.01023,0.010747,0.008769,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ISG15,0.173138,0.681368,1.0,0.381394,0.011774,0.012527,0.003754,0.012096,0.012679,0.010442,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AGRN,0.056096,0.360588,0.381394,1.0,0.013005,0.015775,0.006184,0.006813,0.010775,0.009189,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TNFRSF18,0.008032,0.011545,0.011774,0.013005,1.0,0.356676,0.45401,0.137643,0.20034,0.09321,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Define output dir (based on gene correlation's file)

In [22]:
# output file (hdf5)
output_dir = Path(input_file).with_suffix(".per_lv")
output_dir.mkdir(parents=True, exist_ok=True)

display(output_dir)

PosixPath('/opt/data/results/gls/gene_corrs/cohorts/1000g_eur/1000g/mashr/all_genes/gene_corrs-symbols.per_lv')

## MultiPLIER Z

In [23]:
multiplier_z = pd.read_pickle(conf.MULTIPLIER["MODEL_Z_MATRIX_FILE"])

In [24]:
multiplier_z.shape

(6750, 987)

In [25]:
multiplier_z.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
GAS6,0.0,0.0,0.039438,0.0,0.050476,0.0,0.0,0.0,0.590949,0.0,...,0.050125,0.0,0.033407,0.0,0.0,0.005963,0.347362,0.0,0.0,0.0
MMP14,0.0,0.0,0.0,0.0,0.070072,0.0,0.0,0.004904,1.720179,2.423595,...,0.0,0.0,0.001007,0.0,0.035747,0.0,0.0,0.0,0.014978,0.0
DSP,0.0,0.0,0.0,0.0,0.0,0.041697,0.0,0.005718,0.0,0.0,...,0.020853,0.0,0.0,0.0,0.0,0.005774,0.0,0.0,0.0,0.416405
MARCKSL1,0.305212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.161843,0.149471,...,0.027134,0.05272,0.0,0.030189,0.060884,0.0,0.0,0.0,0.0,0.44848
SPARC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014014,...,0.0,0.0,0.0,0.0,0.0,0.0,0.067779,0.0,0.122417,0.062665


## Common genes

In [26]:
common_genes = sorted(
    list(
        smultixcan_genes.intersection(multiplier_z.index).intersection(gene_corrs.index)
    )
)

In [27]:
len(common_genes)

6442

In [28]:
common_genes[:5]

['A2M', 'AAAS', 'AANAT', 'AARS', 'AARS2']

# Compute inverse correlation matrix for each LV

In [29]:
def store_df(nparray, base_filename):
    if base_filename in ("metadata", "gene_names"):
        np.savez_compressed(output_dir / (base_filename + ".npz"), data=nparray)
    else:
        sparse.save_npz(
            output_dir / (base_filename + ".npz"),
            sparse.csc_matrix(nparray),
            compressed=False,
        )

In [30]:
def compute_chol_inv(lv_codes):
    for lv_code in lv_codes:
        corr_mat_sub = pd.DataFrame(
            np.identity(len(common_genes)),
            index=common_genes.copy(),
            columns=common_genes.copy(),
        )

        lv_data = multiplier_z[lv_code]
        lv_nonzero_genes = lv_data[lv_data > 0].index
        lv_nonzero_genes = lv_nonzero_genes.intersection(corr_mat_sub.index)

        corr_mat_sub.loc[lv_nonzero_genes, lv_nonzero_genes] = gene_corrs.loc[
            lv_nonzero_genes, lv_nonzero_genes
        ]

        chol_mat = np.linalg.cholesky(corr_mat_sub)
        chol_inv = np.linalg.inv(chol_mat)

        store_df(chol_inv, lv_code)

In [31]:
# divide LVs in chunks for parallel processing
lvs_chunks = list(chunker(list(multiplier_z.columns), 50))

In [32]:
# metadata
metadata = np.array([REFERENCE_PANEL, EQTL_MODEL])
store_df(metadata, "metadata")

# gene names
gene_names = np.array(common_genes)
store_df(gene_names, "gene_names")

# pbar = tqdm(total=multiplier_z.columns.shape[0])

with ProcessPoolExecutor(max_workers=conf.GENERAL["N_JOBS"]) as executor, tqdm(
    total=len(lvs_chunks), ncols=100
) as pbar:
    tasks = [executor.submit(compute_chol_inv, chunk) for chunk in lvs_chunks]
    for future in as_completed(tasks):
        res = future.result()
        pbar.update(1)

100%|████████████████████████████████████████████████████████████| 20/20 [1:40:49<00:00, 302.49s/it]


## Some checks

In [33]:
def load_df(base_filename):
    full_filepath = output_dir / (base_filename + ".npz")

    if base_filename in ("metadata", "gene_names"):
        return np.load(full_filepath)["data"]
    else:
        return sparse.load_npz(full_filepath).toarray()

In [34]:
_genes = load_df("gene_names")

In [35]:
display(len(_genes))
assert len(_genes) == len(common_genes)

6442

In [36]:
_metadata = load_df("metadata")

In [37]:
display(_metadata)
assert _metadata[0] == REFERENCE_PANEL
assert _metadata[1] == EQTL_MODEL

array(['1000G', 'MASHR'], dtype='<U5')

In [38]:
lv1_inv = load_df("LV1")

In [39]:
lv2_inv = load_df("LV2")

In [40]:
lv_last_inv = load_df("LV987")

In [41]:
assert lv1_inv.shape == lv2_inv.shape

In [42]:
assert not np.allclose(lv1_inv, lv2_inv)

In [43]:
assert not np.allclose(lv1_inv, lv_last_inv)

In [44]:
assert not np.allclose(lv2_inv, lv_last_inv)