# Description

(Please, take a look at the README.md file in this directory for instructions on how to run this notebook)

This notebook reads the correlation matrix generated and creates new matrices with different "within distances" across genes.
For example, it generates a new correlation matrix with only genes within a distance of 10mb.

# Modules

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pickle

import numpy as np
from scipy.spatial.distance import squareform
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import conf
from entity import Gene
from correlations import (
    check_pos_def,
    compare_matrices,
    correct_corr_mat,
    adjust_non_pos_def,
)

# Settings

In [3]:
# a cohort name (it could be something like UK_BIOBANK, etc)
COHORT_NAME = None

# reference panel such as 1000G or GTEX_V8
REFERENCE_PANEL = None

# predictions models such as MASHR or ELASTIC_NET
EQTL_MODEL = None

# a list with different distances to generate
DISTANCES = [10, 5, 2]

In [4]:
# Parameters
COHORT_NAME = "phenomexcan_astle"
REFERENCE_PANEL = "GTEX_V8"
EQTL_MODEL = "MASHR"


In [5]:
assert COHORT_NAME is not None and len(COHORT_NAME) > 0, "A cohort name must be given"

COHORT_NAME = COHORT_NAME.lower()
display(f"Cohort name: {COHORT_NAME}")

'Cohort name: phenomexcan_astle'

In [6]:
assert (
    REFERENCE_PANEL is not None and len(REFERENCE_PANEL) > 0
), "A reference panel must be given"

display(f"Reference panel: {REFERENCE_PANEL}")

'Reference panel: GTEX_V8'

In [7]:
assert (
    EQTL_MODEL is not None and len(EQTL_MODEL) > 0
), "A prediction/eQTL model must be given"

display(f"eQTL model: {EQTL_MODEL})")

'eQTL model: MASHR)'

In [8]:
OUTPUT_DIR_BASE = (
    conf.RESULTS["GLS"]
    / "gene_corrs"
    / "cohorts"
    / COHORT_NAME
    / REFERENCE_PANEL.lower()
    / EQTL_MODEL.lower()
)
assert OUTPUT_DIR_BASE.exists()

display(f"Using output dir base: {OUTPUT_DIR_BASE}")

'Using output dir base: /project/ritchie20/projects/phenoplier/base/results/gls/gene_corrs/cohorts/phenomexcan_astle/gtex_v8/mashr'

# Load data

## Gene correlations

In [9]:
gene_corrs = pd.read_pickle(OUTPUT_DIR_BASE / "gene_corrs-symbols.pkl")

In [10]:
gene_corrs.shape

(6450, 6450)

In [11]:
gene_corrs.head()

Unnamed: 0,NOC2L,HES4,ISG15,AGRN,TNFRSF18,TNFRSF4,B3GALT6,UBE2J2,ACAP3,TAS1R3,...,PLXNB2,ADM2,MIOX,SCO2,TYMP,CPT1B,CHKB,MAPK8IP2,ARSA,SHANK3
NOC2L,1.0,0.111648,0.187064,0.083062,0.006659,0.006887,0.004603,0.011059,0.011335,0.007667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
HES4,0.111648,1.0,0.760347,0.394843,0.011284,0.008897,0.005317,0.006876,0.002582,0.005084,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ISG15,0.187064,0.760347,1.0,0.348268,0.011228,0.010204,0.006492,0.010893,0.003592,0.009454,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AGRN,0.083062,0.394843,0.348268,1.0,0.006882,0.013548,0.001546,0.002285,0.003513,0.00851,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TNFRSF18,0.006659,0.011284,0.011228,0.006882,1.0,0.337836,0.551124,0.154373,0.193531,0.090832,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
genes_corrs_sum = gene_corrs.sum()
n_genes_included = genes_corrs_sum[genes_corrs_sum > 1.0].shape[0]
display(f"Number of genes with correlations with other genes: {n_genes_included}")

'Number of genes with correlations with other genes: 6450'

In [13]:
genes_corrs_nonzero_sum = (gene_corrs > 0.0).astype(int).sum().sum()
display(f"Number of nonzero cells: {genes_corrs_nonzero_sum}")

'Number of nonzero cells: 2363924'

## Get gene objects

In [14]:
gene_objs = [Gene(name=gene_name) for gene_name in gene_corrs.index]

In [15]:
display(len(gene_objs))

6450

# Subset full correlation matrix using difference "within distances" across genes

In [16]:
for full_distance in DISTANCES:
    distance = full_distance / 2.0
    print(f"Using within distance: {distance}", flush=True)

    # compute a boolean within distance matrix using the given distance
    genes_within_distance = np.eye(len(gene_objs)).astype(bool)
    for g0_idx in range(len(gene_objs) - 1):
        g0_obj = gene_objs[g0_idx]

        for g1_idx in range(g0_idx + 1, len(gene_objs)):
            g1_obj = gene_objs[g1_idx]

            g0_g1_wd = g0_obj.within_distance(g1_obj, distance * 1e6)

            genes_within_distance[g0_idx, g1_idx] = g0_g1_wd
            genes_within_distance[g1_idx, g0_idx] = g0_g1_wd

    genes_within_distance = pd.DataFrame(
        genes_within_distance,
        index=gene_corrs.index.copy(),
        columns=gene_corrs.columns.copy(),
    )

    # subset full correlation matrix
    gene_corrs_within_distance = gene_corrs[genes_within_distance].fillna(0.0)
    assert not gene_corrs_within_distance.equals(gene_corrs)
    assert not np.allclose(gene_corrs_within_distance.to_numpy(), gene_corrs.to_numpy())
    display(gene_corrs_within_distance)

    # check if the new matrix is positive definite
    is_pos_def = check_pos_def(gene_corrs_within_distance)

    if is_pos_def:
        print("all good.", flush=True, end="\n")
    else:
        print("not positive definite, fixing... ", flush=True, end="")
        corr_data_adjusted = adjust_non_pos_def(gene_corrs_within_distance)

        is_pos_def = check_pos_def(corr_data_adjusted)
        assert is_pos_def, "Could not adjust gene correlation matrix"

        print("fixed! comparing...", flush=True, end="\n")
        compare_matrices(gene_corrs_within_distance, corr_data_adjusted)

        # save
        gene_corrs_within_distance = corr_data_adjusted

    # checks
    assert not gene_corrs_within_distance.isna().any(None)
    assert not np.isinf(gene_corrs_within_distance.to_numpy()).any()
    assert not np.iscomplex(gene_corrs_within_distance.to_numpy()).any()

    # show stats
    genes_corrs_sum = gene_corrs_within_distance.sum()
    n_genes_included = genes_corrs_sum[genes_corrs_sum > 1.0].shape[0]
    display(f"Number of genes with correlations with other genes: {n_genes_included}")

    genes_corrs_nonzero_sum = (gene_corrs_within_distance > 0.0).astype(int).sum().sum()
    display(f"Number of nonzero cells: {genes_corrs_nonzero_sum}")

    corr_matrix_flat = gene_corrs_within_distance.mask(
        np.triu(np.ones(gene_corrs_within_distance.shape)).astype(bool)
    ).stack()
    display(corr_matrix_flat.describe().apply(str))

    # save file
    output_filepath = (
        OUTPUT_DIR_BASE
        / f"gene_corrs-symbols-within_distance_{int(full_distance)}mb.pkl"
    )
    display(output_filepath)

    gene_corrs_within_distance.to_pickle(output_filepath)

    print("\n")

Using within distance: 5.0


Unnamed: 0,NOC2L,HES4,ISG15,AGRN,TNFRSF18,TNFRSF4,B3GALT6,UBE2J2,ACAP3,TAS1R3,...,PLXNB2,ADM2,MIOX,SCO2,TYMP,CPT1B,CHKB,MAPK8IP2,ARSA,SHANK3
NOC2L,1.000000,0.111648,0.187064,0.083062,0.006659,0.006887,0.004603,0.011059,0.011335,0.007667,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
HES4,0.111648,1.000000,0.760347,0.394843,0.011284,0.008897,0.005317,0.006876,0.002582,0.005084,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ISG15,0.187064,0.760347,1.000000,0.348268,0.011228,0.010204,0.006492,0.010893,0.003592,0.009454,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
AGRN,0.083062,0.394843,0.348268,1.000000,0.006882,0.013548,0.001546,0.002285,0.003513,0.008510,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
TNFRSF18,0.006659,0.011284,0.011228,0.006882,1.000000,0.337836,0.551124,0.154373,0.193531,0.090832,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CPT1B,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.018582,0.046440,0.013164,0.256685,0.217213,1.000000,0.735570,0.180608,0.035560,0.009659
CHKB,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.030880,0.047012,0.013125,0.208761,0.205824,0.735570,1.000000,0.249788,0.041717,0.020105
MAPK8IP2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.044312,0.093409,0.066327,0.052375,0.125790,0.180608,0.249788,1.000000,0.205484,0.040795
ARSA,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.012015,0.027591,0.021492,0.027293,0.035763,0.035560,0.041717,0.205484,1.000000,0.013757


'Number of negative eigenvalues: 0'

'Negative eigenvalues:\n[]'

Works! (statsmodels.GLS)
Works!
all good.


'Number of genes with correlations with other genes: 6450'

'Number of nonzero cells: 446888'

count               20798025.0
mean     0.0001912420209368383
std       0.006869588775880235
min                        0.0
25%                        0.0
50%                        0.0
75%                        0.0
max         0.9999975042224853
dtype: object

PosixPath('/project/ritchie20/projects/phenoplier/base/results/gls/gene_corrs/cohorts/phenomexcan_astle/gtex_v8/mashr/gene_corrs-symbols-within_distance_10mb.pkl')



Using within distance: 2.5


Unnamed: 0,NOC2L,HES4,ISG15,AGRN,TNFRSF18,TNFRSF4,B3GALT6,UBE2J2,ACAP3,TAS1R3,...,PLXNB2,ADM2,MIOX,SCO2,TYMP,CPT1B,CHKB,MAPK8IP2,ARSA,SHANK3
NOC2L,1.000000,0.111648,0.187064,0.083062,0.006659,0.006887,0.004603,0.011059,0.011335,0.007667,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
HES4,0.111648,1.000000,0.760347,0.394843,0.011284,0.008897,0.005317,0.006876,0.002582,0.005084,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ISG15,0.187064,0.760347,1.000000,0.348268,0.011228,0.010204,0.006492,0.010893,0.003592,0.009454,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
AGRN,0.083062,0.394843,0.348268,1.000000,0.006882,0.013548,0.001546,0.002285,0.003513,0.008510,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
TNFRSF18,0.006659,0.011284,0.011228,0.006882,1.000000,0.337836,0.551124,0.154373,0.193531,0.090832,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CPT1B,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.018582,0.046440,0.013164,0.256685,0.217213,1.000000,0.735570,0.180608,0.035560,0.009659
CHKB,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.030880,0.047012,0.013125,0.208761,0.205824,0.735570,1.000000,0.249788,0.041717,0.020105
MAPK8IP2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.044312,0.093409,0.066327,0.052375,0.125790,0.180608,0.249788,1.000000,0.205484,0.040795
ARSA,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.012015,0.027591,0.021492,0.027293,0.035763,0.035560,0.041717,0.205484,1.000000,0.013757


'Number of negative eigenvalues: 0'

'Negative eigenvalues:\n[]'

Works! (statsmodels.GLS)
Works!
all good.


'Number of genes with correlations with other genes: 6449'

'Number of nonzero cells: 262770'

count                20798025.0
mean     0.00016517902985301234
std        0.006855778293494092
min                         0.0
25%                         0.0
50%                         0.0
75%                         0.0
max          0.9999975042224853
dtype: object

PosixPath('/project/ritchie20/projects/phenoplier/base/results/gls/gene_corrs/cohorts/phenomexcan_astle/gtex_v8/mashr/gene_corrs-symbols-within_distance_5mb.pkl')



Using within distance: 1.0


Unnamed: 0,NOC2L,HES4,ISG15,AGRN,TNFRSF18,TNFRSF4,B3GALT6,UBE2J2,ACAP3,TAS1R3,...,PLXNB2,ADM2,MIOX,SCO2,TYMP,CPT1B,CHKB,MAPK8IP2,ARSA,SHANK3
NOC2L,1.000000,0.111648,0.187064,0.083062,0.006659,0.006887,0.004603,0.011059,0.011335,0.007667,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
HES4,0.111648,1.000000,0.760347,0.394843,0.011284,0.008897,0.005317,0.006876,0.002582,0.005084,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ISG15,0.187064,0.760347,1.000000,0.348268,0.011228,0.010204,0.006492,0.010893,0.003592,0.009454,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
AGRN,0.083062,0.394843,0.348268,1.000000,0.006882,0.013548,0.001546,0.002285,0.003513,0.008510,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
TNFRSF18,0.006659,0.011284,0.011228,0.006882,1.000000,0.337836,0.551124,0.154373,0.193531,0.090832,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CPT1B,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.018582,0.046440,0.013164,0.256685,0.217213,1.000000,0.735570,0.180608,0.035560,0.009659
CHKB,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.030880,0.047012,0.013125,0.208761,0.205824,0.735570,1.000000,0.249788,0.041717,0.020105
MAPK8IP2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.044312,0.093409,0.066327,0.052375,0.125790,0.180608,0.249788,1.000000,0.205484,0.040795
ARSA,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.012015,0.027591,0.021492,0.027293,0.035763,0.035560,0.041717,0.205484,1.000000,0.013757


'Number of negative eigenvalues: 0'

'Negative eigenvalues:\n[]'

Works! (statsmodels.GLS)
Works!
all good.


'Number of genes with correlations with other genes: 6436'

'Number of nonzero cells: 131028'

count                20798025.0
mean     0.00014532124999310308
std        0.006840571937983463
min                         0.0
25%                         0.0
50%                         0.0
75%                         0.0
max          0.9999975042224853
dtype: object

PosixPath('/project/ritchie20/projects/phenoplier/base/results/gls/gene_corrs/cohorts/phenomexcan_astle/gtex_v8/mashr/gene_corrs-symbols-within_distance_2mb.pkl')



