# Description

(Please, take a look at the README.md file in this directory for instructions on how to run this notebook)

This notebook computes predicted expression correlations between all genes in the MultiPLIER models.

It also has a parameter set for papermill to run on a single chromosome to run in parallel (see under `Settings` below).

This notebook is not directly run. See README.md.

# Modules

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
from scipy.spatial.distance import squareform
import pandas as pd
from tqdm import tqdm

import conf
from entity import Gene

# Settings

In [3]:
# reference panel
REFERENCE_PANEL = "GTEX_V8"
# REFERENCE_PANEL = "1000G"

# prediction models
## mashr
EQTL_MODEL = "MASHR"
EQTL_MODEL_FILES_PREFIX = "mashr_"

# ## elastic net
# EQTL_MODEL = "ELASTIC_NET"
# EQTL_MODEL_FILES_PREFIX = "en_"

# make it read the prefix from conf.py
EQTL_MODEL_FILES_PREFIX = None

# specifies a single chromosome value
# by default, run on all chromosomes
chromosome = "all"

In [4]:
if EQTL_MODEL_FILES_PREFIX is None:
    EQTL_MODEL_FILES_PREFIX = conf.PHENOMEXCAN["PREDICTION_MODELS"][
        f"{EQTL_MODEL}_PREFIX"
    ]

In [5]:
display(f"Using eQTL model: {EQTL_MODEL} / {EQTL_MODEL_FILES_PREFIX}")

'Using eQTL model: MASHR / mashr_'

In [6]:
REFERENCE_PANEL_DIR = conf.PHENOMEXCAN["LD_BLOCKS"][f"{REFERENCE_PANEL}_GENOTYPE_DIR"]

In [7]:
display(f"Using reference panel folder: {str(REFERENCE_PANEL_DIR)}")

'Using reference panel folder: /opt/data/data/phenomexcan/ld_blocks/reference_panel_gtex_v8'

In [8]:
OUTPUT_DIR_BASE = (
    conf.PHENOMEXCAN["LD_BLOCKS"][f"GENE_CORRS_DIR"]
    / REFERENCE_PANEL.lower()
    / EQTL_MODEL.lower()
)
OUTPUT_DIR_BASE.mkdir(parents=True, exist_ok=True)

In [9]:
display(f"Using output dir base: {OUTPUT_DIR_BASE}")

'Using output dir base: /opt/data/data/phenomexcan/ld_blocks/gene_corrs/gtex_v8/mashr'

In [10]:
if chromosome == "all":
    from time import sleep

    message = """
    WARNING: you are going to compute correlations of gene predicted expression across all chromosomes without parallelism.
    It is recommended that you look at the README.md file in this subfolder (nbs/08_gsa_gls/README.md) to know how to do that.
    
    It will continue in 20 seconds.
    """
    print(message)
    sleep(20)

# Load data

## MultiPLIER Z

In [11]:
multiplier_z_genes = pd.read_pickle(
    conf.MULTIPLIER["MODEL_Z_MATRIX_FILE"]
).index.tolist()

In [12]:
len(multiplier_z_genes)

6750

In [13]:
multiplier_z_genes[:10]

['GAS6',
 'MMP14',
 'DSP',
 'MARCKSL1',
 'SPARC',
 'CTSD',
 'EPAS1',
 'PALLD',
 'PHC2',
 'LGALS3BP']

## Get gene objects

In [14]:
multiplier_gene_obj = {
    gene_name: Gene(name=gene_name)
    for gene_name in multiplier_z_genes
    if gene_name in Gene.GENE_NAME_TO_ID_MAP
}

In [15]:
len(multiplier_gene_obj)

6454

In [16]:
multiplier_gene_obj["GAS6"].ensembl_id

'ENSG00000183087'

In [17]:
_gene_obj = list(multiplier_gene_obj.values())

genes_info = pd.DataFrame(
    {
        "name": [g.name for g in _gene_obj],
        "id": [g.ensembl_id for g in _gene_obj],
        "chr": [g.chromosome for g in _gene_obj],
        "band": [g.band for g in _gene_obj],
    }
)

In [18]:
genes_info.shape

(6454, 4)

In [19]:
genes_info.head()

Unnamed: 0,name,id,chr,band
0,GAS6,ENSG00000183087,13,13q34
1,MMP14,ENSG00000157227,14,14q11.2
2,DSP,ENSG00000096696,6,6p24.3
3,MARCKSL1,ENSG00000175130,1,1p35.1
4,SPARC,ENSG00000113140,5,5q33.1


## Get tissues names

In [20]:
tissues = conf.PHENOMEXCAN["PREDICTION_MODELS"][f"{EQTL_MODEL}_TISSUES"].split(" ")

In [21]:
tissues[:5]

['Skin_Not_Sun_Exposed_Suprapubic',
 'Cells_EBV-transformed_lymphocytes',
 'Brain_Frontal_Cortex_BA9',
 'Kidney_Cortex',
 'Brain_Substantia_nigra']

In [22]:
assert len(tissues) == 49

# Test

In [None]:
genes_info[genes_info["chr"] == "13"]

In [None]:
_gene_list = [
    Gene("ENSG00000134871"),
    Gene("ENSG00000187498"),
    Gene("ENSG00000183087"),
    Gene("ENSG00000073910"),
    Gene("ENSG00000133101"),
    Gene("ENSG00000122025"),
    Gene("ENSG00000120659"),
    Gene("ENSG00000133116"),
]

tissue = "Whole_Blood"

In [None]:
%%timeit
for gene_idx1 in range(0, len(_gene_list) - 1):
    gene_obj1 = _gene_list[gene_idx1]

    for gene_idx2 in range(gene_idx1 + 1, len(_gene_list)):
        gene_obj2 = _gene_list[gene_idx2]

        c = gene_obj1.get_ssm_correlation(
            gene_obj2,
        )

        print(f"{gene_obj1.name} / {gene_obj2.name}: {c}")

# Compute correlation per chromosome

In [23]:
import warnings

warnings.filterwarnings("error")

In [24]:
all_chrs = genes_info["chr"].dropna().unique()
assert all_chrs.shape[0] == 22

if chromosome != "all":
    chromosome = str(chromosome)
    assert chromosome in all_chrs

    # run only on the chromosome specified
    all_chrs = [chromosome]

# # For testing purposes
# all_chrs = ["13"]
# # tissues = ["Whole_Blood"]
# genes_info = genes_info[genes_info["id"].isin(["ENSG00000134871", "ENSG00000187498", "ENSG00000183087", "ENSG00000073910"])]


for chr_num in all_chrs:
    print(f"Chromosome {chr_num}", flush=True)

    # check if results exist
    output_dir = OUTPUT_DIR_BASE / "by_chr"
    output_file = output_dir / f"gene_corrs-chr{chr_num}.pkl"

    if output_file.exists():
        _tmp_data = pd.read_pickle(output_file)

        if _tmp_data.shape[0] > 0:
            print("Already run, stopping.")
            continue

    genes_chr = genes_info[genes_info["chr"] == chr_num]
    print(f"Genes in chromosome{genes_chr.shape}", flush=True)

    gene_chr_objs = [Gene(ensembl_id=gene_id) for gene_id in genes_chr["id"]]
    gene_chr_ids = [g.ensembl_id for g in gene_chr_objs]

    n = len(gene_chr_objs)
    n_comb = int(n * (n - 1) / 2.0)
    print(f"Number of gene combinations: {n_comb}", flush=True)

    gene_corrs = []

    pbar = tqdm(ncols=100, total=n_comb)
    i = 0
    for gene_idx1 in range(0, len(gene_chr_objs) - 1):
        gene_obj1 = gene_chr_objs[gene_idx1]

        # FIXME: get tissues for which we have results for gene_obj1 only from S-PrediXcan

        for gene_idx2 in range(gene_idx1 + 1, len(gene_chr_objs)):
            gene_obj2 = gene_chr_objs[gene_idx2]

            pbar.set_description(f"{gene_obj1.ensembl_id} / {gene_obj2.ensembl_id}")

            # FIXME: get tissues for which we have results for gene_obj2 only from S-PrediXcan

            try:
                gene_corrs.append(
                    gene_obj1.get_ssm_correlation(
                        other_gene=gene_obj2,
                        # tissues=tissues, FIXME
                        reference_panel=REFERENCE_PANEL,
                        model_type=EQTL_MODEL,
                    )
                )
            except Warning:
                print(
                    f"RuntimeWarning for genes {gene_obj1.ensembl_id} and {gene_obj2.ensembl_id}",
                    flush=True,
                )
                import traceback

                print(traceback.format_exc(), flush=True)
            except Exception as e:
                print(
                    f"Exception for genes {gene_obj1.ensembl_id} and {gene_obj2.ensembl_id}",
                    flush=True,
                )
                import traceback

                print(traceback.format_exc(), flush=True)
                gene_corrs.append(np.nan)

            pbar.update(1)

    pbar.close()

    # testing
    gene_corrs_flat = pd.Series(gene_corrs)
    print(f"Min/max values: {gene_corrs_flat.min()} / {gene_corrs_flat.max()}")
    assert gene_corrs_flat.min() >= -1.001
    assert gene_corrs_flat.max() <= 1.001

    # save
    # FIXME: consider saving only the condenced matrix here. See here for
    # more details: https://github.com/greenelab/phenoplier/pull/38#discussion_r634600813
    gene_corrs_data = squareform(np.array(gene_corrs, dtype=np.float64))
    np.fill_diagonal(gene_corrs_data, 1.0)

    gene_corrs_df = pd.DataFrame(
        data=gene_corrs_data,
        index=gene_chr_ids,
        columns=gene_chr_ids,
    )

    # FIXME: all values should be between 1.0 and -1.0 (change then if not)

    output_dir.mkdir(exist_ok=True, parents=True)
    display(output_file)

    gene_corrs_df.to_pickle(output_file)

Chromosome 22
Genes in chromosome(170, 4)
Number of gene combinations: 14365


ENSG00000184983 / ENSG00000100345:  80%|██████████████▎   | 11427/14365 [6:06:14<2:17:26,  2.81s/it]Exception ignored in: <function Socket.__del__ at 0x7f9fb9d9d5e0>
Traceback (most recent call last):
  File "/opt/conda/envs/phenoplier/lib/python3.8/site-packages/zmq/sugar/socket.py", line 97, in __del__


KeyboardInterrupt: 

In [None]:
gene_corrs_df.shape

In [None]:
gene_corrs_df

# Testing

In [None]:
# data = pd.read_pickle(
#     conf.PHENOMEXCAN["LD_BLOCKS"]["BASE_DIR"] / "gene_corrs" / "Whole_Blood" / "gene_corrs-Whole_Blood-chr13.pkl"
# )

In [None]:
# assert data.loc["ENSG00000134871", "ENSG00000187498"] > 0.97