# Description

This notebook computes predicted expression correlations between all genes in the MultiPLIER models.

# Modules

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import conf
from entity import Gene

# Load data

## SNPs covariance

In [None]:
snps_covar_file = conf.PHENOMEXCAN["LD_BLOCKS"]["BASE_DIR"] / "mashr_snps_chr_blocks_cov.h5"
display(snps_covar_file)

In [None]:
with pd.HDFStore(snps_covar_file, mode="r") as store:
    snps_covar_metadata = store["metadata"].drop_duplicates(subset=["varID"]).set_index("varID")
    assert snps_covar_metadata.index.is_unique

In [None]:
snps_covar_metadata.shape

In [None]:
snps_covar_metadata.head()

## MultiPLIER Z

In [None]:
multiplier_z_genes = pd.read_pickle(conf.MULTIPLIER["MODEL_Z_MATRIX_FILE"]).index.tolist()

In [None]:
len(multiplier_z_genes)

In [None]:
multiplier_z_genes[:10]

## Get gene objects

In [None]:
multiplier_gene_obj = {
    gene_name: Gene(name=gene_name)
    for gene_name in multiplier_z_genes
    if gene_name in Gene.GENE_NAME_TO_ID_MAP
}

In [None]:
len(multiplier_gene_obj)

In [None]:
multiplier_gene_obj["GAS6"].ensembl_id

In [None]:
_gene_obj = list(multiplier_gene_obj.values())

genes_info = pd.DataFrame({
    "name": [g.name for g in _gene_obj],
    "id": [g.ensembl_id for g in _gene_obj],
    "chr": [g.chromosome for g in _gene_obj],
})

In [None]:
genes_info.shape

In [None]:
genes_info.head()

## Get tissues names

In [None]:
db_files = list(conf.PHENOMEXCAN["PREDICTION_MODELS"]["MASHR"].glob("*.db"))

In [None]:
assert len(db_files) == 49

In [None]:
tissues = [str(f).split("mashr_")[1].split(".db")[0] for f in db_files]

In [None]:
tissues[:5]

# Test: compute correlation in one chromosome

In [None]:
genes_chr = genes_info[genes_info["chr"] == "1"]

In [None]:
genes_chr.shape

In [None]:
genes_chr.head()

In [None]:
gene_chr_objs = [Gene(ensembl_id=gene_id) for gene_id in genes_chr["id"]]

In [None]:
len(gene_chr_objs)

In [None]:
gene_chr_objs[0].name,gene_chr_objs[1].name

In [None]:
tissues[0]

In [None]:
Gene("ENSG00000134686").get_pred_expression_variance(tissues[0])

In [None]:
Gene("ENSG00000163221").get_pred_expression_variance(tissues[0])

In [None]:
gene_corrs = []

n = len(gene_chr_objs)
n_comb = int(n * (n - 1) / 2.0)
display(n_comb)
pbar = tqdm(ncols=100, total=n_comb)

i = 0
for gene_idx1 in range(0, len(gene_chr_objs) - 1):
    gene_obj1 = gene_chr_objs[gene_idx1]
    
    for gene_idx2 in range(gene_idx1 + 1, len(gene_chr_objs)):
        gene_obj2 = gene_chr_objs[gene_idx2]
        
        try:
            gene_corrs.append(gene_obj1.get_expression_correlation(gene_obj2, tissues[0]))
#             i = i + 1
            pbar.update(1)
        except TypeError:
            print((gene_obj1.ensembl_id, gene_obj2.ensembl_id))

pbar.close()