# Description

(Please, take a look at the README.md file in this directory for instructions on how to run this notebook)

This notebook computes the covariance of SNPs for each chr.

# Modules

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import gc
import sqlite3
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm

import conf
from entity import Gene

# Settings

In [None]:
# reference panel
REFERENCE_PANEL = "GTEX_V8"
# REFERENCE_PANEL = "1000G"

# prediction models
## mashr
EQTL_MODEL = "MASHR"
EQTL_MODEL_FILES_PREFIX = "mashr_"

# ## elastic net
# EQTL_MODEL = "ELASTIC_NET"
# EQTL_MODEL_FILES_PREFIX = "en_"

# make it read the prefix from conf.py
EQTL_MODEL_FILES_PREFIX = None

# the numpy dtype used for the covariance matrix
#  either float64 or float32 (for huge matrices)
COVARIANCE_MATRIX_DTYPE = None

In [None]:
if EQTL_MODEL_FILES_PREFIX is None:
    EQTL_MODEL_FILES_PREFIX = conf.PHENOMEXCAN["PREDICTION_MODELS"][
        f"{EQTL_MODEL}_PREFIX"
    ]

In [None]:
display(f"Using eQTL model: {EQTL_MODEL} / {EQTL_MODEL_FILES_PREFIX}")

In [None]:
REFERENCE_PANEL_DIR = conf.PHENOMEXCAN["LD_BLOCKS"][f"{REFERENCE_PANEL}_GENOTYPE_DIR"]

In [None]:
display(f"Using reference panel folder: {str(REFERENCE_PANEL_DIR)}")

In [None]:
OUTPUT_DIR_BASE = (
    conf.RESULTS["GLS"]
    / "gene_corrs"
    / "reference_panels"
    / REFERENCE_PANEL.lower()
    / EQTL_MODEL.lower()
)
OUTPUT_DIR_BASE.mkdir(parents=True, exist_ok=True)

In [None]:
display(f"Using output dir base: {OUTPUT_DIR_BASE}")

In [None]:
cov_dtype_dict = {
    "float32": np.float32,
    "float64": np.float64,
}

if COVARIANCE_MATRIX_DTYPE in cov_dtype_dict:
    COV_DTYPE = cov_dtype_dict[COVARIANCE_MATRIX_DTYPE]
else:
    COV_DTYPE = np.float64

display(f"Covariance matrix dtype used: {str(COV_DTYPE)}")

# Load data

## Functions

In [None]:
def get_reference_panel_file(directory: Path, file_pattern: str) -> Path:
    files = list(directory.glob(f"*{file_pattern}*.parquet"))
    assert len(files) == 1, f"More than one file was found: {files}"
    return files[0]

In [None]:
# testing
_tmp = get_reference_panel_file(
    conf.PHENOMEXCAN["LD_BLOCKS"]["GTEX_V8_GENOTYPE_DIR"], "chr1.variants"
)
assert _tmp is not None
assert (
    _tmp.name
    == "gtex_v8_eur_filtered_maf0.01_monoallelic_variants.chr1.variants.parquet"
)

_tmp = get_reference_panel_file(
    conf.PHENOMEXCAN["LD_BLOCKS"]["GTEX_V8_GENOTYPE_DIR"], "_metadata"
)
assert _tmp is not None
assert (
    _tmp.name
    == "gtex_v8_eur_filtered_maf0.01_monoallelic_variants.variants_metadata.parquet"
)

# 1000G
_tmp = get_reference_panel_file(
    conf.PHENOMEXCAN["LD_BLOCKS"]["1000G_GENOTYPE_DIR"], "chr1.variants"
)
assert _tmp is not None
assert _tmp.name == "chr1.variants.parquet"

_tmp = get_reference_panel_file(
    conf.PHENOMEXCAN["LD_BLOCKS"]["1000G_GENOTYPE_DIR"], "_metadata"
)
assert _tmp is not None
assert _tmp.name == "variant_metadata.parquet"

# pattern matches more than one file
try:
    get_reference_panel_file(
        conf.PHENOMEXCAN["LD_BLOCKS"]["1000G_GENOTYPE_DIR"], "chr1"
    )
    raise AssertionError("Exception was not raised")
except AssertionError as e:
    assert "More than one file was found" in str(e)

## SNPs in predictions models

In [None]:
mashr_models_db_files = list(
    conf.PHENOMEXCAN["PREDICTION_MODELS"][EQTL_MODEL].glob("*.db")
)

In [None]:
assert len(mashr_models_db_files) == 49

In [None]:
all_variants_ids = []

for m in mashr_models_db_files:
    print(f"Processing {m.name}")
    tissue = m.name.split(EQTL_MODEL_FILES_PREFIX)[1].split(".db")[0]

    with sqlite3.connect(m) as conn:
        df = pd.read_sql("select gene, varID from weights", conn)
        df["gene"] = df["gene"].apply(lambda x: x.split(".")[0])
        df = df.assign(tissue=tissue)

        all_variants_ids.append(df)

In [None]:
all_gene_snps = pd.concat(all_variants_ids, ignore_index=True)

In [None]:
all_gene_snps.shape

In [None]:
all_gene_snps.head()

In [None]:
all_snps_in_models = set(all_gene_snps["varID"].unique())

## MultiPLIER Z

In [None]:
multiplier_z = pd.read_pickle(conf.MULTIPLIER["MODEL_Z_MATRIX_FILE"])

In [None]:
multiplier_z.shape

In [None]:
multiplier_z.head()

## Reference panel variants metadata

In [None]:
input_file = get_reference_panel_file(REFERENCE_PANEL_DIR, "_metadata")
display(input_file)

In [None]:
variants_metadata = pd.read_parquet(input_file, columns=["id"])

In [None]:
variants_metadata.shape

In [None]:
variants_metadata.head()

In [None]:
variants_ids_with_genotype = set(variants_metadata["id"])

In [None]:
len(variants_ids_with_genotype)

In [None]:
list(variants_ids_with_genotype)[:10]

In [None]:
del variants_metadata

# How many variants in predictions models are present in the reference panel?

In [None]:
n_snps_in_models = len(all_snps_in_models)
display(n_snps_in_models)

In [None]:
n_snps_in_ref_panel = len(all_snps_in_models.intersection(variants_ids_with_genotype))
display(n_snps_in_ref_panel)

In [None]:
n_snps_in_ref_panel / n_snps_in_models

# Get final list of genes in MultiPLIER

In [None]:
genes_in_z = [
    Gene(name=gene_name).ensembl_id
    for gene_name in multiplier_z.index
    if gene_name in Gene.GENE_NAME_TO_ID_MAP
]

In [None]:
len(genes_in_z)

In [None]:
genes_in_z[:5]

In [None]:
genes_in_z = set(genes_in_z)

In [None]:
len(genes_in_z)

In [None]:
# keep genes in MultiPLIER only
display(all_gene_snps.shape)

all_gene_snps = all_gene_snps[all_gene_snps["gene"].isin(genes_in_z)]

display(all_gene_snps.shape)

# (For MultiPLIER genes): How many variants in predictions models are present in the reference panel?

In [None]:
all_snps_in_models_multiplier = set(all_gene_snps["varID"])

n_snps_in_models = len(all_snps_in_models_multiplier)
display(n_snps_in_models)

In [None]:
n_snps_in_ref_panel = len(
    all_snps_in_models_multiplier.intersection(variants_ids_with_genotype)
)
display(n_snps_in_ref_panel)

In [None]:
n_snps_in_ref_panel / n_snps_in_models

## Preprocess SNPs data

In [None]:
variants_ld_block_df = all_gene_snps[["varID"]].drop_duplicates()

In [None]:
variants_ld_block_df.shape

In [None]:
variants_ld_block_df.head()

In [None]:
variants_info = variants_ld_block_df["varID"].str.split("_", expand=True)

In [None]:
variants_info.shape

In [None]:
assert variants_ld_block_df.shape[0] == variants_info.shape[0]

In [None]:
variants_ld_block_df = variants_ld_block_df.join(variants_info)[["varID", 0, 1, 2, 3]]

In [None]:
assert variants_ld_block_df.shape[0] == variants_info.shape[0]

In [None]:
variants_ld_block_df.head()

In [None]:
variants_ld_block_df = variants_ld_block_df.rename(
    columns={
        0: "chr",
        1: "position",
        2: "ref_allele",
        3: "eff_allele",
    }
)

In [None]:
variants_ld_block_df["chr"] = variants_ld_block_df["chr"].apply(lambda x: int(x[3:]))

In [None]:
variants_ld_block_df["position"] = variants_ld_block_df["position"].astype(int)

In [None]:
variants_ld_block_df.shape

In [None]:
variants_ld_block_df.head()

In [None]:
variants_ld_block_df.dtypes

# Covariance for each chromosome block

## Functions

In [None]:
def covariance(df, dtype):
    n = df.shape[0]
    df = df.sub(df.mean(), axis=1).astype(dtype)
    return df.T.dot(df) / (n - 1)

In [None]:
# testing
rs = np.random.RandomState(0)

_test_data = pd.DataFrame(rs.normal(size=(50, 5)), columns=[f"c{i}" for i in range(5)])

# float64
pd.testing.assert_frame_equal(
    covariance(_test_data, np.float64),
    _test_data.cov(),
    rtol=1e-10,
    atol=1e-10,
    check_dtype=True,
)

# float32
pd.testing.assert_frame_equal(
    covariance(_test_data, np.float32),
    _test_data.cov(),
    rtol=1e-5,
    atol=1e-8,
    check_dtype=False,
)

del _test_data

In [None]:
def compute_snps_cov(snps_df):
    assert snps_df["chr"].unique().shape[0] == 1
    chromosome = snps_df["chr"].unique()[0]

    # keep variants only present in genotype
    snps_ids = list(set(snps_df["varID"]).intersection(variants_ids_with_genotype))

    chromosome_file = get_reference_panel_file(
        REFERENCE_PANEL_DIR, f"chr{chromosome}.variants"
    )
    snps_genotypes = pd.read_parquet(chromosome_file, columns=snps_ids)

    return covariance(snps_genotypes, COV_DTYPE)

In [None]:
# testing
_tmp_snps = variants_ld_block_df[variants_ld_block_df["chr"] == 22]
assert _tmp_snps.shape[0] > 0

In [None]:
_tmp_snps.shape

In [None]:
n_expected = len(set(_tmp_snps["varID"]).intersection(variants_ids_with_genotype))
display(n_expected)

In [None]:
_tmp = compute_snps_cov(_tmp_snps)

In [None]:
assert _tmp.shape == (n_expected, n_expected)
assert not _tmp.isna().any().any()

In [None]:
del _tmp_snps, _tmp

## Compute covariance and save

In [None]:
output_file_name_template = conf.PHENOMEXCAN["LD_BLOCKS"][
    "GENE_CORRS_FILE_NAME_TEMPLATES"
]["SNPS_COVARIANCE"]

output_file = OUTPUT_DIR_BASE / output_file_name_template.format(
    prefix="",
    suffix="",
)
display(output_file)

In [None]:
with pd.HDFStore(output_file, mode="w", complevel=4) as store:
    pbar = tqdm(
        variants_ld_block_df.groupby("chr"),
        ncols=100,
        total=variants_ld_block_df["chr"].unique().shape[0],
    )

    store["metadata"] = variants_ld_block_df

    for grp_name, grp_data in pbar:
        pbar.set_description(f"{grp_name} {grp_data.shape}")
        snps_cov = compute_snps_cov(grp_data)  # .astype(COV_DTYPE)
        assert not snps_cov.isna().any().any()
        store[f"chr{grp_name}"] = snps_cov

        del snps_cov
        store.flush()

        gc.collect()

# Testing

In [None]:
_tmp = variants_ld_block_df[variants_ld_block_df["chr"] == 1]

In [None]:
_tmp.shape

In [None]:
assert _tmp.shape[0] > 0

In [None]:
n_expected = len(set(_tmp["varID"]).intersection(variants_ids_with_genotype))
display(n_expected)
assert n_expected > 0

In [None]:
with pd.HDFStore(output_file, mode="r") as store:
    df = store["chr1"]
    assert df.shape == (n_expected, n_expected)
    assert not df.isna().any().any()