# Description

(Please, take a look at the README.md file in this directory for instructions on how to run this notebook)

This notebook computes the covariance of SNPs for each chr.

# Modules

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import gc
import sqlite3
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm

import conf
from entity import Gene

# Settings

In [3]:
COV_DTYPE = np.float32

In [4]:
# reference panel
REFERENCE_PANEL = "GTEX_V8"
# REFERENCE_PANEL = "1000G"

# prediction models
## mashr
EQTL_MODEL = "MASHR"
EQTL_MODEL_FILES_PREFIX = "mashr_"

# ## elastic net
# EQTL_MODEL = "ELASTIC_NET"
# EQTL_MODEL_FILES_PREFIX = "en_"

# make it read the prefix from conf.py
EQTL_MODEL_FILES_PREFIX = None

In [5]:
# Parameters
REFERENCE_PANEL = "GTEX_V8"
EQTL_MODEL = "ELASTIC_NET"


In [6]:
if EQTL_MODEL_FILES_PREFIX is None:
    EQTL_MODEL_FILES_PREFIX = conf.PHENOMEXCAN["PREDICTION_MODELS"][
        f"{EQTL_MODEL}_PREFIX"
    ]

In [7]:
display(f"Using eQTL model: {EQTL_MODEL} / {EQTL_MODEL_FILES_PREFIX}")

'Using eQTL model: ELASTIC_NET / en_'

In [8]:
REFERENCE_PANEL_DIR = conf.PHENOMEXCAN["LD_BLOCKS"][f"{REFERENCE_PANEL}_GENOTYPE_DIR"]

In [9]:
display(f"Using reference panel folder: {str(REFERENCE_PANEL_DIR)}")

'Using reference panel folder: /opt/data/data/phenomexcan/ld_blocks/reference_panel_gtex_v8'

In [10]:
OUTPUT_DIR_BASE = (
    conf.PHENOMEXCAN["LD_BLOCKS"][f"GENE_CORRS_DIR"]
    / REFERENCE_PANEL.lower()
    / EQTL_MODEL.lower()
)
OUTPUT_DIR_BASE.mkdir(parents=True, exist_ok=True)

In [11]:
display(f"Using output dir base: {OUTPUT_DIR_BASE}")

'Using output dir base: /opt/data/data/phenomexcan/ld_blocks/gene_corrs/gtex_v8/elastic_net'

# Load data

## Functions

In [12]:
def get_reference_panel_file(directory: Path, file_pattern: str) -> Path:
    files = list(directory.glob(f"*{file_pattern}*.parquet"))
    assert len(files) == 1, f"More than one file was found: {files}"
    return files[0]

In [13]:
# testing
_tmp = get_reference_panel_file(
    conf.PHENOMEXCAN["LD_BLOCKS"]["GTEX_V8_GENOTYPE_DIR"], "chr1.variants"
)
assert _tmp is not None
assert (
    _tmp.name
    == "gtex_v8_eur_filtered_maf0.01_monoallelic_variants.chr1.variants.parquet"
)

_tmp = get_reference_panel_file(
    conf.PHENOMEXCAN["LD_BLOCKS"]["GTEX_V8_GENOTYPE_DIR"], "_metadata"
)
assert _tmp is not None
assert (
    _tmp.name
    == "gtex_v8_eur_filtered_maf0.01_monoallelic_variants.variants_metadata.parquet"
)

# 1000G
_tmp = get_reference_panel_file(
    conf.PHENOMEXCAN["LD_BLOCKS"]["1000G_GENOTYPE_DIR"], "chr1.variants"
)
assert _tmp is not None
assert _tmp.name == "chr1.variants.parquet"

_tmp = get_reference_panel_file(
    conf.PHENOMEXCAN["LD_BLOCKS"]["1000G_GENOTYPE_DIR"], "_metadata"
)
assert _tmp is not None
assert _tmp.name == "variant_metadata.parquet"

# pattern matches more than one file
try:
    get_reference_panel_file(
        conf.PHENOMEXCAN["LD_BLOCKS"]["1000G_GENOTYPE_DIR"], "chr1"
    )
    raise AssertionError("Exception was not raised")
except AssertionError as e:
    assert "More than one file was found" in str(e)

## SNPs in predictions models

In [14]:
mashr_models_db_files = list(
    conf.PHENOMEXCAN["PREDICTION_MODELS"][EQTL_MODEL].glob("*.db")
)

In [15]:
assert len(mashr_models_db_files) == 49

In [16]:
all_variants_ids = []

for m in mashr_models_db_files:
    print(f"Processing {m.name}")
    tissue = m.name.split(EQTL_MODEL_FILES_PREFIX)[1].split(".db")[0]

    with sqlite3.connect(m) as conn:
        df = pd.read_sql("select gene, varID from weights", conn)
        df["gene"] = df["gene"].apply(lambda x: x.split(".")[0])
        df = df.assign(tissue=tissue)

        all_variants_ids.append(df)

Processing en_Vagina.db
Processing en_Colon_Sigmoid.db
Processing en_Brain_Caudate_basal_ganglia.db
Processing en_Stomach.db
Processing en_Skin_Not_Sun_Exposed_Suprapubic.db
Processing en_Artery_Tibial.db
Processing en_Brain_Hippocampus.db
Processing en_Esophagus_Gastroesophageal_Junction.db
Processing en_Esophagus_Muscularis.db
Processing en_Cells_Cultured_fibroblasts.db
Processing en_Brain_Nucleus_accumbens_basal_ganglia.db
Processing en_Brain_Frontal_Cortex_BA9.db
Processing en_Brain_Hypothalamus.db
Processing en_Breast_Mammary_Tissue.db
Processing en_Colon_Transverse.db
Processing en_Lung.db
Processing en_Esophagus_Mucosa.db
Processing en_Brain_Cerebellum.db
Processing en_Brain_Putamen_basal_ganglia.db
Processing en_Heart_Atrial_Appendage.db
Processing en_Heart_Left_Ventricle.db
Processing en_Brain_Anterior_cingulate_cortex_BA24.db
Processing en_Nerve_Tibial.db
Processing en_Prostate.db
Processing en_Adrenal_Gland.db
Processing en_Minor_Salivary_Gland.db
Processing en_Testis.db
Pro

In [17]:
all_gene_snps = pd.concat(all_variants_ids, ignore_index=True)

In [18]:
all_gene_snps.shape

(8558894, 3)

In [19]:
all_gene_snps.head()

Unnamed: 0,gene,varID,tissue
0,ENSG00000272983,chr10_37251057_C_T_b38,Vagina
1,ENSG00000272983,chr10_37307461_G_T_b38,Vagina
2,ENSG00000272983,chr10_37535769_T_C_b38,Vagina
3,ENSG00000272983,chr10_37537564_T_C_b38,Vagina
4,ENSG00000272983,chr10_37544467_T_C_b38,Vagina


In [20]:
all_snps_in_models = set(all_gene_snps["varID"].unique())

## MultiPLIER Z

In [21]:
multiplier_z = pd.read_pickle(conf.MULTIPLIER["MODEL_Z_MATRIX_FILE"])

In [22]:
multiplier_z.shape

(6750, 987)

In [23]:
multiplier_z.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
GAS6,0.0,0.0,0.039438,0.0,0.050476,0.0,0.0,0.0,0.590949,0.0,...,0.050125,0.0,0.033407,0.0,0.0,0.005963,0.347362,0.0,0.0,0.0
MMP14,0.0,0.0,0.0,0.0,0.070072,0.0,0.0,0.004904,1.720179,2.423595,...,0.0,0.0,0.001007,0.0,0.035747,0.0,0.0,0.0,0.014978,0.0
DSP,0.0,0.0,0.0,0.0,0.0,0.041697,0.0,0.005718,0.0,0.0,...,0.020853,0.0,0.0,0.0,0.0,0.005774,0.0,0.0,0.0,0.416405
MARCKSL1,0.305212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.161843,0.149471,...,0.027134,0.05272,0.0,0.030189,0.060884,0.0,0.0,0.0,0.0,0.44848
SPARC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014014,...,0.0,0.0,0.0,0.0,0.0,0.0,0.067779,0.0,0.122417,0.062665


## Reference panel variants metadata

In [24]:
input_file = get_reference_panel_file(REFERENCE_PANEL_DIR, "_metadata")
display(input_file)

PosixPath('/opt/data/data/phenomexcan/ld_blocks/reference_panel_gtex_v8/gtex_v8_eur_filtered_maf0.01_monoallelic_variants.variants_metadata.parquet')

In [25]:
variants_metadata = pd.read_parquet(input_file, columns=["id"])

In [26]:
variants_metadata.shape

(8880842, 1)

In [27]:
variants_metadata.head()

Unnamed: 0,id
0,chr1_13550_G_A_b38
1,chr1_14671_G_C_b38
2,chr1_14677_G_A_b38
3,chr1_14933_G_A_b38
4,chr1_16841_G_T_b38


In [28]:
variants_ids_with_genotype = set(variants_metadata["id"])

In [29]:
len(variants_ids_with_genotype)

8880842

In [30]:
list(variants_ids_with_genotype)[:10]

['chr5_23730990_A_T_b38',
 'chr2_230563662_T_C_b38',
 'chr5_124951706_C_T_b38',
 'chr3_191464512_G_A_b38',
 'chr5_6936248_C_T_b38',
 'chr5_133597437_T_C_b38',
 'chr5_153503821_A_C_b38',
 'chr7_31296335_C_A_b38',
 'chr2_41777897_C_T_b38',
 'chr7_34234636_G_A_b38']

In [31]:
del variants_metadata

# How many variants in predictions models are present in the reference panel?

In [32]:
n_snps_in_models = len(all_snps_in_models)
display(n_snps_in_models)

1657598

In [33]:
n_snps_in_ref_panel = len(all_snps_in_models.intersection(variants_ids_with_genotype))
display(n_snps_in_ref_panel)

1657598

In [34]:
n_snps_in_ref_panel / n_snps_in_models

1.0

# Get final list of genes in MultiPLIER

In [35]:
genes_in_z = [
    Gene(name=gene_name).ensembl_id
    for gene_name in multiplier_z.index
    if gene_name in Gene.GENE_NAME_TO_ID_MAP
]

In [36]:
len(genes_in_z)

6454

In [37]:
genes_in_z[:5]

['ENSG00000183087',
 'ENSG00000157227',
 'ENSG00000096696',
 'ENSG00000175130',
 'ENSG00000113140']

In [38]:
genes_in_z = set(genes_in_z)

In [39]:
len(genes_in_z)

6454

In [40]:
# keep genes in MultiPLIER only
display(all_gene_snps.shape)

all_gene_snps = all_gene_snps[all_gene_snps["gene"].isin(genes_in_z)]

display(all_gene_snps.shape)

(8558894, 3)

(2826818, 3)

# (For MultiPLIER genes): How many variants in predictions models are present in the reference panel?

In [41]:
all_snps_in_models_multiplier = set(all_gene_snps["varID"])

n_snps_in_models = len(all_snps_in_models_multiplier)
display(n_snps_in_models)

949171

In [42]:
n_snps_in_ref_panel = len(
    all_snps_in_models_multiplier.intersection(variants_ids_with_genotype)
)
display(n_snps_in_ref_panel)

949171

In [43]:
n_snps_in_ref_panel / n_snps_in_models

1.0

## Preprocess SNPs data

In [44]:
variants_ld_block_df = all_gene_snps[["varID"]].drop_duplicates()

In [45]:
variants_ld_block_df.shape

(949171, 1)

In [46]:
variants_ld_block_df.head()

Unnamed: 0,varID
58,chr10_42564881_C_T_b38
59,chr10_42567504_T_C_b38
60,chr10_42568462_G_T_b38
61,chr10_42569289_A_G_b38
62,chr10_42570991_C_T_b38


In [47]:
variants_info = variants_ld_block_df["varID"].str.split("_", expand=True)

In [48]:
variants_info.shape

(949171, 5)

In [49]:
assert variants_ld_block_df.shape[0] == variants_info.shape[0]

In [50]:
variants_ld_block_df = variants_ld_block_df.join(variants_info)[["varID", 0, 1, 2, 3]]

In [51]:
assert variants_ld_block_df.shape[0] == variants_info.shape[0]

In [52]:
variants_ld_block_df.head()

Unnamed: 0,varID,0,1,2,3
58,chr10_42564881_C_T_b38,chr10,42564881,C,T
59,chr10_42567504_T_C_b38,chr10,42567504,T,C
60,chr10_42568462_G_T_b38,chr10,42568462,G,T
61,chr10_42569289_A_G_b38,chr10,42569289,A,G
62,chr10_42570991_C_T_b38,chr10,42570991,C,T


In [53]:
variants_ld_block_df = variants_ld_block_df.rename(
    columns={
        0: "chr",
        1: "position",
        2: "ref_allele",
        3: "eff_allele",
    }
)

In [54]:
variants_ld_block_df["chr"] = variants_ld_block_df["chr"].apply(lambda x: int(x[3:]))

In [55]:
variants_ld_block_df["position"] = variants_ld_block_df["position"].astype(int)

In [56]:
variants_ld_block_df.shape

(949171, 5)

In [57]:
variants_ld_block_df.head()

Unnamed: 0,varID,chr,position,ref_allele,eff_allele
58,chr10_42564881_C_T_b38,10,42564881,C,T
59,chr10_42567504_T_C_b38,10,42567504,T,C
60,chr10_42568462_G_T_b38,10,42568462,G,T
61,chr10_42569289_A_G_b38,10,42569289,A,G
62,chr10_42570991_C_T_b38,10,42570991,C,T


In [58]:
variants_ld_block_df.dtypes

varID         object
chr            int64
position       int64
ref_allele    object
eff_allele    object
dtype: object

# Covariance for each chromosome block

## Functions

In [59]:
def covariance(df, dtype):
    n = df.shape[0]
    df = df.sub(df.mean(), axis=1).astype(dtype)
    return df.T.dot(df) / (n - 1)

In [60]:
# testing
rs = np.random.RandomState(0)

_test_data = pd.DataFrame(rs.normal(size=(50, 5)), columns=[f"c{i}" for i in range(5)])

# float64
pd.testing.assert_frame_equal(
    covariance(_test_data, np.float64),
    _test_data.cov(),
    rtol=1e-10,
    atol=1e-10,
    check_dtype=True,
)

# float32
pd.testing.assert_frame_equal(
    covariance(_test_data, np.float32),
    _test_data.cov(),
    rtol=1e-5,
    atol=1e-8,
    check_dtype=False,
)

del _test_data

In [61]:
def compute_snps_cov(snps_df):
    assert snps_df["chr"].unique().shape[0] == 1
    chromosome = snps_df["chr"].unique()[0]

    # keep variants only present in genotype
    snps_ids = list(set(snps_df["varID"]).intersection(variants_ids_with_genotype))

    chromosome_file = get_reference_panel_file(
        REFERENCE_PANEL_DIR, f"chr{chromosome}.variants"
    )
    snps_genotypes = pd.read_parquet(chromosome_file, columns=snps_ids)

    return covariance(snps_genotypes, COV_DTYPE)

In [62]:
# testing
_tmp_snps = variants_ld_block_df[variants_ld_block_df["chr"] == 22]
assert _tmp_snps.shape[0] > 0

In [63]:
_tmp_snps.shape

(20774, 5)

In [64]:
n_expected = len(set(_tmp_snps["varID"]).intersection(variants_ids_with_genotype))
display(n_expected)

20774

In [65]:
_tmp = compute_snps_cov(_tmp_snps)

In [66]:
assert _tmp.shape == (n_expected, n_expected)
assert not _tmp.isna().any().any()

In [67]:
del _tmp_snps, _tmp

## Compute covariance and save

In [68]:
output_file_name_template = conf.PHENOMEXCAN["LD_BLOCKS"][
    "GENE_CORRS_FILE_NAME_TEMPLATES"
]["SNPS_COVARIANCE"]

output_file = OUTPUT_DIR_BASE / output_file_name_template.format(
    prefix="",
    suffix="",
)
display(output_file)

PosixPath('/opt/data/data/phenomexcan/ld_blocks/gene_corrs/gtex_v8/elastic_net/snps_chr_blocks_cov.h5')

In [69]:
with pd.HDFStore(output_file, mode="w", complevel=4) as store:
    pbar = tqdm(
        variants_ld_block_df.groupby("chr"),
        ncols=100,
        total=variants_ld_block_df["chr"].unique().shape[0],
    )

    store["metadata"] = variants_ld_block_df

    for grp_name, grp_data in pbar:
        pbar.set_description(f"{grp_name} {grp_data.shape}")
        snps_cov = compute_snps_cov(grp_data)  # .astype(COV_DTYPE)
        assert not snps_cov.isna().any().any()
        store[f"chr{grp_name}"] = snps_cov

        del snps_cov
        store.flush()

        gc.collect()

22 (20774, 5): 100%|████████████████████████████████████████████| 22/22 [8:47:02<00:00, 1437.38s/it]


# Testing

In [70]:
_tmp = variants_ld_block_df[variants_ld_block_df["chr"] == 1]

In [71]:
_tmp.shape

(91597, 5)

In [72]:
assert _tmp.shape[0] > 0

In [73]:
n_expected = len(set(_tmp["varID"]).intersection(variants_ids_with_genotype))
display(n_expected)
assert n_expected > 0

91597

In [74]:
with pd.HDFStore(output_file, mode="r") as store:
    df = store["chr1"]
    assert df.shape == (n_expected, n_expected)
    assert not df.isna().any().any()