# Description

(Please, take a look at the README.md file in this directory for instructions on how to run this notebook)

This notebook computes the covariance of SNPs for each chr.

# Modules

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import gc
import sqlite3
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm

import conf
from entity import Gene

# Settings

In [3]:
# reference panel
REFERENCE_PANEL = "GTEX_V8"
# REFERENCE_PANEL = "1000G"

# prediction models
## mashr
EQTL_MODEL = "MASHR"
EQTL_MODEL_FILES_PREFIX = "mashr_"

# ## elastic net
# EQTL_MODEL = "ELASTIC_NET"
# EQTL_MODEL_FILES_PREFIX = "en_"

# make it read the prefix from conf.py
EQTL_MODEL_FILES_PREFIX = None

# the numpy dtype used for the covariance matrix
#  either float64 or float32 (for huge matrices)
COVARIANCE_MATRIX_DTYPE = None

In [4]:
# Parameters
REFERENCE_PANEL = "1000G"
EQTL_MODEL = "MASHR"


In [5]:
if EQTL_MODEL_FILES_PREFIX is None:
    EQTL_MODEL_FILES_PREFIX = conf.PHENOMEXCAN["PREDICTION_MODELS"][
        f"{EQTL_MODEL}_PREFIX"
    ]

In [6]:
display(f"Using eQTL model: {EQTL_MODEL} / {EQTL_MODEL_FILES_PREFIX}")

'Using eQTL model: MASHR / mashr_'

In [7]:
REFERENCE_PANEL_DIR = conf.PHENOMEXCAN["LD_BLOCKS"][f"{REFERENCE_PANEL}_GENOTYPE_DIR"]

In [8]:
display(f"Using reference panel folder: {str(REFERENCE_PANEL_DIR)}")

'Using reference panel folder: /opt/data/data/phenomexcan/ld_blocks/reference_panel_1000G'

In [9]:
OUTPUT_DIR_BASE = (
    conf.PHENOMEXCAN["LD_BLOCKS"][f"GENE_CORRS_DIR"]
    / REFERENCE_PANEL.lower()
    / EQTL_MODEL.lower()
)
OUTPUT_DIR_BASE.mkdir(parents=True, exist_ok=True)

In [10]:
display(f"Using output dir base: {OUTPUT_DIR_BASE}")

'Using output dir base: /opt/data/data/phenomexcan/ld_blocks/gene_corrs/1000g/mashr'

In [11]:
cov_dtype_dict = {
    "float32": np.float32,
    "float64": np.float64,
}

if COVARIANCE_MATRIX_DTYPE in cov_dtype_dict:
    COV_DTYPE = cov_dtype_dict[COVARIANCE_MATRIX_DTYPE]
else:
    COV_DTYPE = np.float64

display(f"Covariance matrix dtype used: {str(COV_DTYPE)}")

"Covariance matrix dtype used: <class 'numpy.float64'>"

# Load data

## Functions

In [12]:
def get_reference_panel_file(directory: Path, file_pattern: str) -> Path:
    files = list(directory.glob(f"*{file_pattern}*.parquet"))
    assert len(files) == 1, f"More than one file was found: {files}"
    return files[0]

In [13]:
# testing
_tmp = get_reference_panel_file(
    conf.PHENOMEXCAN["LD_BLOCKS"]["GTEX_V8_GENOTYPE_DIR"], "chr1.variants"
)
assert _tmp is not None
assert (
    _tmp.name
    == "gtex_v8_eur_filtered_maf0.01_monoallelic_variants.chr1.variants.parquet"
)

_tmp = get_reference_panel_file(
    conf.PHENOMEXCAN["LD_BLOCKS"]["GTEX_V8_GENOTYPE_DIR"], "_metadata"
)
assert _tmp is not None
assert (
    _tmp.name
    == "gtex_v8_eur_filtered_maf0.01_monoallelic_variants.variants_metadata.parquet"
)

# 1000G
_tmp = get_reference_panel_file(
    conf.PHENOMEXCAN["LD_BLOCKS"]["1000G_GENOTYPE_DIR"], "chr1.variants"
)
assert _tmp is not None
assert _tmp.name == "chr1.variants.parquet"

_tmp = get_reference_panel_file(
    conf.PHENOMEXCAN["LD_BLOCKS"]["1000G_GENOTYPE_DIR"], "_metadata"
)
assert _tmp is not None
assert _tmp.name == "variant_metadata.parquet"

# pattern matches more than one file
try:
    get_reference_panel_file(
        conf.PHENOMEXCAN["LD_BLOCKS"]["1000G_GENOTYPE_DIR"], "chr1"
    )
    raise AssertionError("Exception was not raised")
except AssertionError as e:
    assert "More than one file was found" in str(e)

## SNPs in predictions models

In [14]:
mashr_models_db_files = list(
    conf.PHENOMEXCAN["PREDICTION_MODELS"][EQTL_MODEL].glob("*.db")
)

In [15]:
assert len(mashr_models_db_files) == 49

In [16]:
all_variants_ids = []

for m in mashr_models_db_files:
    print(f"Processing {m.name}")
    tissue = m.name.split(EQTL_MODEL_FILES_PREFIX)[1].split(".db")[0]

    with sqlite3.connect(m) as conn:
        df = pd.read_sql("select gene, varID from weights", conn)
        df["gene"] = df["gene"].apply(lambda x: x.split(".")[0])
        df = df.assign(tissue=tissue)

        all_variants_ids.append(df)

Processing mashr_Skin_Not_Sun_Exposed_Suprapubic.db
Processing mashr_Cells_EBV-transformed_lymphocytes.db
Processing mashr_Brain_Frontal_Cortex_BA9.db
Processing mashr_Kidney_Cortex.db
Processing mashr_Brain_Substantia_nigra.db
Processing mashr_Spleen.db
Processing mashr_Colon_Transverse.db
Processing mashr_Heart_Left_Ventricle.db
Processing mashr_Lung.db
Processing mashr_Muscle_Skeletal.db
Processing mashr_Brain_Hypothalamus.db
Processing mashr_Brain_Cortex.db
Processing mashr_Brain_Amygdala.db
Processing mashr_Esophagus_Mucosa.db
Processing mashr_Adrenal_Gland.db
Processing mashr_Uterus.db
Processing mashr_Prostate.db
Processing mashr_Whole_Blood.db
Processing mashr_Pituitary.db
Processing mashr_Esophagus_Gastroesophageal_Junction.db
Processing mashr_Stomach.db
Processing mashr_Heart_Atrial_Appendage.db
Processing mashr_Brain_Cerebellum.db
Processing mashr_Breast_Mammary_Tissue.db
Processing mashr_Artery_Tibial.db
Processing mashr_Artery_Aorta.db
Processing mashr_Small_Intestine_Term

In [17]:
all_gene_snps = pd.concat(all_variants_ids, ignore_index=True)

In [18]:
all_gene_snps.shape

(1132714, 3)

In [19]:
all_gene_snps.head()

Unnamed: 0,gene,varID,tissue
0,ENSG00000169583,chr9_136996001_G_A_b38,Skin_Not_Sun_Exposed_Suprapubic
1,ENSG00000107331,chr9_137029055_C_CA_b38,Skin_Not_Sun_Exposed_Suprapubic
2,ENSG00000107331,chr9_137029407_T_G_b38,Skin_Not_Sun_Exposed_Suprapubic
3,ENSG00000180549,chr9_137031950_T_C_b38,Skin_Not_Sun_Exposed_Suprapubic
4,ENSG00000180549,chr9_137032610_A_G_b38,Skin_Not_Sun_Exposed_Suprapubic


In [20]:
all_snps_in_models = set(all_gene_snps["varID"].unique())

## MultiPLIER Z

In [21]:
multiplier_z = pd.read_pickle(conf.MULTIPLIER["MODEL_Z_MATRIX_FILE"])

In [22]:
multiplier_z.shape

(6750, 987)

In [23]:
multiplier_z.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
GAS6,0.0,0.0,0.039438,0.0,0.050476,0.0,0.0,0.0,0.590949,0.0,...,0.050125,0.0,0.033407,0.0,0.0,0.005963,0.347362,0.0,0.0,0.0
MMP14,0.0,0.0,0.0,0.0,0.070072,0.0,0.0,0.004904,1.720179,2.423595,...,0.0,0.0,0.001007,0.0,0.035747,0.0,0.0,0.0,0.014978,0.0
DSP,0.0,0.0,0.0,0.0,0.0,0.041697,0.0,0.005718,0.0,0.0,...,0.020853,0.0,0.0,0.0,0.0,0.005774,0.0,0.0,0.0,0.416405
MARCKSL1,0.305212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.161843,0.149471,...,0.027134,0.05272,0.0,0.030189,0.060884,0.0,0.0,0.0,0.0,0.44848
SPARC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014014,...,0.0,0.0,0.0,0.0,0.0,0.0,0.067779,0.0,0.122417,0.062665


## Reference panel variants metadata

In [24]:
input_file = get_reference_panel_file(REFERENCE_PANEL_DIR, "_metadata")
display(input_file)

PosixPath('/opt/data/data/phenomexcan/ld_blocks/reference_panel_1000G/variant_metadata.parquet')

In [25]:
variants_metadata = pd.read_parquet(input_file, columns=["id"])

In [26]:
variants_metadata.shape

(27731498, 1)

In [27]:
variants_metadata.head()

Unnamed: 0,id
0,chr1_17496_AC_A_b38
1,chr1_54490_G_A_b38
2,chr1_55330_G_A_b38
3,chr1_55416_G_A_b38
4,chr1_62157_G_A_b38


In [28]:
variants_ids_with_genotype = set(variants_metadata["id"])

In [29]:
len(variants_ids_with_genotype)

27731498

In [30]:
list(variants_ids_with_genotype)[:10]

['chr1_37985530_C_G_b38',
 'chr10_15569395_T_C_b38',
 'chr8_55098244_G_A_b38',
 'chr5_120588383_C_T_b38',
 'chr2_149192685_C_T_b38',
 'chr2_80027582_T_C_b38',
 'chr5_149757891_C_A_b38',
 'chr15_61600701_C_G_b38',
 'chr8_73519344_TC_T_b38',
 'chr13_42027366_GT_G_b38']

In [31]:
del variants_metadata

# How many variants in predictions models are present in the reference panel?

In [32]:
n_snps_in_models = len(all_snps_in_models)
display(n_snps_in_models)

237405

In [33]:
n_snps_in_ref_panel = len(all_snps_in_models.intersection(variants_ids_with_genotype))
display(n_snps_in_ref_panel)

222720

In [34]:
n_snps_in_ref_panel / n_snps_in_models

0.9381436785240412

# Get final list of genes in MultiPLIER

In [35]:
genes_in_z = [
    Gene(name=gene_name).ensembl_id
    for gene_name in multiplier_z.index
    if gene_name in Gene.GENE_NAME_TO_ID_MAP
]

In [36]:
len(genes_in_z)

6454

In [37]:
genes_in_z[:5]

['ENSG00000183087',
 'ENSG00000157227',
 'ENSG00000096696',
 'ENSG00000175130',
 'ENSG00000113140']

In [38]:
genes_in_z = set(genes_in_z)

In [39]:
len(genes_in_z)

6454

In [40]:
# keep genes in MultiPLIER only
display(all_gene_snps.shape)

all_gene_snps = all_gene_snps[all_gene_snps["gene"].isin(genes_in_z)]

display(all_gene_snps.shape)

(1132714, 3)

(396890, 3)

# (For MultiPLIER genes): How many variants in predictions models are present in the reference panel?

In [41]:
all_snps_in_models_multiplier = set(all_gene_snps["varID"])

n_snps_in_models = len(all_snps_in_models_multiplier)
display(n_snps_in_models)

84708

In [42]:
n_snps_in_ref_panel = len(
    all_snps_in_models_multiplier.intersection(variants_ids_with_genotype)
)
display(n_snps_in_ref_panel)

80339

In [43]:
n_snps_in_ref_panel / n_snps_in_models

0.9484228172073476

## Preprocess SNPs data

In [44]:
variants_ld_block_df = all_gene_snps[["varID"]].drop_duplicates()

In [45]:
variants_ld_block_df.shape

(84708, 1)

In [46]:
variants_ld_block_df.head()

Unnamed: 0,varID
0,chr9_136996001_G_A_b38
1,chr9_137029055_C_CA_b38
2,chr9_137029407_T_G_b38
10,chr9_137084985_C_T_b38
11,chr9_137102549_TACAC_T_b38


In [47]:
variants_info = variants_ld_block_df["varID"].str.split("_", expand=True)

In [48]:
variants_info.shape

(84708, 5)

In [49]:
assert variants_ld_block_df.shape[0] == variants_info.shape[0]

In [50]:
variants_ld_block_df = variants_ld_block_df.join(variants_info)[["varID", 0, 1, 2, 3]]

In [51]:
assert variants_ld_block_df.shape[0] == variants_info.shape[0]

In [52]:
variants_ld_block_df.head()

Unnamed: 0,varID,0,1,2,3
0,chr9_136996001_G_A_b38,chr9,136996001,G,A
1,chr9_137029055_C_CA_b38,chr9,137029055,C,CA
2,chr9_137029407_T_G_b38,chr9,137029407,T,G
10,chr9_137084985_C_T_b38,chr9,137084985,C,T
11,chr9_137102549_TACAC_T_b38,chr9,137102549,TACAC,T


In [53]:
variants_ld_block_df = variants_ld_block_df.rename(
    columns={
        0: "chr",
        1: "position",
        2: "ref_allele",
        3: "eff_allele",
    }
)

In [54]:
variants_ld_block_df["chr"] = variants_ld_block_df["chr"].apply(lambda x: int(x[3:]))

In [55]:
variants_ld_block_df["position"] = variants_ld_block_df["position"].astype(int)

In [56]:
variants_ld_block_df.shape

(84708, 5)

In [57]:
variants_ld_block_df.head()

Unnamed: 0,varID,chr,position,ref_allele,eff_allele
0,chr9_136996001_G_A_b38,9,136996001,G,A
1,chr9_137029055_C_CA_b38,9,137029055,C,CA
2,chr9_137029407_T_G_b38,9,137029407,T,G
10,chr9_137084985_C_T_b38,9,137084985,C,T
11,chr9_137102549_TACAC_T_b38,9,137102549,TACAC,T


In [58]:
variants_ld_block_df.dtypes

varID         object
chr            int64
position       int64
ref_allele    object
eff_allele    object
dtype: object

# Covariance for each chromosome block

## Functions

In [59]:
def covariance(df, dtype):
    n = df.shape[0]
    df = df.sub(df.mean(), axis=1).astype(dtype)
    return df.T.dot(df) / (n - 1)

In [60]:
# testing
rs = np.random.RandomState(0)

_test_data = pd.DataFrame(rs.normal(size=(50, 5)), columns=[f"c{i}" for i in range(5)])

# float64
pd.testing.assert_frame_equal(
    covariance(_test_data, np.float64),
    _test_data.cov(),
    rtol=1e-10,
    atol=1e-10,
    check_dtype=True,
)

# float32
pd.testing.assert_frame_equal(
    covariance(_test_data, np.float32),
    _test_data.cov(),
    rtol=1e-5,
    atol=1e-8,
    check_dtype=False,
)

del _test_data

In [61]:
def compute_snps_cov(snps_df):
    assert snps_df["chr"].unique().shape[0] == 1
    chromosome = snps_df["chr"].unique()[0]

    # keep variants only present in genotype
    snps_ids = list(set(snps_df["varID"]).intersection(variants_ids_with_genotype))

    chromosome_file = get_reference_panel_file(
        REFERENCE_PANEL_DIR, f"chr{chromosome}.variants"
    )
    snps_genotypes = pd.read_parquet(chromosome_file, columns=snps_ids)

    return covariance(snps_genotypes, COV_DTYPE)

In [62]:
# testing
_tmp_snps = variants_ld_block_df[variants_ld_block_df["chr"] == 22]
assert _tmp_snps.shape[0] > 0

In [63]:
_tmp_snps.shape

(2687, 5)

In [64]:
n_expected = len(set(_tmp_snps["varID"]).intersection(variants_ids_with_genotype))
display(n_expected)

2484

In [65]:
_tmp = compute_snps_cov(_tmp_snps)

In [66]:
assert _tmp.shape == (n_expected, n_expected)
assert not _tmp.isna().any().any()

In [67]:
del _tmp_snps, _tmp

## Compute covariance and save

In [68]:
output_file_name_template = conf.PHENOMEXCAN["LD_BLOCKS"][
    "GENE_CORRS_FILE_NAME_TEMPLATES"
]["SNPS_COVARIANCE"]

output_file = OUTPUT_DIR_BASE / output_file_name_template.format(
    prefix="",
    suffix="",
)
display(output_file)

PosixPath('/opt/data/data/phenomexcan/ld_blocks/gene_corrs/1000g/mashr/snps_chr_blocks_cov.h5')

In [69]:
with pd.HDFStore(output_file, mode="w", complevel=4) as store:
    pbar = tqdm(
        variants_ld_block_df.groupby("chr"),
        ncols=100,
        total=variants_ld_block_df["chr"].unique().shape[0],
    )

    store["metadata"] = variants_ld_block_df

    for grp_name, grp_data in pbar:
        pbar.set_description(f"{grp_name} {grp_data.shape}")
        snps_cov = compute_snps_cov(grp_data)  # .astype(COV_DTYPE)
        assert not snps_cov.isna().any().any()
        store[f"chr{grp_name}"] = snps_cov

        del snps_cov
        store.flush()

        gc.collect()

22 (2687, 5): 100%|██████████████████████████████████████████████| 22/22 [1:07:16<00:00, 183.48s/it]


# Testing

In [70]:
_tmp = variants_ld_block_df[variants_ld_block_df["chr"] == 1]

In [71]:
_tmp.shape

(8130, 5)

In [72]:
assert _tmp.shape[0] > 0

In [73]:
n_expected = len(set(_tmp["varID"]).intersection(variants_ids_with_genotype))
display(n_expected)
assert n_expected > 0

7701

In [74]:
with pd.HDFStore(output_file, mode="r") as store:
    df = store["chr1"]
    assert df.shape == (n_expected, n_expected)
    assert not df.isna().any().any()