# Description

This notebook computes the covariance of SNPs for each chr.

# Modules

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import gc
import sqlite3

import numpy as np
import pandas as pd
from tqdm import tqdm

import conf
from entity import Gene

# Settings

In [3]:
COV_DTYPE = np.float32

In [4]:
# mashr
EQTL_MODEL = "MASHR"
EQTL_MODEL_FILES_PREFIX = "mashr_"

# # elastic net
# EQTL_MODEL = "ELASTIC_NET"
# EQTL_MODEL_FILES_PREFIX = "en_"

# make it read the prefix from conf.py
EQTL_MODEL_FILES_PREFIX = None

In [5]:
# Parameters
EQTL_MODEL = "MASHR"


In [6]:
if EQTL_MODEL_FILES_PREFIX is None:
    EQTL_MODEL_FILES_PREFIX = conf.PHENOMEXCAN["PREDICTION_MODELS_PREFIXES"][EQTL_MODEL]

In [7]:
display(f"Using eQTL model: {EQTL_MODEL} / {EQTL_MODEL_FILES_PREFIX}")

'Using eQTL model: MASHR / mashr_'

# Load data

## SNPs in predictions models

In [8]:
mashr_models_db_files = list(
    conf.PHENOMEXCAN["PREDICTION_MODELS"][EQTL_MODEL].glob("*.db")
)

In [9]:
assert len(mashr_models_db_files) == 49

In [10]:
all_variants_ids = []

for m in mashr_models_db_files:
    print(f"Processing {m.name}")
    tissue = m.name.split(EQTL_MODEL_FILES_PREFIX)[1].split(".db")[0]

    with sqlite3.connect(m) as conn:
        df = pd.read_sql("select gene, varID from weights", conn)
        df["gene"] = df["gene"].apply(lambda x: x.split(".")[0])
        df = df.assign(tissue=tissue)

        all_variants_ids.append(df)

Processing mashr_Skin_Not_Sun_Exposed_Suprapubic.db
Processing mashr_Cells_EBV-transformed_lymphocytes.db
Processing mashr_Brain_Frontal_Cortex_BA9.db
Processing mashr_Kidney_Cortex.db
Processing mashr_Brain_Substantia_nigra.db
Processing mashr_Spleen.db
Processing mashr_Colon_Transverse.db
Processing mashr_Heart_Left_Ventricle.db
Processing mashr_Lung.db
Processing mashr_Muscle_Skeletal.db
Processing mashr_Brain_Hypothalamus.db
Processing mashr_Brain_Cortex.db
Processing mashr_Brain_Amygdala.db
Processing mashr_Esophagus_Mucosa.db
Processing mashr_Adrenal_Gland.db
Processing mashr_Uterus.db
Processing mashr_Prostate.db
Processing mashr_Whole_Blood.db
Processing mashr_Pituitary.db
Processing mashr_Esophagus_Gastroesophageal_Junction.db
Processing mashr_Stomach.db
Processing mashr_Heart_Atrial_Appendage.db
Processing mashr_Brain_Cerebellum.db
Processing mashr_Breast_Mammary_Tissue.db
Processing mashr_Artery_Tibial.db
Processing mashr_Artery_Aorta.db
Processing mashr_Small_Intestine_Term

In [11]:
all_gene_snps = pd.concat(all_variants_ids, ignore_index=True)

In [12]:
all_gene_snps.shape

(1132714, 3)

In [13]:
all_gene_snps.head()

Unnamed: 0,gene,varID,tissue
0,ENSG00000169583,chr9_136996001_G_A_b38,Skin_Not_Sun_Exposed_Suprapubic
1,ENSG00000107331,chr9_137029055_C_CA_b38,Skin_Not_Sun_Exposed_Suprapubic
2,ENSG00000107331,chr9_137029407_T_G_b38,Skin_Not_Sun_Exposed_Suprapubic
3,ENSG00000180549,chr9_137031950_T_C_b38,Skin_Not_Sun_Exposed_Suprapubic
4,ENSG00000180549,chr9_137032610_A_G_b38,Skin_Not_Sun_Exposed_Suprapubic


In [14]:
all_snps_in_models = set(all_gene_snps["varID"].unique())

## MultiPLIER Z

In [15]:
multiplier_z = pd.read_pickle(conf.MULTIPLIER["MODEL_Z_MATRIX_FILE"])

In [16]:
multiplier_z.shape

(6750, 987)

In [17]:
multiplier_z.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
GAS6,0.0,0.0,0.039438,0.0,0.050476,0.0,0.0,0.0,0.590949,0.0,...,0.050125,0.0,0.033407,0.0,0.0,0.005963,0.347362,0.0,0.0,0.0
MMP14,0.0,0.0,0.0,0.0,0.070072,0.0,0.0,0.004904,1.720179,2.423595,...,0.0,0.0,0.001007,0.0,0.035747,0.0,0.0,0.0,0.014978,0.0
DSP,0.0,0.0,0.0,0.0,0.0,0.041697,0.0,0.005718,0.0,0.0,...,0.020853,0.0,0.0,0.0,0.0,0.005774,0.0,0.0,0.0,0.416405
MARCKSL1,0.305212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.161843,0.149471,...,0.027134,0.05272,0.0,0.030189,0.060884,0.0,0.0,0.0,0.0,0.44848
SPARC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014014,...,0.0,0.0,0.0,0.0,0.0,0.0,0.067779,0.0,0.122417,0.062665


## 1000G variants metadata

In [18]:
input_file = (
    conf.PHENOMEXCAN["LD_BLOCKS"]["1000G_GENOTYPE_DIR"] / "variant_metadata.parquet"
)
display(input_file)

PosixPath('/opt/data/data/phenomexcan/ld_blocks/reference_panel_1000G/variant_metadata.parquet')

In [19]:
variants_metadata = pd.read_parquet(input_file, columns=["id"])

In [20]:
variants_metadata.shape

(27731498, 1)

In [21]:
variants_metadata.head()

Unnamed: 0,id
0,chr1_17496_AC_A_b38
1,chr1_54490_G_A_b38
2,chr1_55330_G_A_b38
3,chr1_55416_G_A_b38
4,chr1_62157_G_A_b38


In [22]:
variants_ids_with_genotype = set(variants_metadata["id"])

In [23]:
len(variants_ids_with_genotype)

27731498

In [24]:
list(variants_ids_with_genotype)[:10]

['chr1_203148767_C_T_b38',
 'chr5_101741893_G_T_b38',
 'chr5_147235562_G_A_b38',
 'chr7_121742746_C_G_b38',
 'chr8_24511427_A_G_b38',
 'chr11_110424325_C_A_b38',
 'chr14_69322186_A_G_b38',
 'chr21_23076762_T_G_b38',
 'chr15_31670819_G_A_b38',
 'chr2_31191964_G_A_b38']

In [25]:
del variants_metadata

# How many variants in predictions models are present in 1000G?

In [26]:
n_snps_in_models = len(all_snps_in_models)
display(n_snps_in_models)

237405

In [27]:
n_snps_in_1000g = len(all_snps_in_models.intersection(variants_ids_with_genotype))
display(n_snps_in_1000g)

222720

In [28]:
n_snps_in_1000g / n_snps_in_models

0.9381436785240412

# Get final list of genes in MultiPLIER

In [29]:
genes_in_z = [
    Gene(name=gene_name).ensembl_id
    for gene_name in multiplier_z.index
    if gene_name in Gene.GENE_NAME_TO_ID_MAP
]

In [30]:
len(genes_in_z)

6454

In [31]:
genes_in_z[:5]

['ENSG00000183087',
 'ENSG00000157227',
 'ENSG00000096696',
 'ENSG00000175130',
 'ENSG00000113140']

In [32]:
genes_in_z = set(genes_in_z)

In [33]:
len(genes_in_z)

6454

In [34]:
# keep genes in MultiPLIER only
display(all_gene_snps.shape)

all_gene_snps = all_gene_snps[all_gene_snps["gene"].isin(genes_in_z)]

display(all_gene_snps.shape)

(1132714, 3)

(396890, 3)

# (For MultiPLIER genes): How many variants in predictions models are present in 1000G?

In [35]:
all_snps_in_models_multiplier = set(all_gene_snps["varID"])

n_snps_in_models = len(all_snps_in_models_multiplier)
display(n_snps_in_models)

84708

In [36]:
n_snps_in_1000g = len(
    all_snps_in_models_multiplier.intersection(variants_ids_with_genotype)
)
display(n_snps_in_1000g)

80339

In [37]:
n_snps_in_1000g / n_snps_in_models

0.9484228172073476

## Preprocess SNPs data

In [38]:
variants_ld_block_df = all_gene_snps[["varID"]].drop_duplicates()

In [39]:
variants_ld_block_df.shape

(84708, 1)

In [40]:
variants_ld_block_df.head()

Unnamed: 0,varID
0,chr9_136996001_G_A_b38
1,chr9_137029055_C_CA_b38
2,chr9_137029407_T_G_b38
10,chr9_137084985_C_T_b38
11,chr9_137102549_TACAC_T_b38


In [41]:
variants_info = variants_ld_block_df["varID"].str.split("_", expand=True)

In [42]:
variants_info.shape

(84708, 5)

In [43]:
assert variants_ld_block_df.shape[0] == variants_info.shape[0]

In [44]:
variants_ld_block_df = variants_ld_block_df.join(variants_info)[["varID", 0, 1, 2, 3]]

In [45]:
assert variants_ld_block_df.shape[0] == variants_info.shape[0]

In [46]:
variants_ld_block_df.head()

Unnamed: 0,varID,0,1,2,3
0,chr9_136996001_G_A_b38,chr9,136996001,G,A
1,chr9_137029055_C_CA_b38,chr9,137029055,C,CA
2,chr9_137029407_T_G_b38,chr9,137029407,T,G
10,chr9_137084985_C_T_b38,chr9,137084985,C,T
11,chr9_137102549_TACAC_T_b38,chr9,137102549,TACAC,T


In [47]:
variants_ld_block_df = variants_ld_block_df.rename(
    columns={
        0: "chr",
        1: "position",
        2: "ref_allele",
        3: "eff_allele",
    }
)

In [48]:
variants_ld_block_df["chr"] = variants_ld_block_df["chr"].apply(lambda x: int(x[3:]))

In [49]:
variants_ld_block_df["position"] = variants_ld_block_df["position"].astype(int)

In [50]:
variants_ld_block_df.shape

(84708, 5)

In [51]:
variants_ld_block_df.head()

Unnamed: 0,varID,chr,position,ref_allele,eff_allele
0,chr9_136996001_G_A_b38,9,136996001,G,A
1,chr9_137029055_C_CA_b38,9,137029055,C,CA
2,chr9_137029407_T_G_b38,9,137029407,T,G
10,chr9_137084985_C_T_b38,9,137084985,C,T
11,chr9_137102549_TACAC_T_b38,9,137102549,TACAC,T


In [52]:
variants_ld_block_df.dtypes

varID         object
chr            int64
position       int64
ref_allele    object
eff_allele    object
dtype: object

# Covariance for each chromosome block

## Functions

In [53]:
def covariance(df, dtype):
    n = df.shape[0]
    df = df.sub(df.mean(), axis=1).astype(dtype)
    return df.T.dot(df) / (n - 1)

In [54]:
# testing
rs = np.random.RandomState(0)

_test_data = pd.DataFrame(rs.normal(size=(50, 5)), columns=[f"c{i}" for i in range(5)])

# float64
pd.testing.assert_frame_equal(
    covariance(_test_data, np.float64),
    _test_data.cov(),
    rtol=1e-10,
    atol=1e-10,
    check_dtype=True,
)

# float32
pd.testing.assert_frame_equal(
    covariance(_test_data, np.float32),
    _test_data.cov(),
    rtol=1e-5,
    atol=1e-8,
    check_dtype=False,
)

del _test_data

In [55]:
def compute_snps_cov(snps_df):
    assert snps_df["chr"].unique().shape[0] == 1
    chromosome = snps_df["chr"].unique()[0]

    # keep variants only present in genotype
    snps_ids = list(set(snps_df["varID"]).intersection(variants_ids_with_genotype))

    chromosome_file = (
        conf.PHENOMEXCAN["LD_BLOCKS"]["1000G_GENOTYPE_DIR"]
        / f"chr{chromosome}.variants.parquet"
    )
    snps_genotypes = pd.read_parquet(chromosome_file, columns=snps_ids)

    return covariance(snps_genotypes, COV_DTYPE)


#     return snps_genotypes.cov()

In [56]:
# testing
_tmp_snps = variants_ld_block_df[variants_ld_block_df["chr"] == 22]
assert _tmp_snps.shape[0] > 0

In [57]:
_tmp_snps.shape

(2687, 5)

In [58]:
n_expected = len(set(_tmp_snps["varID"]).intersection(variants_ids_with_genotype))
display(n_expected)

2484

In [59]:
_tmp = compute_snps_cov(_tmp_snps)

In [60]:
assert _tmp.shape == (n_expected, n_expected)
assert not _tmp.isna().any().any()

In [61]:
del _tmp_snps, _tmp

## Compute covariance and save

In [62]:
output_file = conf.PHENOMEXCAN["LD_BLOCKS"][EQTL_MODEL]["SNPS_COVARIANCE_FILE"]
display(output_file)

PosixPath('/opt/data/data/phenomexcan/ld_blocks/mashr_snps_chr_blocks_cov.h5')

In [63]:
with pd.HDFStore(output_file, mode="w", complevel=4) as store:
    pbar = tqdm(
        variants_ld_block_df.groupby("chr"),
        ncols=100,
        total=variants_ld_block_df["chr"].unique().shape[0],
    )

    store["metadata"] = variants_ld_block_df

    for grp_name, grp_data in pbar:
        pbar.set_description(f"{grp_name} {grp_data.shape}")
        snps_cov = compute_snps_cov(grp_data)  # .astype(COV_DTYPE)
        assert not snps_cov.isna().any().any()
        store[f"chr{grp_name}"] = snps_cov

        del snps_cov
        store.flush()

        gc.collect()

22 (2687, 5): 100%|██████████████████████████████████████████████| 22/22 [1:06:35<00:00, 181.62s/it]


# Testing

In [64]:
_tmp = variants_ld_block_df[variants_ld_block_df["chr"] == 1]

In [65]:
_tmp.shape

(8130, 5)

In [66]:
assert _tmp.shape[0] > 0

In [67]:
n_expected = len(set(_tmp["varID"]).intersection(variants_ids_with_genotype))
display(n_expected)
assert n_expected > 0

7701

In [68]:
with pd.HDFStore(output_file, mode="r") as store:
    df = store["chr1"]
    assert df.shape == (n_expected, n_expected)
    assert not df.isna().any().any()