# Description

This notebook computes the covariance of SNPs for each chr.

# Modules

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sqlite3

import numpy as np
import pandas as pd
from tqdm import tqdm

import conf
from entity import Gene

# Load data

## SNPs in predictions models

In [3]:
mashr_models_db_files = list(
    conf.PHENOMEXCAN["PREDICTION_MODELS"]["MASHR"].glob("*.db")
)

In [4]:
assert len(mashr_models_db_files) == 49

In [5]:
all_variants_ids = []

for m in mashr_models_db_files:
    print(f"Processing {m.name}")
    tissue = m.name.split("mashr_")[1].split(".db")[0]

    with sqlite3.connect(m) as conn:
        df = pd.read_sql("select gene, varID from weights", conn)
        df["gene"] = df["gene"].apply(lambda x: x.split(".")[0])
        df = df.assign(tissue=tissue)

        all_variants_ids.append(df)

Processing mashr_Skin_Not_Sun_Exposed_Suprapubic.db
Processing mashr_Cells_EBV-transformed_lymphocytes.db
Processing mashr_Brain_Frontal_Cortex_BA9.db
Processing mashr_Kidney_Cortex.db
Processing mashr_Brain_Substantia_nigra.db
Processing mashr_Spleen.db
Processing mashr_Colon_Transverse.db


Processing mashr_Heart_Left_Ventricle.db
Processing mashr_Lung.db
Processing mashr_Muscle_Skeletal.db
Processing mashr_Brain_Hypothalamus.db
Processing mashr_Brain_Cortex.db
Processing mashr_Brain_Amygdala.db
Processing mashr_Esophagus_Mucosa.db


Processing mashr_Adrenal_Gland.db
Processing mashr_Uterus.db
Processing mashr_Prostate.db
Processing mashr_Whole_Blood.db
Processing mashr_Pituitary.db
Processing mashr_Esophagus_Gastroesophageal_Junction.db
Processing mashr_Stomach.db


Processing mashr_Heart_Atrial_Appendage.db
Processing mashr_Brain_Cerebellum.db
Processing mashr_Breast_Mammary_Tissue.db
Processing mashr_Artery_Tibial.db
Processing mashr_Artery_Aorta.db
Processing mashr_Small_Intestine_Terminal_Ileum.db
Processing mashr_Brain_Hippocampus.db


Processing mashr_Testis.db
Processing mashr_Brain_Putamen_basal_ganglia.db
Processing mashr_Pancreas.db
Processing mashr_Adipose_Subcutaneous.db
Processing mashr_Brain_Cerebellar_Hemisphere.db
Processing mashr_Colon_Sigmoid.db
Processing mashr_Minor_Salivary_Gland.db


Processing mashr_Cells_Cultured_fibroblasts.db
Processing mashr_Brain_Nucleus_accumbens_basal_ganglia.db
Processing mashr_Brain_Anterior_cingulate_cortex_BA24.db
Processing mashr_Vagina.db
Processing mashr_Ovary.db
Processing mashr_Skin_Sun_Exposed_Lower_leg.db
Processing mashr_Esophagus_Muscularis.db


Processing mashr_Brain_Spinal_cord_cervical_c-1.db
Processing mashr_Artery_Coronary.db
Processing mashr_Thyroid.db
Processing mashr_Brain_Caudate_basal_ganglia.db
Processing mashr_Adipose_Visceral_Omentum.db
Processing mashr_Nerve_Tibial.db
Processing mashr_Liver.db


In [6]:
all_gene_snps = pd.concat(all_variants_ids, ignore_index=True)

In [7]:
all_gene_snps.shape

(1132714, 3)

In [8]:
all_gene_snps.head()

Unnamed: 0,gene,varID,tissue
0,ENSG00000169583,chr9_136996001_G_A_b38,Skin_Not_Sun_Exposed_Suprapubic
1,ENSG00000107331,chr9_137029055_C_CA_b38,Skin_Not_Sun_Exposed_Suprapubic
2,ENSG00000107331,chr9_137029407_T_G_b38,Skin_Not_Sun_Exposed_Suprapubic
3,ENSG00000180549,chr9_137031950_T_C_b38,Skin_Not_Sun_Exposed_Suprapubic
4,ENSG00000180549,chr9_137032610_A_G_b38,Skin_Not_Sun_Exposed_Suprapubic


In [9]:
all_snps_in_models = set(all_gene_snps["varID"].unique())

## MultiPLIER Z

In [10]:
multiplier_z = pd.read_pickle(conf.MULTIPLIER["MODEL_Z_MATRIX_FILE"])

In [11]:
multiplier_z.shape

(6750, 987)

In [12]:
multiplier_z.head()

Unnamed: 0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,...,LV978,LV979,LV980,LV981,LV982,LV983,LV984,LV985,LV986,LV987
GAS6,0.0,0.0,0.039438,0.0,0.050476,0.0,0.0,0.0,0.590949,0.0,...,0.050125,0.0,0.033407,0.0,0.0,0.005963,0.347362,0.0,0.0,0.0
MMP14,0.0,0.0,0.0,0.0,0.070072,0.0,0.0,0.004904,1.720179,2.423595,...,0.0,0.0,0.001007,0.0,0.035747,0.0,0.0,0.0,0.014978,0.0
DSP,0.0,0.0,0.0,0.0,0.0,0.041697,0.0,0.005718,0.0,0.0,...,0.020853,0.0,0.0,0.0,0.0,0.005774,0.0,0.0,0.0,0.416405
MARCKSL1,0.305212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.161843,0.149471,...,0.027134,0.05272,0.0,0.030189,0.060884,0.0,0.0,0.0,0.0,0.44848
SPARC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014014,...,0.0,0.0,0.0,0.0,0.0,0.0,0.067779,0.0,0.122417,0.062665


## 1000G variants metadata

In [13]:
input_file = (
    conf.PHENOMEXCAN["LD_BLOCKS"]["1000G_GENOTYPE_DIR"] / "variant_metadata.parquet"
)
display(input_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/ld_blocks/reference_panel_1000G/variant_metadata.parquet')

In [14]:
variants_metadata = pd.read_parquet(input_file, columns=["id"])

In [15]:
variants_metadata.shape

(27731498, 1)

In [16]:
variants_metadata.head()

Unnamed: 0,id
0,chr1_17496_AC_A_b38
1,chr1_54490_G_A_b38
2,chr1_55330_G_A_b38
3,chr1_55416_G_A_b38
4,chr1_62157_G_A_b38


In [17]:
variants_ids_with_genotype = set(variants_metadata["id"])

In [18]:
len(variants_ids_with_genotype)

27731498

In [19]:
list(variants_ids_with_genotype)[:10]

['chr11_45002488_A_T_b38',
 'chr16_81462547_A_G_b38',
 'chr12_99930541_C_T_b38',
 'chr20_44176423_G_T_b38',
 'chr7_113665978_C_CTTA_b38',
 'chr12_103775295_C_T_b38',
 'chr11_83121405_T_C_b38',
 'chr4_87809126_G_A_b38',
 'chr5_161603025_T_G_b38',
 'chr11_23978558_G_T_b38']

In [20]:
del variants_metadata

# How many variants in predictions models are present in 1000G?

In [21]:
n_snps_in_models = len(all_snps_in_models)
display(n_snps_in_models)

237405

In [22]:
n_snps_in_1000g = len(all_snps_in_models.intersection(variants_ids_with_genotype))
display(n_snps_in_1000g)

222720

In [23]:
n_snps_in_1000g / n_snps_in_models

0.9381436785240412

# Get final list of genes in MultiPLIER

In [24]:
genes_in_z = [
    Gene(name=gene_name).ensembl_id
    for gene_name in multiplier_z.index
    if gene_name in Gene.GENE_NAME_TO_ID_MAP
]

In [25]:
len(genes_in_z)

6454

In [26]:
genes_in_z[:5]

['ENSG00000183087',
 'ENSG00000157227',
 'ENSG00000096696',
 'ENSG00000175130',
 'ENSG00000113140']

In [27]:
genes_in_z = set(genes_in_z)

In [28]:
len(genes_in_z)

6454

In [29]:
# keep genes in MultiPLIER only
display(all_gene_snps.shape)

all_gene_snps = all_gene_snps[all_gene_snps["gene"].isin(genes_in_z)]

display(all_gene_snps.shape)

(1132714, 3)

(396890, 3)

# (For MultiPLIER genes): How many variants in predictions models are present in 1000G?

In [30]:
all_snps_in_models_multiplier = set(all_gene_snps["varID"])

n_snps_in_models = len(all_snps_in_models_multiplier)
display(n_snps_in_models)

84708

In [31]:
n_snps_in_1000g = len(
    all_snps_in_models_multiplier.intersection(variants_ids_with_genotype)
)
display(n_snps_in_1000g)

80339

In [32]:
n_snps_in_1000g / n_snps_in_models

0.9484228172073476

## Preprocess SNPs data

In [33]:
variants_ld_block_df = all_gene_snps[["varID"]].drop_duplicates()

In [34]:
variants_ld_block_df.shape

(84708, 1)

In [35]:
variants_ld_block_df.head()

Unnamed: 0,varID
0,chr9_136996001_G_A_b38
1,chr9_137029055_C_CA_b38
2,chr9_137029407_T_G_b38
10,chr9_137084985_C_T_b38
11,chr9_137102549_TACAC_T_b38


In [36]:
variants_info = variants_ld_block_df["varID"].str.split("_", expand=True)

In [37]:
variants_info.shape

(84708, 5)

In [38]:
assert variants_ld_block_df.shape[0] == variants_info.shape[0]

In [39]:
variants_ld_block_df = variants_ld_block_df.join(variants_info)[["varID", 0, 1, 2, 3]]

In [40]:
assert variants_ld_block_df.shape[0] == variants_info.shape[0]

In [41]:
variants_ld_block_df.head()

Unnamed: 0,varID,0,1,2,3
0,chr9_136996001_G_A_b38,chr9,136996001,G,A
1,chr9_137029055_C_CA_b38,chr9,137029055,C,CA
2,chr9_137029407_T_G_b38,chr9,137029407,T,G
10,chr9_137084985_C_T_b38,chr9,137084985,C,T
11,chr9_137102549_TACAC_T_b38,chr9,137102549,TACAC,T


In [42]:
variants_ld_block_df = variants_ld_block_df.rename(
    columns={
        0: "chr",
        1: "position",
        2: "ref_allele",
        3: "eff_allele",
    }
)

In [43]:
variants_ld_block_df["chr"] = variants_ld_block_df["chr"].apply(lambda x: int(x[3:]))

In [44]:
variants_ld_block_df["position"] = variants_ld_block_df["position"].astype(int)

In [45]:
variants_ld_block_df.shape

(84708, 5)

In [46]:
variants_ld_block_df.head()

Unnamed: 0,varID,chr,position,ref_allele,eff_allele
0,chr9_136996001_G_A_b38,9,136996001,G,A
1,chr9_137029055_C_CA_b38,9,137029055,C,CA
2,chr9_137029407_T_G_b38,9,137029407,T,G
10,chr9_137084985_C_T_b38,9,137084985,C,T
11,chr9_137102549_TACAC_T_b38,9,137102549,TACAC,T


In [47]:
variants_ld_block_df.dtypes

varID         object
chr            int64
position       int64
ref_allele    object
eff_allele    object
dtype: object

# Covariance for each chromosome block

## Functions

In [48]:
def compute_snps_cov(snps_df):
    assert snps_df["chr"].unique().shape[0]
    chromosome = snps_df["chr"].unique()[0]

    # keep variants only present in genotype
    snps_ids = list(set(snps_df["varID"]).intersection(variants_ids_with_genotype))

    chromosome_file = (
        conf.PHENOMEXCAN["LD_BLOCKS"]["1000G_GENOTYPE_DIR"]
        / f"chr{chromosome}.variants.parquet"
    )
    snps_genotypes = pd.read_parquet(chromosome_file, columns=snps_ids)

    return snps_genotypes.cov()

In [49]:
# testing
_tmp_snps = variants_ld_block_df[variants_ld_block_df["chr"] == 22]
assert _tmp_snps.shape[0] > 0

In [50]:
_tmp_snps.shape

(2687, 5)

In [51]:
n_expected = len(set(_tmp_snps["varID"]).intersection(variants_ids_with_genotype))
display(n_expected)

2484

In [52]:
_tmp = compute_snps_cov(_tmp_snps)

In [53]:
assert _tmp.shape == (n_expected, n_expected)
assert not _tmp.isna().any().any()

## Compute covariance and save

In [54]:
output_file = conf.PHENOMEXCAN["LD_BLOCKS"]["BASE_DIR"] / "mashr_snps_chr_blocks_cov.h5"
display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/ld_blocks/mashr_snps_chr_blocks_cov.h5')

In [55]:
with pd.HDFStore(output_file, mode="w", complevel=4) as store:
    pbar = tqdm(
        variants_ld_block_df.groupby("chr"),
        ncols=100,
        total=variants_ld_block_df["chr"].unique().shape[0],
    )

    store["metadata"] = variants_ld_block_df

    for grp_name, grp_data in pbar:
        pbar.set_description(f"{grp_name} {grp_data.shape}")
        snps_cov = compute_snps_cov(grp_data).astype(np.float32)
        assert not snps_cov.isna().any().any()
        store[f"chr{grp_name}"] = snps_cov

  0%|                                                                        | 0/22 [00:00<?, ?it/s]

1 (8130, 5):   0%|                                                           | 0/22 [00:00<?, ?it/s]

1 (8130, 5):   5%|██▏                                             | 1/22 [07:33<2:38:47, 453.69s/it]

2 (5857, 5):   5%|██▏                                             | 1/22 [07:33<2:38:47, 453.69s/it]

2 (5857, 5):   9%|████▎                                           | 2/22 [13:28<2:21:20, 424.04s/it]

3 (4816, 5):   9%|████▎                                           | 2/22 [13:28<2:21:20, 424.04s/it]

3 (4816, 5):  14%|██████▌                                         | 3/22 [17:29<1:56:53, 369.14s/it]

4 (3302, 5):  14%|██████▌                                         | 3/22 [17:29<1:56:53, 369.14s/it]

4 (3302, 5):  18%|████████▋                                       | 4/22 [20:35<1:34:14, 314.12s/it]

5 (4056, 5):  18%|████████▋                                       | 4/22 [20:35<1:34:14, 314.12s/it]

5 (4056, 5):  23%|██████████▉                                     | 5/22 [23:41<1:18:09, 275.83s/it]

6 (4517, 5):  23%|██████████▉                                     | 5/22 [23:41<1:18:09, 275.83s/it]

6 (4517, 5):  27%|█████████████                                   | 6/22 [27:15<1:08:35, 257.20s/it]

7 (3751, 5):  27%|█████████████                                   | 6/22 [27:15<1:08:35, 257.20s/it]

7 (3751, 5):  32%|███████████████▉                                  | 7/22 [29:58<57:14, 228.99s/it]

8 (3141, 5):  32%|███████████████▉                                  | 7/22 [29:58<57:14, 228.99s/it]

8 (3141, 5):  36%|██████████████████▏                               | 8/22 [32:12<46:44, 200.30s/it]

9 (3323, 5):  36%|██████████████████▏                               | 8/22 [32:12<46:44, 200.30s/it]

9 (3323, 5):  41%|████████████████████▍                             | 9/22 [34:00<37:24, 172.62s/it]

10 (3619, 5):  41%|████████████████████                             | 9/22 [34:00<37:24, 172.62s/it]

10 (3619, 5):  45%|█████████████████████▊                          | 10/22 [36:28<33:04, 165.40s/it]

11 (4706, 5):  45%|█████████████████████▊                          | 10/22 [36:28<33:04, 165.40s/it]

11 (4706, 5):  50%|████████████████████████                        | 11/22 [39:39<31:44, 173.13s/it]

12 (4508, 5):  50%|████████████████████████                        | 11/22 [39:39<31:44, 173.13s/it]

12 (4508, 5):  55%|██████████████████████████▏                     | 12/22 [42:34<28:57, 173.72s/it]

13 (1718, 5):  55%|██████████████████████████▏                     | 12/22 [42:34<28:57, 173.72s/it]

13 (1718, 5):  59%|████████████████████████████▎                   | 13/22 [43:38<21:06, 140.69s/it]

14 (2667, 5):  59%|████████████████████████████▎                   | 13/22 [43:38<21:06, 140.69s/it]

14 (2667, 5):  64%|██████████████████████████████▌                 | 14/22 [45:12<16:53, 126.67s/it]

15 (2611, 5):  64%|██████████████████████████████▌                 | 14/22 [45:12<16:53, 126.67s/it]

15 (2611, 5):  68%|████████████████████████████████▋               | 15/22 [46:31<13:06, 112.29s/it]

16 (3635, 5):  68%|████████████████████████████████▋               | 15/22 [46:31<13:06, 112.29s/it]

16 (3635, 5):  73%|██████████████████████████████████▉             | 16/22 [48:10<10:51, 108.50s/it]

17 (5121, 5):  73%|██████████████████████████████████▉             | 16/22 [48:10<10:51, 108.50s/it]

17 (5121, 5):  77%|█████████████████████████████████████           | 17/22 [50:04<09:10, 110.01s/it]

18 (1493, 5):  77%|█████████████████████████████████████           | 17/22 [50:04<09:10, 110.01s/it]

18 (1493, 5):  82%|████████████████████████████████████████         | 18/22 [51:15<06:32, 98.23s/it]

19 (7329, 5):  82%|████████████████████████████████████████         | 18/22 [51:15<06:32, 98.23s/it]

19 (7329, 5):  86%|█████████████████████████████████████████▍      | 19/22 [53:17<05:16, 105.34s/it]

20 (2479, 5):  86%|█████████████████████████████████████████▍      | 19/22 [53:17<05:16, 105.34s/it]

20 (2479, 5):  91%|████████████████████████████████████████████▌    | 20/22 [54:22<03:06, 93.36s/it]

21 (1242, 5):  91%|████████████████████████████████████████████▌    | 20/22 [54:22<03:06, 93.36s/it]

21 (1242, 5):  95%|██████████████████████████████████████████████▊  | 21/22 [55:01<01:17, 77.08s/it]

22 (2687, 5):  95%|██████████████████████████████████████████████▊  | 21/22 [55:01<01:17, 77.08s/it]

22 (2687, 5): 100%|█████████████████████████████████████████████████| 22/22 [55:41<00:00, 65.95s/it]

22 (2687, 5): 100%|████████████████████████████████████████████████| 22/22 [55:41<00:00, 151.89s/it]




# Testing

In [56]:
_tmp = variants_ld_block_df[variants_ld_block_df["chr"] == 1]

In [57]:
_tmp.shape

(8130, 5)

In [58]:
assert _tmp.shape[0] > 0

In [59]:
n_expected = len(set(_tmp["varID"]).intersection(variants_ids_with_genotype))
display(n_expected)
assert n_expected > 0

7701

In [60]:
with pd.HDFStore(output_file, mode="r") as store:
    df = store["chr1"]
    assert df.shape == (n_expected, n_expected)
    assert not df.isna().any().any()