# Description

This notebook computes the covariance inside each LD block

# Modules

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

# import rpy2.robjects as ro
# from rpy2.robjects import pandas2ri
# from rpy2.robjects.conversion import localconverter

import conf

# Load data

## SNPs per LD block data

In [None]:
input_file = conf.PHENOMEXCAN["LD_BLOCKS"]["BASE_DIR"] / "mashr_snps_ld_blocks.pkl"
display(input_file)

In [None]:
variants_ld_block_df = pd.read_pickle(input_file)

In [None]:
variants_ld_block_df.shape

In [None]:
variants_ld_block_df.head()

## 1000G variants metadata

In [None]:
input_file = (
    conf.PHENOMEXCAN["LD_BLOCKS"]["1000G_GENOTYPE_DIR"] / "variant_metadata.parquet"
)
display(input_file)

In [None]:
variants_metadata = pd.read_parquet(input_file, columns=["id"])

In [None]:
variants_metadata.shape

In [None]:
variants_metadata.head()

In [None]:
variants_ids_with_genotype = set(variants_metadata["id"])

In [None]:
len(variants_ids_with_genotype)

In [None]:
list(variants_ids_with_genotype)[:10]

In [None]:
del variants_metadata

# Compute covariance for each LD block

In [None]:
def compute_snps_cov(snps_df):
    assert snps_df["chr"].unique().shape[0]
    chromosome = snps_df["chr"].unique()[0]

    # keep variants only present in genotype
    snps_ids = list(set(snps_df.index).intersection(variants_ids_with_genotype))

    chromosome_file = (
        conf.PHENOMEXCAN["LD_BLOCKS"]["1000G_GENOTYPE_DIR"]
        / f"chr{chromosome}.variants.parquet"
    )
    snps_genotypes = pd.read_parquet(chromosome_file, columns=snps_ids)

    return snps_genotypes.cov()

In [None]:
output_file = conf.PHENOMEXCAN["LD_BLOCKS"]["BASE_DIR"] / "mashr_snps_ld_blocks_cov.h5"
display(output_file)

In [None]:
with pd.HDFStore(output_file, mode="w", complevel=4) as store:
    pbar = tqdm(
        variants_ld_block_df.groupby("ld_block"),
        ncols=100,
        total=variants_ld_block_df["ld_block"].unique().shape[0],
    )

    store["metadata"] = variants_ld_block_df

    for grp_name, grp_data in pbar:
        pbar.set_description(f"{grp_name} {grp_data.shape}")
        snps_cov = compute_snps_cov(grp_data).astype(np.float32)
        assert not snps_cov.isna().any().any()
        store[grp_name] = snps_cov

# Testing

In [None]:
_tmp = variants_ld_block_df[variants_ld_block_df["ld_block"] == "chr10_10"]

In [None]:
_tmp.shape

In [None]:
n_expected = len(set(_tmp.index).intersection(variants_ids_with_genotype))
display(n_expected)

In [None]:
with pd.HDFStore(output_file, mode="r") as store:
    df = store["chr10_10"]
    assert df.shape == (n_expected, n_expected)
    assert not df.isna().any().any()