# Description

This notebook separates all SNPs in prediction models into "genes" (SNPs are grouped according to whethere they are predictors for a gene's expresssion).

# Modules

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import sqlite3

import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter

import conf

In [3]:
load_rda = ro.r["load"]

# Load data

## LD blocks

In [4]:
input_file = str(conf.PHENOMEXCAN["LD_BLOCKS"]["LD_BLOCKS_FILE"])
display(input_file)

'/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/ld_blocks/ld_block_pickrell_eur_b38.rda'

In [5]:
load_rda(input_file)

0
'ld_block_pickrell_eur_b38'


In [6]:
ro.r["ld_block_pickrell_eur_b38"]

0,1
ld_block,[RTYPES.VECSXP]
meta_information,[RTYPES.VECSXP]


### Show metadata

In [7]:
ro.r["ld_block_pickrell_eur_b38"][1]

0,1
genome_assembly_version,[RTYPES.STRSXP]
notes,[RTYPES.STRSXP]


In [8]:
np.array(ro.r["ld_block_pickrell_eur_b38"][1][0])

array(['hg38'], dtype='<U4')

In [9]:
np.array(ro.r["ld_block_pickrell_eur_b38"][1][1])

array(['LD block annotation (for European population) obtained from Berisa et al (https://academic.oup.com/bioinformatics/article/32/2/283/1743626). Formatted by Alvaro Barbeira'],
      dtype='<U169')

### Load LD blocks

In [10]:
ld_block_r = ro.r["ld_block_pickrell_eur_b38"][0]

In [11]:
ld_block_r.rownames

0,1,2,3,4,5,6
'1','2','3',...,'1700','1701','1702'


In [12]:
ld_block_r.colnames

0,1,2,3
'region_na...,'chromosome','start','end'


In [13]:
with localconverter(ro.default_converter + pandas2ri.converter):
    ld_block_df = ro.conversion.rpy2py(ld_block_r)

In [14]:
ld_block_df

Unnamed: 0,region_name,chromosome,start,end
1,chr1_1,chr1,10583,1961168
2,chr1_2,chr1,1961168,3666172
3,chr1_3,chr1,3666172,4320751
4,chr1_4,chr1,4320751,5853833
5,chr1_5,chr1,5853833,7187275
...,...,...,...,...
1698,chr22_20,chr22,44599428,46074615
1699,chr22_21,chr22,46074615,47200568
1700,chr22_22,chr22,47200568,48507891
1701,chr22_23,chr22,48507891,49430885


In [15]:
ld_block_df.dtypes

region_name    object
chromosome     object
start           int32
end             int32
dtype: object

### Save in tsv

In [16]:
output_file = conf.PHENOMEXCAN["LD_BLOCKS"]["LD_BLOCKS_FILE"].parent / (
    conf.PHENOMEXCAN["LD_BLOCKS"]["LD_BLOCKS_FILE"].stem + ".tsv"
)
display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/ld_blocks/ld_block_pickrell_eur_b38.tsv')

In [17]:
ld_block_df.to_csv(output_file, sep="\t", index=False)

## SNPs in predictions models

In [18]:
mashr_models_db_files = list(
    conf.PHENOMEXCAN["PREDICTION_MODELS"]["MASHR"].glob("*.db")
)

In [19]:
assert len(mashr_models_db_files) == 49

In [20]:
all_variants_ids = []

for m in mashr_models_db_files:
    print(f"Processing {m.name}")
    tissue = m.name.split("mashr_")[1].split(".db")[0]

    with sqlite3.connect(m) as conn:
        df = pd.read_sql("select gene, varID from weights", conn)
        df["gene"] = df["gene"].apply(lambda x: x.split(".")[0])
        df = df.assign(tissue=tissue)
        
        all_variants_ids.append(df)

Processing mashr_Brain_Cerebellar_Hemisphere.db
Processing mashr_Adrenal_Gland.db
Processing mashr_Spleen.db
Processing mashr_Minor_Salivary_Gland.db
Processing mashr_Brain_Anterior_cingulate_cortex_BA24.db
Processing mashr_Brain_Frontal_Cortex_BA9.db
Processing mashr_Stomach.db
Processing mashr_Cells_EBV-transformed_lymphocytes.db
Processing mashr_Uterus.db
Processing mashr_Esophagus_Muscularis.db
Processing mashr_Heart_Left_Ventricle.db
Processing mashr_Nerve_Tibial.db
Processing mashr_Breast_Mammary_Tissue.db
Processing mashr_Cells_Cultured_fibroblasts.db
Processing mashr_Prostate.db
Processing mashr_Brain_Substantia_nigra.db
Processing mashr_Brain_Putamen_basal_ganglia.db
Processing mashr_Esophagus_Mucosa.db
Processing mashr_Adipose_Subcutaneous.db
Processing mashr_Artery_Aorta.db
Processing mashr_Whole_Blood.db
Processing mashr_Pituitary.db
Processing mashr_Brain_Nucleus_accumbens_basal_ganglia.db
Processing mashr_Brain_Caudate_basal_ganglia.db
Processing mashr_Brain_Cortex.db
Pro

In [21]:
all_gene_snps = pd.concat(all_variants_ids, ignore_index=True)

In [22]:
all_gene_snps.shape

(1132714, 3)

In [23]:
all_gene_snps.head()

Unnamed: 0,gene,varID,tissue
0,ENSG00000107331,chr9_137029407_T_G_b38,Brain_Cerebellar_Hemisphere
1,ENSG00000180549,chr9_137032610_A_G_b38,Brain_Cerebellar_Hemisphere
2,ENSG00000180549,chr9_137032730_G_A_b38,Brain_Cerebellar_Hemisphere
3,ENSG00000107281,chr9_137046201_C_A_b38,Brain_Cerebellar_Hemisphere
4,ENSG00000054179,chr9_137048360_C_A_b38,Brain_Cerebellar_Hemisphere


# Create variant block dataframe

## Preprocess SNPs data

In [24]:
variants_ld_block_df = all_gene_snps[["varID", "gene"]]

In [25]:
variants_ld_block_df.shape

(1132714, 2)

In [26]:
variants_info = variants_ld_block_df["varID"].str.split("_", expand=True)

In [27]:
variants_info.shape

(1132714, 5)

In [28]:
assert variants_ld_block_df.shape[0] == variants_info.shape[0]

In [29]:
variants_ld_block_df = variants_ld_block_df.join(variants_info)[["varID", 0, 1, 2, 3, "gene"]]

In [30]:
assert variants_ld_block_df.shape[0] == variants_info.shape[0]

In [31]:
variants_ld_block_df.head()

Unnamed: 0,varID,0,1,2,3,gene
0,chr9_137029407_T_G_b38,chr9,137029407,T,G,ENSG00000107331
1,chr9_137032610_A_G_b38,chr9,137032610,A,G,ENSG00000180549
2,chr9_137032730_G_A_b38,chr9,137032730,G,A,ENSG00000180549
3,chr9_137046201_C_A_b38,chr9,137046201,C,A,ENSG00000107281
4,chr9_137048360_C_A_b38,chr9,137048360,C,A,ENSG00000054179


In [32]:
variants_ld_block_df = variants_ld_block_df.rename(
    columns={
        0: "chr",
        1: "position",
        2: "ref_allele",
        3: "eff_allele",
    }
)

In [33]:
variants_ld_block_df["chr"] = variants_ld_block_df["chr"].apply(lambda x: int(x[3:]))

In [34]:
variants_ld_block_df["position"] = variants_ld_block_df["position"].astype(int)

In [35]:
variants_ld_block_df.shape

(1132714, 6)

In [36]:
variants_ld_block_df.head()

Unnamed: 0,varID,chr,position,ref_allele,eff_allele,gene
0,chr9_137029407_T_G_b38,9,137029407,T,G,ENSG00000107331
1,chr9_137032610_A_G_b38,9,137032610,A,G,ENSG00000180549
2,chr9_137032730_G_A_b38,9,137032730,G,A,ENSG00000180549
3,chr9_137046201_C_A_b38,9,137046201,C,A,ENSG00000107281
4,chr9_137048360_C_A_b38,9,137048360,C,A,ENSG00000054179


In [37]:
variants_ld_block_df.dtypes

varID         object
chr            int64
position       int64
ref_allele    object
eff_allele    object
gene          object
dtype: object

# Testing

In [38]:
_unique_chr_per_ld_block = variants_ld_block_df.groupby("gene").apply(
    lambda x: x["chr"].unique().shape[0]
)
display(_unique_chr_per_ld_block)

gene
ENSG00000000419    1
ENSG00000000457    1
ENSG00000000460    1
ENSG00000000938    1
ENSG00000000971    1
                  ..
ENSG00000284430    1
ENSG00000284452    1
ENSG00000284513    1
ENSG00000284526    1
ENSG00000284552    1
Length: 22535, dtype: int64

In [39]:
display(_unique_chr_per_ld_block.unique())
assert _unique_chr_per_ld_block.unique().shape[0] == 1
assert _unique_chr_per_ld_block.unique()[0] == 1

array([1])

# Save

In [40]:
variants_ld_block_df.head()

Unnamed: 0,varID,chr,position,ref_allele,eff_allele,gene
0,chr9_137029407_T_G_b38,9,137029407,T,G,ENSG00000107331
1,chr9_137032610_A_G_b38,9,137032610,A,G,ENSG00000180549
2,chr9_137032730_G_A_b38,9,137032730,G,A,ENSG00000180549
3,chr9_137046201_C_A_b38,9,137046201,C,A,ENSG00000107281
4,chr9_137048360_C_A_b38,9,137048360,C,A,ENSG00000054179


In [41]:
output_file = conf.PHENOMEXCAN["LD_BLOCKS"]["BASE_DIR"] / "mashr_snps_gene_blocks.pkl"
display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/ld_blocks/mashr_snps_gene_blocks.pkl')

In [42]:
variants_ld_block_df.to_pickle(output_file)