# Description

This notebook separates all SNPs in prediction models into LD blocks (LINK TO PAPER).

# Modules

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import sqlite3

import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter

import conf

In [3]:
load_rda = ro.r["load"]

# Load data

## LD blocks

In [4]:
input_file = str(conf.PHENOMEXCAN["LD_BLOCKS"]["LD_BLOCKS_FILE"])
display(input_file)

'/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/ld_blocks/ld_block_pickrell_eur_b38.rda'

In [5]:
load_rda(input_file)

0
'ld_block_pickrell_eur_b38'


In [6]:
ro.r["ld_block_pickrell_eur_b38"]

0,1
ld_block,[RTYPES.VECSXP]
meta_information,[RTYPES.VECSXP]


### Show metadata

In [7]:
ro.r["ld_block_pickrell_eur_b38"][1]

0,1
genome_assembly_version,[RTYPES.STRSXP]
notes,[RTYPES.STRSXP]


In [8]:
np.array(ro.r["ld_block_pickrell_eur_b38"][1][0])

array(['hg38'], dtype='<U4')

In [9]:
np.array(ro.r["ld_block_pickrell_eur_b38"][1][1])

array(['LD block annotation (for European population) obtained from Berisa et al (https://academic.oup.com/bioinformatics/article/32/2/283/1743626). Formatted by Alvaro Barbeira'],
      dtype='<U169')

### Load LD blocks

In [10]:
ld_block_r = ro.r["ld_block_pickrell_eur_b38"][0]

In [11]:
ld_block_r.rownames

0,1,2,3,4,5,6
'1','2','3',...,'1700','1701','1702'


In [12]:
ld_block_r.colnames

0,1,2,3
'region_na...,'chromosome','start','end'


In [13]:
with localconverter(ro.default_converter + pandas2ri.converter):
    ld_block_df = ro.conversion.rpy2py(ld_block_r)

In [14]:
ld_block_df

Unnamed: 0,region_name,chromosome,start,end
1,chr1_1,chr1,10583,1961168
2,chr1_2,chr1,1961168,3666172
3,chr1_3,chr1,3666172,4320751
4,chr1_4,chr1,4320751,5853833
5,chr1_5,chr1,5853833,7187275
...,...,...,...,...
1698,chr22_20,chr22,44599428,46074615
1699,chr22_21,chr22,46074615,47200568
1700,chr22_22,chr22,47200568,48507891
1701,chr22_23,chr22,48507891,49430885


In [15]:
ld_block_df.dtypes

region_name    object
chromosome     object
start           int32
end             int32
dtype: object

### Save in tsv

In [16]:
output_file = conf.PHENOMEXCAN["LD_BLOCKS"]["LD_BLOCKS_FILE"].parent / (
    conf.PHENOMEXCAN["LD_BLOCKS"]["LD_BLOCKS_FILE"].stem + ".tsv"
)
display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/ld_blocks/ld_block_pickrell_eur_b38.tsv')

In [17]:
ld_block_df.to_csv(output_file, sep="\t", index=False)

## SNPs in predictions models

In [18]:
mashr_models_db_files = list(
    conf.PHENOMEXCAN["PREDICTION_MODELS"]["MASHR"].glob("*.db")
)

In [19]:
assert len(mashr_models_db_files) == 49

In [20]:
all_variants_ids = set()

In [21]:
for m in mashr_models_db_files:
    print(f"Processing {m.name}")

    with sqlite3.connect(m) as conn:
        df = pd.read_sql("select varID from weights", conn)["varID"]
        all_variants_ids.update(set(df.values))

Processing mashr_Skin_Not_Sun_Exposed_Suprapubic.db
Processing mashr_Cells_EBV-transformed_lymphocytes.db
Processing mashr_Brain_Frontal_Cortex_BA9.db
Processing mashr_Kidney_Cortex.db
Processing mashr_Brain_Substantia_nigra.db
Processing mashr_Spleen.db
Processing mashr_Colon_Transverse.db
Processing mashr_Heart_Left_Ventricle.db
Processing mashr_Lung.db
Processing mashr_Muscle_Skeletal.db


Processing mashr_Brain_Hypothalamus.db
Processing mashr_Brain_Cortex.db
Processing mashr_Brain_Amygdala.db
Processing mashr_Esophagus_Mucosa.db
Processing mashr_Adrenal_Gland.db
Processing mashr_Uterus.db
Processing mashr_Prostate.db
Processing mashr_Whole_Blood.db
Processing mashr_Pituitary.db
Processing mashr_Esophagus_Gastroesophageal_Junction.db


Processing mashr_Stomach.db
Processing mashr_Heart_Atrial_Appendage.db
Processing mashr_Brain_Cerebellum.db
Processing mashr_Breast_Mammary_Tissue.db
Processing mashr_Artery_Tibial.db
Processing mashr_Artery_Aorta.db
Processing mashr_Small_Intestine_Terminal_Ileum.db
Processing mashr_Brain_Hippocampus.db
Processing mashr_Testis.db
Processing mashr_Brain_Putamen_basal_ganglia.db
Processing mashr_Pancreas.db


Processing mashr_Adipose_Subcutaneous.db
Processing mashr_Brain_Cerebellar_Hemisphere.db
Processing mashr_Colon_Sigmoid.db
Processing mashr_Minor_Salivary_Gland.db
Processing mashr_Cells_Cultured_fibroblasts.db
Processing mashr_Brain_Nucleus_accumbens_basal_ganglia.db
Processing mashr_Brain_Anterior_cingulate_cortex_BA24.db
Processing mashr_Vagina.db
Processing mashr_Ovary.db
Processing mashr_Skin_Sun_Exposed_Lower_leg.db


Processing mashr_Esophagus_Muscularis.db
Processing mashr_Brain_Spinal_cord_cervical_c-1.db
Processing mashr_Artery_Coronary.db
Processing mashr_Thyroid.db
Processing mashr_Brain_Caudate_basal_ganglia.db
Processing mashr_Adipose_Visceral_Omentum.db
Processing mashr_Nerve_Tibial.db
Processing mashr_Liver.db


In [22]:
len(all_variants_ids)

237405

In [23]:
list(all_variants_ids)[:10]

['chr4_152680407_C_A_b38',
 'chr3_39281932_C_G_b38',
 'chr17_3661000_T_C_b38',
 'chr12_120906687_G_T_b38',
 'chr10_43575139_A_G_b38',
 'chr12_230789_A_G_b38',
 'chr14_23960726_G_A_b38',
 'chr13_48658462_T_G_b38',
 'chr12_9990298_G_A_b38',
 'chr20_59958704_G_T_b38']

# Assign each variant to an LD block

## Preprocess SNPs data

In [24]:
variants_ld_block_df = pd.DataFrame({"varID": list(all_variants_ids)})

In [25]:
variants_ld_block_df.shape

(237405, 1)

In [26]:
variants_info = variants_ld_block_df["varID"].str.split("_", expand=True)

In [27]:
variants_info.shape

(237405, 5)

In [28]:
assert variants_ld_block_df.shape[0] == variants_info.shape[0]

In [29]:
variants_ld_block_df = variants_ld_block_df.join(variants_info)[["varID", 0, 1, 2, 3]]

In [30]:
assert variants_ld_block_df.shape[0] == variants_info.shape[0]

In [31]:
variants_ld_block_df.head()

Unnamed: 0,varID,0,1,2,3
0,chr4_152680407_C_A_b38,chr4,152680407,C,A
1,chr3_39281932_C_G_b38,chr3,39281932,C,G
2,chr17_3661000_T_C_b38,chr17,3661000,T,C
3,chr12_120906687_G_T_b38,chr12,120906687,G,T
4,chr10_43575139_A_G_b38,chr10,43575139,A,G


In [32]:
variants_ld_block_df = variants_ld_block_df.rename(
    columns={
        0: "chr",
        1: "position",
        2: "ref_allele",
        3: "eff_allele",
    }
)

In [33]:
variants_ld_block_df["chr"] = variants_ld_block_df["chr"].apply(lambda x: int(x[3:]))

In [34]:
variants_ld_block_df["position"] = variants_ld_block_df["position"].astype(int)

In [35]:
variants_ld_block_df.shape

(237405, 5)

In [36]:
variants_ld_block_df.head()

Unnamed: 0,varID,chr,position,ref_allele,eff_allele
0,chr4_152680407_C_A_b38,4,152680407,C,A
1,chr3_39281932_C_G_b38,3,39281932,C,G
2,chr17_3661000_T_C_b38,17,3661000,T,C
3,chr12_120906687_G_T_b38,12,120906687,G,T
4,chr10_43575139_A_G_b38,10,43575139,A,G


In [37]:
variants_ld_block_df.dtypes

varID         object
chr            int64
position       int64
ref_allele    object
eff_allele    object
dtype: object

## Assign LD blocks

In [38]:
ld_block_df.shape

(1702, 4)

In [39]:
ld_block_df.head()

Unnamed: 0,region_name,chromosome,start,end
1,chr1_1,chr1,10583,1961168
2,chr1_2,chr1,1961168,3666172
3,chr1_3,chr1,3666172,4320751
4,chr1_4,chr1,4320751,5853833
5,chr1_5,chr1,5853833,7187275


In [40]:
snps_ld_blocks = []

for index, ldb in ld_block_df.iterrows():
    snps_in = variants_ld_block_df[
        (variants_ld_block_df["chr"] == int(ldb["chromosome"][3:]))
        & (variants_ld_block_df["position"] >= ldb["start"])
        & (variants_ld_block_df["position"] < ldb["end"])
    ]
    snps_in = snps_in[["varID"]]
    snps_in = snps_in.assign(ld_block=ldb["region_name"])

    snps_ld_blocks.append(snps_in)

In [41]:
display(len(snps_ld_blocks))
assert len(snps_ld_blocks) == ld_block_df.shape[0]

1702

In [42]:
all_snps_ld_blocks = pd.concat(snps_ld_blocks, ignore_index=True)

In [43]:
all_snps_ld_blocks.shape

(237364, 2)

In [44]:
all_snps_ld_blocks.head()

Unnamed: 0,varID,ld_block
0,chr1_975014_C_T_b38,chr1_1
1,chr1_1673818_A_C_b38,chr1_1
2,chr1_1070689_G_A_b38,chr1_1
3,chr1_966426_G_C_b38,chr1_1
4,chr1_975029_T_C_b38,chr1_1


In [45]:
variants_ld_block_df.shape

(237405, 5)

In [46]:
_tmp0 = variants_ld_block_df.set_index("varID")
assert _tmp0.index.is_unique

_tmp1 = all_snps_ld_blocks.set_index("varID")
assert _tmp1.index.is_unique

_tmp_df = pd.merge(_tmp0, _tmp1, left_index=True, right_index=True, how="inner")

In [47]:
display(_tmp_df.shape)
assert _tmp_df.shape[0] == all_snps_ld_blocks.shape[0]

(237364, 5)

In [48]:
_tmp_df.head()

Unnamed: 0_level_0,chr,position,ref_allele,eff_allele,ld_block
varID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
chr4_152680407_C_A_b38,4,152680407,C,A,chr4_98
chr3_39281932_C_G_b38,3,39281932,C,G,chr3_28
chr17_3661000_T_C_b38,17,3661000,T,C,chr17_3
chr12_120906687_G_T_b38,12,120906687,G,T,chr12_74
chr10_43575139_A_G_b38,10,43575139,A,G,chr10_30


In [49]:
variants_ld_block_df = _tmp_df

# Testing

In [50]:
_unique_chr_per_ld_block = variants_ld_block_df.groupby("ld_block").apply(
    lambda x: x["chr"].unique().shape[0]
)
display(_unique_chr_per_ld_block)

ld_block
chr10_1     1
chr10_10    1
chr10_11    1
chr10_12    1
chr10_13    1
           ..
chr9_72     1
chr9_73     1
chr9_74     1
chr9_8      1
chr9_9      1
Length: 1668, dtype: int64

In [51]:
display(_unique_chr_per_ld_block.unique())
assert _unique_chr_per_ld_block.unique().shape[0] == 1
assert _unique_chr_per_ld_block.unique()[0] == 1

array([1])

# Save

In [52]:
variants_ld_block_df.head()

Unnamed: 0_level_0,chr,position,ref_allele,eff_allele,ld_block
varID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
chr4_152680407_C_A_b38,4,152680407,C,A,chr4_98
chr3_39281932_C_G_b38,3,39281932,C,G,chr3_28
chr17_3661000_T_C_b38,17,3661000,T,C,chr17_3
chr12_120906687_G_T_b38,12,120906687,G,T,chr12_74
chr10_43575139_A_G_b38,10,43575139,A,G,chr10_30


In [53]:
output_file = conf.PHENOMEXCAN["LD_BLOCKS"]["BASE_DIR"] / "mashr_snps_ld_blocks.pkl"
display(output_file)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/base/data/phenomexcan/ld_blocks/mashr_snps_ld_blocks.pkl')

In [54]:
variants_ld_block_df.to_pickle(output_file)