LDpred-2 is one of the dedicated PRS programs which is an R
package that uses a Bayesian approach to polygenic risk scoring.
!!! note The script used here is based on LDpred 2 implemented under bigsnpr version 1.4.7
!!! note For more details, please refer to LDpred 2's homepage
You can install LDpred
and its dependencies in R
with the following command:
install.packages("remotes")
library(remotes)
remotes::install_github("https://github.com/privefl/bigsnpr.git")
!!! note For mac users, you might need to follow the guide here to be able to install LDpred2
We assume that you have the following files (or you can download it from here):
File Name | Description |
---|---|
Height.QC.gz | The post-QCed summary statistic |
EUR.QC.bed | The genotype file after performing some basic filtering |
EUR.QC.bim | This file contains the SNPs that passed the basic filtering |
EUR.QC.fam | This file contains the samples that passed the basic filtering |
EUR.height | This file contains the phenotype of the samples |
EUR.cov | This file contains the covariates of the samples |
EUR.eigenvec | This file contains the PCs of the samples |
!!! warning While we do provide a rough guide on how to perform LDpred on bed files separated into individual chromosomes, this script is untested and extra caution is required
On some server, you might need to first use the following code in order to run LDpred with multi-thread
=== "prepare workspace and load bigsnpr"
```R
library(bigsnpr)
options(bigstatsr.check.parallel.blas = FALSE)
options(default.nproc.blas = NULL)
```
=== "read in phenotype and covariates"
```R
library(data.table)
library(magrittr)
phenotype <- fread("EUR.height")
covariate <- fread("EUR.cov")
pcs <- fread("EUR.eigenvec")
# rename columns
colnames(pcs) <- c("FID","IID", paste0("PC",1:6))
# generate required table
pheno <- merge(phenotype, covariate) %>%
merge(., pcs)
```
LDpred2 authors recommend restricting the analysis to only the HapMap3 SNPs
=== "load HapMap3 SNPs"
```R
info <- readRDS(runonce::download_file(
"https://ndownloader.figshare.com/files/25503788",
fname = "map_hm3_ldpred2.rds"))
```
=== "Load summary statistic file"
``` R
# Read in the summary statistic file
sumstats <- bigreadr::fread2("Height.QC.gz")
# LDpred 2 require the header to follow the exact naming
names(sumstats) <-
c("chr",
"pos",
"rsid",
"a1",
"a0",
"n_eff",
"beta_se",
"p",
"OR",
"INFO",
"MAF")
# Transform the OR into log(OR)
sumstats$beta <- log(sumstats$OR)
# Filter out hapmap SNPs
sumstats <- sumstats[sumstats$rsid%in% info$rsid,]
```
!!! Warning
Here, we know the exact ordering of the summary statistics file.
However, in many cases, the ordering of the summary statistics differ,
thus one must rename the columns according to their actual ordering
=== "Genome Wide bed file"
```R
# Get maximum amount of cores
NCORES <- nb_cores()
# Open a temporary file
tmp <- tempfile(tmpdir = "tmp-data")
on.exit(file.remove(paste0(tmp, ".sbk")), add = TRUE)
# Initialize variables for storing the LD score and LD matrix
corr <- NULL
ld <- NULL
# We want to know the ordering of samples in the bed file
fam.order <- NULL
# preprocess the bed file (only need to do once for each data set)
snp_readBed("EUR.QC.bed")
# now attach the genotype object
obj.bigSNP <- snp_attach("EUR.QC.rds")
# extract the SNP information from the genotype
map <- obj.bigSNP$map[-3]
names(map) <- c("chr", "rsid", "pos", "a1", "a0")
# perform SNP matching
info_snp <- snp_match(sumstats, map)
# Assign the genotype to a variable for easier downstream analysis
genotype <- obj.bigSNP$genotypes
# Rename the data structures
CHR <- map$chr
POS <- map$pos
# get the CM information from 1000 Genome
# will download the 1000G file to the current directory (".")
POS2 <- snp_asGeneticPos(CHR, POS, dir = ".")
# calculate LD
for (chr in 1:22) {
# Extract SNPs that are included in the chromosome
ind.chr <- which(info_snp$chr == chr)
ind.chr2 <- info_snp$`_NUM_ID_`[ind.chr]
# Calculate the LD
corr0 <- snp_cor(
genotype,
ind.col = ind.chr2,
ncores = NCORES,
infos.pos = POS2[ind.chr2],
size = 3 / 1000
)
if (chr == 1) {
ld <- Matrix::colSums(corr0^2)
corr <- as_SFBM(corr0, tmp)
} else {
ld <- c(ld, Matrix::colSums(corr0^2))
corr$add_columns(corr0, nrow(corr))
}
}
# We assume the fam order is the same across different chromosomes
fam.order <- as.data.table(obj.bigSNP$fam)
# Rename fam order
setnames(fam.order,
c("family.ID", "sample.ID"),
c("FID", "IID"))
```
=== "Chromosome separated bed files"
```R
# Get maximum amount of cores
NCORES <- nb_cores()
# Open a temporary file
tmp <- tempfile(tmpdir = "tmp-data")
on.exit(file.remove(paste0(tmp, ".sbk")), add = TRUE)
# Initialize variables for storing the LD score and LD matrix
corr <- NULL
ld <- NULL
# We want to know the ordering of samples in the bed file
info_snp <- NULL
fam.order <- NULL
for (chr in 1:22) {
# preprocess the bed file (only need to do once for each data set)
# Assuming the file naming is EUR_chr#.bed
snp_readBed(paste0("EUR_chr",chr,".bed"))
# now attach the genotype object
obj.bigSNP <- snp_attach(paste0("EUR_chr",chr,".rds"))
# extract the SNP information from the genotype
map <- obj.bigSNP$map[-3]
names(map) <- c("chr", "rsid", "pos", "a1", "a0")
# perform SNP matching
tmp_snp <- snp_match(sumstats[sumstats$chr==chr,], map)
info_snp <- rbind(info_snp, tmp_snp)
# Assign the genotype to a variable for easier downstream analysis
genotype <- obj.bigSNP$genotypes
# Rename the data structures
CHR <- map$chr
POS <- map$pos
# get the CM information from 1000 Genome
# will download the 1000G file to the current directory (".")
POS2 <- snp_asGeneticPos(CHR, POS, dir = ".")
# calculate LD
# Extract SNPs that are included in the chromosome
ind.chr <- which(tmp_snp$chr == chr)
ind.chr2 <- tmp_snp$`_NUM_ID_`[ind.chr]
# Calculate the LD
corr0 <- snp_cor(
genotype,
ind.col = ind.chr2,
ncores = NCORES,
infos.pos = POS2[ind.chr2],
size = 3 / 1000
)
if (chr == 1) {
ld <- Matrix::colSums(corr0^2)
corr <- as_SFBM(corr0, tmp)
} else {
ld <- c(ld, Matrix::colSums(corr0^2))
corr$add_columns(corr0, nrow(corr))
}
# We assume the fam order is the same across different chromosomes
if(is.null(fam.order)){
fam.order <- as.data.table(obj.bigSNP$fam)
}
}
# Rename fam order
setnames(fam.order,
c("family.ID", "sample.ID"),
c("FID", "IID"))
```
=== "Perform LD score regression"
```R
df_beta <- info_snp[,c("beta", "beta_se", "n_eff", "_NUM_ID_")]
ldsc <- snp_ldsc( ld,
length(ld),
chi2 = (df_beta$beta / df_beta$beta_se)^2,
sample_size = df_beta$n_eff,
blocks = NULL)
h2_est <- ldsc[["h2"]]
```
=== "Calculate the null R2 (quantitative trait)"
```R
# Reformat the phenotype file such that y is of the same order as the
# sample ordering in the genotype file
y <- pheno[fam.order, on = c("FID", "IID")]
# Calculate the null R2
# use glm for binary trait
# (will also need the fmsb package to calculate the pseudo R2)
null.model <- paste("PC", 1:6, sep = "", collapse = "+") %>%
paste0("Height~Sex+", .) %>%
as.formula %>%
lm(., data = y) %>%
summary
null.r2 <- null.model$r.squared
```
=== "Calculate the null R2 (binary trait)"
```R
library(fmsb)
# Reformat the phenotype file such that y is of the same order as the
# sample ordering in the genotype file
y <- pheno[fam.order, on = c("FID", "IID")]
# Calculate the null R2
# use glm for binary trait
# (will also need the fmsb package to calculate the pseudo R2)
null.model <- paste("PC", 1:6, sep = "", collapse = "+") %>%
paste0("Height~Sex+", .) %>%
as.formula %>%
glm(., data = y, family=binomial) %>%
summary
null.r2 <- fmsb::NagelkerkeR2(null.model)
```
!!! important
Scripts for binary trait analysis only serve as a reference as we have not simulate any binary traits.
In addition, Nagelkerke
=== "infinitesimal model"
```R
beta_inf <- snp_ldpred2_inf(corr, df_beta, h2 = h2_est)
```
=== "grid model"
```R
# Prepare data for grid model
p_seq <- signif(seq_log(1e-4, 1, length.out = 17), 2)
h2_seq <- round(h2_est * c(0.7, 1, 1.4), 4)
grid.param <-
expand.grid(p = p_seq,
h2 = h2_seq,
sparse = c(FALSE, TRUE))
# Get adjusted beta from grid model
beta_grid <-
snp_ldpred2_grid(corr, df_beta, grid.param, ncores = NCORES)
```
=== "auto model"
```R
# Get adjusted beta from the auto model
multi_auto <- snp_ldpred2_auto(
corr,
df_beta,
h2_init = h2_est,
vec_p_init = seq_log(1e-4, 0.9, length.out = NCORES),
ncores = NCORES
)
beta_auto <- sapply(multi_auto, function(auto)
auto$beta_est)
```
=== "infinitesimal model"
```R
if(is.null(obj.bigSNP)){
obj.bigSNP <- snp_attach("EUR.QC.rds")
}
genotype <- obj.bigSNP$genotypes
# calculate PRS for all samples
ind.test <- 1:nrow(genotype)
pred_inf <- big_prodVec( genotype,
beta_inf,
ind.row = ind.test,
ind.col = info_snp$`_NUM_ID_`)
```
=== "grid model"
```R
if(is.null(obj.bigSNP)){
obj.bigSNP <- snp_attach("EUR.QC.rds")
}
genotype <- obj.bigSNP$genotypes
# calculate PRS for all samples
ind.test <- 1:nrow(genotype)
pred_grid <- big_prodMat( genotype,
beta_grid,
ind.col = info_snp$`_NUM_ID_`)
```
=== "auto model"
```R
if(is.null(obj.bigSNP)){
obj.bigSNP <- snp_attach("EUR.QC.rds")
}
genotype <- obj.bigSNP$genotypes
# calculate PRS for all samples
ind.test <- 1:nrow(genotype)
pred_auto <-
big_prodMat(genotype,
beta_auto,
ind.row = ind.test,
ind.col = info_snp$`_NUM_ID_`)
# scale the PRS generated from AUTO
pred_scaled <- apply(pred_auto, 2, sd)
final_beta_auto <-
rowMeans(beta_auto[,
abs(pred_scaled -
median(pred_scaled)) <
3 * mad(pred_scaled)])
pred_auto <-
big_prodVec(genotype,
final_beta_auto,
ind.row = ind.test,
ind.col = info_snp$`_NUM_ID_`)
```
=== "infinitesimal model"
```R
pred_inf <- NULL
for(chr in 1:22){
obj.bigSNP <- snp_attach(paste0("EUR_chr",chr,".rds"))
genotype <- obj.bigSNP$genotypes
# calculate PRS for all samples
ind.test <- 1:nrow(genotype)
# Extract SNPs in this chromosome
chr.idx <- which(info_snp$chr == chr)
ind.chr <- info_snp$`_NUM_ID_`[chr.idx]
tmp <- big_prodVec(genotype,
beta_inf,
ind.row = ind.test,
ind.col = ind.chr)
if(is.null(pred_inf)){
pred_inf <- tmp
}else{
pred_inf <- pred_inf + tmp
}
}
```
=== "grid model"
```R
pred_grid <- NULL
for(chr in 1:22){
obj.bigSNP <- snp_attach(paste0("EUR_chr",chr,"_.rds"))
genotype <- obj.bigSNP$genotypes
# calculate PRS for all samples
ind.test <- 1:nrow(genotype)
# Extract SNPs in this chromosome
chr.idx <- which(info_snp$chr == chr)
ind.chr <- info_snp$`_NUM_ID_`[chr.idx]
tmp <- big_prodMat( genotype,
beta_grid,
ind.col = ind.chr)
if(is.null(pred_grid)){
pred_grid <- tmp
}else{
pred_grid <- pred_grid + tmp
}
}
```
=== "auto model"
```R
pred_auto <- NULL
for(chr in 1:22){
obj.bigSNP <- snp_attach(paste0("EUR_chr",chr,"_.rds"))
genotype <- obj.bigSNP$genotypes
# calculate PRS for all samples
ind.test <- 1:nrow(genotype)
# Extract SNPs in this chromosome
chr.idx <- which(info_snp$chr == chr)
ind.chr <- info_snp$`_NUM_ID_`[chr.idx]
tmp <-
big_prodMat(genotype,
beta_auto,
ind.row = ind.test,
ind.col = ind.chr)
# scale the PRS generated from AUTO
pred_scaled <- apply(tmp, 2, sd)
final_beta_auto <-
rowMeans(tmp[chr.idx,
abs(pred_scaled -
median(pred_scaled)) <
3 * mad(pred_scaled)])
tmp <-
big_prodVec(genotype,
final_beta_auto,
ind.row = ind.test,
ind.col = ind.chr)
if(is.null(pred_auto)){
pred_auto <- tmp
}else{
pred_auto <- pred_auto + tmp
}
}
```
=== "infinitesimal model"
```R
reg.formula <- paste("PC", 1:6, sep = "", collapse = "+") %>%
paste0("Height~PRS+Sex+", .) %>%
as.formula
reg.dat <- y
reg.dat$PRS <- pred_inf
inf.model <- lm(reg.formula, dat=reg.dat) %>%
summary
(result <- data.table(
infinitesimal = inf.model$r.squared - null.r2,
null = null.r2
))
```
=== "grid model"
```R
reg.formula <- paste("PC", 1:6, sep = "", collapse = "+") %>%
paste0("Height~PRS+Sex+", .) %>%
as.formula
reg.dat <- y
max.r2 <- 0
for(i in 1:ncol(pred_grid)){
reg.dat$PRS <- pred_grid[,i]
grid.model <- lm(reg.formula, dat=reg.dat) %>%
summary
if(max.r2 < grid.model$r.squared){
max.r2 <- grid.model$r.squared
}
}
(result <- data.table(
grid = max.r2 - null.r2,
null = null.r2
))
```
=== "auto model"
```R
reg.formula <- paste("PC", 1:6, sep = "", collapse = "+") %>%
paste0("Height~PRS+Sex+", .) %>%
as.formula
reg.dat <- y
reg.dat$PRS <- pred_auto
auto.model <- lm(reg.formula, dat=reg.dat) %>%
summary
(result <- data.table(
auto = auto.model$r.squared - null.r2,
null = null.r2
))
```
??? note "How much phenotypic variation does the PRS from each model explain?" Infinitesimal = 0.0248696
Grid Model = 0.001746926
Auto Model = 0.1751478