 # LDpred-2
 
 Author: Shing Wan Choi

## Required data and files

[data link](https://drive.google.com/file/d/1x_G0Gxk9jFMY-PMqwtg6-vdEyUPp5p5u/view)

\begin{array}{c|c}
\hline \text { File Name } & \text { Description } \\
\hline \text { Height.QC.gz } & \text { The post-QCed summary statistic } \\
\hline \text { EUR.QC.bed } & \text { The genotype file after performing some basic filtering } \\
\hline \text { EUR.QC.bim } & \text { This file contains the SNPs that passed the basic filtering } \\
\hline \text { EUR.QC.fam } & \text { This file contains the samples that passed the basic filtering } \\
\hline \text { EUR.height } & \text { This file contains the phenotype of the samples } \\
\hline \text { EUR.cov } & \text { This file contains the covariates of the samples } \\
\hline \text { EUR.eigenvec } & \text { This file contains the PCs of the samples } \\
\hline
\end{array}

While we do provide a rough guide on how to **perform LDpred on bed files separated into individual chromosomes**, this script is untested and extra caution is required

## Prepare workspace

In [1]:
library(bigsnpr)
# options(bigstatsr.check.parallel.blas = FALSE)
# options(default.nproc.blas = NULL)

Loading required package: bigstatsr



In [11]:
library(data.table)
library(magrittr)
phenotype <- fread("post-qc/EUR.height")
covariate <- fread("post-qc/EUR.cov")
pcs <- fread("post-qc/EUR.eigenvec") # principle components
# rename columns
colnames(pcs) <- c("FID","IID", paste0("PC",1:6))
# generate required table
pheno <- merge(phenotype, covariate) %>%
    merge(., pcs)

In [None]:
head(pheno)

## obtain HapMap3 SNPs

load HapMap3 SNPs

In [12]:
info <- readRDS(url("https://github.com/privefl/bigsnpr/raw/master/data-raw/hm3_variants.rds"))

In [None]:
head(info,10)

## Load and transform the summary statistic file

one must rename the columns according to their actual ordering

In [13]:
# Read in the summary statistic file
sumstats <- bigreadr::fread2("post-qc/Height.QC.gz") 
# LDpred 2 require the header to follow the exact naming
names(sumstats) <-
    c("chr",
    "pos",
    "rsid",
    "a1",
    "a0",
    "n_eff",
    "beta_se",
    "p",
    "OR",
    "INFO",
    "MAF")
# Transform the OR into log(OR)
sumstats$beta <- log(sumstats$OR)


In [14]:
head(sumstats)
nrow(sumstats)
# Filter out hapmap SNPs
sumstats <- sumstats[sumstats$rsid %in% info$rsid,]
head(sumstats)
nrow(sumstats)
write.csv(sumstats, file = "sumstats.csv")



Unnamed: 0_level_0,chr,pos,rsid,a1,a0,n_eff,beta_se,p,OR,INFO,MAF,beta
Unnamed: 0_level_1,<int>,<int>,<chr>,<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,1,756604,rs3131962,A,G,388028,0.00301666,0.483171,0.9978869,0.8905579,0.3693896,-0.00211532
2,1,768448,rs12562034,A,G,388028,0.00329472,0.834808,1.0006873,0.8958935,0.3368458,0.00068708
3,1,779322,rs4040617,G,A,388028,0.00303344,0.42897,0.9976036,0.8975083,0.377368,-0.00239932
4,1,801536,rs79373928,G,T,388028,0.00841324,0.808999,1.0020357,0.9089629,0.4832122,0.00203363
5,1,808631,rs11240779,G,A,388028,0.00242821,0.590265,1.0013083,0.8932125,0.4504096,0.00130747
6,1,809876,rs57181708,G,A,388028,0.00336785,0.71475,1.0012317,0.9235576,0.4997439,0.0012309


Unnamed: 0_level_0,chr,pos,rsid,a1,a0,n_eff,beta_se,p,OR,INFO,MAF,beta
Unnamed: 0_level_1,<int>,<int>,<chr>,<chr>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
2,1,768448,rs12562034,A,G,388028,0.00329472,0.834808,1.0006873,0.8958935,0.3368458,0.00068708
3,1,779322,rs4040617,G,A,388028,0.00303344,0.42897,0.9976036,0.8975083,0.377368,-0.00239932
8,1,838555,rs4970383,A,C,388028,0.00235773,0.150993,0.9966199,0.9077165,0.327164,-0.00338578
18,1,873558,rs1110052,G,T,388028,0.00225578,0.000285674,0.9918494,0.9004758,0.3680888,-0.00818396
19,1,880238,rs3748592,A,G,388028,0.0045031,0.488653,1.0031231,0.8728364,0.4104163,0.00311819
20,1,880390,rs3748593,A,C,388028,0.00635197,0.272881,0.9930596,0.8992809,0.4930243,-0.00696463


##  Calculate the LD matrix

Genome Wide bed file

In [15]:
# Get maximum amount of cores
NCORES <- nb_cores()
# Open a temporary file
tmp <- tempfile(tmpdir = "tmp-dataset")
on.exit(file.remove(paste0(tmp, ".sbk")), add = TRUE)
# Initialize variables for storing the LD score and LD matrix
corr <- NULL
ld <- NULL
# We want to know the ordering of samples in the bed file 
fam.order <- NULL
# preprocess the bed file (only need to do once for each data set)
#snp_readBed("post-qc/EUR.QC.bed")

# now attach the genotype object
obj.bigSNP <- snp_attach("post-qc/EUR.QC.rds")

# extract the SNP information from the genotype
map <- obj.bigSNP$map[-3]
names(map) <- c("chr", "rsid", "pos", "a1", "a0")
# perform SNP matching
info_snp <- snp_match(sumstats, map)
# Assign the genotype to a variable for easier downstream analysis
genotype <- obj.bigSNP$genotypes
# Rename the data structures
CHR <- map$chr
POS <- map$pos
# get the CM information from 1000 Genome
# will download the 1000G file to the current directory (".")
POS2 <- snp_asGeneticPos(CHR, POS, dir = ".")
# calculate LD
for (chr in 1:22) {
    # Extract SNPs that are included in the chromosome
    ind.chr <- which(info_snp$chr == chr)
    ind.chr2 <- info_snp$`_NUM_ID_`[ind.chr]
    # Calculate the LD
    corr0 <- snp_cor(
            genotype,
            ind.col = ind.chr2,
            ncores = NCORES,
            infos.pos = POS2[ind.chr2],
            size = 3 / 1000
        )
    if (chr == 1) {
        ld <- Matrix::colSums(corr0^2)
        corr <- as_SFBM(corr0, tmp)
    } else {
        ld <- c(ld, Matrix::colSums(corr0^2))
        corr$add_columns(corr0, nrow(corr))
    }
}
# We assume the fam order is the same across different chromosomes
fam.order <- as.data.table(obj.bigSNP$fam)
# Rename fam order
setnames(fam.order,
        c("family.ID", "sample.ID"),
        c("FID", "IID"))


"cannot remove file 'tmp-dataset/file77a64f00af92.sbk', reason 'No such file or directory'"
136,004 variants to be matched.

0 ambiguous SNPs have been removed.

134,522 variants have been matched; 0 were flipped and 91 were reversed.

Creating directory "tmp-dataset" which didn't exist..



# Perform LD score regression

In [None]:
df_beta <- info_snp[,c("beta", "beta_se", "n_eff", "_NUM_ID_")]
ldsc <- snp_ldsc(   ld, 
                    length(ld), 
                    chi2 = (df_beta$beta / df_beta$beta_se)^2,
                    sample_size = df_beta$n_eff, 
                    blocks = NULL)
h2_est <- ldsc[["h2"]]
h2_est

# Estimate Beta

## Infinitesimal Model

In [None]:
beta_inf <- snp_ldpred2_inf(corr, df_beta, h2 = h2_est)
summary(beta_inf)

## grid model

In [None]:
# Prepare data for grid model
p_seq <- signif(seq_log(1e-4, 1, length.out = 17), 2)
h2_seq <- round(h2_est * c(0.7, 1, 1.4), 4)
grid.param <-
    expand.grid(p = p_seq,
            h2 = h2_seq,
            sparse = c(FALSE, TRUE))
# Get adjusted beta from grid model
beta_grid <-
    snp_ldpred2_grid(corr, df_beta, grid.param, ncores = NCORES)
summary(beta_grid)

## auto model

In [None]:
# Get adjusted beta from the auto model
multi_auto <- snp_ldpred2_auto(
    corr,
    df_beta,
    h2_init = h2_est,
    vec_p_init = seq_log(1e-4, 0.9, length.out = NCORES),
    ncores = NCORES
)
beta_auto <- sapply(multi_auto, function(auto)
    auto$beta_est)
summary(beta_auto)

# Obtain model PRS

## Infinitesimal Model

In [None]:
if(is.null(obj.bigSNP)){
    obj.bigSNP <- snp_attach("EUR.QC.rds")
}
genotype <- obj.bigSNP$genotypes
# calculate PRS for all samples
ind.test <- 1:nrow(genotype)
pred_inf <- big_prodVec(    genotype,
                            beta_inf,
                            ind.row = ind.test,
                            ind.col = info_snp$`_NUM_ID_`)
hist(pred_inf)

## Grid model

In [None]:
if(is.null(obj.bigSNP)){
    obj.bigSNP <- snp_attach("EUR.QC.rds")
}
genotype <- obj.bigSNP$genotypes
# calculate PRS for all samples
ind.test <- 1:nrow(genotype)
pred_grid <- big_prodMat(   genotype, 
                            beta_grid, 
                            ind.col = info_snp$`_NUM_ID_`)
hist(pred_grid)

## Auto model

In [None]:
if(is.null(obj.bigSNP)){
    obj.bigSNP <- snp_attach("EUR.QC.rds")
}
genotype <- obj.bigSNP$genotypes
# calculate PRS for all samples
ind.test <- 1:nrow(genotype)
pred_auto <-
    big_prodMat(genotype,
                beta_auto,
                ind.row = ind.test,
                ind.col = info_snp$`_NUM_ID_`)
# scale the PRS generated from AUTO
pred_scaled <- apply(pred_auto, 2, sd)
final_beta_auto <-
    rowMeans(beta_auto[,
                abs(pred_scaled -
                    median(pred_scaled)) <
                    3 * mad(pred_scaled)])
pred_auto <-
    big_prodVec(genotype,
        final_beta_auto,
        ind.row = ind.test,
        ind.col = info_snp$`_NUM_ID_`)
hist(pred_auto)