# EBV DNA Quantification

We hypothesize that latent EBV DNA in the blood for all participants in AoU can be quantified by the reads captured by chrEBV (hg38) in each participant's WGS data. 

In [None]:
# Set workspace directory
setwd("/home/jupyter")

In [None]:
# Import libraries
library(data.table)
library(dplyr)
library(BiocParallel)
library(GenomicAlignments)
library(ggplot2)
# Set google project ID
gproj = "terra-vpc-sc-a92a14ba"

For each individual in AoU, get the Q30 or Q60 coverage for each position on the EBV genome (171823 bp).

In [None]:
ebv_bams <- list.files("eb", pattern = ".bam")

In [None]:
bplapply(1:length(ebv_bams), BPPARAM = MulticoreParam(96), function(i){
    file1 <- ebv_bams[i]
    aln <- readGAlignments(paste0("eb/", file1), param=ScanBamParam(what="mapq"))

    data.frame(
        person = i,
  idx = 1:171823,
  q30 = as.integer(coverage(aln[aln@elementMetadata$mapq >= 30])[["chrEBV"]]),
  q60 = as.integer(coverage(aln[aln@elementMetadata$mapq >= 60])[["chrEBV"]])) %>%
        filter(q30 > 0)

    })  %>% rbindlist() %>% data.frame() -> cov_df

For each position, sum the Q30 (or Q60) reads across all people. 

In [None]:
total_bp_df <- cov_df %>% group_by(idx) %>% summarize(total_q30 = sum(q30), total_q60 = sum(q60))

In [None]:
ggplot(total_bp_df,aes(x = idx, y = total_q30)) + geom_point() + scale_y_log10()

Get the well-covered and orders-of-magnitude covered regions.

In [None]:
# total_bp_df <- readRDS("summed_per_bp_ebvgenome.rds")
wipe_here <- total_bp_df %>% filter(total_q30 > 1e3) %>% pull(idx)
well_covered <- total_bp_df %>% filter(total_q30 < 1e3 & total_q30 > 100) %>% pull(idx)
ggplot(total_bp_df,aes(x = idx, y = total_q30, color = idx %in% wipe_here)) + geom_point() + scale_y_log10()

In [None]:
## NOTE: normalization will be fixed later - should be divided by all not wipe_here positions
cov_df %>% filter(!(idx %in% wipe_here)) %>% group_by(person) %>% 
    summarize(ebv_eq_q30 = sum(q30)/ length(well_covered),
              ebv_eq_q60 = sum(q60)/ length(well_covered)) -> total_per_person_equivalents_df

In [None]:
total_per_person_equivalents_df$id <- gsub(".bam", "", ebv_bams[total_per_person_equivalents_df$person])
fwrite(total_per_person_equivalents_df, "ebv_equivalents_bias.correction.tsv.gz")

For consistency with UKB, we will use Q30 coverage. 

In [None]:
wipe_here_q30 <- total_bp_df %>% filter(total_q30 > 1e3) %>% pull(idx)
well_covered_q30 <- total_bp_df %>% filter(total_q30 < 1e3 & total_q30 > 100) %>% pull(idx)
p1 <- ggplot(total_bp_df,aes(x = idx, y = total_q30, color = idx %in% wipe_here_q30)) + geom_point(size = 0.5) + scale_y_log10() +
  labs(y= "total coverage", x = "EBV genome position") +
  scale_color_manual(values = c("TRUE" = "firebrick", "FALSE" = "black")) +
  pretty_plot(fontsize = 8) + L_border() + theme(legend.position = "none") 
p1

Check the positions of high coverage.

The chrEBV fasta file can be obtained here: https://www.ncbi.nlm.nih.gov/nuccore/NC_007605.1 .

In [None]:
library(seqinr)
EBV_genome_seq <- seqinr::read.fasta("../data/chrEBV_used.fasta")$chrEBV
stopifnot(length(EBV_genome_seq) == max(total_bp_df$idx)) # 171823
base_matrix <- t(matrix(EBV_genome_seq, nrow = 1, byrow = TRUE))
base_df <- as.data.frame(base_matrix, stringsAsFactors = FALSE)

In [None]:
all_idx <- data.frame(idx = 1: max(total_bp_df$idx))
total_bp_df <- merge(all_idx, total_bp_df, by = "idx", all.x = TRUE)
total_bp_df[is.na(total_bp_df)] <- 0
total_bp_df <- total_bp_df[order(total_bp_df$idx), ]
total_bp_df$base <- base_df$V1

In [None]:
# Get sequences of wipe_here regions from Q30
wipe_here_df <- total_bp_df[total_bp_df$idx %in% wipe_here_q30,] %>%
  dplyr::arrange(idx) %>%
  dplyr::mutate(group = cumsum(c(TRUE, diff(idx) != 1)))

wipe_here_df_seq <- wipe_here_df  %>%
  dplyr::group_by(group) %>%
  dplyr::summarise(
    idx = paste(idx, collapse = ","),
    seq = paste(base, collapse = ""),
    .groups = 'drop') %>%
  dplyr::select(-group)

In [None]:
wipe_here_df_seq
# check individual sequences: 
# wipe_here_df_seq$seq[1]
# wipe_here_df_seq$seq[2]
# wipe_here_df_seq$seq[3]
# wipe_here_df_seq$seq[4]

Get number of positions that have different coverages.

In [None]:
length(wipe_here_q30) # 214 too highly covered bases (total_q30 > 10^3)
length(well_covered_q30) # 142931 well covered bases (100 < total_q30 < 10^3)
length(total_bp_df %>% filter(total_q30 == 0) %>% pull(idx)) # 23513 bases (total_q30 > 0)

In [None]:
not_wiped_q30 <- total_bp_df %>% filter(total_q30 < 1e3 & total_q30 > 0) %>% pull(idx)
length(not_wiped_q30)

## Normalize EBV genome coverage

The genomic metrics file can be obtained as described here: https://support.researchallofus.org/hc/en-us/articles/4614687617556-How-the-All-of-Us-Genomic-data-are-organized-Archived-C2022Q4R13-CDRv7 and here:
https://support.researchallofus.org/hc/en-us/articles/29475233432212-Controlled-CDR-Directory .

Specifically, these gs bucket paths:
- Genetic ancestry and PCs: gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/aux/ancestry/ancestry_preds.tsv
- Genomic metrics (sex_at_birth): gs://fc-aou-datasets-controlled/v8/wgs/short_read/snpindel/aux/qc/genomic_metrics.tsv 
- Demographics (?)

In [None]:
gm <- data.frame(fread("../data/genomic_metrics.tsv"))
gm$research_id <- as.character(gm$research_id)

Small fix in normalization: `ebv_equivalents_bias.correction.tsv.gz` is the amount of EBV per person when divided by `well_covered` positions (142931 bp). However, the amount of EBV per person was summed over all `not_wiped_q30` positions. Correct the amount by multiplying by `well_covered` positions (142931 bp), then dividing by `not_wiped_q30` positions (148096 bp).

In [None]:
correction <- length(well_covered_q30) / length(not_wiped_q30)

In [None]:
mdf <- merge(gm, ebv, by = "research_id", all.x = TRUE)
mdf[is.na(mdf)] <- 0
# correcting the normalization factor
# placing in units of 30x human genome coverage
mdf$ebv_compared_wgs_30x <- mdf$ebv_eq_q30 * correction / mdf$mean_coverage * 30 

There are 51459 rows in the `ebv` dataframe - everyone who had some EBV reads. 

In [None]:
table(mdf$ebv_eq_q30 > 0)

In [None]:
mdf <- mdf %>% 
    dplyr::rename(person = research_id, ebv_q30_30x = ebv_compared_wgs_30x) %>%
    dplyr::select(person, ebv_q30_30x) %>%
    arrange(desc(ebv_q30_30x))
head(mdf)

There are 29249 people in AoU who pass the 0.0018 filter (using the same threshold as in the UKB).

In [None]:
table(mdf$ebv_q30_30x > 0.0018)

Rank coverage plot:

In [None]:
mdf %>% arrange(desc(ebv_q30_30x)) %>% mutate(rank = 1:n()) %>%
    ggplot(aes(x = rank, y = ebv_q30_30x + 0.0000001)) +
    scale_y_log10() + scale_x_log10()+
    geom_point() + labs(x = "person rank", y = "normalized Q30 EBV load in 30xWGS") + theme_bw()

In [None]:
fwrite(mdf, "../intermediate/ebv_equivalent_30x.csv")