# EBV DNA PheWAS

We are subsetting to individuals who:
- Have WGS data 
- Have at least 2 instances of any ICD code
- Are of EUR ancestry (to match the general UKB cohort)

In [None]:
setwd('/home/jupyter/workspaces/ebvphewas')

In [None]:
# Load in libraries
suppressMessages(library(data.table))
suppressMessages(library(dplyr))
library(fastglm)
library(ggplot2)
library(BiocParallel)

Aggregate observation/condition source values for each person:

In [None]:
# Load in files from 01_Query_PheWAS_inputs
obs_df <- fread("observation_df.csv")
con_df <- fread("condition_source_df.csv")

In [None]:
obs_df <- obs_df[,c("person_id", "observation_source_value", "n")]
con_df <- con_df[,c("person_id", "condition_source_value", "n")]
colnames(obs_df) <- c("person", "value", "n")
colnames(con_df) <- c("person", "value", "n")

Get only individuals with WGS data:

In [None]:
gm <- data.frame(fread("../data/genomic_metrics.tsv"))

In [None]:
twofer_df <- rbind(obs_df[obs_df$person %in% gm$research_id,], con_df[con_df$person %in% gm$research_id,])
dim(twofer_df)
twofer_df_agg <- twofer_df[, .(count = sum(n)), by=list(person,value)]
head(twofer_df_agg)
dim(twofer_df_agg)

Filter for individuals with at least 2 counts of ICD codes:

In [None]:
twofer_df_agg <- twofer_df_agg %>% filter(count >= 2) # before: 145607603 x 3
dim(twofer_df_agg) # after: 8068442 x 3
# save the file
fwrite(twofer_df_agg, "twofer_df_agg.csv")

Get EBV quantifications for individuals with EUR ancestry from `EBV_DNA_Quantification/02_EBV_DNA_covariates.ipynb`. 

In [None]:
# 1/0 of EBV+ at all thresholds
# out of these, 11572 are EBV+ at the 0.0018 threshold
ebv_30x_df_EUR <- fread("EBV_GWAS_data/EUR/ebv_30x_df_EUR_allthresh.csv") 
nrow(ebv_30x_df_EUR[ebv_30x_df_EUR$ebv_q30_0.0018 == 1,])

In [None]:
# 101399 EUR ancestry people with at least two instances of any ICD code
twofer_df_agg_EUR <- twofer_df_agg %>%
    dplyr::filter(person %in% eur_30x_df_EUR$person)
length(unique(twofer_df_agg_EUR$person))

In [None]:
# out of these, 8777 are EBV+ at the 0.0018 threshold
ebv_30x_df_EUR_hasICD <- ebv_30x_df_EUR[ebv_30x_df_EUR$person %in% twofer_df_agg_EUR$person,]
nrow(ebv_30x_df_EUR_hasICD) # 101399
nrow(ebv_30x_df_EUR_hasICD[ebv_30x_df_EUR_hasICD$ebv_q30_0.0018 == 1,]) # 8777

## Run PheWAS on individuals with EUR ancestry

This takes 1-2 hours to run. 

Get represented codes (total count >= 20):

In [None]:
# ICDs that are at least >= 2 count (51096 codes)
icd_values <- table(twofer_df_agg_EUR$value) 
length(icd_values)

# represented codes in PheWAS (total count >= 20) (14384 codes)
icds_represented <- names(icd_values)[(icd_values >= 20)]
length(icds_represented)

In [None]:
# Run PheWAS on 0018 threshold 

pathout = paste0("../intermediate/ebv_phewas_EUR_0018.csv")

# writing realtime: track progress in terminal with watch -n 1 'wc -l file.csv'
bplapply(1:length(icds_represented), BPPARAM = MulticoreParam(4), function(i){
    id_one <- icds_represented[i]
    print(id_one)
    ebv_30x_df_EUR$outcome <- as.numeric(ebv_30x_df_EUR$person %in% twofer_df_agg_EUR[twofer_df_agg_EUR$value == id_one,]$person)
    ft <- fisher.test(ebv_30x_df_EUR$ebv_q30_0.0018, ebv_30x_df_EUR$outcome)
    df <- data.frame(id_one, n = sum(ebv_30x_df_EUR$outcome), p.value = ft$p.value, OR = ft$estimate)
    if(i == 1){
        fwrite(df,pathout,append = F)
    } else{
        fwrite(df,pathout,append = T)
    }
}) %>% rbindlist() -> ebv_phewas

Stricter PheWAS (only running on EUR people with at least 2 instances of any ICD code, 101399 people):

In [None]:
pathout = paste0("../intermediate/ebv_phewas_EUR_hasICD_0018.csv")

# writing realtime to track progress in terminal with watch -n 1 'wc -l file.csv'
bplapply(1:length(icds_represented), BPPARAM = MulticoreParam(4), function(i){
    id_one <- icds_represented[i]
    ebv_30x_df_EUR_hasICD$outcome <- as.numeric(ebv_30x_df_EUR_hasICD$person %in% twofer_df_agg_EUR[twofer_df_agg_EUR$value == id_one,]$person)
    ft <- fisher.test(ebv_30x_df_EUR_hasICD$ebv_q30_0018, ebv_30x_df_EUR_hasICD$outcome)
    df <- data.frame(id_one, n = sum(ebv_30x_df_EUR_hasICD$outcome), p.value = ft$p.value, OR = ft$estimate)
    if(i == 1){
        fwrite(df,pathout,append = F)
    } else{
        fwrite(df,pathout,append = T)
    }
}) %>% rbindlist() -> ebv_phewas_hasICD

## Add annotations for each code

Using files from `02_Clean_ICD_annotations.ipynb`.

In [None]:
AOU_PheWAS <- fread("../data/ebv_phewas_EUR_0018.csv")

# Add AOU annotations - cleaned up from the ICD9/ICD10CM reference mapping 
ICD_Phecodes_All <- fread("../data/ICD_Phecodes_All.csv")
AOU_PheWAS <- AOU_PheWAS %>%
  dplyr::left_join(ICD_Phecodes_All, by = "id_one")

# Subset for ICD10CM codes to compare with UKB (6881 rows left)
AOU_PheWAS <- AOU_PheWAS[AOU_PheWAS$Code == "ICD10CM",]

# Lots of annotation duplications (same ICD10CM code, different descriptions)
# In this case, just take the first row annotation
AOU_PheWAS <- AOU_PheWAS %>% distinct(id_one, .keep_all = TRUE)


## Fisher tests for specific codes

In [None]:
id_one <- "G35" # or "B27.00"
id_one %in% twofer_df_agg_EUR$value

In [None]:
ebv_30x_df_EUR$outcome <- as.numeric(ebv_30x_df_EUR$person %in% twofer_df_agg_EUR[twofer_df_agg_EUR$value == id_one,]$person)
ft <- fisher.test(ebv_30x_df_EUR$ebv_q30_0018, ebv_30x_df_EUR$outcome)
ft

### Example code for running a keyword association (searching for substring):

NOTE: be careful when keywords show up only in the "Excl_Phenotypes" - an example is an ecounter for a **test** for HIV that does not mean HIV+.

In [None]:
ICD_Phecodes <- fread("ICD_Phecodes_All.csv")

In [None]:
keyword <- "Multiple sclerosis" # or "infectious mononuc"
has_keyword <- ICD_Phecodes[ICD_Phecodes$Code_Description %likeic% keyword | 
                           ICD_Phecodes$Phecode_Description %likeic% keyword | 
                           ICD_Phecodes$Excl_Phenotypes %likeic% keyword, ]
has_keyword

In [None]:
all_keyword_ppl <- unique(twofer_df_agg_clean[twofer_df_agg_clean$value %in% c("G35", "340"),]$person)
length(all_keyword_ppl)

In [None]:
ebv_pos <- vector(mode="integer", length=10)
ebv_ms <- vector(mode="integer", length=10)
p.value <- vector(mode="numeric", length=10)
OR <- vector(mode="numeric", length=10)
conf.inf.lower <- vector(mode="numeric", length=10)
conf.inf.higher <- vector(mode="numeric", length=10)

In [None]:
i = 1
thresholds <- c(0, 0.0015, 0.0018, 0.002, 0.003, 0.004, 0.005, 0.007, 0.012, 0.015, 0.03)

for(thresh in thresholds){
    # Define whether a person with demographic info has MS
    thresh_col <- paste0("ebv_q30_", thresh)
 
    # Get overlap of ppl with EBV and with MS
    ebv_ms[[i]] <- length(intersect(all_keyword_ppl, ebv_30x_df_EUR[ebv_30x_df_EUR[[thresh_col]] == 1,]$person))
    
    # Get number of ppl with EBV
    ebv_pos[[i]] <- length(ebv_30x_df_EUR[ebv_30x_df_EUR[[thresh_col]] == 1,]$person)
    
    # Fisher test on having a set of MS-associated codes and having EBV
    mdf_clean$outcome <- as.numeric(ebv_30x_df_EUR$person %in% all_keyword_ppl)
    ft <- fisher.test(mdf_clean[[thresh_col]], mdf_clean$outcome, conf.int = TRUE)
    p.value[[i]] <- ft$p.value
    OR[[i]] <- ft$estimate
    conf.inf.lower[[i]] <- ft$conf.int[1]
    conf.inf.higher[[i]] <- ft$conf.int[2]

    i = i + 1
}

In [None]:
ms_ebv_df <- data.frame(Q30_threshold = thresholds, 
                        EBV_MS_DP = ebv_ms,
                        EBV_MS_proportion = ebv_ms/length(all_keyword_ppl),
                        P_VALUE = p.value,
                        CI_LOWER = conf.inf.lower,
                        CI_HIGHER = conf.inf.higher,
                        OR = OR)

ms_ebv_df

ggplot(ms_ebv_df, aes(x=Q30_threshold,y=EBV_MS_DP)) + 
    geom_bar(stat="identity") +
    theme_minimal() + 
    labs(title="Number of EBV+ MS+ people", x="EBV Q30 threshold", y="Codes")

ggplot(ms_ebv_df, aes(x=Q30_threshold,y=EBV_MS_proportion)) + 
    geom_bar(stat="identity") +
    theme_minimal() + 
    labs(title="Proportion of EBV+ MS+ people in MS+ people", x="EBV Q30 threshold", y="% EBV+")