In [None]:
library(stringr)
library(data.table)
library(dplyr)
library(boot)
source("eval_functions_refactored.R")

In [None]:
covars <- fread("AoU_98K_covariates.tsv")
subpopPCs <- fread("2024-05-31_global_subpop_v2_gnomad-AoU-PCs-in-AOU.tsv")
sire <- fread("self_report_demographics.txt")

dir <- "comparisons_continuous_refactored_subpopPCs_boot/"
dir.create(dir)

In [None]:
score_names <- c("Hemoglobin_Mean_INT", 
                 "LDLC_Mean_INT", 
                 "mean_Height_INT",
                 "mean_BMI_INT",
                 "mean_Systolic_INT")
pheno_names <- c("hemoglobin", 
                 "ldl",
                 "height",
                 "bmi",
                 "sbp")
methods <- c("LDpred", "PRScs", "PRScsx")

In [None]:
for(method in methods){
    mapply(compare_method_PCadj_subpopPCs_boot, 
       score_names, 
       pheno_names, MoreArgs = list(dir = dir, 
                                    method = method, 
                                    covars = covars, 
                                    age_sex = TRUE))
}


In [None]:
for(method in c("LDpred", "PRScs")){
    mapply(compare_method_PCadj_subpopPCs_META_boot, 
       score_names, 
       pheno_names, MoreArgs = list(dir = dir, 
                                    method = method, 
                                    covars = covars, 
                                    age_sex = TRUE))
}

In [None]:
compare_method_PCadj_subpopPCs_boot <- function (dir, covars, score_name, phen_name, method, bootstrap = TRUE, age_sex = FALSE) 
{
    phecode <- score_name
    score_data <- fread(paste0("computed_scores/", score_name, "_", method, ".txt"))
    phen_data <- fread(paste0("outcomes/", phen_name, ".txt"))
    score_num <- length(colnames(score_data)) - 4
    print(score_num)
    dat <- phen_data %>% 
        left_join(covars) %>% 
        left_join(score_data, by = c("person_id" = "IID")) %>%
        left_join(subpopPCs %>% 
                  select(!c(ancestry_pred_other, 
                            contains("prob"), 
                            contains("prev_global"))), 
                  by = c("person_id" = "s"))
    
    
    dat <- dat %>% filter(unrel == 1)
    
    grouping_name <-  "ancestry"
    adj_level <- ifelse(age_sex == FALSE, "PC-adj", "PCagesex-adj")
    
    R2_file <- paste0(dir, phecode, "_", method, "_", grouping_name, 
        "_R2_", adj_level, "_unrel.txt")
    beta_file <- paste0(dir, phecode, "_", method, "_", grouping_name, 
        "_beta_", adj_level, "_unrel.txt")
    boot_file <- paste0(dir, phecode, "_", method, "_", grouping_name, 
        "_boot_", adj_level, "_unrel.txt")
    grouping_var <-  "ancestry_pred_other"
    
    for (pop in c("eur", "afr", "amr")) {
        dat_anc <- dat[dat[[grouping_var]] == pop, ]
        dat_anc$mean <- inormal(dat_anc$mean)
        pcs <- paste0(toupper(pop), "_PC", 1:20)
        
        if(age_sex == TRUE){
            covs <- c("age", "is_male", pcs)
        }else{
            covs <- pcs
        }
        
        dat_anc <- dat_anc %>% select(mean, contains("SCORE"), all_of(covs)) %>% na.omit()
        
        for (i in 1:score_num) {
            score_col <- paste0("SCORE", i, "_SUM")
            dat_anc[[paste0("score", i)]] <- perform_adj_PC_specify_covs(dat_anc, score_col, paste(covs, collapse = " + "))
        }
        dat_anc$mean1 <- perform_adj_PC_specify_covs(dat_anc, "mean", paste(covs, collapse = " + "))
        r_squared_values <- numeric(score_num)
        for (i in 1:score_num) {
            score_col <- paste0("score", i)
            r_squared_values[i] <- perform_lm_and_write_beta(dat_anc, 
                score_col, phecode, pop, beta_file)
            if (bootstrap == TRUE) {
                boot_res <- boot(dat_anc, extract_r_squared, 
                  R = 1000, score_col = score_col, phecode = phecode)
                boot_dat <- paste(round(boot_res$t0, 5), paste0(round(quantile(boot_res$t, 
                  probs = c(0.025, 0.975)), 5), collapse = ";"), 
                  sep = ";")
                write(paste(phecode, pop, dim(dat_anc)[1], score_col, 
                  boot_dat, sep = ";"), file = boot_file, append = TRUE)
            }
        }
        write(paste(phecode, pop, dim(dat_anc)[1], paste(r_squared_values, 
            collapse = ";"), sep = ";"), file = R2_file, append = TRUE)
    }
}

In [None]:

score_name <- "Hemoglobin_Mean_INT"
phen_name <- "hemoglobin"
method <- "PRScs"

compare_method_PCadj_subpopPCs_META_boot(dir, covars, score_name, phen_name, 
                                         method, bootstrap = TRUE, age_sex = TRUE)

In [None]:
compare_method_PCadj_subpopPCs_META_boot <- function (dir, covars, score_name, phen_name, method, bootstrap = TRUE, age_sex = FALSE) 
{
    phecode <- score_name
    score_data <- fread(paste0("computed_scores/", score_name, "_", method, ".txt"))
    phen_data <- fread(paste0("outcomes/", phen_name, ".txt"))
    score_num <- length(colnames(score_data)) - 4
    print(score_num)
    dat <- phen_data %>% 
        left_join(covars) %>% 
        left_join(score_data, by = c("person_id" = "IID")) %>%
        left_join(subpopPCs %>% 
                  select(!c(ancestry_pred_other, 
                            contains("prob"), 
                            contains("prev_global"))), 
                  by = c("person_id" = "s"))
    
    dat <- dat %>% filter(unrel == 1)
    
    grouping_name <-  "ancestry"
    adj_level <- ifelse(age_sex == FALSE, "PC-adj", "PCagesex-adj")
    
    boot_file <- paste0(dir, phecode, "_", method, "_", grouping_name, 
        "_META_boot_", adj_level, "_unrel.txt")
    grouping_var <-  "ancestry_pred_other"
    
    for (pop in c("eur", "afr", "amr")) {
        dat_anc <- dat[dat[[grouping_var]] == pop, ]
        dat_anc$mean <- inormal(dat_anc$mean)
        pcs <- paste0(toupper(pop), "_PC", 1:20)
        
        if(age_sex == TRUE){
            covs <- c("age", "is_male", pcs)
        }else{
            covs <- pcs
        }
        
        dat_anc <- dat_anc %>% select(mean, contains("SCORE"), all_of(covs)) %>% na.omit()
        
        #adjust scores and phenotype by covariates
        for (i in 1:score_num) {
            score_col <- paste0("SCORE", i, "_SUM")
            dat_anc[[paste0("score", i)]] <- perform_adj_PC_specify_covs(dat_anc, score_col, paste(covs, collapse = " + "))
        }
        dat_anc$mean1 <- perform_adj_PC_specify_covs(dat_anc, "mean", paste(covs, collapse = " + "))
        
        boot_res <- boot(dat_anc, extract_meta_r_squared, 
                         R = 1000, method = method, phecode = phecode)
        boot_dat <- paste(round(boot_res$t0, 5), paste0(round(quantile(boot_res$t, 
                  probs = c(0.025, 0.975)), 5), collapse = ";"), 
                  sep = ";")
        write(paste(phecode, pop, dim(dat_anc)[1], method, 
                  boot_dat, sep = ";"), file = boot_file, append = TRUE)
        }
    }

In [None]:
extract_meta_r_squared <- function (data, indices, method, phecode) 
{
    d <- data[indices, ]
    if(method == "PRScs"){
        avg <- mean(c(perform_lm(d, paste0("score", 1), phecode), 
                    perform_lm(d, paste0("score", 2), phecode), 
                    perform_lm(d, paste0("score", 3), phecode)))
        return(avg)
    }else if(method == "LDpred"){
        avg <- mean(c(perform_lm(d, paste0("score", 2), phecode), 
                    perform_lm(d, paste0("score", 4), phecode), 
                    perform_lm(d, paste0("score", 6), phecode)))
        return(avg)
    }
}

In [None]:
perform_adj_PC_specify_covs <- function (dat, score_name, covs) 
{
    formula <- paste(score_name, "~", covs)
    mod <- lm(as.formula(formula), data = dat)
    return(residuals(mod))
}