<a href="https://colab.research.google.com/github/comparativechrono/100KGP_enrichment/blob/main/case_control_enrichment_with_R.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
library(dplyr)
library(purrr)

parse_vcf_info <- function(file_path) {
  vcf_data <- read.table(file_path, header = FALSE, stringsAsFactors = FALSE)
  names(vcf_data) <- c("Sample", "Chrom", "Pos", "ID", "Ref", "Alt", "Qual", "Filter", "GT", "GQ", "DP")

  # Split GT field to count alleles
  vcf_data$AlleleCount <- sapply(strsplit(vcf_data$GT, "/"), function(gt) sum(as.numeric(gt)))

  return(vcf_data)
}

cases <- parse_vcf_info("cases.txt")
controls <- parse_vcf_info("control.txt")

# Aggregate the allele counts by variant ID for both cases and controls:

cases_freq <- cases %>% group_by(ID) %>% summarise(AlleleFreq = sum(AlleleCount))
controls_freq <- controls %>% group_by(ID) %>% summarise(AlleleFreq = sum(AlleleCount))

# Convert columns to numeric to avoid potential issues
combined_freq <- combined_freq %>%
  mutate(AlleleFreq_cases = as.numeric(AlleleFreq_cases),
         AlleleFreq_controls = as.numeric(AlleleFreq_controls))

# Use map to iterate over each row and create matrices
combined_freq <- combined_freq %>%
  mutate(matrix = map2(AlleleFreq_cases, AlleleFreq_controls, ~matrix(c(.x, total_cases_alleles - .x,
                                                                       .y, total_controls_alleles - .y),
                                                                      nrow = 2, byrow = TRUE)))

# Perform Fisher's Exact Test
combined_freq <- combined_freq %>%
  mutate(fisher = map(matrix, ~fisher.test(.x)))

# Extract p-values and odds ratios
results <- combined_freq %>%
  transmute(ID = ID,
            p.value = map_dbl(fisher, ~.x$p.value),
            odds_ratio = map_dbl(fisher, ~.x$estimate[["odds ratio"]], .default = NA))

# Write to CSV
write.csv(results, "3A_fishers_results.csv", row.names = FALSE)