<a href="https://colab.research.google.com/github/comparativechrono/100KGP_enrichment/blob/main/case_control_enrichment_with_baseR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Using wide data format is a tidy R approach, however, if you are working on a clinical computing system without tidyverse, here is a base R implementation of the approach.
# Parse the Files

parse_vcf_info <- function(file_path) {
  vcf_data <- read.table(file_path, header = FALSE, stringsAsFactors = FALSE)
  colnames(vcf_data) <- c("Sample", "Chrom", "Pos", "ID", "Ref", "Alt", "Qual", "Filter", "GT", "GQ", "DP")

  # Split GT field to count alleles
  vcf_data$AlleleCount <- sapply(strsplit(vcf_data$GT, "/"), function(gt) sum(as.numeric(gt)))

  return(vcf_data)
}

cases <- parse_vcf_info("cases.txt")
controls <- parse_vcf_info("control.txt")

# Calculate Allele Frequencies

aggregate_allele_freq <- function(data) {
  aggregate(AlleleCount ~ ID, data = data, sum)
}

cases_freq <- aggregate_allele_freq(cases)
controls_freq <- aggregate_allele_freq(controls)

# Generate Matrices for Fisher's Test

combined_freq <- merge(cases_freq, controls_freq, by = "ID", suffixes = c("_cases", "_controls"))

# Assuming total number of alleles analyzed (for scaling to total cases/controls)
total_cases_alleles <- 80 * 2
total_controls_alleles <- 230 * 2

# Initialize the matrix list
combined_freq$matrix <- vector("list", nrow(combined_freq))

for (i in 1:nrow(combined_freq)) {
  combined_freq$matrix[[i]] <- matrix(c(
    combined_freq$AlleleCount_cases[i], total_cases_alleles - combined_freq$AlleleCount_cases[i],
    combined_freq$AlleleCount_controls[i], total_controls_alleles - combined_freq$AlleleCount_controls[i]),
    nrow = 2)
}

# Perform fishers exact test

combined_freq$fisher <- lapply(combined_freq$matrix, fisher.test)

# Extract Results and Write to CSV

results <- data.frame(
  ID = combined_freq$ID,
  p.value = sapply(combined_freq$fisher, function(x) x$p.value),
  odds_ratio = sapply(combined_freq$fisher, function(x) ifelse(is.null(x$estimate), NA, x$estimate["odds ratio"]))
)

write.csv(results, "3A_fishers_results.csv", row.names = FALSE)

