<a href="https://colab.research.google.com/github/comparativechrono/100KGP_enrichment/blob/main/PRS/case_control_PRS_tidy_R.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Load the Weights File

variant_weights <- read.table("variant_weights.txt", header = FALSE, stringsAsFactors = FALSE)
names(variant_weights) <- c("ID", "Weight")

#Calculate Individual PRS

parse_vcf_info_with_prs <- function(file_path, weights) {
  vcf_data <- read.table(file_path, header = FALSE, stringsAsFactors = FALSE)
  names(vcf_data) <- c("Sample", "Chrom", "Pos", "ID", "Ref", "Alt", "Qual", "Filter", "GT", "GQ", "DP")

  # Split GT field to count alleles
  vcf_data$AlleleCount <- sapply(strsplit(vcf_data$GT, "/"), function(gt) sum(as.numeric(gt)))

  # Join with weights to calculate PRS
  vcf_data <- merge(vcf_data, weights, by = "ID")
  vcf_data$PRS <- vcf_data$AlleleCount * vcf_data$Weight

  return(vcf_data)
}

# Calculate PRS for cases and controls
cases_with_prs <- parse_vcf_info_with_prs("cases.txt", variant_weights)
controls_with_prs <- parse_vcf_info_with_prs("control.txt", variant_weights)

# Aggregate PRS and Calculate Mean and Standard Deviation

# Aggregate PRS by sample
aggregate_prs <- function(data) {
  aggregate(PRS ~ Sample, data, sum)
}

cases_prs_summary <- aggregate_prs(cases_with_prs)
controls_prs_summary <- aggregate_prs(controls_with_prs)

# Write aggregated PRS to CSV
write.csv(cases_prs_summary, "cases_prs_summary.csv", row.names = FALSE)
write.csv(controls_prs_summary, "controls_prs_summary.csv", row.names = FALSE)


# Calculate mean and standard deviation for cases and controls
summary_statistics <- function(prs_data) {
  list(mean = mean(prs_data$PRS), sd = sd(prs_data$PRS))
}

cases_stats <- summary_statistics(cases_prs_summary)
controls_stats <- summary_statistics(controls_prs_summary)

# Print summary statistics
print("Cases PRS Mean and SD:")
print(cases_stats)

print("Controls PRS Mean and SD:")
print(controls_stats)

# Prepare summary statistics for output
summary_stats_df <- data.frame(
  Group = c("Cases", "Controls"),
  Mean_PRS = c(cases_stats$mean, controls_stats$mean),
  SD_PRS = c(cases_stats$sd, controls_stats$sd)
)

# Write summary statistics to CSV
write.csv(summary_stats_df, "summary_statistics_prs.csv", row.names = FALSE)


[1] "Cases PRS Mean and SD:"
$mean
[1] 2.9

$sd
[1] 1.473092

[1] "Controls PRS Mean and SD:"
$mean
[1] 1.52

$sd
[1] 0.6058052

