### Analysis of the SR data (raw and smooth)
The SR methylation calls from BSsmooth were taken for EMSeq and TruSeq (no TruMethyl since distinct samples are absent). Our aim is to compare methylation difference (DMRs) using Metilene, and see if smoothing makes a differences in called DMRs.

In [6]:
# RUN FIRST TIME
# install.packages("tidyverse")
# install.packages("wesanderson")
# install.packages("cowplot")


In [7]:
# Load packages
library(ggplot2)
library(data.table)
library(tidyverse)
library(wesanderson)
library(RColorBrewer)
library(viridis)
library(cowplot)


In [3]:
#
getwd()
metilene_out1 = list.files(path="outputs/metilene/results", pattern="\\.sr\\.metilene\\.out\\.tsv", full.names = TRUE, recursive=FALSE, ignore.case = FALSE)
metilene_out1


In [4]:
#
df_file_stats <- data.frame(
  Group1 = character(), Group2 = character(), Type = character(), Seq = character(),
  count_total_calls = numeric(), count_signf_calls = numeric(), stringsAsFactors = FALSE
)

df_file_stats <- lapply(metilene_out1, function(file1) {
  row_df1 <- data.frame(
    Group1 = NA, Group2 = NA, Type = NA, Seq = NA,
    count_total_calls = 0, count_signf_calls = 0, stringsAsFactors = FALSE
  )
  
  size1 <- file.info(file1)$size
  if (size1 > 0) {
    df1 <- read.table(file1, header = FALSE)
    row_df1$count_total_calls <- nrow(df1)
    row_df1$count_signf_calls <- nrow(dplyr::filter(df1, V4 <= 0.1))
  }
  
  # Extract metadata from filename
  row_df1$Group1 <- sub("__.*", "", basename(file1))
  row_df1$Group1 <- sub(".*_HG", "HG", row_df1$Group1)
  row_df1$Group1 <- sub("_.*", "", row_df1$Group1)    
  row_df1$Group2 <- sub("\\..*", "", sub(".*__", "", basename(file1)))
  row_df1$Group2 <- sub(".*_HG", "HG", row_df1$Group2)
  row_df1$Group2 <- sub("_.*", "", row_df1$Group2)    
  row_df1$Type <- sub(".*\\.", "", sub(".sr.metilene.out.tsv", "", basename(file1)))
  row_df1$Seq <- sub(".*_", "", sub("_HG00[0-9].*", "", basename(file1)))
  
  return(row_df1)
})

df_file_stats <- do.call(rbind, df_file_stats)
df_file_stats <- df_file_stats |>
  dplyr::mutate(Group = paste(Group1, Group2, sep = "_"))

#
df_file_stats


Group1,Group2,Type,Seq,count_total_calls,count_signf_calls,Group
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>
HG005,HG002,raw,TruSeq,2206,18,HG005_HG002
HG005,HG002,smoothed,TruSeq,0,0,HG005_HG002
HG002,HG005,raw,EMSeq,4199,61,HG002_HG005
HG002,HG005,smoothed,EMSeq,0,0,HG002_HG005


In [14]:
# Plot
plot1 <- ggplot(df_file_stats, aes(x = Group, y = count_total_calls, fill = Type)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ Seq) + scale_fill_manual(values = brewer.pal(3, "Dark2")) +
  labs(x = "", y = "#DMRs", title = "Total Calls") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
       plot.title = element_text(hjust = 0.5))

plot2 <- ggplot(df_file_stats, aes(x = Group, y = count_signf_calls, fill = Type)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ Seq) + scale_fill_manual(values = brewer.pal(3, "Dark2")) +
  labs(x = "", y = "#DMRs", title = "Signf. Calls") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
       plot.title = element_text(hjust = 0.5))

plot_compare <- plot_grid(plot1, plot2, labels = c('A', 'B'), label_size = 8)

pdf(file = paste0("/opt/notebooks/outputs/metilene/results/","compare_sr_dmr_counts.pdf"),
    width = 10, height = 5)
plot_compare
dev.off()

