initial commit

dphiggs01 · Jul 9, 2018 · 61b7299 · 61b7299
1 parent f3d953b
commit 61b7299
Show file tree

Hide file tree

Showing 10 changed files with 16,922 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,10 @@
+Package: wormcat
+Title: What the Package Does (one line, title case)
+Version: 0.0.0.9000
+Authors@R: person("First", "Last", email = "first.last@example.com", role = c("aut", "cre"))
+Description: What the package does (one paragraph).
+Depends: R (>= 3.4.3)
+License: What license is it under?
+Encoding: UTF-8
+LazyData: true
+RoxygenNote: 6.0.1.9000
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,3 @@
+# Generated by roxygen2: do not edit by hand
+
+export(worm_cat_fun)
diff --git a/R/worm_cat_acceptable_pvalues.R b/R/worm_cat_acceptable_pvalues.R
@@ -0,0 +1,20 @@
+
+
+.worm_cat_acceptable_pvalues <- function(csv_file_name) {
+
+  acceptable_pvalues  <- read.csv(csv_file_name, row.names=1, header=TRUE, sep =",")
+  acceptable_pvalues <- na.omit(acceptable_pvalues)
+
+  acceptable_pvalues[order(acceptable_pvalues$PValue),]
+
+  acceptable_pvalues <- subset(acceptable_pvalues, PValue < 0.05)
+
+  acceptable_pvalues <- rbind(acceptable_pvalues, data.frame(Category = "calibration high", RGS = 250, AC = 0, PValue = 1.00E-50))
+  acceptable_pvalues <- rbind(acceptable_pvalues, data.frame(Category = "calibration low", RGS = 1, AC = 0, PValue = 1))
+
+  csv_out_file_nm <-sprintf("%s_apv.csv",substr(csv_file_name, 0, nchar(csv_file_name)-4))
+  write.csv(acceptable_pvalues, file = csv_out_file_nm,row.names=FALSE)
+}
+
+
+
diff --git a/R/worm_cat_add_categories.R b/R/worm_cat_add_categories.R
@@ -0,0 +1,29 @@
+## apply category 3 to regulated gene set then split categories
+
+.worm_cat_add_categories <- function(csv_fnm, out_dir, worm_cat_annotations){
+
+  RGS  <- read.csv(csv_fnm, header=TRUE, sep =",")
+
+  annotations  <- read.csv(worm_cat_annotations, header=TRUE, sep =",")
+
+  #Remove dupliplicates from RGS
+  RGS_df <- data.frame(unique(RGS))
+
+  #Remove dupliplicates from annotations
+  #annotations_df <- data.frame(unique(annotations$Sequence.ID))
+  annotations_df <- annotations[!duplicated(annotations$Sequence.ID), ]
+
+  # remove other columns
+  annotations_clean <- annotations_df[c(1,2,3,4,5,7)]
+
+  # merge.x
+  RGS_merge <-  merge(RGS_df, annotations_clean, by = "Sequence.ID", all.x = TRUE)
+
+  # create Cat1, Cat2, Cat3 files
+  Cat <- RGS_merge[c(3,4,5)]
+
+  # Save csv
+  write.csv(Cat, file = paste(out_dir,"/rgs_and_categories.csv", sep=""))
+
+}
+
diff --git a/R/worm_cat_bubble_plot.R b/R/worm_cat_bubble_plot.R
@@ -0,0 +1,79 @@
+library(ggplot2)
+library(plotflow)
+library(scales)
+library(ggthemes)
+library(pander)
+
+#RGS-Regulated Gene Sets
+
+#3.	Run bubble chart
+#Will need to select categories (only p values  < 0.05 or only metabolism)
+#Will need to match file name/category name in script
+#Will need to add calibrations for high and low (see arf_cat2UP.csv) so that colors and sizes stay consistent between graphs
+
+.worm_cat_bubble_plot <- function(csv_file_name, plot_title) {
+
+  dev.list()
+
+  bubbles  <- read.csv(csv_file_name, header=TRUE, sep =",")
+
+  bubbles <- na.omit(bubbles)
+
+  bubbles$bubbles_z <- round(0.001 * (bubbles$PValue - mean(bubbles$PValue))/sd(bubbles$PValue), 2)  # compute normalized value as a placeholder
+
+  bubbles$p_value_type <- ifelse(bubbles$PValue < 1e-40, "Col1",
+                                         ifelse(bubbles$PValue < 1e-20, "Col2",
+                                               ifelse(bubbles$PValue < 1e-10, "Col3",
+                                                       ifelse(bubbles$PValue < 1e-5, "Col4",
+                                                              ifelse(bubbles$PValue < 1e-2, "Col5",
+                                                                     ifelse(bubbles$PValue < 5e-2, "Col6","NS"))))))
+
+
+  bubbles$RGS_size <- ifelse(bubbles$RGS > 150, "Size1",
+                                               ifelse(bubbles$RGS > 100, "Size2",
+                                                      ifelse(bubbles$RGS > 75, "Size3",
+                                                             ifelse(bubbles$RGS > 50, "Size4",
+                                                                    ifelse(bubbles$RGS > 25, "Size5",
+                                                                           ifelse(bubbles$RGS > 10, "Size6",
+                                                                                  ifelse(bubbles$RGS > 5, "Size7",
+                                                                                      ifelse(bubbles$RGS > 2, "Size8","Size9"))))))))
+
+
+  bubbles_plot <- bubbles[order(bubbles$PValue), ]  # sort
+
+  # Plot
+  myplot <- ggplot(reorder_by(Category, ~ -PValue, bubbles_plot), aes(x=`Category`, y=bubbles_z, label=bubbles_z, size = bubbles_plot$RGS_size)) +
+
+    geom_point(stat='identity', aes(col=bubbles_plot$p_value_type))  +
+
+    scale_color_manual(name="P value",
+                       labels = c("10-40", "10-20", "10-10", "10-5", "0.001", "0.05", "NS"),
+                       values = c("Col1"="red4", "Col2" = "orangered3", "Col3" = "orangered1",
+                                  "Col4" = "orange", "Col5" = "gold", "Col6" = "yellow", "NS"="grey48")) +
+    scale_size_manual(name="Count",
+                      labels = c( "150", "100", "75", "50", "25", "10","5", "2", "Zero"),
+                      values = c( "Size1" = 10, "Size2" = 9,
+                                 "Size3" = 8, "Size4" = 7, "Size5" = 6,"Size6" = 5, "Size7" = 2.5, "Size8" = 1,"Size9"=0.1)) +
+
+
+    labs(title=plot_title) +
+    ylim(-1.0, 1.0) +
+    coord_flip()
+
+
+  myplot + theme(panel.grid = element_blank(), panel.background = element_blank(), legend.key = element_rect(fill = "white", colour = "white"))
+
+  s_from <- 0
+  s_to <- nchar(csv_file_name)-4
+  file_out_name <- sprintf("%s.png",substr(csv_file_name, s_from, s_to))
+
+  #ggsave(file_out_name, width = 5.5, height = 5, useDingbats=F)
+  ggsave(file_out_name, width = 5.5, height = 5)
+
+  print(myplot)
+
+  dev.off()
+}
+
+
+
diff --git a/R/worm_cat_controller.R b/R/worm_cat_controller.R
@@ -0,0 +1,58 @@
+#rm(list= ls())
+
+#source("worm_cat_add_categories.R")
+#source("worm_cat_fisher_test.R")
+#source("worm_cat_acceptable_pvalues.R")
+#source("worm_cat_bubble_plot.R")
+
+#' Worm Cat Function
+#'
+#' This function take a regulated gene set and the category annotations and runs a fisher test.
+#' @param file_to_process the file to be processes
+#' @param title the title for your bubble plots
+#' @param rm_dir Boolean If FALSE do not remove temp dir. If TRUE remove temp dir
+#' @keywords worm cat
+#' @export
+#' @examples
+#' worm_cat_fun()
+worm_cat_fun <- function(file_to_process, title="rgs", rm_dir=FALSE){
+    mainDir <- getwd()
+    subDir <- paste("worm-cat_",format(Sys.time(), "%b-%d-%Y-%H:%M:%S"), sep="")
+    subDirPath <- paste("./",subDir, sep="")
+    dir.create(file.path(mainDir, subDir))
+
+    #worm_cat_annotations <- "annotations_jul-09-2018.csv"
+    worm_cat_annotations <- system.file("extdata", "annotations_jul-09-2018.csv", package="wormcat")
+
+    .worm_cat_add_categories(file_to_process, subDirPath, worm_cat_annotations)
+
+    .worm_cat_fisher_test(subDirPath, worm_cat_annotations)
+
+    # For each of the three files made above:
+    # 1. Parse the file to only include the entries with "acceptable pvalues"
+    # 2. Create bubble plots for each of the three categories based on the accepteble pvlaues
+    for(i in 1:3) {
+
+      cat_file_to_process <- sprintf("./%s/rgs_fisher_cat%d.csv", subDir, i)
+      print(sprintf("Processed %s",cat_file_to_process))
+      .worm_cat_acceptable_pvalues(cat_file_to_process)
+
+      cat_pvalue_file_to_process <- sprintf("./%s/rgs_fisher_cat%d_apv.csv",subDir, i)
+
+      plot_titles <- c(paste(title, "category1", sep=":"),
+                       paste(title, "category2", sep=":"),
+                       paste(title, "category3", sep=":"))
+      .worm_cat_bubble_plot(cat_pvalue_file_to_process, plot_titles[i])
+    }
+
+    files2zip <- dir(subDirPath, full.names = TRUE)
+    zip(zipfile = subDir, files = files2zip)
+    if(rm_dir == TRUE){
+      print("cleaning up")
+       unlink(subDir, TRUE)
+    }
+}
+
+
+
+
diff --git a/R/worm_cat_fisher_test.R b/R/worm_cat_fisher_test.R
@@ -0,0 +1,121 @@
+# Goal: generate p values for enrichment of annotation terms in RNA seq data
+# Step 1: Count categories in annotation files *AC*, will stay static
+# Step 2: Count categories in regulated gene sets *RGS*, make this easy to switch in alternate data sets, *RGS_a, b, c, etc.*
+# Step 3: Generate data frame: AC and RGS
+# Step 4: Build contengency table for each category in RGS vs AC, use a for loop to build the contengency tables.
+# Step 5: Use Fisher.test to generate P value for enrichment of specific categories in RGS
+
+library("data.table")
+library("plyr")
+
+.worm_cat_fisher_test <- function(output_dir, worm_cat_annotations){
+  # read in  csv files, create data frames
+  AC <- read.csv(worm_cat_annotations,header = TRUE, sep = ",")
+
+  csv_file <- paste(output_dir,"/rgs_and_categories.csv", sep="")
+  RGS <- read.csv(csv_file,header = TRUE, sep = ",")
+
+  # Step 1 Count categories in annotation files *AC*, will stay static
+
+  total_count <- data.frame(nrow(AC))
+
+  total_nrow <- rename(total_count, c("nrow.AC." = "nrow"))
+
+  #print(total_nrow)
+  #print(total_nrow$nrow)
+  #print(total_annotated)
+
+  total_annotated_cat1 <- data.frame(table(AC$Category.1))
+
+  total_annotated_cat2 <- data.frame(table(AC$Category.2))
+
+  total_annotated_cat3 <- data.frame(table(AC$Category.3))
+
+  # Step 2/3: Count categories in regulated gene sets *RGS*
+
+  RGS_count <- data.frame(nrow(RGS))
+
+  #print(RGS_count)
+
+  RGS_nrow <- rename(RGS_count, c("nrow.RGS." = "nrow"))
+
+  #print(RGS_nrow)
+  #print(RGS_nrow$nrow)
+
+  RGS_annotated_cat1 <- data.frame(table(RGS$Category.1))
+
+  RGS_annotated_cat2 <- data.frame(table(RGS$Category.2))
+
+  RGS_annotated_cat3 <- data.frame(table(RGS$Category.3))
+
+  .merger_cats(RGS_annotated_cat1, total_annotated_cat1,total_nrow$nrow, RGS_nrow$nrow, .out_file_nm(output_dir,1))
+  .merger_cats(RGS_annotated_cat2, total_annotated_cat2,total_nrow$nrow, RGS_nrow$nrow, .out_file_nm(output_dir,2))
+  .merger_cats(RGS_annotated_cat3, total_annotated_cat3,total_nrow$nrow, RGS_nrow$nrow, .out_file_nm(output_dir,3))
+}
+
+###########################
+# Step 4: Merge data frames
+.merger_cats <- function(UP_annotated_cat, total_annotated_cat,total_all_cat, total_rgs_cat, file_nm) {
+
+  cat_a <- merge(UP_annotated_cat, total_annotated_cat, by = "Var1", all.x = TRUE)
+
+  #print(head(cat_a))
+
+  cat_b <- rename(cat_a, c("Var1" = "Category", "Freq.x" = "RGS", "Freq.y" = "AC" ))
+
+  # Step 5: Build contengency table for each category in RGS vs AC
+
+  #"a =cat_b$Category",
+
+  #"x = cat_b$RGS",
+
+  #"y = cat_b$AC"
+
+  # con <- file("test.log")
+  #  con2 <- file("/Users/danhiggins/Code/R_Workspace/Fisher-Test/test1.log")
+  #  sink(con, append=TRUE)
+  #  sink(con, append=TRUE, type="message")
+
+  df <- data.frame(Category=character(),
+                   RGS=double(),
+                   AC=double(),
+                   PValue=double(),
+                   stringsAsFactors=FALSE)
+
+  fact_character <- levels(cat_b$Category)[as.numeric(cat_b$Category)]
+
+  for(i in 1:nrow(cat_b)) {
+    #a <- cat_b$Category[i]
+    #print("xxxxxxxxxxx")
+    #print(RGS_nrow$nrow)
+    #print(total_nrow$nrow)
+    #print(cat_b$Category[i])
+    #print(total_rgs_cat)
+    #print(total_all_cat)
+    #print(fact_character[i])
+    #print(cat_b$RGS[i])
+    #print(cat_b$AC[i])
+    if(is.na(cat_b$RGS[i]) | is.na(cat_b$AC[i])){
+      pvalue <- NA
+    }else{
+      stat <- fisher.test(matrix(c(cat_b$RGS[i],total_rgs_cat,
+                                   cat_b$AC[i],total_all_cat),nrow=2,ncol=2),alternative="greater")
+      pvalue <- stat$p.value
+    }
+    #print(pvalue)
+    #print(fact_character[i])
+    #print("xxxxxxxxxxx")
+
+    df[nrow(df) + 1,] = list(Category=fact_character[i],RGS=cat_b$RGS[i], AC=cat_b$AC[i],pvalue)
+  }
+
+  write.csv(df, file = file_nm)
+}
+
+.out_file_nm <- function(output_dir,n){
+  sprintf("%s/rgs_fisher_cat%d.csv",output_dir, n)
+}
+
+
+
+