Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Dan Higgins committed Jul 9, 2018
1 parent f3d953b commit 61b7299
Show file tree
Hide file tree
Showing 10 changed files with 16,922 additions and 0 deletions.
10 changes: 10 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Package: wormcat
Title: What the Package Does (one line, title case)
Version: 0.0.0.9000
Authors@R: person("First", "Last", email = "first.last@example.com", role = c("aut", "cre"))
Description: What the package does (one paragraph).
Depends: R (>= 3.4.3)
License: What license is it under?
Encoding: UTF-8
LazyData: true
RoxygenNote: 6.0.1.9000
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Generated by roxygen2: do not edit by hand

export(worm_cat_fun)
20 changes: 20 additions & 0 deletions R/worm_cat_acceptable_pvalues.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@


.worm_cat_acceptable_pvalues <- function(csv_file_name) {

acceptable_pvalues <- read.csv(csv_file_name, row.names=1, header=TRUE, sep =",")
acceptable_pvalues <- na.omit(acceptable_pvalues)

acceptable_pvalues[order(acceptable_pvalues$PValue),]

acceptable_pvalues <- subset(acceptable_pvalues, PValue < 0.05)

acceptable_pvalues <- rbind(acceptable_pvalues, data.frame(Category = "calibration high", RGS = 250, AC = 0, PValue = 1.00E-50))
acceptable_pvalues <- rbind(acceptable_pvalues, data.frame(Category = "calibration low", RGS = 1, AC = 0, PValue = 1))

csv_out_file_nm <-sprintf("%s_apv.csv",substr(csv_file_name, 0, nchar(csv_file_name)-4))
write.csv(acceptable_pvalues, file = csv_out_file_nm,row.names=FALSE)
}



29 changes: 29 additions & 0 deletions R/worm_cat_add_categories.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
## apply category 3 to regulated gene set then split categories

.worm_cat_add_categories <- function(csv_fnm, out_dir, worm_cat_annotations){

RGS <- read.csv(csv_fnm, header=TRUE, sep =",")

annotations <- read.csv(worm_cat_annotations, header=TRUE, sep =",")

#Remove dupliplicates from RGS
RGS_df <- data.frame(unique(RGS))

#Remove dupliplicates from annotations
#annotations_df <- data.frame(unique(annotations$Sequence.ID))
annotations_df <- annotations[!duplicated(annotations$Sequence.ID), ]

# remove other columns
annotations_clean <- annotations_df[c(1,2,3,4,5,7)]

# merge.x
RGS_merge <- merge(RGS_df, annotations_clean, by = "Sequence.ID", all.x = TRUE)

# create Cat1, Cat2, Cat3 files
Cat <- RGS_merge[c(3,4,5)]

# Save csv
write.csv(Cat, file = paste(out_dir,"/rgs_and_categories.csv", sep=""))

}

79 changes: 79 additions & 0 deletions R/worm_cat_bubble_plot.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
library(ggplot2)
library(plotflow)
library(scales)
library(ggthemes)
library(pander)

#RGS-Regulated Gene Sets

#3. Run bubble chart
#Will need to select categories (only p values < 0.05 or only metabolism)
#Will need to match file name/category name in script
#Will need to add calibrations for high and low (see arf_cat2UP.csv) so that colors and sizes stay consistent between graphs

.worm_cat_bubble_plot <- function(csv_file_name, plot_title) {

dev.list()

bubbles <- read.csv(csv_file_name, header=TRUE, sep =",")

bubbles <- na.omit(bubbles)

bubbles$bubbles_z <- round(0.001 * (bubbles$PValue - mean(bubbles$PValue))/sd(bubbles$PValue), 2) # compute normalized value as a placeholder

bubbles$p_value_type <- ifelse(bubbles$PValue < 1e-40, "Col1",
ifelse(bubbles$PValue < 1e-20, "Col2",
ifelse(bubbles$PValue < 1e-10, "Col3",
ifelse(bubbles$PValue < 1e-5, "Col4",
ifelse(bubbles$PValue < 1e-2, "Col5",
ifelse(bubbles$PValue < 5e-2, "Col6","NS"))))))


bubbles$RGS_size <- ifelse(bubbles$RGS > 150, "Size1",
ifelse(bubbles$RGS > 100, "Size2",
ifelse(bubbles$RGS > 75, "Size3",
ifelse(bubbles$RGS > 50, "Size4",
ifelse(bubbles$RGS > 25, "Size5",
ifelse(bubbles$RGS > 10, "Size6",
ifelse(bubbles$RGS > 5, "Size7",
ifelse(bubbles$RGS > 2, "Size8","Size9"))))))))


bubbles_plot <- bubbles[order(bubbles$PValue), ] # sort

# Plot
myplot <- ggplot(reorder_by(Category, ~ -PValue, bubbles_plot), aes(x=`Category`, y=bubbles_z, label=bubbles_z, size = bubbles_plot$RGS_size)) +

geom_point(stat='identity', aes(col=bubbles_plot$p_value_type)) +

scale_color_manual(name="P value",
labels = c("10-40", "10-20", "10-10", "10-5", "0.001", "0.05", "NS"),
values = c("Col1"="red4", "Col2" = "orangered3", "Col3" = "orangered1",
"Col4" = "orange", "Col5" = "gold", "Col6" = "yellow", "NS"="grey48")) +
scale_size_manual(name="Count",
labels = c( "150", "100", "75", "50", "25", "10","5", "2", "Zero"),
values = c( "Size1" = 10, "Size2" = 9,
"Size3" = 8, "Size4" = 7, "Size5" = 6,"Size6" = 5, "Size7" = 2.5, "Size8" = 1,"Size9"=0.1)) +


labs(title=plot_title) +
ylim(-1.0, 1.0) +
coord_flip()


myplot + theme(panel.grid = element_blank(), panel.background = element_blank(), legend.key = element_rect(fill = "white", colour = "white"))

s_from <- 0
s_to <- nchar(csv_file_name)-4
file_out_name <- sprintf("%s.png",substr(csv_file_name, s_from, s_to))

#ggsave(file_out_name, width = 5.5, height = 5, useDingbats=F)
ggsave(file_out_name, width = 5.5, height = 5)

print(myplot)

dev.off()
}



58 changes: 58 additions & 0 deletions R/worm_cat_controller.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#rm(list= ls())

#source("worm_cat_add_categories.R")
#source("worm_cat_fisher_test.R")
#source("worm_cat_acceptable_pvalues.R")
#source("worm_cat_bubble_plot.R")

#' Worm Cat Function
#'
#' This function take a regulated gene set and the category annotations and runs a fisher test.
#' @param file_to_process the file to be processes
#' @param title the title for your bubble plots
#' @param rm_dir Boolean If FALSE do not remove temp dir. If TRUE remove temp dir
#' @keywords worm cat
#' @export
#' @examples
#' worm_cat_fun()
worm_cat_fun <- function(file_to_process, title="rgs", rm_dir=FALSE){
mainDir <- getwd()
subDir <- paste("worm-cat_",format(Sys.time(), "%b-%d-%Y-%H:%M:%S"), sep="")
subDirPath <- paste("./",subDir, sep="")
dir.create(file.path(mainDir, subDir))

#worm_cat_annotations <- "annotations_jul-09-2018.csv"
worm_cat_annotations <- system.file("extdata", "annotations_jul-09-2018.csv", package="wormcat")

.worm_cat_add_categories(file_to_process, subDirPath, worm_cat_annotations)

.worm_cat_fisher_test(subDirPath, worm_cat_annotations)

# For each of the three files made above:
# 1. Parse the file to only include the entries with "acceptable pvalues"
# 2. Create bubble plots for each of the three categories based on the accepteble pvlaues
for(i in 1:3) {

cat_file_to_process <- sprintf("./%s/rgs_fisher_cat%d.csv", subDir, i)
print(sprintf("Processed %s",cat_file_to_process))
.worm_cat_acceptable_pvalues(cat_file_to_process)

cat_pvalue_file_to_process <- sprintf("./%s/rgs_fisher_cat%d_apv.csv",subDir, i)

plot_titles <- c(paste(title, "category1", sep=":"),
paste(title, "category2", sep=":"),
paste(title, "category3", sep=":"))
.worm_cat_bubble_plot(cat_pvalue_file_to_process, plot_titles[i])
}

files2zip <- dir(subDirPath, full.names = TRUE)
zip(zipfile = subDir, files = files2zip)
if(rm_dir == TRUE){
print("cleaning up")
unlink(subDir, TRUE)
}
}




121 changes: 121 additions & 0 deletions R/worm_cat_fisher_test.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# Goal: generate p values for enrichment of annotation terms in RNA seq data
# Step 1: Count categories in annotation files *AC*, will stay static
# Step 2: Count categories in regulated gene sets *RGS*, make this easy to switch in alternate data sets, *RGS_a, b, c, etc.*
# Step 3: Generate data frame: AC and RGS
# Step 4: Build contengency table for each category in RGS vs AC, use a for loop to build the contengency tables.
# Step 5: Use Fisher.test to generate P value for enrichment of specific categories in RGS

library("data.table")
library("plyr")

.worm_cat_fisher_test <- function(output_dir, worm_cat_annotations){
# read in csv files, create data frames
AC <- read.csv(worm_cat_annotations,header = TRUE, sep = ",")

csv_file <- paste(output_dir,"/rgs_and_categories.csv", sep="")
RGS <- read.csv(csv_file,header = TRUE, sep = ",")

# Step 1 Count categories in annotation files *AC*, will stay static

total_count <- data.frame(nrow(AC))

total_nrow <- rename(total_count, c("nrow.AC." = "nrow"))

#print(total_nrow)
#print(total_nrow$nrow)
#print(total_annotated)

total_annotated_cat1 <- data.frame(table(AC$Category.1))

total_annotated_cat2 <- data.frame(table(AC$Category.2))

total_annotated_cat3 <- data.frame(table(AC$Category.3))

# Step 2/3: Count categories in regulated gene sets *RGS*

RGS_count <- data.frame(nrow(RGS))

#print(RGS_count)

RGS_nrow <- rename(RGS_count, c("nrow.RGS." = "nrow"))

#print(RGS_nrow)
#print(RGS_nrow$nrow)

RGS_annotated_cat1 <- data.frame(table(RGS$Category.1))

RGS_annotated_cat2 <- data.frame(table(RGS$Category.2))

RGS_annotated_cat3 <- data.frame(table(RGS$Category.3))

.merger_cats(RGS_annotated_cat1, total_annotated_cat1,total_nrow$nrow, RGS_nrow$nrow, .out_file_nm(output_dir,1))
.merger_cats(RGS_annotated_cat2, total_annotated_cat2,total_nrow$nrow, RGS_nrow$nrow, .out_file_nm(output_dir,2))
.merger_cats(RGS_annotated_cat3, total_annotated_cat3,total_nrow$nrow, RGS_nrow$nrow, .out_file_nm(output_dir,3))
}

###########################
# Step 4: Merge data frames
.merger_cats <- function(UP_annotated_cat, total_annotated_cat,total_all_cat, total_rgs_cat, file_nm) {

cat_a <- merge(UP_annotated_cat, total_annotated_cat, by = "Var1", all.x = TRUE)

#print(head(cat_a))

cat_b <- rename(cat_a, c("Var1" = "Category", "Freq.x" = "RGS", "Freq.y" = "AC" ))

# Step 5: Build contengency table for each category in RGS vs AC

#"a =cat_b$Category",

#"x = cat_b$RGS",

#"y = cat_b$AC"

# con <- file("test.log")
# con2 <- file("/Users/danhiggins/Code/R_Workspace/Fisher-Test/test1.log")
# sink(con, append=TRUE)
# sink(con, append=TRUE, type="message")

df <- data.frame(Category=character(),
RGS=double(),
AC=double(),
PValue=double(),
stringsAsFactors=FALSE)

fact_character <- levels(cat_b$Category)[as.numeric(cat_b$Category)]

for(i in 1:nrow(cat_b)) {
#a <- cat_b$Category[i]
#print("xxxxxxxxxxx")
#print(RGS_nrow$nrow)
#print(total_nrow$nrow)
#print(cat_b$Category[i])
#print(total_rgs_cat)
#print(total_all_cat)
#print(fact_character[i])
#print(cat_b$RGS[i])
#print(cat_b$AC[i])
if(is.na(cat_b$RGS[i]) | is.na(cat_b$AC[i])){
pvalue <- NA
}else{
stat <- fisher.test(matrix(c(cat_b$RGS[i],total_rgs_cat,
cat_b$AC[i],total_all_cat),nrow=2,ncol=2),alternative="greater")
pvalue <- stat$p.value
}
#print(pvalue)
#print(fact_character[i])
#print("xxxxxxxxxxx")

df[nrow(df) + 1,] = list(Category=fact_character[i],RGS=cat_b$RGS[i], AC=cat_b$AC[i],pvalue)
}

write.csv(df, file = file_nm)
}

.out_file_nm <- function(output_dir,n){
sprintf("%s/rgs_fisher_cat%d.csv",output_dir, n)
}




Loading

0 comments on commit 61b7299

Please sign in to comment.