-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Dan Higgins
committed
Jul 9, 2018
1 parent
f3d953b
commit 61b7299
Showing
10 changed files
with
16,922 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
Package: wormcat | ||
Title: What the Package Does (one line, title case) | ||
Version: 0.0.0.9000 | ||
Authors@R: person("First", "Last", email = "first.last@example.com", role = c("aut", "cre")) | ||
Description: What the package does (one paragraph). | ||
Depends: R (>= 3.4.3) | ||
License: What license is it under? | ||
Encoding: UTF-8 | ||
LazyData: true | ||
RoxygenNote: 6.0.1.9000 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# Generated by roxygen2: do not edit by hand | ||
|
||
export(worm_cat_fun) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
|
||
|
||
.worm_cat_acceptable_pvalues <- function(csv_file_name) { | ||
|
||
acceptable_pvalues <- read.csv(csv_file_name, row.names=1, header=TRUE, sep =",") | ||
acceptable_pvalues <- na.omit(acceptable_pvalues) | ||
|
||
acceptable_pvalues[order(acceptable_pvalues$PValue),] | ||
|
||
acceptable_pvalues <- subset(acceptable_pvalues, PValue < 0.05) | ||
|
||
acceptable_pvalues <- rbind(acceptable_pvalues, data.frame(Category = "calibration high", RGS = 250, AC = 0, PValue = 1.00E-50)) | ||
acceptable_pvalues <- rbind(acceptable_pvalues, data.frame(Category = "calibration low", RGS = 1, AC = 0, PValue = 1)) | ||
|
||
csv_out_file_nm <-sprintf("%s_apv.csv",substr(csv_file_name, 0, nchar(csv_file_name)-4)) | ||
write.csv(acceptable_pvalues, file = csv_out_file_nm,row.names=FALSE) | ||
} | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
## apply category 3 to regulated gene set then split categories | ||
|
||
.worm_cat_add_categories <- function(csv_fnm, out_dir, worm_cat_annotations){ | ||
|
||
RGS <- read.csv(csv_fnm, header=TRUE, sep =",") | ||
|
||
annotations <- read.csv(worm_cat_annotations, header=TRUE, sep =",") | ||
|
||
#Remove dupliplicates from RGS | ||
RGS_df <- data.frame(unique(RGS)) | ||
|
||
#Remove dupliplicates from annotations | ||
#annotations_df <- data.frame(unique(annotations$Sequence.ID)) | ||
annotations_df <- annotations[!duplicated(annotations$Sequence.ID), ] | ||
|
||
# remove other columns | ||
annotations_clean <- annotations_df[c(1,2,3,4,5,7)] | ||
|
||
# merge.x | ||
RGS_merge <- merge(RGS_df, annotations_clean, by = "Sequence.ID", all.x = TRUE) | ||
|
||
# create Cat1, Cat2, Cat3 files | ||
Cat <- RGS_merge[c(3,4,5)] | ||
|
||
# Save csv | ||
write.csv(Cat, file = paste(out_dir,"/rgs_and_categories.csv", sep="")) | ||
|
||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
library(ggplot2) | ||
library(plotflow) | ||
library(scales) | ||
library(ggthemes) | ||
library(pander) | ||
|
||
#RGS-Regulated Gene Sets | ||
|
||
#3. Run bubble chart | ||
#Will need to select categories (only p values < 0.05 or only metabolism) | ||
#Will need to match file name/category name in script | ||
#Will need to add calibrations for high and low (see arf_cat2UP.csv) so that colors and sizes stay consistent between graphs | ||
|
||
.worm_cat_bubble_plot <- function(csv_file_name, plot_title) { | ||
|
||
dev.list() | ||
|
||
bubbles <- read.csv(csv_file_name, header=TRUE, sep =",") | ||
|
||
bubbles <- na.omit(bubbles) | ||
|
||
bubbles$bubbles_z <- round(0.001 * (bubbles$PValue - mean(bubbles$PValue))/sd(bubbles$PValue), 2) # compute normalized value as a placeholder | ||
|
||
bubbles$p_value_type <- ifelse(bubbles$PValue < 1e-40, "Col1", | ||
ifelse(bubbles$PValue < 1e-20, "Col2", | ||
ifelse(bubbles$PValue < 1e-10, "Col3", | ||
ifelse(bubbles$PValue < 1e-5, "Col4", | ||
ifelse(bubbles$PValue < 1e-2, "Col5", | ||
ifelse(bubbles$PValue < 5e-2, "Col6","NS")))))) | ||
|
||
|
||
bubbles$RGS_size <- ifelse(bubbles$RGS > 150, "Size1", | ||
ifelse(bubbles$RGS > 100, "Size2", | ||
ifelse(bubbles$RGS > 75, "Size3", | ||
ifelse(bubbles$RGS > 50, "Size4", | ||
ifelse(bubbles$RGS > 25, "Size5", | ||
ifelse(bubbles$RGS > 10, "Size6", | ||
ifelse(bubbles$RGS > 5, "Size7", | ||
ifelse(bubbles$RGS > 2, "Size8","Size9")))))))) | ||
|
||
|
||
bubbles_plot <- bubbles[order(bubbles$PValue), ] # sort | ||
|
||
# Plot | ||
myplot <- ggplot(reorder_by(Category, ~ -PValue, bubbles_plot), aes(x=`Category`, y=bubbles_z, label=bubbles_z, size = bubbles_plot$RGS_size)) + | ||
|
||
geom_point(stat='identity', aes(col=bubbles_plot$p_value_type)) + | ||
|
||
scale_color_manual(name="P value", | ||
labels = c("10-40", "10-20", "10-10", "10-5", "0.001", "0.05", "NS"), | ||
values = c("Col1"="red4", "Col2" = "orangered3", "Col3" = "orangered1", | ||
"Col4" = "orange", "Col5" = "gold", "Col6" = "yellow", "NS"="grey48")) + | ||
scale_size_manual(name="Count", | ||
labels = c( "150", "100", "75", "50", "25", "10","5", "2", "Zero"), | ||
values = c( "Size1" = 10, "Size2" = 9, | ||
"Size3" = 8, "Size4" = 7, "Size5" = 6,"Size6" = 5, "Size7" = 2.5, "Size8" = 1,"Size9"=0.1)) + | ||
|
||
|
||
labs(title=plot_title) + | ||
ylim(-1.0, 1.0) + | ||
coord_flip() | ||
|
||
|
||
myplot + theme(panel.grid = element_blank(), panel.background = element_blank(), legend.key = element_rect(fill = "white", colour = "white")) | ||
|
||
s_from <- 0 | ||
s_to <- nchar(csv_file_name)-4 | ||
file_out_name <- sprintf("%s.png",substr(csv_file_name, s_from, s_to)) | ||
|
||
#ggsave(file_out_name, width = 5.5, height = 5, useDingbats=F) | ||
ggsave(file_out_name, width = 5.5, height = 5) | ||
|
||
print(myplot) | ||
|
||
dev.off() | ||
} | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
#rm(list= ls()) | ||
|
||
#source("worm_cat_add_categories.R") | ||
#source("worm_cat_fisher_test.R") | ||
#source("worm_cat_acceptable_pvalues.R") | ||
#source("worm_cat_bubble_plot.R") | ||
|
||
#' Worm Cat Function | ||
#' | ||
#' This function take a regulated gene set and the category annotations and runs a fisher test. | ||
#' @param file_to_process the file to be processes | ||
#' @param title the title for your bubble plots | ||
#' @param rm_dir Boolean If FALSE do not remove temp dir. If TRUE remove temp dir | ||
#' @keywords worm cat | ||
#' @export | ||
#' @examples | ||
#' worm_cat_fun() | ||
worm_cat_fun <- function(file_to_process, title="rgs", rm_dir=FALSE){ | ||
mainDir <- getwd() | ||
subDir <- paste("worm-cat_",format(Sys.time(), "%b-%d-%Y-%H:%M:%S"), sep="") | ||
subDirPath <- paste("./",subDir, sep="") | ||
dir.create(file.path(mainDir, subDir)) | ||
|
||
#worm_cat_annotations <- "annotations_jul-09-2018.csv" | ||
worm_cat_annotations <- system.file("extdata", "annotations_jul-09-2018.csv", package="wormcat") | ||
|
||
.worm_cat_add_categories(file_to_process, subDirPath, worm_cat_annotations) | ||
|
||
.worm_cat_fisher_test(subDirPath, worm_cat_annotations) | ||
|
||
# For each of the three files made above: | ||
# 1. Parse the file to only include the entries with "acceptable pvalues" | ||
# 2. Create bubble plots for each of the three categories based on the accepteble pvlaues | ||
for(i in 1:3) { | ||
|
||
cat_file_to_process <- sprintf("./%s/rgs_fisher_cat%d.csv", subDir, i) | ||
print(sprintf("Processed %s",cat_file_to_process)) | ||
.worm_cat_acceptable_pvalues(cat_file_to_process) | ||
|
||
cat_pvalue_file_to_process <- sprintf("./%s/rgs_fisher_cat%d_apv.csv",subDir, i) | ||
|
||
plot_titles <- c(paste(title, "category1", sep=":"), | ||
paste(title, "category2", sep=":"), | ||
paste(title, "category3", sep=":")) | ||
.worm_cat_bubble_plot(cat_pvalue_file_to_process, plot_titles[i]) | ||
} | ||
|
||
files2zip <- dir(subDirPath, full.names = TRUE) | ||
zip(zipfile = subDir, files = files2zip) | ||
if(rm_dir == TRUE){ | ||
print("cleaning up") | ||
unlink(subDir, TRUE) | ||
} | ||
} | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
# Goal: generate p values for enrichment of annotation terms in RNA seq data | ||
# Step 1: Count categories in annotation files *AC*, will stay static | ||
# Step 2: Count categories in regulated gene sets *RGS*, make this easy to switch in alternate data sets, *RGS_a, b, c, etc.* | ||
# Step 3: Generate data frame: AC and RGS | ||
# Step 4: Build contengency table for each category in RGS vs AC, use a for loop to build the contengency tables. | ||
# Step 5: Use Fisher.test to generate P value for enrichment of specific categories in RGS | ||
|
||
library("data.table") | ||
library("plyr") | ||
|
||
.worm_cat_fisher_test <- function(output_dir, worm_cat_annotations){ | ||
# read in csv files, create data frames | ||
AC <- read.csv(worm_cat_annotations,header = TRUE, sep = ",") | ||
|
||
csv_file <- paste(output_dir,"/rgs_and_categories.csv", sep="") | ||
RGS <- read.csv(csv_file,header = TRUE, sep = ",") | ||
|
||
# Step 1 Count categories in annotation files *AC*, will stay static | ||
|
||
total_count <- data.frame(nrow(AC)) | ||
|
||
total_nrow <- rename(total_count, c("nrow.AC." = "nrow")) | ||
|
||
#print(total_nrow) | ||
#print(total_nrow$nrow) | ||
#print(total_annotated) | ||
|
||
total_annotated_cat1 <- data.frame(table(AC$Category.1)) | ||
|
||
total_annotated_cat2 <- data.frame(table(AC$Category.2)) | ||
|
||
total_annotated_cat3 <- data.frame(table(AC$Category.3)) | ||
|
||
# Step 2/3: Count categories in regulated gene sets *RGS* | ||
|
||
RGS_count <- data.frame(nrow(RGS)) | ||
|
||
#print(RGS_count) | ||
|
||
RGS_nrow <- rename(RGS_count, c("nrow.RGS." = "nrow")) | ||
|
||
#print(RGS_nrow) | ||
#print(RGS_nrow$nrow) | ||
|
||
RGS_annotated_cat1 <- data.frame(table(RGS$Category.1)) | ||
|
||
RGS_annotated_cat2 <- data.frame(table(RGS$Category.2)) | ||
|
||
RGS_annotated_cat3 <- data.frame(table(RGS$Category.3)) | ||
|
||
.merger_cats(RGS_annotated_cat1, total_annotated_cat1,total_nrow$nrow, RGS_nrow$nrow, .out_file_nm(output_dir,1)) | ||
.merger_cats(RGS_annotated_cat2, total_annotated_cat2,total_nrow$nrow, RGS_nrow$nrow, .out_file_nm(output_dir,2)) | ||
.merger_cats(RGS_annotated_cat3, total_annotated_cat3,total_nrow$nrow, RGS_nrow$nrow, .out_file_nm(output_dir,3)) | ||
} | ||
|
||
########################### | ||
# Step 4: Merge data frames | ||
.merger_cats <- function(UP_annotated_cat, total_annotated_cat,total_all_cat, total_rgs_cat, file_nm) { | ||
|
||
cat_a <- merge(UP_annotated_cat, total_annotated_cat, by = "Var1", all.x = TRUE) | ||
|
||
#print(head(cat_a)) | ||
|
||
cat_b <- rename(cat_a, c("Var1" = "Category", "Freq.x" = "RGS", "Freq.y" = "AC" )) | ||
|
||
# Step 5: Build contengency table for each category in RGS vs AC | ||
|
||
#"a =cat_b$Category", | ||
|
||
#"x = cat_b$RGS", | ||
|
||
#"y = cat_b$AC" | ||
|
||
# con <- file("test.log") | ||
# con2 <- file("/Users/danhiggins/Code/R_Workspace/Fisher-Test/test1.log") | ||
# sink(con, append=TRUE) | ||
# sink(con, append=TRUE, type="message") | ||
|
||
df <- data.frame(Category=character(), | ||
RGS=double(), | ||
AC=double(), | ||
PValue=double(), | ||
stringsAsFactors=FALSE) | ||
|
||
fact_character <- levels(cat_b$Category)[as.numeric(cat_b$Category)] | ||
|
||
for(i in 1:nrow(cat_b)) { | ||
#a <- cat_b$Category[i] | ||
#print("xxxxxxxxxxx") | ||
#print(RGS_nrow$nrow) | ||
#print(total_nrow$nrow) | ||
#print(cat_b$Category[i]) | ||
#print(total_rgs_cat) | ||
#print(total_all_cat) | ||
#print(fact_character[i]) | ||
#print(cat_b$RGS[i]) | ||
#print(cat_b$AC[i]) | ||
if(is.na(cat_b$RGS[i]) | is.na(cat_b$AC[i])){ | ||
pvalue <- NA | ||
}else{ | ||
stat <- fisher.test(matrix(c(cat_b$RGS[i],total_rgs_cat, | ||
cat_b$AC[i],total_all_cat),nrow=2,ncol=2),alternative="greater") | ||
pvalue <- stat$p.value | ||
} | ||
#print(pvalue) | ||
#print(fact_character[i]) | ||
#print("xxxxxxxxxxx") | ||
|
||
df[nrow(df) + 1,] = list(Category=fact_character[i],RGS=cat_b$RGS[i], AC=cat_b$AC[i],pvalue) | ||
} | ||
|
||
write.csv(df, file = file_nm) | ||
} | ||
|
||
.out_file_nm <- function(output_dir,n){ | ||
sprintf("%s/rgs_fisher_cat%d.csv",output_dir, n) | ||
} | ||
|
||
|
||
|
||
|
Oops, something went wrong.