Skip to content

Commit

Permalink
version 0.2
Browse files Browse the repository at this point in the history
  • Loading branch information
tambeta authored and gaborcsardi committed Nov 22, 2012
0 parents commit 99bd2c2
Show file tree
Hide file tree
Showing 7 changed files with 410 additions and 0 deletions.
15 changes: 15 additions & 0 deletions DESCRIPTION
@@ -0,0 +1,15 @@
Package: gProfileR
Version: 0.2
License: GPL (>= 2)
Description: A package for functional enrichment analysis via the
g:Profiler toolkit (http://biit.cs.ut.ee/gprofiler/).
Title: g:ProfileR
Author: Juri Reimand <juri.reimand@ut.ee>, Raivo Kolde
<rkolde@gmail.com>, Tambet Arak <tambet.arak@gmail.com>
Maintainer: Tambet Arak <tambet.arak@gmail.com>
Depends: R (>= 2.10)
Imports: RCurl, plyr
Collate: 'gProfileR.R'
Packaged: 2012-11-22 13:38:39 UTC; arak
Repository: CRAN
Date/Publication: 2012-11-22 18:35:04
6 changes: 6 additions & 0 deletions MD5
@@ -0,0 +1,6 @@
33f338c1343a224c3004d2f07bc17fe4 *DESCRIPTION
e9a407c892de029c40b2587fda85fa77 *NAMESPACE
7ff7605b07fad3e2363e93b902ea11af *R/gProfileR.R
db6446338dd3e5427f41a3f1f307005f *man/gconvert.Rd
6731a119d46db7711a1aa2f0f438d744 *man/gorth.Rd
5bdd29aa699e82d4c44f085966dc8775 *man/gprofiler.Rd
3 changes: 3 additions & 0 deletions NAMESPACE
@@ -0,0 +1,3 @@
export(gorth)
export(gprofiler)
export(gconvert)
302 changes: 302 additions & 0 deletions R/gProfileR.R
@@ -0,0 +1,302 @@
#' Find orthologs.
#'
#' Interface to the g:Orth tool. Organism names are constructed by concatenating the first letter of
#' the name and the family name. Example: human - 'hsapiens', mouse - 'mmusculus'.
#'
#' To alleviate the problem of having many orthologs per gene (most of
#' them uninformative) one can set a threshold for the number of results. The program
#' tries to find the most informative by selecting the most popular ones.
#'
#' @param query list of gene IDs to be translated.
#' @param source_organism name of the source organism.
#' @param target_organism name of the target organism.
#' @param region_query interpret query as chromosomal ranges.
#' @param numeric_ns namespace to use for fully numeric IDs.
#' @param mthreshold maximum number of ortholog names per gene to show.
#' @param df logical indicating whether the output will be a data.frame or list.
#' @return The output can be either a list or a data.frame. The list has an entry for every input gene.
#' The data frame is just a two column table with inputs and corrsponding outputs. The input names may be duplicated.
#' @references J. Reimand, M. Kull, H. Peterson, J. Hansen, J. Vilo: g:Profiler -- a web-based toolset for functional profiling of gene lists from large-scale experiments (2007) NAR 35 W193-W200
#' @author Raivo Kolde <rkolde@@gmail.com>, Juri Reimand <juri.reimand@@ut.ee>, Tambet Arak <tambet.arak@@gmail.com>
#' @examples
#' gorth(c("Klf4", "Pax5", "Sox2", "Nanog"), source_organism = "mmusculus", target_organism = "hsapiens")
#' @export
gorth <- function(
query,
source_organism = "hsapiens",
target_organism = "mmusculus",
region_query = F,
numeric_ns = "",
mthreshold = 3,
df = T
){
my_url <- "http://biit.cs.ut.ee/gprofiler/gorth.cgi"

if(length(query) == 0) return(NULL)

rcurl_opts = RCurl::curlOptions(useragent = "gProfileR")
raw_query <- RCurl::postForm(my_url, .opts = rcurl_opts,
output = 'mini',
query = paste(query, collapse = " "),
organism = source_organism,
target = target_organism,
region_query = ifelse(region_query, "1", "0"),
prefix = numeric_ns
)

conn <- textConnection(raw_query)
tab <- read.table(conn, sep = "\t")
close(conn)

colnames(tab)[2] <- "Source"

res = plyr::dlply(tab, "Source", function(x){

x = x[x$V6 != "N/A",]
if(nrow(x) == 0){
return(NULL)
}

if(length(x$V6) < mthreshold + 1){
return(as.character(x$V6))
}
else{
return(names(rev(sort(table(as.character(x$V6)))))[1:mthreshold])
}
})

if(df){
res <- plyr::ldply(res, function(x) data.frame(Target = x))
}

return(res)
}

#' Annotate gene list functionally.
#'
#' Interface to the g:Profiler tool for finding enrichments in gene lists. Organism names are constructed by concatenating the first letter of
#' the name and the family name. Example: human - 'hsapiens', mouse - 'mmusculus'.
#'
#' @param organism organism name.
#' @param query list of gene IDs or a list of such lists.
#' @param ordered_query in case output gene lists are ranked this option may be used
#' to get GSEA style p-values.
#' @param significant whether all or only statistically significant results should be
#' returned.
#' @param exclude_iea exclude electronic annotations (IEA).
#' @param region_query interpret query as chromosomal ranges.
#' @param max_p_value custom p-value threshold, results with a larger p-value are excluded.
#' @param max_set_size maximum size of functional category, larger categories are excluded.
#' @param correction_method the algorithm used for determining the significance threshold, one of
#' "gSCS", "fdr", "bonferroni".
#' @param hier_filtering hierarchical filtering strength, one of "none", "moderate", "strong".
#' @param domain_size statistical domain size, one of "annotated", "known".
#' @param custom_bg vector of gene names to use as a statistical background.
#' @param numeric_ns namespace to use for fully numeric IDs.
#' @return Data frame with the enrichment analysis results. If the input consisted of several lists the corresponding list is indicated with a variable 'query number'.
#' @references J. Reimand, M. Kull, H. Peterson, J. Hansen, J. Vilo: g:Profiler - a web-based toolset for functional profiling of gene lists from large-scale experiments (2007) NAR 35 W193-W200
#' @author Juri Reimand <jyri.reimand@@ut.ee>, Raivo Kolde <rkolde@@gmail.com>, Tambet Arak <tambet.arak@@gmail.com>
#' @examples
#' gprofiler(c("Klf4", "Pax5", "Sox2", "Nanog"), organism = "mmusculus")
#' @export
gprofiler <- function(
query,
organism='hsapiens',
ordered_query=F,
significant=T,
exclude_iea=F,
region_query = F,
max_p_value=1.0,
max_set_size=0,
correction_method="analytical",
hier_filtering="none",
domain_size="annotated",
custom_bg="",
numeric_ns = ""
) {
my_url <- "http://biit.cs.ut.ee/gprofiler/gcocoa.cgi"
query_url = ""

# Query

if (is.list(query)) {
for (i in 1:length(query)) {
query_url <- paste(sep="\n", query_url, paste("> query", i), paste(query[[i]], collapse=" "))
}
} else if (is.vector(query)) {
query_url <- paste(query, collapse=" ")
} else {
print("ERROR missing query")
return()
}

# Significance threshold

if (correction_method == "gSCS")
correction_method = "analytical"
if (!correction_method %in% c("analytical", "fdr", "bonferroni"))
stop("Multiple testing correction method not recognized")

# Hierarchical filtering

if (!hier_filtering %in% c("none", "moderate", "strong"))
stop("hier_filtering must be one of \"none\", \"moderate\" or \"strong\"")
if (hier_filtering == "strong") hier_filtering = "compact_ccomp"
else if (hier_filtering == "moderate") hier_filtering = "compact_rgroups"
else hier_filtering = ""

# Domain size

if (!domain_size %in% c("annotated", "known"))
stop("domain_size must be one of \"annotated\" or \"known\"")

# Custom background

if (is.vector(custom_bg))
custom_bg = paste(custom_bg, collapse=" ")
else
stop("custom_bg must be a vector")

# Max. set size

if (max_set_size < 0) max_set_size = 0

rcurl_opts = RCurl::curlOptions(useragent = "gProfileR")
raw_query <- RCurl::postForm(my_url, .opts = rcurl_opts,
organism=organism,
query=query_url,
output='mini',
analytical="1",
sort_by_structure="1",
ordered_query = ifelse(ordered_query, "1", "0"),
significant = ifelse(significant, "1", "0"),
no_iea = ifelse(exclude_iea, "1", "0"),
region_query = ifelse(region_query, "1", "0"),
user_thr = as.character(max_p_value),
max_set_size = as.character(max_set_size),
threshold_algo = correction_method,
hierfiltering = hier_filtering,
domain_size_type = domain_size,
custbg_file = "",
custbg = custom_bg,
prefix = numeric_ns
)

split_query <- unlist(strsplit(raw_query, split="\n"))

commented_lines <- grep("^#", split_query)
if (length(commented_lines)>0) {
split_query <- split_query[-commented_lines]
}

empty_lines <- which(split_query == "")
if (length(empty_lines)>0) {
split_query <- split_query[-empty_lines]
}

if(length(split_query) > 0){
conn <- textConnection(paste(split_query, collapse = "\n"))
split_query <- read.table(conn, sep = "\t", quote = "")
close(conn)
}
else{
split_query <- as.data.frame(matrix(NA, 0, 14))
}

rownames(split_query) <- NULL
colnames(split_query) <- c(
"query.number", "significant", "p.value",
"term.size", "query.size", "overlap.size",
"precision", "recall", "term.id",
"domain", "subgraph.number", "term.name",
"relative.depth", "intersection"
)
split_query$term.name <- gsub("^\\s+", "", split_query$term.name)
split_query$significant <- ifelse(split_query$significant == "!", T, F)

if(is.list(query) & !is.null(names(query))){
split_query$query.number <- names(query)[split_query$query.number]
}

return(split_query)
}

#' Convert gene IDs.
#'
#' Interface to the g:Convert tool. Organism names are constructed by concatenating the first letter of
#' the name and the family name. Example: human - 'hsapiens', mouse - 'mmusculus'.
#'
#' @param query list of gene IDs.
#' @param organism organism name.
#' @param target target namespace.
#' @param region_query interpret query as chromosomal ranges.
#' @param numeric_ns namespace to use for fully numeric IDs.
#' @param df logical indicating whether the output will be a data.frame or list.
#' @return The output can be either list or data.frame. List has an entry for every input gene. Data frame is just a two column table with inputs and corrsponding outputs. The input names may be duplicated.
#' @references J. Reimand, M. Kull, H. Peterson, J. Hansen, J. Vilo: g:Profiler - a web-based toolset for functional profiling of gene lists from large-scale experiments (2007) NAR 35 W193-W200
#' @author Juri Reimand <jyri.reimand@@ut.ee>, Raivo Kolde <rkolde@@gmail.com>, Tambet Arak <tambet.arak@@gmail.com>
#' @examples
#' gconvert(c("Klf4", "Pax5", "Sox2", "Nanog"), organism = "mmusculus")
#'
#' # Get all mouse Cell cycle genes
#' gconvert(c("GO:0007049"), organism = "mmusculus")
#' @export
gconvert = function(
query,
organism = "hsapiens",
target = "ENSG",
region_query = F,
numeric_ns = "",
df = T
) {

url = "http://biit.cs.ut.ee/gprofiler/gconvert.cgi"
field = 4

if (target == "DESC") {
field = 6
target = "ENSG"
}

rcurl_opts = RCurl::curlOptions(useragent = "gProfileR")
raw_query <- RCurl::postForm(url, .opts = rcurl_opts,
organism=organism,
output='mini',
query=paste(query, collapse="+"),
target=target,
region_query = ifelse(region_query, "1", "0"),
prefix = numeric_ns
)

split_query = unlist(strsplit(raw_query, split="\n"))

commented_lines = grep("^#", split_query)
if (length(commented_lines)>0) {
split_query = split_query[-commented_lines]
}
empty_lines = which(split_query == "")
if (length(empty_lines)>0) {
split_query = split_query[-empty_lines]
}

split_query = t(as.data.frame(strsplit(split="\t", split_query)))
rownames(split_query) = c()
resulting_mapping = list()

if (nrow(split_query)) {
for (id in unique(toupper(query))) {
my_map = split_query[split_query[,2]==id,,drop=F][,field]
my_map = my_map[my_map != "N/A"]
resulting_mapping[[id]] = my_map
}
}

if(df){
resulting_mapping <- plyr::ldply(resulting_mapping, function(x) data.frame(Target = x))
}

return(resulting_mapping)
}



24 changes: 24 additions & 0 deletions man/gconvert.Rd
@@ -0,0 +1,24 @@
\name{gconvert}
\alias{gconvert}
\title{Convert gene IDs.}
\usage{gconvert(query, organism="hsapiens", target="ENSG", region_query=F,
numeric_ns="", df=T)
}
\description{Convert gene IDs.}
\details{Interface to the g:Convert tool. Organism names are constructed by concatenating the first letter of
the name and the family name. Example: human - 'hsapiens', mouse - 'mmusculus'.}
\value{The output can be either list or data.frame. List has an entry for every input gene. Data frame is just a two column table with inputs and corrsponding outputs. The input names may be duplicated.}
\references{J. Reimand, M. Kull, H. Peterson, J. Hansen, J. Vilo: g:Profiler - a web-based toolset for functional profiling of gene lists from large-scale experiments (2007) NAR 35 W193-W200}
\author{Juri Reimand <jyri.reimand@ut.ee>, Raivo Kolde <rkolde@gmail.com>, Tambet Arak <tambet.arak@gmail.com>}
\arguments{\item{query}{list of gene IDs.}
\item{organism}{organism name.}
\item{target}{target namespace.}
\item{region_query}{interpret query as chromosomal ranges.}
\item{numeric_ns}{namespace to use for fully numeric IDs.}
\item{df}{logical indicating whether the output will be a data.frame or list.}
}
\examples{gconvert(c("Klf4", "Pax5", "Sox2", "Nanog"), organism = "mmusculus")

# Get all mouse Cell cycle genes
gconvert(c("GO:0007049"), organism = "mmusculus")}

27 changes: 27 additions & 0 deletions man/gorth.Rd
@@ -0,0 +1,27 @@
\name{gorth}
\alias{gorth}
\title{Find orthologs.}
\usage{gorth(query, source_organism="hsapiens", target_organism="mmusculus",
region_query=F, numeric_ns="", mthreshold=3, df=T)
}
\description{Find orthologs.}
\details{Interface to the g:Orth tool. Organism names are constructed by concatenating the first letter of
the name and the family name. Example: human - 'hsapiens', mouse - 'mmusculus'.

To alleviate the problem of having many orthologs per gene (most of
them uninformative) one can set a threshold for the number of results. The program
tries to find the most informative by selecting the most popular ones.}
\value{The output can be either a list or a data.frame. The list has an entry for every input gene.
The data frame is just a two column table with inputs and corrsponding outputs. The input names may be duplicated.}
\references{J. Reimand, M. Kull, H. Peterson, J. Hansen, J. Vilo: g:Profiler -- a web-based toolset for functional profiling of gene lists from large-scale experiments (2007) NAR 35 W193-W200}
\author{Raivo Kolde <rkolde@gmail.com>, Juri Reimand <juri.reimand@ut.ee>, Tambet Arak <tambet.arak@gmail.com>}
\arguments{\item{query}{list of gene IDs to be translated.}
\item{source_organism}{name of the source organism.}
\item{target_organism}{name of the target organism.}
\item{region_query}{interpret query as chromosomal ranges.}
\item{numeric_ns}{namespace to use for fully numeric IDs.}
\item{mthreshold}{maximum number of ortholog names per gene to show.}
\item{df}{logical indicating whether the output will be a data.frame or list.}
}
\examples{gorth(c("Klf4", "Pax5", "Sox2", "Nanog"), source_organism = "mmusculus", target_organism = "hsapiens")}

0 comments on commit 99bd2c2

Please sign in to comment.