Skip to content
This repository has been archived by the owner on May 22, 2024. It is now read-only.

Commit

Permalink
Use 'by' instead of array_agg(distinct) for factors
Browse files Browse the repository at this point in the history
It turns out that the latter uses lots of memory and is slow.
  • Loading branch information
Qian, Hai committed Apr 18, 2014
1 parent 6aca9c6 commit c9b5789
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 17 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ Package: PivotalR
Type: Package
Title: R front-end to PostgreSQL and Pivotal (Greenplum) database,
wrapper for MADlib
Version: 0.1.15.41
Version: 0.1.15.42
Date: 2014-03-10
Author: Predictive Analytics Team at Pivotal Inc. <user@madlib.net>,
with contributions from Data Scientist Team at Pivotal Inc.
Expand Down
35 changes: 20 additions & 15 deletions R/method-as.db.data.frame_.R
Original file line number Diff line number Diff line change
Expand Up @@ -261,25 +261,30 @@ setMethod (
factor.ref <- rep(as.character(NA), length(x@.is.factor))
if (pivot && !all(x@.is.factor == FALSE)) {
cats <- x@.expr[x@.is.factor]
sql <- "select "
for (i in seq_len(length(cats))) {
sql <- paste(sql, "array_agg(distinct case when ", cats[i],
" is NULL then 'NULL' else (",
cats[i], ")::text end) as ",
"distinct_", i, sep = "")
if (i != length(cats)) sql <- paste(sql, ",", sep = "")
}
## scan through the table only once
sql <- paste(sql, " from ", tbl, where, sep = "")
#distincts <- .db.getQuery(sql, conn.id)
distincts <- db.q(sql, conn.id = conn.id, verbose = FALSE)
idx <- 0
## sql <- "select "
## for (i in seq_len(length(cats))) {
## sql <- paste(sql, "array_agg(distinct case when ", cats[i],
## " is NULL then 'NULL' else (",
## cats[i], ")::text end) as ",
## "distinct_", i, sep = "")
## if (i != length(cats)) sql <- paste(sql, ",", sep = "")
## }
## ## scan through the table only once
## sql <- paste(sql, " from ", tbl, where, sep = "")
## #distincts <- .db.getQuery(sql, conn.id)
## distincts <- db.q(sql, conn.id = conn.id, verbose = FALSE)
## idx <- 0
for (i in seq_len(length(x@.is.factor))) {
if (x@.is.factor[i]) {
idx <- idx + 1
distinct <- as.vector(arraydb.to.arrayr(distincts[[paste("distinct_",idx,sep="")]], "character"))
## idx <- idx + 1
## distinct <- as.vector(arraydb.to.arrayr(distincts[[paste("distinct_",idx,sep="")]], "character"))
## Produce a fixed order for distinct values
## distinct <- .strip(distinct[order(distinct, decreasing = TRUE)], "\"")

distinct <- lk(by(x[[i]], x[[i]], function(s) as.character(s)), -1)[,1]
distinct[is.na(distinct)] <- "NULL"
distinct <- .strip(distinct[order(distinct, decreasing = TRUE)], "\"")

if (is.na(x@.factor.ref[i]))
avoid <- distinct[length(distinct)]
else
Expand Down
2 changes: 1 addition & 1 deletion R/utility-generic.R
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ arraydb.to.arrayr <- function (str, type = "double", n = 1)
names(fake) <- names(data)
for (i in seq_len(l)) {
if (data@.is.factor[i]) {
fake[,i] <- distinct[[data@.col.name[i]]]
fake[,i] <- array(distinct[[data@.col.name[i]]], dim = c(max.level, 1))
fake[,i] <- as.factor(fake[,i])
fake[,i] <- relevel(fake[,i],
ref = ref[[data@.col.name[i]]])
Expand Down

0 comments on commit c9b5789

Please sign in to comment.