Use 'by' instead of array_agg(distinct) for factors

It turns out that the latter uses lots of memory and is slow.
greenplum-db · Apr 18, 2014 · c9b5789 · c9b5789
1 parent 6aca9c6
commit c9b5789
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 17 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -2,7 +2,7 @@ Package: PivotalR
 Type: Package
 Title: R front-end to PostgreSQL and Pivotal (Greenplum) database,
         wrapper for MADlib
-Version: 0.1.15.41
+Version: 0.1.15.42
 Date: 2014-03-10
 Author: Predictive Analytics Team at Pivotal Inc. <user@madlib.net>,
         with contributions from Data Scientist Team at Pivotal Inc.

diff --git a/R/method-as.db.data.frame_.R b/R/method-as.db.data.frame_.R
@@ -261,25 +261,30 @@ setMethod (
         factor.ref <- rep(as.character(NA), length(x@.is.factor))
         if (pivot && !all(x@.is.factor == FALSE)) {
             cats <- x@.expr[x@.is.factor]
-            sql <- "select "
-            for (i in seq_len(length(cats))) {
-                sql <- paste(sql, "array_agg(distinct case when ", cats[i],
-                             " is NULL then 'NULL' else (",
-                             cats[i], ")::text end) as ",
-                             "distinct_", i, sep = "")
-                if (i != length(cats)) sql <- paste(sql, ",", sep = "")
-            }
-            ## scan through the table only once
-            sql <- paste(sql, " from ", tbl, where, sep = "")
-            #distincts <- .db.getQuery(sql, conn.id)
-            distincts <- db.q(sql, conn.id = conn.id, verbose = FALSE)
-            idx <- 0
+            ## sql <- "select "
+            ## for (i in seq_len(length(cats))) {
+            ##     sql <- paste(sql, "array_agg(distinct case when ", cats[i],
+            ##                  " is NULL then 'NULL' else (",
+            ##                  cats[i], ")::text end) as ",
+            ##                  "distinct_", i, sep = "")
+            ##     if (i != length(cats)) sql <- paste(sql, ",", sep = "")
+            ## }
+            ## ## scan through the table only once
+            ## sql <- paste(sql, " from ", tbl, where, sep = "")
+            ## #distincts <- .db.getQuery(sql, conn.id)
+            ## distincts <- db.q(sql, conn.id = conn.id, verbose = FALSE)
+            ## idx <- 0
             for (i in seq_len(length(x@.is.factor))) {
                 if (x@.is.factor[i]) {
-                    idx <- idx + 1
-                    distinct <- as.vector(arraydb.to.arrayr(distincts[[paste("distinct_",idx,sep="")]], "character"))
+                    ## idx <- idx + 1
+                    ## distinct <- as.vector(arraydb.to.arrayr(distincts[[paste("distinct_",idx,sep="")]], "character"))
                     ## Produce a fixed order for distinct values
+                    ## distinct <- .strip(distinct[order(distinct, decreasing = TRUE)], "\"")
+
+                    distinct <- lk(by(x[[i]], x[[i]], function(s) as.character(s)), -1)[,1]
+                    distinct[is.na(distinct)] <- "NULL"
                     distinct <- .strip(distinct[order(distinct, decreasing = TRUE)], "\"")
+
                     if (is.na(x@.factor.ref[i]))
                         avoid <- distinct[length(distinct)]
                     else

diff --git a/R/utility-generic.R b/R/utility-generic.R
@@ -276,7 +276,7 @@ arraydb.to.arrayr <- function (str, type = "double", n = 1)
             names(fake) <- names(data)
             for (i in seq_len(l)) {
                 if (data@.is.factor[i]) {
-                    fake[,i] <- distinct[[data@.col.name[i]]]
+                    fake[,i] <- array(distinct[[data@.col.name[i]]], dim = c(max.level, 1))
                     fake[,i] <- as.factor(fake[,i])
                     fake[,i] <- relevel(fake[,i],
                                         ref = ref[[data@.col.name[i]]])