Skip to content

Commit

Permalink
Added ffdfddply
Browse files Browse the repository at this point in the history
  • Loading branch information
jwijffels committed Apr 18, 2012
1 parent 30e366f commit 57ccc53
Show file tree
Hide file tree
Showing 157 changed files with 6,494 additions and 2 deletions.
3 changes: 3 additions & 0 deletions .gitignore
@@ -0,0 +1,3 @@
.Rproj.user
.Rhistory
.RData
Empty file modified build.bash 100644 → 100755
Empty file.
20 changes: 20 additions & 0 deletions examples/ffdfplyr.R
@@ -0,0 +1,20 @@
data(iris)
ffiris <- as.ffdf(iris)

result <- ffdfddply(x=ffiris,
split=x$Species,
FUN=function(x){
lowestbypetalwidth <- x[order(x$Petal.Width, decreasing=TRUE), ]
lowestbypetalwidth <- lowestbypetalwidth[!duplicated(lowestbypetalwidth[, c("Species","Petal.Width")]), ]
lowestbypetalwidth$group <- factor(x= "lowest", levels = c("lowest","highest"))
highestbypetalwidth <- x[order(x$Petal.Width, decreasing=FALSE), ]
highestbypetalwidth <- highestbypetalwidth[!duplicated(highestbypetalwidth[, c("Species","Petal.Width")]), ]
highestbypetalwidth$group <- factor(x= "highest", levels = c("lowest","highest"))
rbind(lowestbypetalwidth, highestbypetalwidth)
},
BATCHBYTES = 5000,
trace=TRUE)
class(result)
dim(result)
dim(iris)
result[1:10,]
20 changes: 20 additions & 0 deletions output/ffbase.Rcheck/00_pkg_src/ffbase/DESCRIPTION
@@ -0,0 +1,20 @@
Package: ffbase
Maintainer: Edwin de Jonge <edwindjonge@gmail.com>
License: GPL-3
Title: Basic statistical functions for package ff
Type: Package
LazyLoad: yes
Author: Edwin de Jonge, Jan Wijffels
Description: Implementation for min,max, mean, tabulate,
table, with, within. for package ff
Version: 0.4
URL: http://code.google.com/p/fffunctions/
Date: 2011-11-1
Depends: ff (>= 2.2-0)
Suggests: testthat, LaF
Collate: 'auxilary.R' 'chunkify.R' 'compact.R' 'cut_ff.R' 'diff_ff.R'
'droplevels.R' 'ffappend.R' 'ffdfsave.R' 'mean.R' 'pkg.R'
'risplit.R' 'subset.R' 'Summary_ff.R' 'table_ff.R'
'tabulate_ff.R' 'transform.R' 'unique.R' 'with.R' 'ffwhich.R'
'ffplyr.R' 'zzz.R'
Packaged: 2012-04-18 21:31:40 UTC; jan
28 changes: 28 additions & 0 deletions output/ffbase.Rcheck/00_pkg_src/ffbase/NAMESPACE
@@ -0,0 +1,28 @@
export(all.ff)
export(chunkify)
export(ffappend)
export(ffdfddply)
export(ffdfsave)
export(ffwhich)
export(max.ff)
export(min.ff)
export(range.ff)
export(subset.ff)
export(subset.ffdf)
export(table.ff)
export(tabulate.ff)
export(transform.ffdf)
S3method(any,ff)
S3method(c,ff)
S3method(compact,ff)
S3method(cut,ff)
S3method(droplevels,ff)
S3method(droplevels,ffdf)
S3method(ffwhich,ffdf)
S3method(ffwhich,ff_vector)
S3method(mean,ff)
S3method(mean,ffdf)
S3method(sum,ff)
S3method(unique,ff)
S3method(with,ffdf)
S3method(within,ffdf)
11 changes: 11 additions & 0 deletions output/ffbase.Rcheck/00_pkg_src/ffbase/NEWS
@@ -0,0 +1,11 @@
version 0.3
- Added ffdfsave function that can be used to save an ffdf data.frame

version 0.2-2
- bug fix, subset for ffdf now works properly (thanks to Martijn Tennekes)

version 0.2-1
- bug fix, sum.ff with na.rm=TRUE is now working correctly (thanks to Andreas Borg)

version 0.2
- initial version
111 changes: 111 additions & 0 deletions output/ffbase.Rcheck/00_pkg_src/ffbase/R/Summary_ff.R
@@ -0,0 +1,111 @@
#' Summary methods for ff objects
#'
#' @export
#' @param x a \code{ff} object
#' @param ... optional other (\code{ff}) objects
#' @param na.rm should \code{NA} be removed?
#' @param range a \code{ri} or an \code{integer} vector of \code{length==2} giving a range restriction for chunked processing
#' @return \code{TRUE}, \code{FALSE} or \code{NA}
all.ff <- function(x,..., na.rm=FALSE, range=NULL){
r <- checkRange(range,x)
all( ...
, sapply(chunk(x, from=min(r), to=max(r))
, function(i){
all(x[i], na.rm=na.rm)
}
)
)
}

#' Summary methods for ff objects
#' @export
#' @method any ff
#' @param x a \code{ff} object
#' @param ... optional other (\code{ff}) objects
#' @param na.rm should \code{NA} be removed?
#' @param range a \code{ri} or an \code{integer} vector of \code{length==2} giving a range restriction for chunked processing
#' @return \code{TRUE}, \code{FALSE} or \code{NA}
any.ff <- function(x, ..., na.rm=FALSE, range=NULL){
r <- checkRange(range,x)
any( ...
, sapply(chunk(x, from=min(r), to=max(r))
, function(i){
any(x[i], na.rm=na.rm)
}
)
)
}

#' \code{sum} returns the sum of all the values present in its arguments.
#'
#' @title Sum of \code{ff} vector Elements
#' @method sum ff
#' @export
#' @param x a \code{ff} object
#' @param ... optional other (\code{ff}) objects
#' @param na.rm should \code{NA} be removed?
#' @param range a \code{ri} or an \code{integer} vector of \code{length==2} giving a range restriction for chunked processing
#' @return sum of elements
sum.ff <- function(x, ..., na.rm=FALSE, range=NULL){
r <- checkRange(range, x)
sum( ...
, sapply( chunk(x, from=min(r), to=max(r))
, function(i){
sum(x[i], na.rm=na.rm)
}
)
)
}

#' Minimum, maximum and range of ff vector
#'
#' default behaviour of \code{\link{min}},\code{\link{max}} and \code{\link{range}}
#' @method min ff
#' @method max ff
#' @method range ff
#' @example ../examples/minmaxrange.R
#' @aliases min max range
#' @export min.ff max.ff range.ff
#' @aliases min.ff max.ff range.ff
#' @param x a \code{ff} object
#' @param ... optional other (\code{ff}) objects
#' @param na.rm should \code{NA} be removed?
#' @param range a \code{ri} or an \code{integer} vector of \code{length==2} giving a range restriction for chunked processing
#' @return minimun, maximum or range values
min.ff <- function(x, ..., na.rm=FALSE, range=NULL){
r <- checkRange(range, x)

min( ... #for all other ff's?
, sapply( chunk(x, from=min(r), to=max(r))
, function(i){
#print(x[i])
min(x[i], na.rm=na.rm)
}
)
)
}

max.ff <- function(x, ..., na.rm=FALSE, range=NULL){
r <- checkRange(range, x)

max( ... #for all other ff's?
, sapply( chunk(x, from=min(r), to=max(r))
, function(i){
max(x[i], na.rm=na.rm)
}
)
)
}

range.ff <- function(x, ..., na.rm=FALSE, range=NULL){
r <- checkRange(range, x)

range( ... #for all other ff's?
, sapply( chunk(x, from=min(r), to=max(r))
, function(i){
#print(x[i])
range(x[i], na.rm=na.rm)
}
)
)
}
31 changes: 31 additions & 0 deletions output/ffbase.Rcheck/00_pkg_src/ffbase/R/auxilary.R
@@ -0,0 +1,31 @@
checkRange <- function(range, x){

if (is.null(range)){
return(ri(1,length(x)))
}

#TODO add checks
range
}

#' Groups the input integer vector into several groups if the running cumulative sum increases a certain maximum number
#'
#' Groups the input integer vector into several groups if the running cumulative sum increases a certain maximum number
#'
#' @param x an integer vector
#' @param max the maximum running cumulative size before an extra grouping is done
#' @return An integer vector of the same length of x, indicating groups
grouprunningcumsum <- function(x, max){
l <- as.integer(length(x))
if(l == 0){
return(x)
}
x <- as.integer(x)
max <- as.integer(max)
result <- .C("grouprunningcumsum",
x = x,
l = l,
max = max,
PACKAGE="ffbase")
result$x
}
46 changes: 46 additions & 0 deletions output/ffbase.Rcheck/00_pkg_src/ffbase/R/chunkify.R
@@ -0,0 +1,46 @@
#' Chunkify an element-wise function
#'
#' Chunkify creates a new function that operates on a ff vector.
#' It creates chunks from the ff vector and calls the orginal function \code{fun} on each chunk.
#' @export chunkify
#' @param fun function to be 'chunkified', the function must accept a vector and
#' return a vector of the same \code{length}
#' @return 'chunkified' function that accepts a \code{ff} vector as its first argument.
chunkify <- function(fun){
cfun <- function( x
, ...
, inplace=FALSE
){

chunks <- chunk(x)

i <- chunks[[1]]
ret <- as.ff(fun(x[i], ...))
length(ret) <- length(x)

for (i in chunks[-1]){
ret[i] <- fun(x[i], ...)
}
ret
}
cfun
}

#' Chunk an expression to be used in a chunk for loop
#'@param x \code{character} with vars
#'@param expr \code{expression} vector
#'@param i name of index
#'@keywords internal
chunkexpr <- function(x, expr, i="i"){
es <- deparse(expr)
xs <- x
for (var in xs){
varre <- paste("\\b(",var,")\\b", sep="")
varsub <- paste("\\1[",i,"]", sep="")
es <- gsub(varre, varsub, es)
}
#print(es)
parse(text=es)
}

#chunkexpr(c("x","y"), expression(x>2 & y==1, z==3, y> 3))
34 changes: 34 additions & 0 deletions output/ffbase.Rcheck/00_pkg_src/ffbase/R/compact.R
@@ -0,0 +1,34 @@
#' Compact a ff vector or ffdf data frame
#'
#' Compact takes a ff vector and tries to use the smallest binary data type for this vector.
#' @aliases compact compact.ff compact.ffdf
#' @method compact ff
#' @method compact ffdf
#' @export
#' @param x \code{ff} or \code{ffdf} object
#' @param use.na \code{logical} if TRUE the resulting ff vector can contain NA, otherwise not
#' @param ... other parameters
#' @return compact ff vector
compact <- function(x, use.na=TRUE, ...){
UseMethod("compact")
}

compact.ff <- function(x, use.na=TRUE,...){
switch( vmode(x)
, integer = {
lev <- levels(x)
levels(x) <- NULL
r <- range(x, na.rm=TRUE)

if (r[1] <= 0)
return(x)
as.ff(x, vmode="byte")
}
, x
)
}

compact.ffdf <- function(x, use.na, ...){
ret <- lapply(physical(x), compact, use.na)
do.call(ffdf, ret)
}
42 changes: 42 additions & 0 deletions output/ffbase.Rcheck/00_pkg_src/ffbase/R/cut_ff.R
@@ -0,0 +1,42 @@
#' cut divides the range of x into intervals and codes the values in x according to which interval they fall. The leftmost interval corresponds to level one, the next leftmost to level two and so on.
#'
#' The \code{cut} method for ff with the behaviour of \code{link{cut}}
#' @title Convert Numeric ff vector to factor ff
#' @export
#' @seealso cut
#' @method cut ff
#'
#' @param x a (numeric) ff object that will be cut into pieces
#' @param breaks specifies the breaks for cutting this
#' @param ... other parameters that can be given to \code{\link{cut.default}}
#'
#' @return ff a new \code{\link{ff}} object with the newly created factor
cut.ff <- function(x, breaks, ...){
f <- NULL

#### borrowed code from cut.default
if (length(breaks) == 1) {
if (is.na(breaks) | breaks < 2)
stop("invalid number of intervals")
nb <- as.integer(breaks + 1)
dx <- diff(rx <- range(x, na.rm = TRUE))
if (dx == 0)
dx <- abs(rx[1])
breaks <- seq.int( rx[1] - dx/1000
, rx[2L] + dx/1000
, length.out = nb
)
}
####

args <- list(...)

args$breaks <- breaks
for (i in chunk(x, ...)){
args$x <- x[i]
f <- ffappend( f
, do.call(cut, args)
)
}
f
}
12 changes: 12 additions & 0 deletions output/ffbase.Rcheck/00_pkg_src/ffbase/R/diff_ff.R
@@ -0,0 +1,12 @@
diff.ff <- function(x, ...){
#TODO generalize for lag and difference
chunks <- chunk(x, ...)

# hmm, next line may go wrong for small datatypes...
d <- ff(vmode=vmode(x), length=length(x)-1)
for (i in chunks){
j <- i
d[i] <- diff(x[i])
}
d
}

0 comments on commit 57ccc53

Please sign in to comment.