From f72d35ef2b2d68d839415141bd715a273e83b9a5 Mon Sep 17 00:00:00 2001 From: Thoralf Mildenberger Date: Sun, 16 Oct 2016 17:39:09 +0000 Subject: [PATCH] version 0.0-24 --- DESCRIPTION | 27 +- MD5 | 12 + NAMESPACE | 2 + NEWS | 19 ++ R/histogram.R | 162 +++++----- R/histogram.irregular.R | 632 ++++++++++++++++++++-------------------- man/histogram.Rd | 4 +- 7 files changed, 456 insertions(+), 402 deletions(-) create mode 100644 MD5 create mode 100644 NEWS diff --git a/DESCRIPTION b/DESCRIPTION index d0c918f..6113fa1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,15 +1,22 @@ Package: histogram Type: Package -Title: Construction of regular and irregular histograms with different - options for automatic choice of bins -Version: 0.0-23 -Date: 2009-12-23 -Author: Thoralf Mildenberger, Yves Rozenholc, David Zasada. -Maintainer: Thoralf Mildenberger -Description: Automatic construction of regular and irregular histograms - as described in Rozenholc/Mildenberger/Gather (2009). +Title: Construction of Regular and Irregular Histograms with Different + Options for Automatic Choice of Bins +Version: 0.0-24 +Date: 2016-10-16 +Authors@R: c(person("Thoralf", "Mildenberger", role = c("aut", "cre"), + email = "mild@zhaw.ch"), + person("Yves", "Rozenholc", role = "aut"), + person("David", "Zasada", role = "aut")) +Author: Thoralf Mildenberger [aut, cre], + Yves Rozenholc [aut], + David Zasada [aut] +Maintainer: Thoralf Mildenberger +Description: Automatic construction of regular and irregular histograms as described in Rozenholc/Mildenberger/Gather (2010). License: GPL (>= 2) LazyLoad: yes -Packaged: 2009-12-23 13:24:37 UTC; mildenberger +ByteCompile: yes +NeedsCompilation: no +Packaged: 2016-10-16 11:53:03 UTC; thoralf Repository: CRAN -Date/Publication: 2009-12-24 10:42:12 +Date/Publication: 2016-10-16 17:39:09 diff --git a/MD5 b/MD5 new file mode 100644 index 0000000..3bd32f5 --- /dev/null +++ b/MD5 @@ -0,0 +1,12 @@ +585b5e846c6e08b01a502d694b755d1b *DESCRIPTION +2eee5e1ba5f04ca53172f7dc99b8cc67 *NAMESPACE +ab1e64d23e14a119e78ddabc58d2bc18 *NEWS +aaa37ba9791531fe0a4f9d035b765cdf *R/DynamicExtreme.R +5a8c58e38abf7ce910a120a86d91f13f *R/DynamicList.R +4410f07bbec87addbc63d54c62b5218e *R/PathList.R +82b767d1bd543f658faf5b9bdd971086 *R/greedy.R +2a020c19585b2feebdc68681ad46b53b *R/histogram.R +66b51a4ae95e20b8176947a7a0e1079c *R/histogram.irregular.R +db6698fc59594df25bd06e4d6456e7a1 *R/histogram.regular.R +cbb0bfb6be33de3e0fa608f9d0ad49d3 *R/utils.R +e1aa6fdb961cac16db5d8c7fdc98fd9e *man/histogram.Rd diff --git a/NAMESPACE b/NAMESPACE index b662713..1ea1585 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,2 +1,4 @@ +importFrom("graphics", "hist") +importFrom("stats", "quantile") export(histogram) diff --git a/NEWS b/NEWS new file mode 100644 index 0000000..1afae31 --- /dev/null +++ b/NEWS @@ -0,0 +1,19 @@ +Version 0.0-24, released 2016/10/17: + +* NAs are now removed (giving a warning), previously histogram() failed when NAs were present +* package is now byte-compiled on install (resulting in a speed-up for lagrer datasets) +* partial matchings in function call to DynamicExtreme() replaced by proper function arguments +* added reference to published paper +* histogram title should now be correct in all cases +* now explicitly imports hist() and quantile() +* some small changes to DESCRIPTION file + +Version 0.0-23, released 2009/12/24 + + +Version 0.0-22, released 2009/12/16 + + +Version 0.0-21, released 2009/11/12 + +* First release on CRAN diff --git a/R/histogram.R b/R/histogram.R index 6324223..b29c7ab 100644 --- a/R/histogram.R +++ b/R/histogram.R @@ -1,75 +1,87 @@ -`histogram` <- function( y, type="combined", grid="data", breaks=NULL, penalty="default", greedy=TRUE, right=TRUE, control=list(), verbose=TRUE, plot=TRUE ) { - - # check data vector - if ( length(unique(y))<2 ) - stop( "data vector must consist of at least two distinct values!" ) - - # handle invalid penalty/type combination - penalty = tolower( penalty ) - if ( any( penalty==c("br","nml","sc","mdl") ) && ( tolower(type)!="regular" && tolower(type)!="r" ) ) { - warning( "Penalty '", penalty, "' not supported for irregular histograms - creating regular histogram." ) - type <- "regular" - } - # handle invalid parameter "breaks" - if ( length(breaks) > 1 ) { - warning( "Breaks is a vector of length ", length(breaks), " - using first value only", call.=FALSE ) - breaks = breaks[1] - } - if ( ! is.null(breaks) ) { - breaks <- floor( breaks ) - if ( breaks < 2 ) { - warning( "Breaks must be an integer <= 2 - using breaks=2", call.=FALSE ) - breaks <- 2 - } - } - - # histogram type: regular - if ( tolower(type)=="regular" || tolower(type)=="r" ) - out<-histogram.regular( y, penalty=penalty, breaks=breaks, control=control, right=right, verbose=verbose, plot=plot, yvarname=deparse( substitute(y)) )$H - - # histogram type: irregular - if ( tolower(type)=="irregular" || tolower(type)=="i" ) - out<-histogram.irregular( y, grid=grid, breaks=breaks, penalty=penalty, greedy=greedy, control=control, right=right, verbose=verbose, plot=plot, yvarname=deparse( substitute(y)) )$H - - # histogram type: combined - if ( tolower(type)=="combined" || tolower(type)=="c" ) { - - # check penalty-parameter - penalty = tolower( penalty ) - if ( ! any( penalty==c("default","pena","penb","penr") ) ) { - warning( "Penalty '", penalty, "' not supported for combined histograms - using default setting for irregular histograms", call.=FALSE ) - penalty = "default" - } - - if ( verbose ) - message( "Choosing between regular and irregular histogram:\n\n1.", appendLF=FALSE ) - out1 <- histogram.regular( y, penalty="br", breaks=NULL, control=control, right=right, verbose=verbose, plot=FALSE ) - if ( verbose ) - message( "2.",appendLF=FALSE ) - out2 <- histogram.irregular( y, grid=grid, breaks=NULL, penalty=penalty, greedy=greedy, control=control, right=right, verbose=verbose, plot=FALSE ) - - #compare maximized likelihood or frgular and irregular histogram - if (out1$lhvalue>=out2$lhvalue) { - out<-out1$H - if (verbose) - message("\nRegular histogram chosen.\n") - } - else { - out<-out2$H - if ( verbose ) - message("\nIrregular histogram chosen.\n") - } - - # Bugfix: Name of y-var gets lost above - reset it. - out$xname = deparse( substitute(y)) - - if ( plot ) - plot(out, freq=FALSE) - } - - - if ( verbose ) - print( out ) - - return( invisible(out) ) -} +`histogram` <- function( y, type="combined", grid="data", breaks=NULL, penalty="default", greedy=TRUE, right=TRUE, control=list(), verbose=TRUE, plot=TRUE ) { + + # save y name for later (before doing anything to it) + + xname <- deparse( substitute(y)) + + # check data vector + + if ( any(is.na(y)) ) { + warning("Removing NAs from data vector") + y <- y[!is.na(y)] + } + + + if ( length(unique(y))<2 ) + stop( "data vector must consist of at least two distinct values!" ) + + # handle invalid penalty/type combination + penalty = tolower( penalty ) + if ( any( penalty==c("br","nml","sc","mdl") ) && ( tolower(type)!="regular" && tolower(type)!="r" ) ) { + warning( "Penalty '", penalty, "' not supported for irregular histograms - creating regular histogram." ) + type <- "regular" + } + # handle invalid parameter "breaks" + if ( length(breaks) > 1 ) { + warning( "Breaks is a vector of length ", length(breaks), " - using first value only", call.=FALSE ) + breaks = breaks[1] + } + if ( ! is.null(breaks) ) { + breaks <- floor( breaks ) + if ( breaks < 2 ) { + warning( "Breaks must be an integer <= 2 - using breaks=2", call.=FALSE ) + breaks <- 2 + } + } + + # histogram type: regular + if ( tolower(type)=="regular" || tolower(type)=="r" ) + out<-histogram.regular( y, penalty=penalty, breaks=breaks, control=control, right=right, verbose=verbose, plot=plot, yvarname=xname )$H + + # histogram type: irregular + if ( tolower(type)=="irregular" || tolower(type)=="i" ) + out<-histogram.irregular( y, grid=grid, breaks=breaks, penalty=penalty, greedy=greedy, control=control, right=right, verbose=verbose, plot=plot, yvarname=xname )$H + + # histogram type: combined + if ( tolower(type)=="combined" || tolower(type)=="c" ) { + + # check penalty-parameter + penalty = tolower( penalty ) + if ( ! any( penalty==c("default","pena","penb","penr") ) ) { + warning( "Penalty '", penalty, "' not supported for combined histograms - using default setting for irregular histograms", call.=FALSE ) + penalty = "default" + } + + if ( verbose ) + message( "Choosing between regular and irregular histogram:\n\n1.", appendLF=FALSE ) + out1 <- histogram.regular( y, penalty="br", breaks=NULL, control=control, right=right, verbose=verbose, plot=FALSE ) + if ( verbose ) + message( "2.",appendLF=FALSE ) + out2 <- histogram.irregular( y, grid=grid, breaks=NULL, penalty=penalty, greedy=greedy, control=control, right=right, verbose=verbose, plot=FALSE ) + + #compare maximized likelihood or frgular and irregular histogram + if (out1$lhvalue>=out2$lhvalue) { + out<-out1$H + if (verbose) + message("\nRegular histogram chosen.\n") + } + else { + out<-out2$H + if ( verbose ) + message("\nIrregular histogram chosen.\n") + } + + # Bugfix: Name of y-var gets lost above - reset it. + + out$xname <- xname + + if ( plot ) + plot(out, freq=FALSE) + } + + + if ( verbose ) + print( out ) + + return( invisible(out) ) +} diff --git a/R/histogram.irregular.R b/R/histogram.irregular.R index 76f8392..14b373a 100644 --- a/R/histogram.irregular.R +++ b/R/histogram.irregular.R @@ -1,316 +1,316 @@ -`histogram.irregular` <- function( y, grid="data", breaks=NULL, penalty="penB", greedy=TRUE, right=TRUE, control=list(), verbose=TRUE, plot=TRUE, yvarname="y" ) { - - epsilon <- 1e-7 - - penalty = tolower( penalty ) - grid = tolower(grid) - - # check penalty parameter - if ( penalty=="default" ) - penalty = "penb" - if ( ! any( penalty==c("penr","pena","penb","aic","bic","cv") ) ) { - warning( "Penalty '", penalty, "' not supported for irregular histograms - using 'penR'", call.=FALSE ) - penalty = "penr" - } - - # check data vector - if ( is.null(y) ) - stop( "no data in vector y" ) - lhvalue<-(-Inf) - n <- length(y) - a <- min(y) - b <- max(y) - y <- sort((y-a)/(b-a)) # normalize range to [0,1] - - if (verbose) - message("Building irregular histogram.") - - # control defaults - if ( penalty=="penr" ) - cont2 <- list( g1=1, g2=1, alpha=0.5, c=1 ) - if (penalty=="penb") - cont2 <- list( g1=1, g2=1, alpha=1, c=1 ) - if (penalty=="pena") - cont2 <- list( g1=1, g2=1, alpha=0.5, c=1, k=2 ) - if (penalty=="aic") - cont2 <- list( g1=1, g2=1, alpha=1 ) - if (penalty=="bic") - cont2 <- list( g1=1, g2=1, alpha=0.5 ) - if (penalty=="cv") - cont2 <- list( g1=1, g2=1, cvformula=1, p=1 ) - - cont2$quanttype=7 - - cont2$maxbin=1000 - if ( greedy == TRUE ) - cont2$maxbin=Inf - - if (grid=="data") - cont2$g3=Inf - if (grid=="regular") - cont2$g3=-1 - if (grid=="quantiles") - cont2$g3=-1 - - cont2$between<-FALSE - - # replace control defaults with values set by the user - cont2[names(control)]<-control - - # check g1/g2/g3 - if ( cont2$g1 <= 0 ) - stop( "g1 must be greater than 0" ) - if ( cont2$g2 < 0 ) - stop( "g2 must be greater than or equal to 0" ) - if ( cont2$g3 == -Inf ) - stop( "g3 must be a real number or Inf, but not -Inf" ) - -### if user supplies breaks argument, use this for G -### breakes argument now checked in the wrapper function - - if ( !is.null(breaks) ) - G <- floor( breaks ) - else - G <- cont2$g1*n^cont2$g2*log(n)^cont2$g3 - - - # check controls - # alpha - if ( !is.null(cont2$alpha) ) - if ( cont2$alpha < 0 ) - stop( "alpha must be greater than or equal to 0" ) - # c - if ( !is.null(cont2$c) ) - if ( cont2$c < 0 ) - stop( "c must be greater than or equal to 0" ) - # k - if ( !is.null(cont2$k) ) - if ( cont2$k < 1 ) - stop( "k must be greater than or equal to 1" ) - # cvformula - if ( !is.null(cont2$cvformula) ) - if ( cont2$cvformula!=1 && cont2$cvformula!=2) - stop( "cvformula must be 1 or 2" ) - # p - if ( !is.null(cont2$p) ) { - if ( (cont2$p%%1!=0) || (cont2$p<1) || (cont2$p>n-1) ) - stop( "p must be an integer between 1 and (n-1)" ) - if ( cont2$p!=1 && cont2$cvformula!=2 ) { - # falls p!=1: cvformula 2 verwenden - warning( "For p!=1 'cvformula=2' is used", call.=FALSE ) - cont2$cvformula = 2 - } - } - # mincount - mincount=0 - if ( (penalty=="cv")&&(cont2$cvformula==3) ) - mincount <- 2 - - # check grid parameter - if ( ! any( grid==c("data","regular","quantiles") ) ) { - warning( "Grid '", grid, "' not supported - using 'data'", call.=FALSE ) - grid = "data" - } - - if ( grid=="regular" ) { - if ( length(breaks)==1 ) - BL <- (0:((breaks+1)-1)) / (((breaks+1)-1)) -# BL<-seq(0,1,length.out=(breaks+1)) - if ( is.null(breaks) ) - BL <- (0:(max(2,floor(G)+1)-1)) / (max(2,floor(G)+1)-1) -# BL<-seq(0,1,length.out=max(2,floor(G)+1)) - if (verbose) - message(paste("- Using a regular grid with ",length(BL)-1," bins as finest grid.",sep="")) - } - - if (grid=="quantiles") { - if (length(breaks)==1) - bk <- (0:((breaks+1)-1)) / (((breaks+1)-1)) -# bk<-seq(0,1,length.out=(breaks+1)) - if (is.null(breaks)) - bk <- (0:(max(2,floor(G)+1)-1)) / (max(2,floor(G)+1)-1) -# bk<-seq(0,1,length.out=max(2,floor(G)+1)) - BL<-c(0,quantile(y,probs=bk[-c(1,length(bk))],type=cont2$quanttype),1) - BL<-unique(sort(BL)) - if (verbose) - message("- Using a regular quantile grid with ",length(BL)-1," bins as finest grid.",sep="") - } - - if (grid=="data") { - if (cont2$between) BL<-c(y[1]-epsilon,y[-n]+diff(y)/2,y[n]+epsilon) - else { - if (!right) - BL <- c(y[1:n-1]-epsilon,(y[n-1]+y[n])/2,y[n]+epsilon) - else - BL <- c(y[1]-epsilon,(y[1]+y[2])/2,y[2:n]+epsilon) - } - BL<-unique(sort(BL)) - if (verbose) - message("- Using finest grid based on observations.") - } - - # specify weightfunctions for dynamical programming part... - - if (penalty=="penr") { - if (verbose) message("- Choosing number of bins via maximum likelihood with ",toupper(penalty)," penalty.",sep="") - mini<-FALSE - weightfunction <- function(i,j) { - dN = NL[j]-NL[i] - dBL = BL[j]-BL[i] - # compute the loglikelihood part of interval [B(i),B(j)[ - if (dN==0) - W<-0 - else - if ((dBL<1/G)&(grid=="data")) - return(-Inf) # remove small bins - else - W <- dN*log(dN/dBL) - W <- W-cont2$alpha*dN/dBL/n ## use of random penalty - if (dN=2)&(D>binmax)){ - if (verbose) - message(paste("- Using greedy procedure to recursively build a finest partition with at most ",binmax," bins.",sep="")) - BL<-histgreedy(BL,NL,n,binmax,verbose=verbose) - NL<-calcNL(y,BL,right) - } - } - - D <- length(BL)-1 - #D <- min(length(NL)-1,D) - #bounds<-BL - - - # optimize & penalize - if (penalty=="penr") { - pen <- cont2$c*logchoose(n-1,D-1) + log(1:D)^2.5 - # compute dynamically the max likelihood for irregular histograms with - # d bins, d varying between 1 and D - tmp <- DynamicExtreme(weightfunction,n=length(BL)-1,D=D,mini=FALSE,msg=verbose) - D0 <- which.max(tmp$extreme-pen) - lhvalue<-max(tmp$extreme-pen)-n*log(b-a)-n*log(n)+cont2$alpha - } - if (penalty=="penb") { - pen <- cont2$c*logchoose(n-1,D-1) + cont2$alpha*(0:(D-1)) + log(1:D)^2.5 - # compute dynamically the max likelihood for irregular histograms with - # d bins, d varying between 1 and D - tmp <- DynamicExtreme(weightfunction,n=length(BL)-1,D=D,mini=FALSE,msg=verbose) - D0 <- which.max(tmp$extreme-pen) - lhvalue<-max(tmp$extreme-pen)-n*log(b-a)-n*log(n) - } - if (penalty=="pena") { - pen <- cont2$c*logchoose(n-1,D-1) + cont2$alpha*(0:(D-1)) + cont2$c*cont2$k*log(1:D)+ - 2*sqrt(cont2$c*cont2$alpha*(0:(D-1))*(logchoose(n-1,D-1)+cont2$k*log(1:D)) ) - # compute dynamically the max likelihood for irregular histograms with - # d bins, d varying between 1 and D - tmp <- DynamicExtreme(weightfunction,n=length(BL)-1,D=D,mini=FALSE,msg=verbose) - D0 <- which.max(tmp$extreme-pen) - lhvalue<-max(tmp$extreme-pen)-n*log(b-a)-n*log(n) - } - if (penalty=="aic") { - pen <- cont2$alpha*(0:(D-1)) - # compute dynamically the max likelihood for irregular histograms with - # d bins, d varying between 1 and D - tmp <- DynamicExtreme(weightfunction,n=length(BL)-1,D=D,mini=FALSE,msg=verbose) - D0 <- which.max(tmp$extreme-pen) - lhvalue<-max(tmp$extreme-pen)-n*log(b-a)-n*log(n) - } - if (penalty=="bic") { - pen <- cont2$alpha*log(n)*(0:(D-1)) - # compute dynamically the max likelihood for irregular histograms with - # d bins, d varying between 1 and D - tmp <- DynamicExtreme(weightfunction,n=length(BL)-1,D=D,mini=FALSE,msg=verbose) - D0 <- which.max(tmp$extreme-pen) - lhvalue<-max(tmp$extreme-pen)-n*log(b-a)-n*log(n) - } - if (penalty=="cv") { - # compute dynamically the cv scores for irregular histograms with - # d bins, d varying between 1 and D - tmp <- DynamicExtreme(weightfunction,n=length(BL)-1,D=D,mini=TRUE,msg=verbose) - D0 <- which.min(tmp$extreme) - } - - if (verbose) message(paste("- Number of bins chosen: ",D0,".\n\n",sep="")) - bounds <- DynamicList(tmp$ancestor,BL,D0) - - # transform back to original range - y<-a+(b-a)*y - - # create histogram - H <- hist(y,breaks=a+(b-a)*bounds,right=right,plot=FALSE) - - # Bugfix: Name of y-var gets lost above - reset it. - H$xname = yvarname - - # Plot - if (plot) - plot( H, freq=FALSE ) - - return(list(H=H,lhvalue=lhvalue)) -} - +`histogram.irregular` <- function( y, grid="data", breaks=NULL, penalty="penB", greedy=TRUE, right=TRUE, control=list(), verbose=TRUE, plot=TRUE, yvarname="y" ) { + + epsilon <- 1e-7 + + penalty = tolower( penalty ) + grid = tolower(grid) + + # check penalty parameter + if ( penalty=="default" ) + penalty = "penb" + if ( ! any( penalty==c("penr","pena","penb","aic","bic","cv") ) ) { + warning( "Penalty '", penalty, "' not supported for irregular histograms - using 'penR'", call.=FALSE ) + penalty = "penr" + } + + # check data vector + if ( is.null(y) ) + stop( "no data in vector y" ) + lhvalue<-(-Inf) + n <- length(y) + a <- min(y) + b <- max(y) + y <- sort((y-a)/(b-a)) # normalize range to [0,1] + + if (verbose) + message("Building irregular histogram.") + + # control defaults + if ( penalty=="penr" ) + cont2 <- list( g1=1, g2=1, alpha=0.5, c=1 ) + if (penalty=="penb") + cont2 <- list( g1=1, g2=1, alpha=1, c=1 ) + if (penalty=="pena") + cont2 <- list( g1=1, g2=1, alpha=0.5, c=1, k=2 ) + if (penalty=="aic") + cont2 <- list( g1=1, g2=1, alpha=1 ) + if (penalty=="bic") + cont2 <- list( g1=1, g2=1, alpha=0.5 ) + if (penalty=="cv") + cont2 <- list( g1=1, g2=1, cvformula=1, p=1 ) + + cont2$quanttype=7 + + cont2$maxbin=1000 + if ( greedy == TRUE ) + cont2$maxbin=Inf + + if (grid=="data") + cont2$g3=Inf + if (grid=="regular") + cont2$g3=-1 + if (grid=="quantiles") + cont2$g3=-1 + + cont2$between<-FALSE + + # replace control defaults with values set by the user + cont2[names(control)]<-control + + # check g1/g2/g3 + if ( cont2$g1 <= 0 ) + stop( "g1 must be greater than 0" ) + if ( cont2$g2 < 0 ) + stop( "g2 must be greater than or equal to 0" ) + if ( cont2$g3 == -Inf ) + stop( "g3 must be a real number or Inf, but not -Inf" ) + +### if user supplies breaks argument, use this for G +### breakes argument now checked in the wrapper function + + if ( !is.null(breaks) ) + G <- floor( breaks ) + else + G <- cont2$g1*n^cont2$g2*log(n)^cont2$g3 + + + # check controls + # alpha + if ( !is.null(cont2$alpha) ) + if ( cont2$alpha < 0 ) + stop( "alpha must be greater than or equal to 0" ) + # c + if ( !is.null(cont2$c) ) + if ( cont2$c < 0 ) + stop( "c must be greater than or equal to 0" ) + # k + if ( !is.null(cont2$k) ) + if ( cont2$k < 1 ) + stop( "k must be greater than or equal to 1" ) + # cvformula + if ( !is.null(cont2$cvformula) ) + if ( cont2$cvformula!=1 && cont2$cvformula!=2) + stop( "cvformula must be 1 or 2" ) + # p + if ( !is.null(cont2$p) ) { + if ( (cont2$p%%1!=0) || (cont2$p<1) || (cont2$p>n-1) ) + stop( "p must be an integer between 1 and (n-1)" ) + if ( cont2$p!=1 && cont2$cvformula!=2 ) { + # falls p!=1: cvformula 2 verwenden + warning( "For p!=1 'cvformula=2' is used", call.=FALSE ) + cont2$cvformula = 2 + } + } + # mincount + mincount=0 + if ( (penalty=="cv")&&(cont2$cvformula==3) ) + mincount <- 2 + + # check grid parameter + if ( ! any( grid==c("data","regular","quantiles") ) ) { + warning( "Grid '", grid, "' not supported - using 'data'", call.=FALSE ) + grid = "data" + } + + if ( grid=="regular" ) { + if ( length(breaks)==1 ) + BL <- (0:((breaks+1)-1)) / (((breaks+1)-1)) +# BL<-seq(0,1,length.out=(breaks+1)) + if ( is.null(breaks) ) + BL <- (0:(max(2,floor(G)+1)-1)) / (max(2,floor(G)+1)-1) +# BL<-seq(0,1,length.out=max(2,floor(G)+1)) + if (verbose) + message(paste("- Using a regular grid with ",length(BL)-1," bins as finest grid.",sep="")) + } + + if (grid=="quantiles") { + if (length(breaks)==1) + bk <- (0:((breaks+1)-1)) / (((breaks+1)-1)) +# bk<-seq(0,1,length.out=(breaks+1)) + if (is.null(breaks)) + bk <- (0:(max(2,floor(G)+1)-1)) / (max(2,floor(G)+1)-1) +# bk<-seq(0,1,length.out=max(2,floor(G)+1)) + BL<-c(0,quantile(y,probs=bk[-c(1,length(bk))],type=cont2$quanttype),1) + BL<-unique(sort(BL)) + if (verbose) + message("- Using a regular quantile grid with ",length(BL)-1," bins as finest grid.",sep="") + } + + if (grid=="data") { + if (cont2$between) BL<-c(y[1]-epsilon,y[-n]+diff(y)/2,y[n]+epsilon) + else { + if (!right) + BL <- c(y[1:n-1]-epsilon,(y[n-1]+y[n])/2,y[n]+epsilon) + else + BL <- c(y[1]-epsilon,(y[1]+y[2])/2,y[2:n]+epsilon) + } + BL<-unique(sort(BL)) + if (verbose) + message("- Using finest grid based on observations.") + } + + # specify weightfunctions for dynamical programming part... + + if (penalty=="penr") { + if (verbose) message("- Choosing number of bins via maximum likelihood with ",toupper(penalty)," penalty.",sep="") + mini<-FALSE + weightfunction <- function(i,j) { + dN = NL[j]-NL[i] + dBL = BL[j]-BL[i] + # compute the loglikelihood part of interval [B(i),B(j)[ + if (dN==0) + W<-0 + else + if ((dBL<1/G)&(grid=="data")) + return(-Inf) # remove small bins + else + W <- dN*log(dN/dBL) + W <- W-cont2$alpha*dN/dBL/n ## use of random penalty + if (dN=2)&(D>binmax)){ + if (verbose) + message(paste("- Using greedy procedure to recursively build a finest partition with at most ",binmax," bins.",sep="")) + BL<-histgreedy(BL,NL,n,binmax,verbose=verbose) + NL<-calcNL(y,BL,right) + } + } + + D <- length(BL)-1 + #D <- min(length(NL)-1,D) + #bounds<-BL + + + # optimize & penalize + if (penalty=="penr") { + pen <- cont2$c*logchoose(n-1,D-1) + log(1:D)^2.5 + # compute dynamically the max likelihood for irregular histograms with + # d bins, d varying between 1 and D + tmp <- DynamicExtreme(weightfunction,n=length(BL)-1,Dmax=D,mini=FALSE,msg=verbose) + D0 <- which.max(tmp$extreme-pen) + lhvalue<-max(tmp$extreme-pen)-n*log(b-a)-n*log(n)+cont2$alpha + } + if (penalty=="penb") { + pen <- cont2$c*logchoose(n-1,D-1) + cont2$alpha*(0:(D-1)) + log(1:D)^2.5 + # compute dynamically the max likelihood for irregular histograms with + # d bins, d varying between 1 and D + tmp <- DynamicExtreme(weightfunction,n=length(BL)-1,Dmax=D,mini=FALSE,msg=verbose) + D0 <- which.max(tmp$extreme-pen) + lhvalue<-max(tmp$extreme-pen)-n*log(b-a)-n*log(n) + } + if (penalty=="pena") { + pen <- cont2$c*logchoose(n-1,D-1) + cont2$alpha*(0:(D-1)) + cont2$c*cont2$k*log(1:D)+ + 2*sqrt(cont2$c*cont2$alpha*(0:(D-1))*(logchoose(n-1,D-1)+cont2$k*log(1:D)) ) + # compute dynamically the max likelihood for irregular histograms with + # d bins, d varying between 1 and D + tmp <- DynamicExtreme(weightfunction,n=length(BL)-1,Dmax=D,mini=FALSE,msg=verbose) + D0 <- which.max(tmp$extreme-pen) + lhvalue<-max(tmp$extreme-pen)-n*log(b-a)-n*log(n) + } + if (penalty=="aic") { + pen <- cont2$alpha*(0:(D-1)) + # compute dynamically the max likelihood for irregular histograms with + # d bins, d varying between 1 and D + tmp <- DynamicExtreme(weightfunction,n=length(BL)-1,Dmax=D,mini=FALSE,msg=verbose) + D0 <- which.max(tmp$extreme-pen) + lhvalue<-max(tmp$extreme-pen)-n*log(b-a)-n*log(n) + } + if (penalty=="bic") { + pen <- cont2$alpha*log(n)*(0:(D-1)) + # compute dynamically the max likelihood for irregular histograms with + # d bins, d varying between 1 and D + tmp <- DynamicExtreme(weightfunction,n=length(BL)-1,Dmax=D,mini=FALSE,msg=verbose) + D0 <- which.max(tmp$extreme-pen) + lhvalue<-max(tmp$extreme-pen)-n*log(b-a)-n*log(n) + } + if (penalty=="cv") { + # compute dynamically the cv scores for irregular histograms with + # d bins, d varying between 1 and D + tmp <- DynamicExtreme(weightfunction,n=length(BL)-1,Dmax=D,mini=TRUE,msg=verbose) + D0 <- which.min(tmp$extreme) + } + + if (verbose) message(paste("- Number of bins chosen: ",D0,".\n\n",sep="")) + bounds <- DynamicList(tmp$ancestor,BL,D0) + + # transform back to original range + y<-a+(b-a)*y + + # create histogram + H <- hist(y,breaks=a+(b-a)*bounds,right=right,plot=FALSE) + + # Bugfix: Name of y-var gets lost above - reset it. + H$xname <- yvarname + + # Plot + if (plot) + plot( H, freq=FALSE ) + + return(list(H=H, lhvalue=lhvalue)) +} + diff --git a/man/histogram.Rd b/man/histogram.Rd index 402de32..eb7dd3c 100644 --- a/man/histogram.Rd +++ b/man/histogram.Rd @@ -36,7 +36,7 @@ verbose = TRUE, plot = TRUE) is constructed by a greedy step to make the search procedure feasible. Has no effect for regular histograms.} \item{right}{logical; if \code{TRUE}, the histograms cells are right-closed (left open) intervals.} \item{control}{list of additional control parameters. Meaning and default values depend on settings of \code{type}, - \code{penalty} and \code{grid}. See below.}. + \code{penalty} and \code{grid}. See below.} \item{verbose}{logical; if \code{TRUE} (default), some information is given during histogram construction and the resulting \code{histogram} object is printed.} \item{plot}{logical. If \code{TRUE} (default), the histogram is plotted.} @@ -191,6 +191,8 @@ Hall, P. and Hannan, E. J. (1988). On stochastic complexity and nonparametric de Rozenholc, Y, Mildenberger, T. and Gather, U. (2009). Combining regular and irregular histograms by penalized likelihood. Discussion Paper 31/2009, SFB 823, TU Dortmund. \url{http://www.statistik.tu-dortmund.de/fileadmin/user_upload/SFB_823/discussion_papers/2009/31_09_rozenholc_mildenberger_gather.pdf} + +Rozenholc, Y., Mildenberger, T., Gather, U. (2010). Combining regular and irregular histograms by penalized likelihood. Computational Statistics and Data Analysis 54, 3313-3323. } \author{Thoralf Mildenberger, Yves Rozenholc, David Zasada.} %\note{ ~~further notes~~