version 0.1.0

cran · Aug 12, 2016 · aec1ca4 · aec1ca4
commit aec1ca4
Show file tree

Hide file tree

Showing 16 changed files with 771 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,22 @@
+Package: binsmooth
+Type: Package
+Title: Generate PDFs and CDFs from Binned Data
+Version: 0.1.0
+Author: David J. Hunter and McKalie Drown
+Maintainer: Dave Hunter <dhunter@westmont.edu>
+Description: Provides several methods for generating density functions
+    based on binned data. Data are assumed to be nonnegative, but the bin widths
+    need not be uniform, and the top bin may be unbounded. All PDF smoothing methods
+    maintain the areas specified by the binned data. (Equivalently, all CDF
+    smoothing methods interpolate the points specified by the binned data.) An
+    estimate for the mean of the distribution may be supplied as an optional
+    argument, which greatly improves the reliability of statistics computed from
+    the smoothed density functions. Methods include step function, recursive
+    subdivision, and optimized spline.
+License: MIT + file LICENSE
+Imports: stats, pracma, ineq, triangle
+LazyData: TRUE
+NeedsCompilation: no
+Packaged: 2016-08-12 14:09:50 UTC; dhunter
+Repository: CRAN
+Date/Publication: 2016-08-12 16:46:49
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,2 @@
+YEAR: 2016
+COPYRIGHT HOLDER: David J. Hunter and McKalie Drown
diff --git a/MD5 b/MD5
@@ -0,0 +1,15 @@
+29cbf0aaefa92ed1f2b8339fa9724e0e *DESCRIPTION
+11fcc18229d1926590c9d08f219c5132 *LICENSE
+1e32880d420021b43570b02ebd8ee747 *NAMESPACE
+718b18f72622950c7b2295e16059284c *R/rsubbins.R
+0b0f11381b6ae0594fec5136eeecb050 *R/simcounty.R
+98fd59c90b830afaf4e0d9c7b17aa00d *R/splinebins.R
+623bfeaff9a308954638e90d80276b18 *R/stepbins.R
+5237fa31e1d511ca7ead04d6f771f08c *data/county_bins.rda
+47e7d5dc78ca0d9cbd9c8d105b2b2090 *data/county_true.rda
+d668f119a683e0b3b0ab2a3da9517394 *man/county_bins.Rd
+84a042f94329da7f1240919fd887bb33 *man/county_true.Rd
+143c3aea13378d084527eb3398a3bce2 *man/rsubbins.Rd
+99495f9813d6057a0ac352e8ab92187d *man/simcounty.Rd
+bd92a9fcc194aaaf34d52dba32bd68b5 *man/splinebins.Rd
+c66af3ca0a5304d0fdea1a54229ca5da *man/stepbins.Rd
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,4 @@
+exportPattern("^[[:alpha:]]+")
+importFrom("stats", "approxfun", "stepfun", "splinefun", "rlnorm", "rweibull", "rgamma", "median")
+importFrom("ineq", "Gini")
+importFrom("triangle", "rtriangle")
diff --git a/R/rsubbins.R b/R/rsubbins.R
@@ -0,0 +1,158 @@
+rsubbins <- function(bEdges, bCounts, m=NULL, eps1 = 0.25, eps2 = 0.75, depth = 3,
+                     tailShape = c("onebin", "pareto", "exponential"),
+                     nTail=16, numIterations=20, pIndex=1.160964, tbRatio=0.8) {
+  if(is.null(m)) { # no mean provided, so make one up
+    L <- length(bCounts)
+    m <- sum(0.5*(c(bEdges[1:(L-1)],2.0*bEdges[L-1])+c(0, bEdges[1:(L-1)]))*bCounts/sum(bCounts))
+  }
+  tailShape <- match.arg(tailShape)
+  if(tailShape == "onebin")
+    rsubbinsNotail(bEdges, bCounts, m, eps1, eps2, depth)
+  else
+    rsubbinsTail(bEdges, bCounts, m, eps1, eps2, depth, tailShape, nTail, numIterations, pIndex, tbRatio)
+}
+
+rsubbinsTail <- function(bEdges, bCounts, m, eps1, eps2, depth,
+                         tailShape = c("pareto", "exponential"),
+                         nTail, numIterations, pIndex, tbRatio) {
+  tailShape <- match.arg(tailShape)
+  eps3 <- (1 - eps2) / 2
+  L <- length(bCounts)
+  tot <- sum(bCounts)
+  e <- c(0,bEdges[1:(L-1)],numeric(nTail)) # preallocate tail edges
+  shrinkFactor <- 1
+  shrinkMultiplier <- 0.995
+  tailCount <- bCounts[L]
+  bbtot <- tot-tailCount
+  bbMean <- sum((e[2:(L)]+e[1:(L-1)])*bCounts[1:(L-1)])/(2*bbtot) # mean of bounded bins as a step function
+  while(m<bbMean) { # binning must be skewed, because supplied mean is less than mean of bounded bins
+    e <- e*shrinkMultiplier # shrink the bins
+    bbMean <- sum((e[2:(L)]+e[1:(L-1)])*bCounts[1:(L-1)])/(2*bbtot)
+    shrinkFactor <- shrinkFactor*shrinkMultiplier
+  }
+  if(tailCount>0) {
+    # there is remaining area in final bin, so make a tail of bins
+    L <- L+nTail-1
+    tailArea <- tailCount/tot
+    bAreas <- c(bCounts/tot, numeric(nTail-1))
+    tbWidth <- e[L-nTail+1]-e[L-nTail] # initial tail bin width = width of last bounded bin
+    h <- bAreas[L-nTail]/tbWidth # height of last bounded bin
+    if(tailShape=="pareto")
+      tailUnscaled <- (1:nTail)^(-1-pIndex)
+    else
+      tailUnscaled <- tbRatio^(1:nTail)
+    bAreas[(L-nTail+1):(L)] <- tailUnscaled/sum(tailUnscaled)*tailArea
+    repeat {
+      e[(L-nTail+2):(L+1)] <- e[L-nTail+1]+(1:(nTail))*tbWidth
+      bMean <- sum((e[2:(L+1)]+e[1:L])*bAreas)/2
+      if(bMean>m) break # now the correct tbWidth is between 0 and tbWidth
+      tbWidth <- tbWidth*2
+    }
+    l <- 0
+    r <- tbWidth
+    for(i in 1:numIterations) {
+      tbWidth <- (l+r)/2
+      e[(L-nTail+2):(L+1)] <- e[L-nTail+1]+(1:(nTail))*tbWidth
+      bMean <- sum((e[2:(L+1)]+e[1:L])*bAreas)/2
+      if (bMean<m) # bin mean is too small; lengthen bins
+        l <- tbWidth
+      else
+        r <- tbWidth
+    }
+  }
+  else { # final bin is empty, so get rid of it
+    L <- L-1
+    bAreas <- bCounts[1:L]/tot
+    e <- e[1:(L+1)]
+  }
+  # subdivide bins
+  binAreas <- bAreas
+  binHeights <- c(0, binAreas/(e[2:(L+1)]-e[1:L]), 0)
+  binEdges <- e
+  for(i in 1:depth){
+    L <- length(binEdges)
+    binDiffs <- binHeights[2:(L + 1)] - binHeights[1:L]
+    newbinHeights <- 1:(3 * (L - 1))
+    binWidths <- binEdges[2:L] - binEdges[1:(L-1)]
+    binEdges2 <- binEdges[1:(L - 1)] + binWidths * eps3
+    binEdges3 <- binEdges[2:L] - binWidths * eps3
+    binEdges <- sort(c(binEdges, binEdges2, binEdges3), decreasing = FALSE)
+    for(j in 1:(L-1)) {
+      new_middlebin_height <- (((binDiffs[j] - binDiffs[(j + 1)]) * eps1 * eps3) / eps2) + binHeights[j + 1]
+      if(new_middlebin_height>0)
+      {
+        newbinHeights[(3 * j - 2)] <- binHeights[j + 1] - binDiffs[j] * eps1
+        newbinHeights[(3 * j)] <- binHeights[(j + 1)] + binDiffs[(j + 1)] * eps1
+        newbinHeights[(3 * j - 1)] <- new_middlebin_height
+      }
+      else # don't shift when middle bin would go negative
+      {
+        newbinHeights[(3*j-2)] <- binHeights[(j+1)]
+        newbinHeights[(3*j)] <- binHeights[(j+1)]
+        newbinHeights[(3*j-1)] <- binHeights[(j+1)]
+      }
+    }
+    binHeights <- c(0, newbinHeights, 0)
+  }
+  L <- length(binEdges)
+  binWidths <- binEdges[2:L] - binEdges[1:(L-1)]
+  binAreas <- binWidths * binHeights[2:L]
+  cAreas <- vapply(1:length(binAreas), function(x){sum(binAreas[1:x])}, numeric(1))
+  rsubCDF <- approxfun(binEdges, c(0, cAreas), yleft=0, yright=1, rule=2)
+  rsubPDF <- stepfun(binEdges, binHeights)
+  return(list(rsubPDF=rsubPDF, rsubCDF=rsubCDF, E=binEdges[L], shrinkFactor=shrinkFactor))
+}
+
+rsubbinsNotail <- function(bEdges, bCounts, m, eps1, eps2, depth) {
+  eps3 <- (1 - eps2) / 2
+  L <- length(bCounts)
+  tot <- sum(bCounts)
+  p <- c(1,1-vapply(1:(L-1), function(x){sum(bCounts[1:x])}, numeric(1))/tot)
+  e <- c(0,bEdges[1:(L-1)],0) # preallocate last entry; replace with E later
+  A <- 0.5*sum((e[2:L]-e[1:(L-1)])*(p[2:L]+p[1:(L-1)]))
+  shrinkFactor <- 1
+  shrinkMultiplier <- 0.995
+  while(m<A) { # binning must be skewed, because supplied mean is too small
+    e <- e*shrinkMultiplier # shrink the bins
+    A <- 0.5*sum((e[2:L]-e[1:(L-1)])*(p[2:L]+p[1:(L-1)]))
+    shrinkFactor <- shrinkFactor*shrinkMultiplier
+  }
+  E <- ifelse(p[L]>0, bEdges[L-1]+2*(m-A)/p[L], bEdges[L-1]*1.001) # make width of final bin small if empty
+  e[L+1] <- E # this is the right border of the last bin so the mean is preserved
+  # subdivide bins
+  binAreas <- bCounts/tot
+  binHeights <- c(0, binAreas/(e[2:(L+1)]-e[1:L]), 0)
+  binEdges <- e
+  for(i in 1:depth){
+    L <- length(binEdges)
+    binDiffs <- binHeights[2:(L + 1)] - binHeights[1:L]
+    newbinHeights <- 1:(3 * (L - 1))
+    binWidths <- binEdges[2:L] - binEdges[1:(L-1)]
+    binEdges2 <- binEdges[1:(L - 1)] + binWidths * eps3
+    binEdges3 <- binEdges[2:L] - binWidths * eps3
+    binEdges <- sort(c(binEdges, binEdges2, binEdges3), decreasing = FALSE)
+    for(j in 1:(L-1)) {
+      new_middlebin_height <- (((binDiffs[j] - binDiffs[(j + 1)]) * eps1 * eps3) / eps2) + binHeights[j + 1]
+      if(new_middlebin_height>0)
+      {
+        newbinHeights[(3 * j - 2)] <- binHeights[j + 1] - binDiffs[j] * eps1
+        newbinHeights[(3 * j)] <- binHeights[(j + 1)] + binDiffs[(j + 1)] * eps1
+        newbinHeights[(3 * j - 1)] <- new_middlebin_height
+      }
+      else # don't shift when middle bin would go negative
+      {
+        newbinHeights[(3*j-2)] <- binHeights[(j+1)]
+        newbinHeights[(3*j)] <- binHeights[(j+1)]
+        newbinHeights[(3*j-1)] <- binHeights[(j+1)]
+      }
+    }
+    binHeights <- c(0, newbinHeights, 0)
+  }
+  L <- length(binEdges)
+  binWidths <- binEdges[2:L] - binEdges[1:(L-1)]
+  binAreas <- binWidths * binHeights[2:L]
+  cAreas <- vapply(1:length(binAreas), function(x){sum(binAreas[1:x])}, numeric(1))
+  rsubCDF <- approxfun(binEdges, c(0, cAreas), yleft=0, yright=1, rule=2)
+  rsubPDF <- stepfun(binEdges, binHeights)
+  return(list(rsubPDF=rsubPDF, rsubCDF=rsubCDF, E=E, shrinkFactor=shrinkFactor))
+}
diff --git a/R/simcounty.R b/R/simcounty.R
@@ -0,0 +1,67 @@
+simcounty <- function(numCounties, minPop=1000, maxPop=100000,
+                      bin_minimums=c(0,10000,15000,20000,25000,30000,35000,40000,45000,
+                                     50000,60000,75000,100000,125000,150000,200000)) {
+  # Preallocate space for the whole data frames
+  numBins <- length(bin_minimums)
+  # variables for county_true data frame
+  fips <- 100+1:numCounties
+  mean_true <- numeric(numCounties)
+  median_true <- numeric(numCounties)
+  gini_true <- numeric(numCounties)
+  numRows <- numBins*numCounties
+  #variables for county_bins data frame
+  # later: fips <- rep(fips, each=numBins)
+  households <- numeric(numRows)
+  bin_min <- rep(bin_minimums, numCounties)
+  bin_max <- rep(c(bin_minimums[2:numBins]-1,NA), numCounties)
+  county <- character(numRows)
+  state <- rep(" Simulated", numRows)
+  minPop <- 1000
+  maxPop <- 100000
+  dSamps <- list(numeric(maxPop), numeric(maxPop), numeric(maxPop)) # 3 samples for each county
+  distributions <- c("lognormal", "triangle", "weibull", "gamma")
+  for(countyI in 1:numCounties) {
+    distsToUse <- sample(distributions, 3, replace=TRUE)
+    i <- 1
+    for(d in distsToUse)
+    {
+      sSize <- sample(maxPop,1)+minPop
+      switch(d,
+             lognormal = {
+               lnMu <- sample(c(10.5,10.6,10.7,10.8,10.9),1)
+               lnSigma <- sample(c(0.6,0.7,0.8),1)
+               dSamps[[i]] <- rlnorm(sSize, lnMu, lnSigma)
+             },
+             triangle = {
+               triB <- sample(c(300000, 400000, 500000),1)
+               triC <- sample(c(10000, 15000, 20000, 25000),1)
+               dSamps[[i]] <- triangle::rtriangle(sSize,a=0,b=triB,c=triC)
+             },
+             weibull = {
+               weibA <- sample(c(1.4, 1.5, 1.6, 1.7),1)
+               weibB <- sample(c(50000, 60000, 70000, 80000),1)
+               dSamps[[i]] <-  rweibull(sSize,weibA, weibB)
+             },
+             gamma = {
+               gammaA <- sample(seq(1,3,by=0.2),1)
+               dSamps[[i]] <- rgamma(sSize,gammaA,gammaA/50000)
+             }
+      )
+      i <- i+1
+    }
+    simPopSamp <- unlist(dSamps)
+    households[(1+(countyI-1)*numBins):(numBins*countyI)] <- c(vapply(1:(numBins-1),
+                                                                      function(x) {sum(simPopSamp >= bin_minimums[x] & simPopSamp < bin_minimums[x+1])},
+                                                                      numeric(1)),
+                                                               sum(simPopSamp>bin_minimums[numBins]))
+    county[(1+(countyI-1)*numBins):(numBins*countyI)] <- rep(paste(distsToUse, collapse=" "), numBins)
+    mean_true[countyI] <- mean(simPopSamp)
+    median_true[countyI] <- median(simPopSamp)
+    gini_true[countyI] <- ineq::Gini(simPopSamp, corr=TRUE)
+    countyI <- countyI + 1
+  } #end of for countyI loop
+  county_true <- data.frame(fips, mean_true, median_true, gini_true)
+  fips <- rep(fips, each=numBins)
+  county_bins <- data.frame(fips,households,bin_min,bin_max,county,state)
+  return(list(county_bins=county_bins, county_true=county_true))
+}
diff --git a/R/splinebins.R b/R/splinebins.R
@@ -0,0 +1,53 @@
+splinebins <- function(bEdges, bCounts, m=NULL, numIterations=16, monoMethod=c("hyman", "monoH.FC")) {
+  monoMethod <- match.arg(monoMethod)
+  L <- length(bCounts)
+  tot <- sum(bCounts)
+  if(is.null(m))  # no mean provided, so make one up
+    m <- sum(0.5*(c(bEdges[1:(L-1)],2.0*bEdges[L-1])+c(0, bEdges[1:(L-1)]))*bCounts/tot)
+  sumbAreas <- vapply(1:L, function(i) {sum(bCounts[1:i])/tot}, numeric(1))
+  tailEnd <- 1.05*bEdges[L-1] # start with a really short tail
+  x <- c(0, bEdges[1:(L-1)], tailEnd, tailEnd*1.01)
+  y <- c(0, sumbAreas, 1) # The last x,y pair forces smoothness at x=tailEnd
+  f <- splinefun(x,y, method=monoMethod)
+  est_mean <- tailEnd - pracma::integral(f, 0, tailEnd)
+  shrinkFactor <- 1
+  shrinkMultiplier <- 0.995
+  while(est_mean > m) { # binning must be skewed, because supplied mean is too small
+    x <- x*shrinkMultiplier # shrink the bins
+    tailEnd <- tailEnd*shrinkMultiplier
+    f <- splinefun(x,y, method=monoMethod)
+    est_mean <- tailEnd - pracma::integral(f, 0, tailEnd)
+    shrinkFactor <- shrinkFactor*shrinkMultiplier
+  }
+  if(bCounts[L] > 0) {
+    # search for tail length
+    while(est_mean < m) {
+      tailEnd <- tailEnd * 2
+      x[L+1] <- tailEnd
+      x[L+2] <- tailEnd*1.01
+      f <- splinefun(x,y, method=monoMethod)
+      est_mean <- tailEnd - pracma::integral(f, 0, tailEnd)
+    }
+    # now the correct tail end will be between bEdges[L-1] and tailEnd
+    l <- bEdges[L-1]
+    r <- tailEnd
+    for(i in 1:numIterations) {
+      tailEnd <- (l+r)/2
+      x[L+1] <- tailEnd
+      x[L+2] <- tailEnd*1.01
+      f <- splinefun(x,y, method=monoMethod)
+      est_mean <- tailEnd - pracma::integral(f, 0, tailEnd)
+      if(est_mean > m)
+        r <- tailEnd
+      else
+        l <- tailEnd
+    }
+  }
+  splineCDF <- function(x){
+    ifelse(x<0, 0, ifelse(x>tailEnd, 1, f(x)))
+  }
+  splinePDF <- function(x){
+    ifelse(x<0 | x>tailEnd, 0, f(x,deriv=1))
+  }
+  return(list(splinePDF=splinePDF, splineCDF=splineCDF, E=tailEnd, est_mean=est_mean, shrinkFactor=shrinkFactor))
+}