version 0.2.0

cran · May 31, 2019 · bd0c870 · bd0c870
1 parent aec1ca4
commit bd0c870
Show file tree

Hide file tree

Showing 19 changed files with 341 additions and 36 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,22 +1,27 @@
 Package: binsmooth
 Type: Package
 Title: Generate PDFs and CDFs from Binned Data
-Version: 0.1.0
+Version: 0.2.0
 Author: David J. Hunter and McKalie Drown
 Maintainer: Dave Hunter <dhunter@westmont.edu>
 Description: Provides several methods for generating density functions
-    based on binned data. Data are assumed to be nonnegative, but the bin widths
-    need not be uniform, and the top bin may be unbounded. All PDF smoothing methods
-    maintain the areas specified by the binned data. (Equivalently, all CDF
-    smoothing methods interpolate the points specified by the binned data.) An
-    estimate for the mean of the distribution may be supplied as an optional
-    argument, which greatly improves the reliability of statistics computed from
-    the smoothed density functions. Methods include step function, recursive
-    subdivision, and optimized spline.
+    based on binned data. Methods include step function, recursive
+    subdivision, and optimized spline. Data are assumed to be nonnegative, 
+    but the bin widths need not be equal, and the top bin need not have an 
+    upper bound. All PDF smoothing methods maintain the areas specified by 
+    the binned data. (Equivalently, all CDF smoothing methods interpolate 
+    the points specified by the binned data.) An estimate for the mean of 
+    the distribution may be supplied as an optional argument, which greatly 
+    improves the reliability of statistics computed from the smoothed density 
+    functions. Includes methods for estimating the Gini coefficient, the 
+    Theil index, percentiles, and random deviates from a smoothed 
+    distribution. Among the three methods, the optimized spline (splinebins) 
+    is recommended for most purposes. The percentile and random-draw 
+    functions only support splinebins. 
 License: MIT + file LICENSE
 Imports: stats, pracma, ineq, triangle
 LazyData: TRUE
 NeedsCompilation: no
-Packaged: 2016-08-12 14:09:50 UTC; dhunter
+Packaged: 2019-05-31 16:25:17 UTC; dhunter
 Repository: CRAN
-Date/Publication: 2016-08-12 16:46:49
+Date/Publication: 2019-05-31 22:11:49 UTC
diff --git a/MD5 b/MD5
@@ -1,15 +1,26 @@
-29cbf0aaefa92ed1f2b8339fa9724e0e *DESCRIPTION
+0555bf8a3c2237d7c21ef858daa8c14d *DESCRIPTION
 11fcc18229d1926590c9d08f219c5132 *LICENSE
 1e32880d420021b43570b02ebd8ee747 *NAMESPACE
+697bb12c10759a0ff85806e074114758 *NEWS
+0b08e66cda3f9eab71fa02616c2ae78a *R/gini.R
 718b18f72622950c7b2295e16059284c *R/rsubbins.R
-0b0f11381b6ae0594fec5136eeecb050 *R/simcounty.R
-98fd59c90b830afaf4e0d9c7b17aa00d *R/splinebins.R
+d83f9955b6554f52406efffe1ff50d98 *R/sb_percentiles.R
+d75b32130119bef303a08018516b08de *R/sb_sample.R
+bc0b94f27b3c257f31e3cd997eb14089 *R/simcounty.R
+f72652d10bcb9011168986090a14c06b *R/splinebins.R
+93e8b451cc3e732d3e003fac2adfe38b *R/stats_from_distribution.R
 623bfeaff9a308954638e90d80276b18 *R/stepbins.R
+6644434bb4789dad83ffb90467f932fd *R/theil.R
 5237fa31e1d511ca7ead04d6f771f08c *data/county_bins.rda
 47e7d5dc78ca0d9cbd9c8d105b2b2090 *data/county_true.rda
 d668f119a683e0b3b0ab2a3da9517394 *man/county_bins.Rd
 84a042f94329da7f1240919fd887bb33 *man/county_true.Rd
-143c3aea13378d084527eb3398a3bce2 *man/rsubbins.Rd
-99495f9813d6057a0ac352e8ab92187d *man/simcounty.Rd
-bd92a9fcc194aaaf34d52dba32bd68b5 *man/splinebins.Rd
-c66af3ca0a5304d0fdea1a54229ca5da *man/stepbins.Rd
+df3c9823236c02b29a842405c923a8e1 *man/gini.Rd
+fe2bd370b9ea7ef94dea637644727966 *man/rsubbins.Rd
+5cca6bc445c879f14bfda203f286f326 *man/sb_percentiles.Rd
+a9f32c0206986b937d7dae2e95012267 *man/sb_sample.Rd
+da111c9bd47b60725cb440b6995188fe *man/simcounty.Rd
+3f8447b7e5c73e431bdf5721ea32eba8 *man/splinebins.Rd
+00d834fa433ec1b0ee440c0a685f50f6 *man/stats_from_distribution.Rd
+b9f8d416f41acaa1d2857b48f6dc510d *man/stepbins.Rd
+de732e89ee60ab0d1f12c24766220b8c *man/theil.Rd
diff --git a/NEWS b/NEWS
@@ -0,0 +1,16 @@
+Changes in Version 0.2.0
+========================
+
+New features:
+
+* Added functions to compute the Gini and Theil coefficients from the smoothed distributions, along with other descriptive statistics.
+* Added Theil index to simulated county_true data.
+* Added inverse CDF to the list that splinebins returns.
+* Added functions for computing percentiles and random samples from a splinebins fit.
+* Added NEWS file.
+
+Updates:
+
+* Updated references to the paper in Sociological Science: https://www.sociologicalscience.com/articles-v4-26-641/
+* Updated documentation.
+* Fixed typo in bincounts for Cook County in documentation.
diff --git a/R/gini.R b/R/gini.R
@@ -0,0 +1,6 @@
+gini <- function(binFit) {
+  CDF <- binFit[[2]]
+  E <- binFit[[3]]
+  cdf_mean <- E - pracma::integral(CDF, 0, E)
+  return(1-pracma::integral(function(x){(1-CDF(x))^2}, 0, E)/cdf_mean)
+}
diff --git a/R/sb_percentiles.R b/R/sb_percentiles.R
@@ -0,0 +1,6 @@
+sb_percentiles <- function(splinebinFit, p = seq(0,100,25)) {
+  iCDF <- splinebinFit$splineInvCDF
+  percentiles <- iCDF(p/100)
+  names(percentiles) <- paste0(p, "%")
+  return(percentiles)
+}
diff --git a/R/sb_sample.R b/R/sb_sample.R
@@ -0,0 +1,4 @@
+sb_sample <- function(splinebinFit, n = 1) {
+  iCDF <- splinebinFit$splineInvCDF
+  return(iCDF(stats::runif(n)))
+}
diff --git a/R/simcounty.R b/R/simcounty.R
@@ -8,6 +8,7 @@ simcounty <- function(numCounties, minPop=1000, maxPop=100000,
   mean_true <- numeric(numCounties)
   median_true <- numeric(numCounties)
   gini_true <- numeric(numCounties)
+  theil_true <- numeric(numCounties)
   numRows <- numBins*numCounties
   #variables for county_bins data frame
   # later: fips <- rep(fips, each=numBins)
@@ -58,9 +59,10 @@ simcounty <- function(numCounties, minPop=1000, maxPop=100000,
     mean_true[countyI] <- mean(simPopSamp)
     median_true[countyI] <- median(simPopSamp)
     gini_true[countyI] <- ineq::Gini(simPopSamp, corr=TRUE)
+    theil_true[countyI] <- ineq::Theil(simPopSamp)
     countyI <- countyI + 1
   } #end of for countyI loop
-  county_true <- data.frame(fips, mean_true, median_true, gini_true)
+  county_true <- data.frame(fips, mean_true, median_true, gini_true, theil_true)
   fips <- rep(fips, each=numBins)
   county_bins <- data.frame(fips,households,bin_min,bin_max,county,state)
   return(list(county_bins=county_bins, county_true=county_true))

diff --git a/R/splinebins.R b/R/splinebins.R
@@ -1,4 +1,4 @@
-splinebins <- function(bEdges, bCounts, m=NULL, numIterations=16, monoMethod=c("hyman", "monoH.FC")) {
+splinebins <- function(bEdges, bCounts, m=NULL, numIterations=16, monoMethod=c("hyman", "monoH.FC"), ipn=200) {
   monoMethod <- match.arg(monoMethod)
   L <- length(bCounts)
   tot <- sum(bCounts)
@@ -43,11 +43,17 @@ splinebins <- function(bEdges, bCounts, m=NULL, numIterations=16, monoMethod=c("
         l <- tailEnd
     }
   }
+  xfix <- seq(0, tailEnd, length.out = ipn) # ADDED in v0.2.0: sample CDF to invert
+  yfix <- f(xfix)
+  finv <- splinefun(yfix, xfix, method=monoMethod) # ADDED in v0.2.0: approximate inverse CDF
   splineCDF <- function(x){
     ifelse(x<0, 0, ifelse(x>tailEnd, 1, f(x)))
   }
   splinePDF <- function(x){
     ifelse(x<0 | x>tailEnd, 0, f(x,deriv=1))
   }
-  return(list(splinePDF=splinePDF, splineCDF=splineCDF, E=tailEnd, est_mean=est_mean, shrinkFactor=shrinkFactor))
+  splineInvCDF <- function(x){ # ADDED in v0.2.0: approximate inverse CDF
+    ifelse(x<0, 0, ifelse(x>1, tailEnd, finv(x)))
+  }
+  return(list(splinePDF=splinePDF, splineCDF=splineCDF, E=tailEnd, est_mean=est_mean, shrinkFactor=shrinkFactor, splineInvCDF=splineInvCDF))
 }
diff --git a/R/stats_from_distribution.R b/R/stats_from_distribution.R
@@ -0,0 +1,12 @@
+stats_from_distribution <- function(binFit) {
+  PDF <- binFit[[1]]
+  CDF <- binFit[[2]]
+  E <- binFit[[3]]
+  cdf_mean <- E - pracma::integral(CDF, 0, E)
+  v <- pracma::integral(function(x){2*x-2*x*CDF(x)}, 0 ,E) - cdf_mean^2
+  g <- 1-pracma::integral(function(x){(1-CDF(x))^2}, 0, E)/cdf_mean
+  t <- pracma::integral(function(x){PDF(x)*x/cdf_mean*log(x/cdf_mean)}, 0, E)
+  statistics <- c(cdf_mean, v, sqrt(v), g, t)
+  names(statistics) <- c("mean", "variance", "SD", "Gini", "Theil")
+  return(statistics)
+}
diff --git a/R/theil.R b/R/theil.R
@@ -0,0 +1,7 @@
+theil <- function(binFit) {
+  PDF <- binFit[[1]]
+  CDF <- binFit[[2]]
+  E <- binFit[[3]]
+  cdf_mean <- E - pracma::integral(CDF, 0, E)
+  return(pracma::integral(function(x){PDF(x)*x/cdf_mean*log(x/cdf_mean)}, 0, E))
+}
diff --git a/man/gini.Rd b/man/gini.Rd
@@ -0,0 +1,43 @@
+\name{gini}
+\alias{gini}
+\title{
+Estimate the Gini coefficient
+}
+\description{
+Estimates the Gini coefficient from a smoothed distribution.
+}
+\usage{
+gini(binFit)
+}
+\arguments{
+  \item{binFit}{
+A list as returned by \code{\link{splinebins}}, \code{\link{stepbins}}, or \code{\link{rsubbins}}. (Alternatively, a list containing a PDF of non-negative support, its CDF, and an upper bound for the support of the PDF.)
+}
+}
+\details{
+For distributions of non-negative support, the Gini coefficient can be computed from a cumulative distribution function \eqn{F(x)} by the integral
+\deqn{G = 1 - \frac{1}{\mu}\int_0^\infty (1-F(x))^2 \, dx}
+where \eqn{\mu} is the mean of the distribution.
+}
+\value{
+Returns the Gini coefficient \eqn{G}.
+}
+\references{
+Paul T. von Hippel, David J. Hunter, McKalie Drown. \emph{Better Estimates from Binned Income Data: Interpolated CDFs and Mean-Matching}, Sociological Science, November 15, 2017. \url{https://www.sociologicalscience.com/articles-v4-26-641/}
+}
+\author{
+David J. Hunter and McKalie Drown
+}
+
+\examples{
+# 2005 ACS data from Cook County, Illinois
+binedges <- c(10000,15000,20000,25000,30000,35000,40000,45000,
+              50000,60000,75000,100000,125000,150000,200000,NA)
+bincounts <- c(157532,97369,102673,100888,90835,94191,87688,90481,
+               79816,153581,195430,240948,155139,94527,92166,103217)
+stepfit <- stepbins(binedges, bincounts, 76091)
+splinefit <- splinebins(binedges, bincounts, 76091)
+gini(stepfit)
+gini(splinefit) # More accurate
+}
+
diff --git a/man/rsubbins.Rd b/man/rsubbins.Rd
@@ -52,6 +52,8 @@ The decay ratio for the tail bins. Ignored unless \code{tailShape} equals \code{
 }
 \details{
 First, a step function PDF is created, as described in \code{\link{stepbins}}. The bins of the resulting PDF are then recursively subdivided and shifted in a manner that preserves the area of the original bins, resulting in a step function with finer bins.
+
+The methods \code{\link{stepbins}} and \code{\link{rsubbins}} are included in this package mainly for the purpose of comparison. For most use cases, \code{\link{splinebins}} will produce more accurate smoothing results.
 }
 \value{
 Returns a list with the following components.
@@ -61,7 +63,7 @@ Returns a list with the following components.
 \item{shrinkFactor}{If the supplied estimate for the mean is too small to be fitted with a step function, the bins edges will be scaled by \code{shrinkFactor}, which will be chosen less than (and close to) 1.}
 }
 \references{
-Hunter, D., Drown, M., and von Hippel, P. (2016) \emph{Optimized smoothing techniques for binned data}, in preparation.
+Paul T. von Hippel, David J. Hunter, McKalie Drown. \emph{Better Estimates from Binned Income Data: Interpolated CDFs and Mean-Matching}, Sociological Science, November 15, 2017. \url{https://www.sociologicalscience.com/articles-v4-26-641/}
 }
 \author{
 David J. Hunter and McKalie Drown
@@ -72,7 +74,7 @@ David J. Hunter and McKalie Drown
 binedges <- c(10000,15000,20000,25000,30000,35000,40000,45000,
               50000,60000,75000,100000,125000,150000,200000,NA)
 bincounts <- c(157532,97369,102673,100888,90835,94191,87688,90481,
-               79816,153581,195430,240948,155139,9452,92166,103217)
+               79816,153581,195430,240948,155139,94527,92166,103217)
 rsb <- rsubbins(binedges, bincounts, 76091, tailShape="pareto")
 
 plot(rsb$rsubPDF, do.points=FALSE)

diff --git a/man/sb_percentiles.Rd b/man/sb_percentiles.Rd
@@ -0,0 +1,43 @@
+\name{sb_percentiles}
+\alias{sb_percentiles}
+\title{
+Estimate percentiles from splinebins
+}
+\description{
+Estimates percentiles of a smoothed distribution obtained using \code{\link{splinebins}}.
+}
+\usage{
+sb_percentiles(splinebinFit, p = seq(0,100,25))
+}
+\arguments{
+  \item{splinebinFit}{
+A list as returned by \code{\link{splinebins}}.
+}
+  \item{p}{
+A vector of percentages in the range \eqn{0 \le p \le 100}.
+}
+}
+\details{
+The approximate inverse of the CDF calculated by \code{\link{splinebins}} is used to approximate percentiles of the smoothed distribution.
+}
+\value{
+A vector of percentiles.
+}
+\references{
+Paul T. von Hippel, David J. Hunter, McKalie Drown. \emph{Better Estimates from Binned Income Data: Interpolated CDFs and Mean-Matching}, Sociological Science, November 15, 2017. \url{https://www.sociologicalscience.com/articles-v4-26-641/}
+}
+\author{
+David J. Hunter and McKalie Drown
+}
+
+\examples{
+# 2005 ACS data from Cook County, Illinois
+binedges <- c(10000,15000,20000,25000,30000,35000,40000,45000,
+              50000,60000,75000,100000,125000,150000,200000,NA)
+bincounts <- c(157532,97369,102673,100888,90835,94191,87688,90481,
+               79816,153581,195430,240948,155139,94527,92166,103217)
+splinefit <- splinebins(binedges, bincounts, 76091)
+sb_percentiles(splinefit)
+sb_percentiles(splinefit, c(27, 32, 93))
+}
+
diff --git a/man/sb_sample.Rd b/man/sb_sample.Rd
@@ -0,0 +1,43 @@
+\name{sb_sample}
+\alias{sb_sample}
+\title{
+Random sample from splinebins distribution
+}
+\description{
+Draw a random sample of points from a smoothed distribution obtained using \code{\link{splinebins}}.
+}
+\usage{
+sb_sample(splinebinFit, n = 1)
+}
+\arguments{
+  \item{splinebinFit}{
+A list as returned by \code{\link{splinebins}}.
+}
+  \item{n}{
+A positive integer giving the sample size.
+}
+}
+\details{
+The approximate inverse of the CDF calculated by \code{\link{splinebins}} is used to generate random values of the smoothed distribution.
+}
+\value{
+A vector of random deviates.
+}
+\references{
+Paul T. von Hippel, David J. Hunter, McKalie Drown. \emph{Better Estimates from Binned Income Data: Interpolated CDFs and Mean-Matching}, Sociological Science, November 15, 2017. \url{https://www.sociologicalscience.com/articles-v4-26-641/}
+}
+\author{
+David J. Hunter and McKalie Drown
+}
+
+\examples{
+# 2005 ACS data from Cook County, Illinois
+binedges <- c(10000,15000,20000,25000,30000,35000,40000,45000,
+              50000,60000,75000,100000,125000,150000,200000,NA)
+bincounts <- c(157532,97369,102673,100888,90835,94191,87688,90481,
+               79816,153581,195430,240948,155139,94527,92166,103217)
+splinefit <- splinebins(binedges, bincounts, 76091)
+sb_sample(splinefit, 5)
+hist(sb_sample(splinefit, 3000))
+}
+
diff --git a/man/simcounty.Rd b/man/simcounty.Rd
@@ -36,7 +36,7 @@ Returns a list of two data frames:
 \item{county_true}{Statistics computed from the raw data}
 }
 \references{
-Hunter, D., Drown, M., and von Hippel, P. (2016) \emph{Optimized smoothing techniques for binned data}, in preparation.
+Paul T. von Hippel, David J. Hunter, McKalie Drown. \emph{Better Estimates from Binned Income Data: Interpolated CDFs and Mean-Matching}, Sociological Science, November 15, 2017. \url{https://www.sociologicalscience.com/articles-v4-26-641/}
 }
 \author{
 David J. Hunter and McKalie Drown
@@ -45,13 +45,22 @@ David J. Hunter and McKalie Drown
 \code{\link{county_bins}}, \code{\link{county_true}}
 }
 \examples{
-l <- simcounty(5)
-cb <- l$county_bins
-ct <- l$county_true
+l1 <- simcounty(5)
+cb <- l1$county_bins
+ct <- l1$county_true
 sbl <- splinebins(cb$bin_max[cb$fips==103], cb$households[cb$fips==103],
                   ct$mean_true[ct$fips==103])
 stl <- stepbins(cb$bin_max[cb$fips==105], cb$households[cb$fips==105],
                 ct$mean_true[ct$fips==105])
 plot(sbl$splinePDF, 0, 300000, n=500)
 plot(stl$stepPDF, do.points=FALSE, main=cb$county[cb$fips==105][1])
+
+## Simulate one county and estimate gini and theil from binned data
+l2 <- simcounty(1)
+binedges <- l2$county_bins$bin_max + 0.5 # continuity correction
+bincounts <- l2$county_bins$households
+splinefit <- splinebins(binedges, bincounts, l2$county_true$mean_true)
+gini(splinefit)
+theil(splinefit)
+l2$county_true
 }