Skip to content

Commit

Permalink
version 0.1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
djhunter authored and cran-robot committed Aug 12, 2016
0 parents commit aec1ca4
Show file tree
Hide file tree
Showing 16 changed files with 771 additions and 0 deletions.
22 changes: 22 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
Package: binsmooth
Type: Package
Title: Generate PDFs and CDFs from Binned Data
Version: 0.1.0
Author: David J. Hunter and McKalie Drown
Maintainer: Dave Hunter <dhunter@westmont.edu>
Description: Provides several methods for generating density functions
based on binned data. Data are assumed to be nonnegative, but the bin widths
need not be uniform, and the top bin may be unbounded. All PDF smoothing methods
maintain the areas specified by the binned data. (Equivalently, all CDF
smoothing methods interpolate the points specified by the binned data.) An
estimate for the mean of the distribution may be supplied as an optional
argument, which greatly improves the reliability of statistics computed from
the smoothed density functions. Methods include step function, recursive
subdivision, and optimized spline.
License: MIT + file LICENSE
Imports: stats, pracma, ineq, triangle
LazyData: TRUE
NeedsCompilation: no
Packaged: 2016-08-12 14:09:50 UTC; dhunter
Repository: CRAN
Date/Publication: 2016-08-12 16:46:49
2 changes: 2 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
YEAR: 2016
COPYRIGHT HOLDER: David J. Hunter and McKalie Drown
15 changes: 15 additions & 0 deletions MD5
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
29cbf0aaefa92ed1f2b8339fa9724e0e *DESCRIPTION
11fcc18229d1926590c9d08f219c5132 *LICENSE
1e32880d420021b43570b02ebd8ee747 *NAMESPACE
718b18f72622950c7b2295e16059284c *R/rsubbins.R
0b0f11381b6ae0594fec5136eeecb050 *R/simcounty.R
98fd59c90b830afaf4e0d9c7b17aa00d *R/splinebins.R
623bfeaff9a308954638e90d80276b18 *R/stepbins.R
5237fa31e1d511ca7ead04d6f771f08c *data/county_bins.rda
47e7d5dc78ca0d9cbd9c8d105b2b2090 *data/county_true.rda
d668f119a683e0b3b0ab2a3da9517394 *man/county_bins.Rd
84a042f94329da7f1240919fd887bb33 *man/county_true.Rd
143c3aea13378d084527eb3398a3bce2 *man/rsubbins.Rd
99495f9813d6057a0ac352e8ab92187d *man/simcounty.Rd
bd92a9fcc194aaaf34d52dba32bd68b5 *man/splinebins.Rd
c66af3ca0a5304d0fdea1a54229ca5da *man/stepbins.Rd
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
exportPattern("^[[:alpha:]]+")
importFrom("stats", "approxfun", "stepfun", "splinefun", "rlnorm", "rweibull", "rgamma", "median")
importFrom("ineq", "Gini")
importFrom("triangle", "rtriangle")
158 changes: 158 additions & 0 deletions R/rsubbins.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
rsubbins <- function(bEdges, bCounts, m=NULL, eps1 = 0.25, eps2 = 0.75, depth = 3,
tailShape = c("onebin", "pareto", "exponential"),
nTail=16, numIterations=20, pIndex=1.160964, tbRatio=0.8) {
if(is.null(m)) { # no mean provided, so make one up
L <- length(bCounts)
m <- sum(0.5*(c(bEdges[1:(L-1)],2.0*bEdges[L-1])+c(0, bEdges[1:(L-1)]))*bCounts/sum(bCounts))
}
tailShape <- match.arg(tailShape)
if(tailShape == "onebin")
rsubbinsNotail(bEdges, bCounts, m, eps1, eps2, depth)
else
rsubbinsTail(bEdges, bCounts, m, eps1, eps2, depth, tailShape, nTail, numIterations, pIndex, tbRatio)
}

rsubbinsTail <- function(bEdges, bCounts, m, eps1, eps2, depth,
tailShape = c("pareto", "exponential"),
nTail, numIterations, pIndex, tbRatio) {
tailShape <- match.arg(tailShape)
eps3 <- (1 - eps2) / 2
L <- length(bCounts)
tot <- sum(bCounts)
e <- c(0,bEdges[1:(L-1)],numeric(nTail)) # preallocate tail edges
shrinkFactor <- 1
shrinkMultiplier <- 0.995
tailCount <- bCounts[L]
bbtot <- tot-tailCount
bbMean <- sum((e[2:(L)]+e[1:(L-1)])*bCounts[1:(L-1)])/(2*bbtot) # mean of bounded bins as a step function
while(m<bbMean) { # binning must be skewed, because supplied mean is less than mean of bounded bins
e <- e*shrinkMultiplier # shrink the bins
bbMean <- sum((e[2:(L)]+e[1:(L-1)])*bCounts[1:(L-1)])/(2*bbtot)
shrinkFactor <- shrinkFactor*shrinkMultiplier
}
if(tailCount>0) {
# there is remaining area in final bin, so make a tail of bins
L <- L+nTail-1
tailArea <- tailCount/tot
bAreas <- c(bCounts/tot, numeric(nTail-1))
tbWidth <- e[L-nTail+1]-e[L-nTail] # initial tail bin width = width of last bounded bin
h <- bAreas[L-nTail]/tbWidth # height of last bounded bin
if(tailShape=="pareto")
tailUnscaled <- (1:nTail)^(-1-pIndex)
else
tailUnscaled <- tbRatio^(1:nTail)
bAreas[(L-nTail+1):(L)] <- tailUnscaled/sum(tailUnscaled)*tailArea
repeat {
e[(L-nTail+2):(L+1)] <- e[L-nTail+1]+(1:(nTail))*tbWidth
bMean <- sum((e[2:(L+1)]+e[1:L])*bAreas)/2
if(bMean>m) break # now the correct tbWidth is between 0 and tbWidth
tbWidth <- tbWidth*2
}
l <- 0
r <- tbWidth
for(i in 1:numIterations) {
tbWidth <- (l+r)/2
e[(L-nTail+2):(L+1)] <- e[L-nTail+1]+(1:(nTail))*tbWidth
bMean <- sum((e[2:(L+1)]+e[1:L])*bAreas)/2
if (bMean<m) # bin mean is too small; lengthen bins
l <- tbWidth
else
r <- tbWidth
}
}
else { # final bin is empty, so get rid of it
L <- L-1
bAreas <- bCounts[1:L]/tot
e <- e[1:(L+1)]
}
# subdivide bins
binAreas <- bAreas
binHeights <- c(0, binAreas/(e[2:(L+1)]-e[1:L]), 0)
binEdges <- e
for(i in 1:depth){
L <- length(binEdges)
binDiffs <- binHeights[2:(L + 1)] - binHeights[1:L]
newbinHeights <- 1:(3 * (L - 1))
binWidths <- binEdges[2:L] - binEdges[1:(L-1)]
binEdges2 <- binEdges[1:(L - 1)] + binWidths * eps3
binEdges3 <- binEdges[2:L] - binWidths * eps3
binEdges <- sort(c(binEdges, binEdges2, binEdges3), decreasing = FALSE)
for(j in 1:(L-1)) {
new_middlebin_height <- (((binDiffs[j] - binDiffs[(j + 1)]) * eps1 * eps3) / eps2) + binHeights[j + 1]
if(new_middlebin_height>0)
{
newbinHeights[(3 * j - 2)] <- binHeights[j + 1] - binDiffs[j] * eps1
newbinHeights[(3 * j)] <- binHeights[(j + 1)] + binDiffs[(j + 1)] * eps1
newbinHeights[(3 * j - 1)] <- new_middlebin_height
}
else # don't shift when middle bin would go negative
{
newbinHeights[(3*j-2)] <- binHeights[(j+1)]
newbinHeights[(3*j)] <- binHeights[(j+1)]
newbinHeights[(3*j-1)] <- binHeights[(j+1)]
}
}
binHeights <- c(0, newbinHeights, 0)
}
L <- length(binEdges)
binWidths <- binEdges[2:L] - binEdges[1:(L-1)]
binAreas <- binWidths * binHeights[2:L]
cAreas <- vapply(1:length(binAreas), function(x){sum(binAreas[1:x])}, numeric(1))
rsubCDF <- approxfun(binEdges, c(0, cAreas), yleft=0, yright=1, rule=2)
rsubPDF <- stepfun(binEdges, binHeights)
return(list(rsubPDF=rsubPDF, rsubCDF=rsubCDF, E=binEdges[L], shrinkFactor=shrinkFactor))
}

rsubbinsNotail <- function(bEdges, bCounts, m, eps1, eps2, depth) {
eps3 <- (1 - eps2) / 2
L <- length(bCounts)
tot <- sum(bCounts)
p <- c(1,1-vapply(1:(L-1), function(x){sum(bCounts[1:x])}, numeric(1))/tot)
e <- c(0,bEdges[1:(L-1)],0) # preallocate last entry; replace with E later
A <- 0.5*sum((e[2:L]-e[1:(L-1)])*(p[2:L]+p[1:(L-1)]))
shrinkFactor <- 1
shrinkMultiplier <- 0.995
while(m<A) { # binning must be skewed, because supplied mean is too small
e <- e*shrinkMultiplier # shrink the bins
A <- 0.5*sum((e[2:L]-e[1:(L-1)])*(p[2:L]+p[1:(L-1)]))
shrinkFactor <- shrinkFactor*shrinkMultiplier
}
E <- ifelse(p[L]>0, bEdges[L-1]+2*(m-A)/p[L], bEdges[L-1]*1.001) # make width of final bin small if empty
e[L+1] <- E # this is the right border of the last bin so the mean is preserved
# subdivide bins
binAreas <- bCounts/tot
binHeights <- c(0, binAreas/(e[2:(L+1)]-e[1:L]), 0)
binEdges <- e
for(i in 1:depth){
L <- length(binEdges)
binDiffs <- binHeights[2:(L + 1)] - binHeights[1:L]
newbinHeights <- 1:(3 * (L - 1))
binWidths <- binEdges[2:L] - binEdges[1:(L-1)]
binEdges2 <- binEdges[1:(L - 1)] + binWidths * eps3
binEdges3 <- binEdges[2:L] - binWidths * eps3
binEdges <- sort(c(binEdges, binEdges2, binEdges3), decreasing = FALSE)
for(j in 1:(L-1)) {
new_middlebin_height <- (((binDiffs[j] - binDiffs[(j + 1)]) * eps1 * eps3) / eps2) + binHeights[j + 1]
if(new_middlebin_height>0)
{
newbinHeights[(3 * j - 2)] <- binHeights[j + 1] - binDiffs[j] * eps1
newbinHeights[(3 * j)] <- binHeights[(j + 1)] + binDiffs[(j + 1)] * eps1
newbinHeights[(3 * j - 1)] <- new_middlebin_height
}
else # don't shift when middle bin would go negative
{
newbinHeights[(3*j-2)] <- binHeights[(j+1)]
newbinHeights[(3*j)] <- binHeights[(j+1)]
newbinHeights[(3*j-1)] <- binHeights[(j+1)]
}
}
binHeights <- c(0, newbinHeights, 0)
}
L <- length(binEdges)
binWidths <- binEdges[2:L] - binEdges[1:(L-1)]
binAreas <- binWidths * binHeights[2:L]
cAreas <- vapply(1:length(binAreas), function(x){sum(binAreas[1:x])}, numeric(1))
rsubCDF <- approxfun(binEdges, c(0, cAreas), yleft=0, yright=1, rule=2)
rsubPDF <- stepfun(binEdges, binHeights)
return(list(rsubPDF=rsubPDF, rsubCDF=rsubCDF, E=E, shrinkFactor=shrinkFactor))
}
67 changes: 67 additions & 0 deletions R/simcounty.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
simcounty <- function(numCounties, minPop=1000, maxPop=100000,
bin_minimums=c(0,10000,15000,20000,25000,30000,35000,40000,45000,
50000,60000,75000,100000,125000,150000,200000)) {
# Preallocate space for the whole data frames
numBins <- length(bin_minimums)
# variables for county_true data frame
fips <- 100+1:numCounties
mean_true <- numeric(numCounties)
median_true <- numeric(numCounties)
gini_true <- numeric(numCounties)
numRows <- numBins*numCounties
#variables for county_bins data frame
# later: fips <- rep(fips, each=numBins)
households <- numeric(numRows)
bin_min <- rep(bin_minimums, numCounties)
bin_max <- rep(c(bin_minimums[2:numBins]-1,NA), numCounties)
county <- character(numRows)
state <- rep(" Simulated", numRows)
minPop <- 1000
maxPop <- 100000
dSamps <- list(numeric(maxPop), numeric(maxPop), numeric(maxPop)) # 3 samples for each county
distributions <- c("lognormal", "triangle", "weibull", "gamma")
for(countyI in 1:numCounties) {
distsToUse <- sample(distributions, 3, replace=TRUE)
i <- 1
for(d in distsToUse)
{
sSize <- sample(maxPop,1)+minPop
switch(d,
lognormal = {
lnMu <- sample(c(10.5,10.6,10.7,10.8,10.9),1)
lnSigma <- sample(c(0.6,0.7,0.8),1)
dSamps[[i]] <- rlnorm(sSize, lnMu, lnSigma)
},
triangle = {
triB <- sample(c(300000, 400000, 500000),1)
triC <- sample(c(10000, 15000, 20000, 25000),1)
dSamps[[i]] <- triangle::rtriangle(sSize,a=0,b=triB,c=triC)
},
weibull = {
weibA <- sample(c(1.4, 1.5, 1.6, 1.7),1)
weibB <- sample(c(50000, 60000, 70000, 80000),1)
dSamps[[i]] <- rweibull(sSize,weibA, weibB)
},
gamma = {
gammaA <- sample(seq(1,3,by=0.2),1)
dSamps[[i]] <- rgamma(sSize,gammaA,gammaA/50000)
}
)
i <- i+1
}
simPopSamp <- unlist(dSamps)
households[(1+(countyI-1)*numBins):(numBins*countyI)] <- c(vapply(1:(numBins-1),
function(x) {sum(simPopSamp >= bin_minimums[x] & simPopSamp < bin_minimums[x+1])},
numeric(1)),
sum(simPopSamp>bin_minimums[numBins]))
county[(1+(countyI-1)*numBins):(numBins*countyI)] <- rep(paste(distsToUse, collapse=" "), numBins)
mean_true[countyI] <- mean(simPopSamp)
median_true[countyI] <- median(simPopSamp)
gini_true[countyI] <- ineq::Gini(simPopSamp, corr=TRUE)
countyI <- countyI + 1
} #end of for countyI loop
county_true <- data.frame(fips, mean_true, median_true, gini_true)
fips <- rep(fips, each=numBins)
county_bins <- data.frame(fips,households,bin_min,bin_max,county,state)
return(list(county_bins=county_bins, county_true=county_true))
}
53 changes: 53 additions & 0 deletions R/splinebins.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
splinebins <- function(bEdges, bCounts, m=NULL, numIterations=16, monoMethod=c("hyman", "monoH.FC")) {
monoMethod <- match.arg(monoMethod)
L <- length(bCounts)
tot <- sum(bCounts)
if(is.null(m)) # no mean provided, so make one up
m <- sum(0.5*(c(bEdges[1:(L-1)],2.0*bEdges[L-1])+c(0, bEdges[1:(L-1)]))*bCounts/tot)
sumbAreas <- vapply(1:L, function(i) {sum(bCounts[1:i])/tot}, numeric(1))
tailEnd <- 1.05*bEdges[L-1] # start with a really short tail
x <- c(0, bEdges[1:(L-1)], tailEnd, tailEnd*1.01)
y <- c(0, sumbAreas, 1) # The last x,y pair forces smoothness at x=tailEnd
f <- splinefun(x,y, method=monoMethod)
est_mean <- tailEnd - pracma::integral(f, 0, tailEnd)
shrinkFactor <- 1
shrinkMultiplier <- 0.995
while(est_mean > m) { # binning must be skewed, because supplied mean is too small
x <- x*shrinkMultiplier # shrink the bins
tailEnd <- tailEnd*shrinkMultiplier
f <- splinefun(x,y, method=monoMethod)
est_mean <- tailEnd - pracma::integral(f, 0, tailEnd)
shrinkFactor <- shrinkFactor*shrinkMultiplier
}
if(bCounts[L] > 0) {
# search for tail length
while(est_mean < m) {
tailEnd <- tailEnd * 2
x[L+1] <- tailEnd
x[L+2] <- tailEnd*1.01
f <- splinefun(x,y, method=monoMethod)
est_mean <- tailEnd - pracma::integral(f, 0, tailEnd)
}
# now the correct tail end will be between bEdges[L-1] and tailEnd
l <- bEdges[L-1]
r <- tailEnd
for(i in 1:numIterations) {
tailEnd <- (l+r)/2
x[L+1] <- tailEnd
x[L+2] <- tailEnd*1.01
f <- splinefun(x,y, method=monoMethod)
est_mean <- tailEnd - pracma::integral(f, 0, tailEnd)
if(est_mean > m)
r <- tailEnd
else
l <- tailEnd
}
}
splineCDF <- function(x){
ifelse(x<0, 0, ifelse(x>tailEnd, 1, f(x)))
}
splinePDF <- function(x){
ifelse(x<0 | x>tailEnd, 0, f(x,deriv=1))
}
return(list(splinePDF=splinePDF, splineCDF=splineCDF, E=tailEnd, est_mean=est_mean, shrinkFactor=shrinkFactor))
}

0 comments on commit aec1ca4

Please sign in to comment.