-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit aec1ca4
Showing
16 changed files
with
771 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
Package: binsmooth | ||
Type: Package | ||
Title: Generate PDFs and CDFs from Binned Data | ||
Version: 0.1.0 | ||
Author: David J. Hunter and McKalie Drown | ||
Maintainer: Dave Hunter <dhunter@westmont.edu> | ||
Description: Provides several methods for generating density functions | ||
based on binned data. Data are assumed to be nonnegative, but the bin widths | ||
need not be uniform, and the top bin may be unbounded. All PDF smoothing methods | ||
maintain the areas specified by the binned data. (Equivalently, all CDF | ||
smoothing methods interpolate the points specified by the binned data.) An | ||
estimate for the mean of the distribution may be supplied as an optional | ||
argument, which greatly improves the reliability of statistics computed from | ||
the smoothed density functions. Methods include step function, recursive | ||
subdivision, and optimized spline. | ||
License: MIT + file LICENSE | ||
Imports: stats, pracma, ineq, triangle | ||
LazyData: TRUE | ||
NeedsCompilation: no | ||
Packaged: 2016-08-12 14:09:50 UTC; dhunter | ||
Repository: CRAN | ||
Date/Publication: 2016-08-12 16:46:49 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
YEAR: 2016 | ||
COPYRIGHT HOLDER: David J. Hunter and McKalie Drown |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
29cbf0aaefa92ed1f2b8339fa9724e0e *DESCRIPTION | ||
11fcc18229d1926590c9d08f219c5132 *LICENSE | ||
1e32880d420021b43570b02ebd8ee747 *NAMESPACE | ||
718b18f72622950c7b2295e16059284c *R/rsubbins.R | ||
0b0f11381b6ae0594fec5136eeecb050 *R/simcounty.R | ||
98fd59c90b830afaf4e0d9c7b17aa00d *R/splinebins.R | ||
623bfeaff9a308954638e90d80276b18 *R/stepbins.R | ||
5237fa31e1d511ca7ead04d6f771f08c *data/county_bins.rda | ||
47e7d5dc78ca0d9cbd9c8d105b2b2090 *data/county_true.rda | ||
d668f119a683e0b3b0ab2a3da9517394 *man/county_bins.Rd | ||
84a042f94329da7f1240919fd887bb33 *man/county_true.Rd | ||
143c3aea13378d084527eb3398a3bce2 *man/rsubbins.Rd | ||
99495f9813d6057a0ac352e8ab92187d *man/simcounty.Rd | ||
bd92a9fcc194aaaf34d52dba32bd68b5 *man/splinebins.Rd | ||
c66af3ca0a5304d0fdea1a54229ca5da *man/stepbins.Rd |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
exportPattern("^[[:alpha:]]+") | ||
importFrom("stats", "approxfun", "stepfun", "splinefun", "rlnorm", "rweibull", "rgamma", "median") | ||
importFrom("ineq", "Gini") | ||
importFrom("triangle", "rtriangle") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
rsubbins <- function(bEdges, bCounts, m=NULL, eps1 = 0.25, eps2 = 0.75, depth = 3, | ||
tailShape = c("onebin", "pareto", "exponential"), | ||
nTail=16, numIterations=20, pIndex=1.160964, tbRatio=0.8) { | ||
if(is.null(m)) { # no mean provided, so make one up | ||
L <- length(bCounts) | ||
m <- sum(0.5*(c(bEdges[1:(L-1)],2.0*bEdges[L-1])+c(0, bEdges[1:(L-1)]))*bCounts/sum(bCounts)) | ||
} | ||
tailShape <- match.arg(tailShape) | ||
if(tailShape == "onebin") | ||
rsubbinsNotail(bEdges, bCounts, m, eps1, eps2, depth) | ||
else | ||
rsubbinsTail(bEdges, bCounts, m, eps1, eps2, depth, tailShape, nTail, numIterations, pIndex, tbRatio) | ||
} | ||
|
||
rsubbinsTail <- function(bEdges, bCounts, m, eps1, eps2, depth, | ||
tailShape = c("pareto", "exponential"), | ||
nTail, numIterations, pIndex, tbRatio) { | ||
tailShape <- match.arg(tailShape) | ||
eps3 <- (1 - eps2) / 2 | ||
L <- length(bCounts) | ||
tot <- sum(bCounts) | ||
e <- c(0,bEdges[1:(L-1)],numeric(nTail)) # preallocate tail edges | ||
shrinkFactor <- 1 | ||
shrinkMultiplier <- 0.995 | ||
tailCount <- bCounts[L] | ||
bbtot <- tot-tailCount | ||
bbMean <- sum((e[2:(L)]+e[1:(L-1)])*bCounts[1:(L-1)])/(2*bbtot) # mean of bounded bins as a step function | ||
while(m<bbMean) { # binning must be skewed, because supplied mean is less than mean of bounded bins | ||
e <- e*shrinkMultiplier # shrink the bins | ||
bbMean <- sum((e[2:(L)]+e[1:(L-1)])*bCounts[1:(L-1)])/(2*bbtot) | ||
shrinkFactor <- shrinkFactor*shrinkMultiplier | ||
} | ||
if(tailCount>0) { | ||
# there is remaining area in final bin, so make a tail of bins | ||
L <- L+nTail-1 | ||
tailArea <- tailCount/tot | ||
bAreas <- c(bCounts/tot, numeric(nTail-1)) | ||
tbWidth <- e[L-nTail+1]-e[L-nTail] # initial tail bin width = width of last bounded bin | ||
h <- bAreas[L-nTail]/tbWidth # height of last bounded bin | ||
if(tailShape=="pareto") | ||
tailUnscaled <- (1:nTail)^(-1-pIndex) | ||
else | ||
tailUnscaled <- tbRatio^(1:nTail) | ||
bAreas[(L-nTail+1):(L)] <- tailUnscaled/sum(tailUnscaled)*tailArea | ||
repeat { | ||
e[(L-nTail+2):(L+1)] <- e[L-nTail+1]+(1:(nTail))*tbWidth | ||
bMean <- sum((e[2:(L+1)]+e[1:L])*bAreas)/2 | ||
if(bMean>m) break # now the correct tbWidth is between 0 and tbWidth | ||
tbWidth <- tbWidth*2 | ||
} | ||
l <- 0 | ||
r <- tbWidth | ||
for(i in 1:numIterations) { | ||
tbWidth <- (l+r)/2 | ||
e[(L-nTail+2):(L+1)] <- e[L-nTail+1]+(1:(nTail))*tbWidth | ||
bMean <- sum((e[2:(L+1)]+e[1:L])*bAreas)/2 | ||
if (bMean<m) # bin mean is too small; lengthen bins | ||
l <- tbWidth | ||
else | ||
r <- tbWidth | ||
} | ||
} | ||
else { # final bin is empty, so get rid of it | ||
L <- L-1 | ||
bAreas <- bCounts[1:L]/tot | ||
e <- e[1:(L+1)] | ||
} | ||
# subdivide bins | ||
binAreas <- bAreas | ||
binHeights <- c(0, binAreas/(e[2:(L+1)]-e[1:L]), 0) | ||
binEdges <- e | ||
for(i in 1:depth){ | ||
L <- length(binEdges) | ||
binDiffs <- binHeights[2:(L + 1)] - binHeights[1:L] | ||
newbinHeights <- 1:(3 * (L - 1)) | ||
binWidths <- binEdges[2:L] - binEdges[1:(L-1)] | ||
binEdges2 <- binEdges[1:(L - 1)] + binWidths * eps3 | ||
binEdges3 <- binEdges[2:L] - binWidths * eps3 | ||
binEdges <- sort(c(binEdges, binEdges2, binEdges3), decreasing = FALSE) | ||
for(j in 1:(L-1)) { | ||
new_middlebin_height <- (((binDiffs[j] - binDiffs[(j + 1)]) * eps1 * eps3) / eps2) + binHeights[j + 1] | ||
if(new_middlebin_height>0) | ||
{ | ||
newbinHeights[(3 * j - 2)] <- binHeights[j + 1] - binDiffs[j] * eps1 | ||
newbinHeights[(3 * j)] <- binHeights[(j + 1)] + binDiffs[(j + 1)] * eps1 | ||
newbinHeights[(3 * j - 1)] <- new_middlebin_height | ||
} | ||
else # don't shift when middle bin would go negative | ||
{ | ||
newbinHeights[(3*j-2)] <- binHeights[(j+1)] | ||
newbinHeights[(3*j)] <- binHeights[(j+1)] | ||
newbinHeights[(3*j-1)] <- binHeights[(j+1)] | ||
} | ||
} | ||
binHeights <- c(0, newbinHeights, 0) | ||
} | ||
L <- length(binEdges) | ||
binWidths <- binEdges[2:L] - binEdges[1:(L-1)] | ||
binAreas <- binWidths * binHeights[2:L] | ||
cAreas <- vapply(1:length(binAreas), function(x){sum(binAreas[1:x])}, numeric(1)) | ||
rsubCDF <- approxfun(binEdges, c(0, cAreas), yleft=0, yright=1, rule=2) | ||
rsubPDF <- stepfun(binEdges, binHeights) | ||
return(list(rsubPDF=rsubPDF, rsubCDF=rsubCDF, E=binEdges[L], shrinkFactor=shrinkFactor)) | ||
} | ||
|
||
rsubbinsNotail <- function(bEdges, bCounts, m, eps1, eps2, depth) { | ||
eps3 <- (1 - eps2) / 2 | ||
L <- length(bCounts) | ||
tot <- sum(bCounts) | ||
p <- c(1,1-vapply(1:(L-1), function(x){sum(bCounts[1:x])}, numeric(1))/tot) | ||
e <- c(0,bEdges[1:(L-1)],0) # preallocate last entry; replace with E later | ||
A <- 0.5*sum((e[2:L]-e[1:(L-1)])*(p[2:L]+p[1:(L-1)])) | ||
shrinkFactor <- 1 | ||
shrinkMultiplier <- 0.995 | ||
while(m<A) { # binning must be skewed, because supplied mean is too small | ||
e <- e*shrinkMultiplier # shrink the bins | ||
A <- 0.5*sum((e[2:L]-e[1:(L-1)])*(p[2:L]+p[1:(L-1)])) | ||
shrinkFactor <- shrinkFactor*shrinkMultiplier | ||
} | ||
E <- ifelse(p[L]>0, bEdges[L-1]+2*(m-A)/p[L], bEdges[L-1]*1.001) # make width of final bin small if empty | ||
e[L+1] <- E # this is the right border of the last bin so the mean is preserved | ||
# subdivide bins | ||
binAreas <- bCounts/tot | ||
binHeights <- c(0, binAreas/(e[2:(L+1)]-e[1:L]), 0) | ||
binEdges <- e | ||
for(i in 1:depth){ | ||
L <- length(binEdges) | ||
binDiffs <- binHeights[2:(L + 1)] - binHeights[1:L] | ||
newbinHeights <- 1:(3 * (L - 1)) | ||
binWidths <- binEdges[2:L] - binEdges[1:(L-1)] | ||
binEdges2 <- binEdges[1:(L - 1)] + binWidths * eps3 | ||
binEdges3 <- binEdges[2:L] - binWidths * eps3 | ||
binEdges <- sort(c(binEdges, binEdges2, binEdges3), decreasing = FALSE) | ||
for(j in 1:(L-1)) { | ||
new_middlebin_height <- (((binDiffs[j] - binDiffs[(j + 1)]) * eps1 * eps3) / eps2) + binHeights[j + 1] | ||
if(new_middlebin_height>0) | ||
{ | ||
newbinHeights[(3 * j - 2)] <- binHeights[j + 1] - binDiffs[j] * eps1 | ||
newbinHeights[(3 * j)] <- binHeights[(j + 1)] + binDiffs[(j + 1)] * eps1 | ||
newbinHeights[(3 * j - 1)] <- new_middlebin_height | ||
} | ||
else # don't shift when middle bin would go negative | ||
{ | ||
newbinHeights[(3*j-2)] <- binHeights[(j+1)] | ||
newbinHeights[(3*j)] <- binHeights[(j+1)] | ||
newbinHeights[(3*j-1)] <- binHeights[(j+1)] | ||
} | ||
} | ||
binHeights <- c(0, newbinHeights, 0) | ||
} | ||
L <- length(binEdges) | ||
binWidths <- binEdges[2:L] - binEdges[1:(L-1)] | ||
binAreas <- binWidths * binHeights[2:L] | ||
cAreas <- vapply(1:length(binAreas), function(x){sum(binAreas[1:x])}, numeric(1)) | ||
rsubCDF <- approxfun(binEdges, c(0, cAreas), yleft=0, yright=1, rule=2) | ||
rsubPDF <- stepfun(binEdges, binHeights) | ||
return(list(rsubPDF=rsubPDF, rsubCDF=rsubCDF, E=E, shrinkFactor=shrinkFactor)) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
simcounty <- function(numCounties, minPop=1000, maxPop=100000, | ||
bin_minimums=c(0,10000,15000,20000,25000,30000,35000,40000,45000, | ||
50000,60000,75000,100000,125000,150000,200000)) { | ||
# Preallocate space for the whole data frames | ||
numBins <- length(bin_minimums) | ||
# variables for county_true data frame | ||
fips <- 100+1:numCounties | ||
mean_true <- numeric(numCounties) | ||
median_true <- numeric(numCounties) | ||
gini_true <- numeric(numCounties) | ||
numRows <- numBins*numCounties | ||
#variables for county_bins data frame | ||
# later: fips <- rep(fips, each=numBins) | ||
households <- numeric(numRows) | ||
bin_min <- rep(bin_minimums, numCounties) | ||
bin_max <- rep(c(bin_minimums[2:numBins]-1,NA), numCounties) | ||
county <- character(numRows) | ||
state <- rep(" Simulated", numRows) | ||
minPop <- 1000 | ||
maxPop <- 100000 | ||
dSamps <- list(numeric(maxPop), numeric(maxPop), numeric(maxPop)) # 3 samples for each county | ||
distributions <- c("lognormal", "triangle", "weibull", "gamma") | ||
for(countyI in 1:numCounties) { | ||
distsToUse <- sample(distributions, 3, replace=TRUE) | ||
i <- 1 | ||
for(d in distsToUse) | ||
{ | ||
sSize <- sample(maxPop,1)+minPop | ||
switch(d, | ||
lognormal = { | ||
lnMu <- sample(c(10.5,10.6,10.7,10.8,10.9),1) | ||
lnSigma <- sample(c(0.6,0.7,0.8),1) | ||
dSamps[[i]] <- rlnorm(sSize, lnMu, lnSigma) | ||
}, | ||
triangle = { | ||
triB <- sample(c(300000, 400000, 500000),1) | ||
triC <- sample(c(10000, 15000, 20000, 25000),1) | ||
dSamps[[i]] <- triangle::rtriangle(sSize,a=0,b=triB,c=triC) | ||
}, | ||
weibull = { | ||
weibA <- sample(c(1.4, 1.5, 1.6, 1.7),1) | ||
weibB <- sample(c(50000, 60000, 70000, 80000),1) | ||
dSamps[[i]] <- rweibull(sSize,weibA, weibB) | ||
}, | ||
gamma = { | ||
gammaA <- sample(seq(1,3,by=0.2),1) | ||
dSamps[[i]] <- rgamma(sSize,gammaA,gammaA/50000) | ||
} | ||
) | ||
i <- i+1 | ||
} | ||
simPopSamp <- unlist(dSamps) | ||
households[(1+(countyI-1)*numBins):(numBins*countyI)] <- c(vapply(1:(numBins-1), | ||
function(x) {sum(simPopSamp >= bin_minimums[x] & simPopSamp < bin_minimums[x+1])}, | ||
numeric(1)), | ||
sum(simPopSamp>bin_minimums[numBins])) | ||
county[(1+(countyI-1)*numBins):(numBins*countyI)] <- rep(paste(distsToUse, collapse=" "), numBins) | ||
mean_true[countyI] <- mean(simPopSamp) | ||
median_true[countyI] <- median(simPopSamp) | ||
gini_true[countyI] <- ineq::Gini(simPopSamp, corr=TRUE) | ||
countyI <- countyI + 1 | ||
} #end of for countyI loop | ||
county_true <- data.frame(fips, mean_true, median_true, gini_true) | ||
fips <- rep(fips, each=numBins) | ||
county_bins <- data.frame(fips,households,bin_min,bin_max,county,state) | ||
return(list(county_bins=county_bins, county_true=county_true)) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
splinebins <- function(bEdges, bCounts, m=NULL, numIterations=16, monoMethod=c("hyman", "monoH.FC")) { | ||
monoMethod <- match.arg(monoMethod) | ||
L <- length(bCounts) | ||
tot <- sum(bCounts) | ||
if(is.null(m)) # no mean provided, so make one up | ||
m <- sum(0.5*(c(bEdges[1:(L-1)],2.0*bEdges[L-1])+c(0, bEdges[1:(L-1)]))*bCounts/tot) | ||
sumbAreas <- vapply(1:L, function(i) {sum(bCounts[1:i])/tot}, numeric(1)) | ||
tailEnd <- 1.05*bEdges[L-1] # start with a really short tail | ||
x <- c(0, bEdges[1:(L-1)], tailEnd, tailEnd*1.01) | ||
y <- c(0, sumbAreas, 1) # The last x,y pair forces smoothness at x=tailEnd | ||
f <- splinefun(x,y, method=monoMethod) | ||
est_mean <- tailEnd - pracma::integral(f, 0, tailEnd) | ||
shrinkFactor <- 1 | ||
shrinkMultiplier <- 0.995 | ||
while(est_mean > m) { # binning must be skewed, because supplied mean is too small | ||
x <- x*shrinkMultiplier # shrink the bins | ||
tailEnd <- tailEnd*shrinkMultiplier | ||
f <- splinefun(x,y, method=monoMethod) | ||
est_mean <- tailEnd - pracma::integral(f, 0, tailEnd) | ||
shrinkFactor <- shrinkFactor*shrinkMultiplier | ||
} | ||
if(bCounts[L] > 0) { | ||
# search for tail length | ||
while(est_mean < m) { | ||
tailEnd <- tailEnd * 2 | ||
x[L+1] <- tailEnd | ||
x[L+2] <- tailEnd*1.01 | ||
f <- splinefun(x,y, method=monoMethod) | ||
est_mean <- tailEnd - pracma::integral(f, 0, tailEnd) | ||
} | ||
# now the correct tail end will be between bEdges[L-1] and tailEnd | ||
l <- bEdges[L-1] | ||
r <- tailEnd | ||
for(i in 1:numIterations) { | ||
tailEnd <- (l+r)/2 | ||
x[L+1] <- tailEnd | ||
x[L+2] <- tailEnd*1.01 | ||
f <- splinefun(x,y, method=monoMethod) | ||
est_mean <- tailEnd - pracma::integral(f, 0, tailEnd) | ||
if(est_mean > m) | ||
r <- tailEnd | ||
else | ||
l <- tailEnd | ||
} | ||
} | ||
splineCDF <- function(x){ | ||
ifelse(x<0, 0, ifelse(x>tailEnd, 1, f(x))) | ||
} | ||
splinePDF <- function(x){ | ||
ifelse(x<0 | x>tailEnd, 0, f(x,deriv=1)) | ||
} | ||
return(list(splinePDF=splinePDF, splineCDF=splineCDF, E=tailEnd, est_mean=est_mean, shrinkFactor=shrinkFactor)) | ||
} |
Oops, something went wrong.