Skip to content

Commit

Permalink
Merge pull request #1 from chris31415926535/bugfixing
Browse files Browse the repository at this point in the history
Bugfixing
  • Loading branch information
chris31415926535 committed May 3, 2023
2 parents dc26536 + 82395e0 commit 6cce400
Show file tree
Hide file tree
Showing 7 changed files with 257 additions and 17 deletions.
3 changes: 1 addition & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@ RoxygenNote: 7.2.3
Imports:
dplyr,
furrr,
sf,
tidyr
sf
Depends:
R (>= 2.10)
Suggests:
Expand Down
135 changes: 121 additions & 14 deletions R/pseudohouseholds.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
PHH_TYPE_REGULAR <- 0
PHH_TYPE_UNPOPULATED <- 1
PHH_TYPE_NO_VALID_PHHS_FOUND <- 2

#' Get Pseudo-Households (PHH) for many regions, with optional parallel processing
#'
#' Calculate PHHs for a set of regions using a given road network.
Expand Down Expand Up @@ -37,7 +41,7 @@
#' # Create PHHs for the first 20 dissemination blocks in Ottawa, Ontario, using
#' # parallel processing (consult documentation for the package future for details
#' # about parallel processing).
#' \dontrun{
#' \donttest{
#' library(future)
#' future::plan(future::multisession)
#' phhs <- get_phhs_parallel(region = ottawa_db_shp[1:20,], region_idcol = "DBUID",
Expand Down Expand Up @@ -75,9 +79,7 @@ get_phhs_parallel <- function(regions, region_idcol, roads, region_popcol = NA,
phh_valid <- phh_candidates[vapply(phh_candidates, length, FUN.VALUE = 1) > 0]

# tidy them up
phhs <- dplyr::tibble(x=phh_valid) |>
tidyr::unnest(cols = c("x")) |>
sf::st_as_sf()
phhs <- dplyr::bind_rows(phh_valid)

# remove any temporary options used to suppress repeated warnings
warning_cleanup()
Expand Down Expand Up @@ -170,9 +172,20 @@ get_phhs_single <- function(region, region_idcol, roads, region_popcol = NA, roa
region_pop <- region[, region_popcol, drop=TRUE];

# if no population and we're skipping such regions, return empty result
if (region_pop == 0 & skip_unpopulated_regions) return(dplyr::tibble());
# if no population and we're including such regions, return default response
if (region_pop == 0) {
if (skip_unpopulated_regions){
result <- dplyr::tibble()
} else {
result <- create_default_response(region = region, region_idcol = region_idcol, roads_idcol = roads_idcol, type = PHH_TYPE_UNPOPULATED)
}

return(result);
}
}



# get the roads that intersect the region plus a buffer
# we cast to multilinestring and then back to linestring to deal with disconnected multilinestrings
roads_touching_region <- sf::st_intersection(roads, sf::st_buffer(region, road_buffer_m)) |>
Expand All @@ -181,10 +194,10 @@ get_phhs_single <- function(region, region_idcol, roads, region_popcol = NA, roa

# ggplot() + geom_sf(data=region) + geom_sf(data=roads_touching_region)

# if it doesn't intersect any roads, return no results.
# for future consideration, perhaps options--e.g. could return centroid instead
# if it doesn't intersect any roads, return default response
if (nrow(roads_touching_region) == 0) {
return(dplyr::tibble())
result <- create_default_response(region = region, region_idcol = region_idcol, roads_idcol = roads_idcol, type = PHH_TYPE_NO_VALID_PHHS_FOUND)
return(result)
}

## set candidate PHHs by sampling road segments at the density set by the user
Expand Down Expand Up @@ -225,7 +238,7 @@ get_phhs_single <- function(region, region_idcol, roads, region_popcol = NA, roa
# any (probably one) that are inside the region. This seems to work well with
# weird convexities and strange crescent geometries, giving a good number of
# points. If there are too many too close together we thin them out later.
phh_inregion <- get_phh_points_pushpull (region, phh_onstreet, roads_idcol, delta_distance = delta_distance_m)
phh_inregion <- get_phh_points (region, phh_onstreet, region_idcol, roads_idcol, delta_distance = delta_distance_m)

#ggplot() + geom_sf(data=region) + geom_sf(data=roads_touching_region) + geom_sf(data=phh_inregion)

Expand Down Expand Up @@ -253,8 +266,11 @@ get_phhs_single <- function(region, region_idcol, roads, region_popcol = NA, roa
#result$pop <- as.numeric(region_pop/nrow(result))
result[, region_popcol] <- as.numeric(region_pop/nrow(result))
}
result$geometry <- result$x
result$x <- NULL

if ("x" %in% colnames(result)) {
result$geometry <- result$x
result$x <- NULL
}

if (roads_idcol == "road_id_NA") result$road_id_NA <- NULL

Expand All @@ -274,10 +290,10 @@ get_phhs_single <- function(region, region_idcol, roads, region_popcol = NA, roa
# the region, and so simply pulling shapes from the edges will often move them
# outside the region. In cases where both candidates are valid, unreasonably
# close PHHs will be cleaned up later.
get_phh_points_pushpull <- function(db, phh_onstreet, roads_idcol, delta_distance = 5){
get_phh_points <- function(db, phh_onstreet, region_idcol, roads_idcol, delta_distance = 5){

# for clean R CMD CHECK
PHH_X_pull <- PHH_X_push <- PHH_Y_pull <- PHH_Y_push <- geometry <- NULL
PHH_X_pull <- PHH_X_push <- PHH_Y_pull <- PHH_Y_push <- geometry <- .tempid <- NULL

# get db centroid coordinates
db_centroid <- sf::st_centroid(db)
Expand Down Expand Up @@ -326,10 +342,42 @@ get_phh_points_pushpull <- function(db, phh_onstreet, roads_idcol, delta_distanc
# ggplot() + geom_sf(data=db) + geom_sf(data=roads_touching_region) + geom_sf(data=phh_push, colour="red")+ geom_sf(data=phh_pull, colour="blue") + geom_sf(data=db_centroid)

# remove candidates that aren't inside the region
# if we proceed, these are regular PHHs and we type them as such
phh_indb <- sf::st_filter(phh_pushpull, db)
phh_indb$phh_type <- PHH_TYPE_REGULAR

# strange geometries with only a few border roads can lead to no points inside
# in this case, we sample 16 points distributed radially around the candidate point
# and keep the first one we find that's inside the region
if (nrow(phh_indb) == 0) {
phh_onstreet_foranalysis <- phh_onstreet
phh_onstreet_foranalysis$.tempid <- 1:nrow(phh_onstreet)
sf::st_agr(phh_onstreet_foranalysis) = "constant"
pts_spread <- sf::st_buffer(phh_onstreet_foranalysis, delta_distance, nQuadSegs = 4) |>
sf::st_cast("POINT")

# ggplot(pts_spread) + geom_sf()

pts_inshape <- sf::st_filter(pts_spread, db) |>
dplyr::group_by(.tempid) |>
dplyr::slice_head(n=1)

phh_indb <- pts_inshape |>
dplyr::select(-.tempid) |>
sf::st_as_sf()

phh_indb$phh_type <- PHH_TYPE_REGULAR

# if we STILL don't get at least one valid PHH for some reason, return the
# default point -- either centroid or approx geometric center
if (nrow(phh_indb) == 0) {
phh_indb <- create_default_response(region = db, region_idcol = region_idcol, roads_idcol = roads_idcol, type = PHH_TYPE_NO_VALID_PHHS_FOUND)
}

} # end if no valid points returned by push/pull algorithm

# rename the geometry column for compatibility with the sf package
phh_indb <- dplyr::rename(phh_indb, x = geometry)
if (!"geometry" %in% colnames(phh_indb)) phh_indb <- dplyr::rename(phh_indb, x = geometry)

# ggplot() + geom_sf(data=db) + geom_sf(data=roads_touching_region) + geom_sf(data=phh_indb)

Expand Down Expand Up @@ -543,3 +591,62 @@ validate_phhs <- function(phhs, regions, region_idcol, region_popcol){

return (result)
}



# internal function to return either the centroid (for a convex shape) or else
# the point inside the shape farthest from its boundaries. used for unpopulated
# regions, or regions with no roads, to still provide a single representative pt
get_center <- function(shape) {

# calculate the centroid
centroid <- sf::st_centroid(shape)

# if the centroid is inside the shape, we are done: return the centroid
if (sf::st_contains(shape, centroid, sparse = FALSE)) return (centroid)

#ggplot() + geom_sf(data = centroid) + geom_sf(data = crescent)

# create a hollow version of the shape for measuring distances to the borders
shape_hollow <- sf::st_cast(shape, "MULTILINESTRING")

#ggplot(crescent_hollow) + geom_sf()

# sample points within the shape
pts <- sf::st_sample(shape, 100)
pts <- sf::st_sf(geometry = pts)

#ggplot(pts) + geom_sf()

# get distances from the sampled points to the borders
distances <- sf::st_distance(shape_hollow, pts)[1,]

# choose the shape with the biggest distance to the border
biggest_dist_index <- which(distances == max(distances))
pt <- pts[biggest_dist_index,]

# add back the input shape's information
pt <- dplyr::bind_cols(pt, sf::st_drop_geometry(shape))
#ggplot() + geom_sf(data = crescent) + geom_sf(data = pts) + geom_sf(data = pt, colour="red")

return (pt)
}



create_default_response <- function(region, region_idcol, roads_idcol, type) {

result <- get_center(region)

result$phh_id <- paste0(result[,region_idcol, drop=TRUE], ".1")

if (!is.na(roads_idcol)) {
result[,roads_idcol] <- NA
}

result$phh_type <- type

return (result)
}

#create_default_response(region = region, region_idcol = "DBUID", roads_idcol = "NGD_UID", type = PHH_TYPE_NO_VALID_PHHS_FOUND)
45 changes: 45 additions & 0 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,13 @@ ggplot2::ggplot(db) +

This DB is a few hundred meters across, borders Bank St. on its west, and extends to the east into an irregularly shaped residential area. From the printout above, we see that it had a population of 125 in 2021.

### If the region is unpopulated, optionally return a default point.

By default, if a region is unpopulated it receives one single point with population
0 of `phh_type` type 1. This point will be the centroid if it is inside the region,
otherwise it will be an approximation of the region's "pole of inaccessibility."
Details are provided below.

### Find road segments intersecting or near the region

First, we identify road segments that either intersect with the region or come within a specified buffer. The default buffer is 5 meters, which we adopt here, although larger or smaller buffers may be appropriate in different settings.
Expand Down Expand Up @@ -208,6 +215,22 @@ Next, we keep only those candidate PHHs that are inside the region using a spati
```

### (If necessary) If no valid points are returned, sample radially around our on-street points

The push/pull algorithm may not return valid PHHs in some cases, for example if
the region is concave and has only small intersections with any road. In this
case we use a backup algorithm that samples 16 candidate points distributed
radially with radius `delta_distance` around our on-street point and selects the
first candidate point within the region.

### (If necessary) If *still* no valid points are returned, return a default region point.

If for any reason the algorithm has still not found a valid candidate PHH, it will
now return a single default point. It will return the centroid if the centroid is
inside the region, or else an approximation of the visual centre of the region.
In either case this point will have `phh_type` 2, indicating no valid PHHs were
found using the standard methods.

### (Optionally) Ensure our PHHs have a minimum population

If we are assigning populations to our PHHs, we next check to ensure each PHH
Expand Down Expand Up @@ -253,3 +276,25 @@ Any analysis using PHHs should be clear that they rely on approximations and do
not provide "true" population distributions (whatever that might even mean).
While PHHs can provide advantages over centroid-based analyses, they also have
limitations and should be used with these limitations in mind.

## Details

### The variable `phh_type`

* 1: Normal PHH
* 2: Unpopulated region
* 3: No valid PHHs found

### The default point

If a region is unpopulated or if it is no valid PHHs can be found, the function
will return the region's "default point." If the region's centroid is inside it,
the default point is the centroid. If the centroid is outside the region, we use
a simple algorithm to choose a point that is approximately in the region's
"pole of inaccessibility," or farthest from each border: we randomly sample 100
points inside the region, and choose the one that is farthest from any border.

Such points are flagged as either `phh_type` 1 (unpopulated) or `phh_type` 2
(populated but no valid PHHs found), and care should be taken when using these
variables for analysis since they may be farther from road networks than regular
PHHs and may not represent actual population distributions.
49 changes: 49 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,13 @@ This DB is a few hundred meters across, borders Bank St. on its west,
and extends to the east into an irregularly shaped residential area.
From the printout above, we see that it had a population of 125 in 2021.

### If the region is unpopulated, optionally return a default point.

By default, if a region is unpopulated it receives one single point with
population 0 of `phh_type` type 1. This point will be the centroid if it
is inside the region, otherwise it will be an approximation of the
region’s “pole of inaccessibility.” Details are provided below.

### Find road segments intersecting or near the region

First, we identify road segments that either intersect with the region
Expand Down Expand Up @@ -155,6 +162,24 @@ a spatial filter.

<img src="man/figures/README-plot_db_pointsinregion-1.png" width="50%" />

### (If necessary) If no valid points are returned, sample radially around our on-street points

The push/pull algorithm may not return valid PHHs in some cases, for
example if the region is concave and has only small intersections with
any road. In this case we use a backup algorithm that samples 16
candidate points distributed radially with radius `delta_distance`
around our on-street point and selects the first candidate point within
the region.

### (If necessary) If *still* no valid points are returned, return a default region point.

If for any reason the algorithm has still not found a valid candidate
PHH, it will now return a single default point. It will return the
centroid if the centroid is inside the region, or else an approximation
of the visual centre of the region. In either case this point will have
`phh_type` 2, indicating no valid PHHs were found using the standard
methods.

### (Optionally) Ensure our PHHs have a minimum population

If we are assigning populations to our PHHs, we next check to ensure
Expand Down Expand Up @@ -203,3 +228,27 @@ and do not provide “true” population distributions (whatever that might
even mean). While PHHs can provide advantages over centroid-based
analyses, they also have limitations and should be used with these
limitations in mind.

## Details

### The variable `phh_type`

- 1: Normal PHH
- 2: Unpopulated region
- 3: No valid PHHs found

### The default point

If a region is unpopulated or if it is no valid PHHs can be found, the
function will return the region’s “default point.” If the region’s
centroid is inside it, the default point is the centroid. If the
centroid is outside the region, we use a simple algorithm to choose a
point that is approximately in the region’s “pole of inaccessibility,”
or farthest from each border: we randomly sample 100 points inside the
region, and choose the one that is farthest from any border.

Such points are flagged as either `phh_type` 1 (unpopulated) or
`phh_type` 2 (populated but no valid PHHs found), and care should be
taken when using these variables for analysis since they may be farther
from road networks than regular PHHs and may not represent actual
population distributions.
4 changes: 4 additions & 0 deletions cran-comments.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## Resubmission in response to comments from Benjamin Altmann, 2023-04-27

* Replaced \dontrun{} with \donttest{} in examples, as requested.

## Resumbission in response to comments from Victoria Wimmer, 2023-04-25

* Removed \dontrun{} from most example code and made a few minor tweaks to ensure it
Expand Down
2 changes: 1 addition & 1 deletion man/get_phhs_parallel.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 6cce400

Please sign in to comment.