Permalink
Branch: master
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
391 lines (364 sloc) 16.4 KB
#' @import data.table
mutual_total_compute <- function(data, group, unit, base) {
# calculate M
add_local(data, group, unit, base)
M <- sum(data[, list(p = first(p_unit), ls = first(ls_unit)), by = unit][, p * ls])
# calculate H
p <- data[, list(p = first(p_group)), by = group][["p"]]
entropy_group <- sum(p * logf(1 / p, base))
H <- M / entropy_group
data.table(stat = c("M", "H"), est = c(M, H),
stringsAsFactors = FALSE)
}
#' @import data.table
mutual_total_within_compute <- function(data, group, unit, within, base,
components = FALSE) {
# calculate totals
n_total <- sum(data[, "freq"])
n_within <- data[, list(n_within_group = sum(freq)), by = c(within, group)]
n_within[, `:=`(n_within = sum(n_within_group)), by = within]
# calculate proportions and entropy for each within unit
n_within[, `:=`(p_within = n_within / n_total, p = n_within_group / n_within)]
entropy <- n_within[, list(entropyw = sum(p * logf(1 / p, base)), p_within = first(p_within)),
by = within
]
setkeyv(entropy, within)
# calculate totals
data[, `:=`(n_total, sum(freq)), by = within]
data[, `:=`(n_unit, sum(freq)), by = c(within, unit)]
data[, `:=`(p_unit = n_unit / n_total, p_group_g_unit = freq / n_unit)]
# calculate entropy within groups
by_unit <- data[, list(
p_unit = first(p_unit),
entropy_cond = sum(p_group_g_unit * logf(1 / p_group_g_unit, base))),
by = c(within, unit)]
setkeyv(by_unit, within)
# compute total entropy
p <- data[, list(p = sum(freq)), by = group][["p"]] / n_total
entropy_overall <- sum(p * logf(1 / p, base))
# merge within entropy, and compare to group entropy
by_unit <- merge(by_unit, entropy)
by_within <- by_unit[, list(
M = sum(p_unit * (entropyw - entropy_cond)),
p = first(p_within),
H = sum(p_unit * (entropyw - entropy_cond)) / first(entropyw),
h_weight = first(p_within) * first(entropyw) / entropy_overall
), by = within]
by_within$H <- ifelse(is.finite(by_within$H), by_within$H, 0)
if (components == TRUE) {
melt(by_within,
id.vars = within, measure.vars = c("M", "p", "H", "h_weight"),
variable.name = "stat", value.name = "est",
variable.factor = FALSE)
} else {
# total M is the sum of weighted within-group partial M
M <- sum(by_within$M %*% by_within$p)
H <- sum(by_within$H %*% by_within$h_weight)
data.table(stat = c("M", "H"), est = c(M, H), stringsAsFactors = FALSE)
}
}
#' Calculate total segregation for M and H
#'
#' Returns the total segregation between \code{group} and \code{unit}.
#' If \code{within} is given, calculates segregation within each
#' \code{within} category separately, and takes the weighted average.
#' Also see \code{\link{mutual_within}} for detailed within calculations.
#'
#' @param data A data frame.
#' @param group A categorical variable or a vector of variables
#' contained in \code{data}. Defines the first dimension
#' over which segregation is computed.
#' @param unit A categorical variable or a vector of variables
#' contained in \code{data}. Defines the second dimension
#' over which segregation is computed.
#' @param within A categorical variable or a vector of variables
#' contained in \code{data}. The variable(s) should be a superset of either
#' the \code{unit} or the \code{group} for the calculation to be meaningful.
#' If provided, segregation is
#' computed within the groups defined by the variable, and then averaged.
#' (Default \code{NULL})
#' @param weight Numeric. Only frequency weights are allowed.
#' (Default \code{NULL})
#' @param se If \code{TRUE}, standard errors are estimated via bootstrap.
#' (Default \code{FALSE})
#' @param n_bootstrap Number of bootstrap iterations. (Default \code{10})
#' @param base Base of the logarithm that is used in the calculation.
#' Defaults to the natural logarithm.
#' @return Returns a data frame with two rows. The column \code{est} contains
#' the Mutual Information Index, M, and Theil's Entropy Index, H. The H is the
#' the M divided by the \code{group} entropy. If \code{within} was given,
#' M and H are weighted averages of the within-category segregation scores.
#' If \code{se} is set to \code{TRUE}, an additional column \code{se} contains
#' the associated bootstrapped standard errors, and the column \code{est} contains
#' bootstrapped estimates.
#' @references
#' Henri Theil. 1971. Principles of Econometrics. New York: Wiley.
#'
#' Ricardo Mora and Javier Ruiz-Castillo. 2011.
#' "Entropy-based Segregation Indices". Sociological Methodology 41(1): 159–194.
#' @examples
#' # calculate school racial segregation
#' mutual_total(schools00, "school", "race", weight="n") # M => .425
#'
#' # note that the definition of groups and units is arbitrary
#' mutual_total(schools00, "race", "school", weight="n") # M => .425
#'
#' # if groups or units are defined by a combination of variables,
#' # vectors of variable names can be provided -
#' # here there is no difference, because schools
#' # are nested within districts
#' mutual_total(schools00, "race", c("district", "school"),
#' weight="n") # M => .424
#'
#' # estimate standard errors for M and H
#' mutual_total(schools00, "race", "school", weight="n", se=TRUE)
#'
#' # estimate segregation within school districts
#' mutual_total(schools00, "race", "school",
#' within="district", weight="n") # M => .087
#'
#' # estimate between-district racial segregation
#' mutual_total(schools00, "race", "district", weight="n") # M => .338
#' # note that the sum of within-district and between-district
#' # segregation equals total school-race segregation;
#' # here, most segregation is between school districts
#' @import data.table
#' @export
mutual_total <- function(data, group, unit, within = NULL,
weight = NULL, se = FALSE, n_bootstrap = 10, base = exp(1)) {
d <- prepare_data(data, group, unit, weight, within)
if (se == FALSE) {
if (is.null(within)) {
ret <- mutual_total_compute(d, group, unit, base)
} else {
ret <- mutual_total_within_compute(d, group, unit, within, base)
}
} else {
vars <- attr(d, "vars")
n_total <- sum(d[, "freq"])
if (!all(d[["freq"]] == round(d[["freq"]]))) {
warning("bootstrap with non-integer weights")
}
boot_ret <- lapply(1:n_bootstrap, function(i) {
update_log(bs_n = i, bs_max = n_bootstrap)
# resample and collapse by all variables, except "freq"
resampled <- d[
sample(.N, n_total, replace = TRUE, prob = freq)][,
list(freq = .N), by = vars]
if (is.null(within)) {
mutual_total_compute(resampled, group, unit, base)
} else {
mutual_total_within_compute(resampled, group, unit, within, base)
}
})
close_log()
boot_ret <- rbindlist(boot_ret)
# summarize bootstrapped data frames
ret <- boot_ret[, list(
est = mean(est), se = stats::sd(est)), by = c("stat")]
}
rownames(ret) <- ret[["stat"]]
as_df(ret)
}
#' Calculate detailed within-category segregation scores for M and H
#'
#' Calculates the segregation between \code{group} and \code{unit}
#' within each category defined by \code{within}.
#'
#' @param data A data frame.
#' @param group A categorical variable or a vector of variables
#' contained in \code{data}. Defines the first dimension
#' over which segregation is computed.
#' @param unit A categorical variable or a vector of variables
#' contained in \code{data}. Defines the second dimension
#' over which segregation is computed.
#' @param within A categorical variable or a vector of variables
#' contained in \code{data} that defines the within-segregation categories.
#' @param weight Numeric. Only frequency weights are allowed.
#' (Default \code{NULL})
#' @param se If \code{TRUE}, standard errors are estimated via bootstrap.
#' (Default \code{FALSE})
#' @param n_bootstrap Number of bootstrap iterations. (Default \code{10})
#' @param base Base of the logarithm that is used in the calculation.
#' Defaults to the natural logarithm.
#' @param wide Returns a wide dataframe instead of a long dataframe.
#' (Default \code{FALSE})
#' @return Returns a data frame with four rows for each category defined by \code{within}.
#' The column \code{est} contains four statistics that
#' are provided for each unit:
#' \code{M} is the within-category M, and \code{p} is the proportion of the category.
#' Multiplying \code{M} and \code{p} gives the contribution of each within-category
#' towards the total M.
#' \code{H} is the within-category H, and \code{h_weight} provides the weight.
#' Multiplying \code{H} and \code{h_weight} gives the contribution of each within-category
#' towards the total H. \code{h_weight} is defined as \code{p * EW/E}, where \code{EW} is the
#' within-category entropy, and \code{E} is the overall entropy.
#' If \code{se} is set to \code{TRUE}, an additional column \code{se} contains
#' the associated bootstrapped standard errors, and the column \code{est} contains
#' bootstrapped estimates.
#' If \code{wide} is set to \code{TRUE}, returns instead a wide dataframe, with one
#' row for each \code{within} category, and the associated statistics in separate columns.
#' @references
#' Henri Theil. 1971. Principles of Econometrics. New York: Wiley.
#'
#' Ricardo Mora and Javier Ruiz-Castillo. 2011.
#' "Entropy-based Segregation Indices". Sociological Methodology 41(1): 159–194.
#' @examples
#' (within <- mutual_within(schools00, "race", "school", within = "state",
#' weight = "n", wide = TRUE))
#' # the M for "AL" is .409
#' # manual calculation
#' schools_AL <- schools00[schools00$state=="AL",]
#' mutual_total(schools_AL, "race", "school", weight = "n") # M => .409
#'
#' # to recover the within M and H from the output, multiply
#' # p * M and h_weight * H, respectively
#' sum(within$p * within$M) # => .326
#' sum(within$H * within$h_weight) # => .321
#' # compare with:
#' mutual_total(schools00, "race", "school", within = "state", weight = "n")
#' @import data.table
#' @export
mutual_within <- function(data, group, unit, within,
weight = NULL, se = FALSE, n_bootstrap = 10, base = exp(1),
wide = FALSE) {
d <- prepare_data(data, group, unit, weight, within)
if (se == FALSE) {
ret <- mutual_total_within_compute(d, group, unit, within, base, components = TRUE)
} else {
vars <- attr(d, "vars")
n_total <- sum(d[, "freq"])
if (!all(d[["freq"]] == round(d[["freq"]]))) {
warning("bootstrap with non-integer weights")
}
boot_ret <- lapply(1:n_bootstrap, function(i) {
update_log(bs_n = i, bs_max = n_bootstrap)
# resample and collapse by all variables, except "freq"
resampled <- d[
sample(.N, n_total, replace = TRUE, prob = freq)][,
list(freq = .N), by = vars]
mutual_total_within_compute(resampled, group, unit, within, base, components = TRUE)
})
close_log()
boot_ret <- rbindlist(boot_ret)
# summarize bootstrapped data frames
ret <- boot_ret[, list(
est = mean(est), se = stats::sd(est)),
by = c(within, "stat")]
}
if (wide == TRUE) {
f <- stats::as.formula(paste(paste(within, collapse = "+"),
"~ factor(stat, levels=c('M', 'p', 'H', 'h_weight'))"))
if (se == TRUE) {
ret <- dcast(ret, f, value.var = c("est", "se"))
names(ret) <- c(within,
"M", "p", "H", "h_weight",
"M_se", "p_se", "H_se", "h_weight_se")
setcolorder(ret, c(within,
"M", "M_se", "p", "p_se",
"H", "H_se", "h_weight", "h_weight_se"))
} else {
ret <- dcast(ret, f, value.var = c("est"))
}
}
as_df(ret)
}
#' @import data.table
mutual_local_compute <- function(data, group, unit, base = exp(1)) {
add_local(data, group, unit, base)
local <- data[, list(ls = first(ls_unit), p = first(p_unit)), by = unit]
# melt into long form
melt(local,
id.vars = unit, measure.vars = c("ls", "p"),
variable.name = "stat", value.name = "est",
variable.factor = FALSE)
}
#' Calculates local segregation indices based on M
#'
#' Returns local segregation indices for each category defined
#' by \code{unit}.
#'
#' @param data A data frame.
#' @param group A categorical variable or a vector of variables
#' contained in \code{data}. Defines the dimension
#' over which segregation is computed.
#' @param unit A categorical variable or a vector of variables
#' contained in \code{data}. Defines the group for which local
#' segregation indices are calculated.
#' @param weight Numeric. Only frequency weights are allowed.
#' (Default \code{NULL})
#' @param se If \code{TRUE}, standard errors are estimated via bootstrap.
#' (Default \code{FALSE})
#' @param n_bootstrap Number of bootstrap iterations. (Default \code{10})
#' @param base Base of the logarithm that is used in the calculation.
#' Defaults to the natural logarithm.
#' @param wide Returns a wide dataframe instead of a long dataframe.
#' (Default \code{FALSE})
#' @return Returns a data frame with two rows for each category defined by \code{unit},
#' for a total of \code{2*(number of units)} rows.
#' The column \code{est} contains two statistics that
#' are provided for each unit: \code{ls}, the local segregation score, and
#' \code{p}, the proportion of the unit from the total number of cases.
#' If \code{se} is set to \code{TRUE}, an additional column \code{se} contains
#' the associated bootstrapped standard errors, and the column \code{est} contains
#' bootstrapped estimates.
#' If \code{wide} is set to \code{TRUE}, returns instead a wide dataframe, with one
#' row for each \code{unit}, and the associated statistics in separate columns.
#' @references
#' Henri Theil. 1971. Principles of Econometrics. New York: Wiley.
#'
#' Ricardo Mora and Javier Ruiz-Castillo. 2011.
#' "Entropy-based Segregation Indices". Sociological Methodology 41(1): 159–194.
#' @examples
#' # which racial groups are most segregated?
#' (localseg = mutual_local(schools00, "school", "race",
#' weight="n", wide = TRUE))
#'
#' sum(localseg$p) # => 1
#'
#' # the sum of the weighted local segregation scores equals
#' # total segregation
#' sum(localseg$ls * localseg$p) # => .425
#' mutual_total(schools00, "school", "race", weight="n") # M => .425
#' @import data.table
#' @export
mutual_local <- function(data, group, unit, weight = NULL,
se = FALSE, n_bootstrap = 10, base = exp(1),
wide = FALSE) {
d <- prepare_data(data, group, unit, weight)
if (se == FALSE) {
ret <- mutual_local_compute(d, group, unit, base)
} else {
vars <- attr(d, "vars")
n_total <- sum(d[, "freq"])
if (!all(d[["freq"]] == round(d[["freq"]]))) {
warning("bootstrap with non-integer weights")
}
boot_ret <- lapply(1:n_bootstrap, function(i) {
update_log(bs_n = i, bs_max = n_bootstrap)
# resample and collapse by all variables, except "freq"
resampled <- d[
sample(.N, n_total, replace = TRUE, prob = freq)][,
list(freq = .N), by = vars]
mutual_local_compute(resampled, group, unit, base)
})
close_log()
boot_ret <- rbindlist(boot_ret)
# summarize bootstrapped data frames
ret <- boot_ret[, list(
est = mean(est), se = stats::sd(est)),
by = c(unit, "stat")]
}
if (wide == TRUE) {
f <- stats::as.formula(paste(paste(unit, collapse = "+"),
"~ factor(stat, levels=c('ls', 'p'))"))
if (se == TRUE) {
ret <- dcast(ret, f, value.var = c("est", "se"))
names(ret) <- c(unit, "ls", "p", "ls_se", "p_se")
setcolorder(ret, c(unit, "ls", "ls_se", "p", "p_se"))
} else {
ret <- dcast(ret, f, value.var = c("est"))
}
}
as_df(ret)
}