Permalink
Browse files

Change the estimate of variance for election predictions to be based …

…on observation of past elections compared to GAM forecasts; rather than treating elections as though they have as much variance as individual polls.
  • Loading branch information...
ellisp committed Mar 26, 2017
1 parent 04cb81f commit 5f64a509e10c8ef51ec6538ca9626c0f33c4b1e7
View
@@ -20,7 +20,12 @@ thefont <- "Calibri"
source("setup/set-fonts.R")
source("setup/functions.R")
# house effects, and variance of election results compared to predictions,
# using all available data
source("method-gam/estimate-house-effects.R")
source("method-gam/estimate-election-variance.R")
# Fit model and simulations for this current election year
source("method-gam/fit-gam.R")
source("method-gam/simulations.R")
write.csv(seats, file = "D:/Peter/Documents/blog/ellisp.github.io/elections/simulations.csv",
@@ -9,12 +9,12 @@ n <- 1000
mod_cov <- solve(crossprod(mod$family$data$R))
# estimated standard error of forecast mean:
se <- as.vector(mod_pred_elect[["se.fit"]])
sigma2 <- mod_cov
diag(sigma2) <- diag(sigma2) + sqrt(se)
se3 <- as.vector(sqrt(se ^ 2 + exp(coef(mod_var)[1] + coef(mod_var)[2] * mod_pred_elect[["fit"]])))
sigma3 <- se3 %*% t(se3) * cov2cor(mod_cov)
sims <- inv.logit(MASS::mvrnorm(n = n,
mu = mod_pred_elect[["fit"]],
Sigma = sigma2)) %>%
Sigma = sigma3)) %>%
as_tibble()
names(sims) <- parties
@@ -103,7 +103,7 @@ p <- seats %>%
gather(Coalition, Seats) %>%
ggplot(aes(x = Seats, colour = Coalition, fill = Coalition)) +
geom_density(alpha = 0.5) +
scale_y_continuous(limits = c(0, 0.05)) +
scale_y_continuous() +
ggtitle("Likely seat counts for various combinations of parties",
"Most likely outcome is that New Zealand First are needed to build a majority.") +
labs(caption = "Source: https://ellisp.github.io",
@@ -0,0 +1,68 @@
# Idea is to estimate the mean square error, possibly by party,
# on logit scale of the actual election results compared to the
# percentage that was predicted for them. This then becomes
# the individual randomness element of the simulation model
PollsAll <- polls %>%
as_tibble() %>%
left_join(house_effects, by = c("Pollster", "Party")) %>%
filter(Pollster != "Election result") %>%
mutate(Bias = ifelse(is.na(Bias), 0, Bias),
VotingIntention = ifelse(VotingIntention < 0.0005, 0.0005, VotingIntention),
VotingIntention = logit(VotingIntention) - Bias) %>%
mutate(PollDate = paste(Pollster, MidDate),
ID = 1:n()) %>%
select(Party, VotingIntention, PollDate, MidDate, ElectionYear) %>%
spread(Party, VotingIntention, fill = logit(0.0005)) %>%
mutate(MidDate = as.numeric(MidDate))
names(PollsAll) <- make.names(names(PollsAll))
mod_all <- gam(list(
ACT ~ s(MidDate, k = 3),
Conservative ~ s(MidDate, k = 3),
Green ~ s(MidDate),
Labour ~ s(MidDate),
Mana ~ s(MidDate, k = 3),
Maori ~ s(MidDate, k = 3),
National ~ s(MidDate),
NZ.First ~ s(MidDate),
United.Future ~ MidDate),
data = PollsAll, family = mvn(d = 9))
# election results. Note this snippet is copied from estimate-house-effects,
# should rationalise:
results <- polls %>%
filter(!ElectionYear %in% range(ElectionYear)) %>%
filter(Pollster == "Election result") %>%
as_tibble() %>%
select(MidDate, Party, VotingIntention) %>%
mutate(Party = make.names(Party),
VotingIntention = logit(VotingIntention))
election_dates <- unique(results$MidDate)
election_predictions <- predict(mod_all,
newdata = data.frame(MidDate = as.numeric(election_dates))) %>%
as.data.frame() %>%
cbind(election_dates)
names(election_predictions) <- c("ACT", "Conservative", "Green", "Labour", "Mana",
"Maori", "National", "NZ.First", "United.Future", "MidDate")
comparisons <- election_predictions %>%
gather(Party, Result, -MidDate) %>%
left_join(results, by = c("Party", "MidDate")) %>%
mutate(SquaredError = (Result - VotingIntention) ^ 2) %>%
filter(!is.na(VotingIntention))
# variance is a function of Result, roughly linear from log(var) ~ logit(result)
ggplot(comparisons, aes(x = Result, y = SquaredError, label = paste(Party, substring(MidDate,1, 4)))) +
geom_smooth(method = "lm") +
geom_text(aes(colour = as.ordered(MidDate))) +
labs(x = "Result (logit scale)")+
scale_y_log10("Squared Error (on logit scale) from actual election result", label = comma) +
scale_color_viridis(discrete = TRUE, begin = 0.2, end = 0.8, option = "B")
mod_var <- lm(log(SquaredError) ~ Result, data = comparisons)
summary(mod_var)
coef(mod_var)
@@ -9,12 +9,12 @@ n <- 1000
mod_cov <- solve(crossprod(mod$family$data$R))
# estimated standard error of forecast mean:
se <- as.vector(mod_pred_elect[["se.fit"]])
sigma2 <- mod_cov
diag(sigma2) <- diag(sigma2) + sqrt(se)
se3 <- as.vector(sqrt(se ^ 2 + exp(coef(mod_var)[1] + coef(mod_var)[2] * mod_pred_elect[["fit"]])))
sigma3 <- se3 %*% t(se3) * cov2cor(mod_cov)
sims <- inv.logit(MASS::mvrnorm(n = n,
mu = mod_pred_elect[["fit"]],
Sigma = sigma2)) %>%
Sigma = sigma3)) %>%
as_tibble()
names(sims) <- parties
@@ -114,7 +114,7 @@ p <- seats %>%
gather(Coalition, Seats) %>%
ggplot(aes(x = Seats, colour = Coalition, fill = Coalition)) +
geom_density(alpha = 0.5) +
scale_y_continuous(limits = c(0, 0.038)) +
scale_y_continuous() +
ggtitle("Likely seat counts for various combinations of parties",
"Most likely outcome is that New Zealand First are needed to build a majority.") +
labs(caption = "Source: https://ellisp.github.io",
View
@@ -9,22 +9,35 @@ n <- 1000
# estimated cov matrix from model.
mod_cov <- solve(crossprod(mod$family$data$R))
# estimated standard error for the predicted values
# estimated standard error for the predicted values. This bit is uncontroversial.
se <- as.vector(mod_pred_elect[["se.fit"]])
# In my first iterations I was using sigma1 as the covariance matrix for
# simulations. But now I think this is understating the true randomeness we should expect
#
# simulations. But now I think this is understating the true randomness we should expect
# sigma1 <- se %*% t(se) * cov2cor(mod_cov)
# This method takes the covariance of the residuals (mod_cov) and
# adds to it the uncertainty from the prediction. A fairer method:
sigma2 <- mod_cov
diag(sigma2) <- diag(sigma2) + sqrt(se)
# This second method takes the covariance of the residuals (mod_cov) and
# adds to it the uncertainty from the prediction. It ends up being too *much*
# variance, because the election is treated as just another small sample poll
# sigma2 <- mod_cov
#diag(sigma2) <- diag(sigma2) + sqrt(se)
# The third method takes the observed mean squared error from previous elections compared to
# where a gam fits the lines (see estimate-election-variance.R) and treats that as the
# individual variance of the actual election observation. This is added to the sum of the
# standard errors for the prediction of the latent party vote variable to create an expected
# standard deviation of the distribution for each party mean. The correlation matrix
# from the GAM predicting this year's eleciton is used as a basis, scaled up by this standard
# deviation, to create a new covariance matrix
se3 <- as.vector(sqrt(se ^ 2 + exp(coef(mod_var)[1] + coef(mod_var)[2] * mod_pred_elect[["fit"]])))
sigma3 <- se3 %*% t(se3) * cov2cor(mod_cov)
# round(sigma1, 2) # way too small
# round(sigma2, 2) # too big
# round(sigma3, 3) # generally (not always) in between
sims <- inv.logit(MASS::mvrnorm(n = n,
mu = mod_pred_elect[["fit"]],
Sigma = sigma2)) %>%
Sigma = sigma3)) %>%
as_tibble()
names(sims) <- parties
@@ -131,7 +144,7 @@ p <- seats %>%
gather(Coalition, Seats) %>%
ggplot(aes(x = Seats, colour = Coalition, fill = Coalition)) +
geom_density(alpha = 0.5) +
scale_y_continuous(limits = c(0, 0.041)) +
scale_y_continuous(limits = c(0, 0.2)) +
ggtitle("Likely seat counts for various combinations of parties",
"Most likely outcome is that New Zealand First are needed to build a majority.") +
labs(caption = "Source: https://ellisp.github.io",

0 comments on commit 5f64a50

Please sign in to comment.