Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
203 lines (151 sloc) 7.47 KB
library(reshape)
library(RCurl)
library(XML)
library(stringr)
# PARAMETERS FOR PLOT GENERATION:
# How many nearest data points to use for the local fit
num_data_points = 4
theurl <- getURL("https://en.wikipedia.org/wiki/Ontario_general_election,_2018", ssl.verifyPeer=FALSE)
tables <- readHTMLTable(theurl)
#get the nth tables on the page
df <- tables[[24]]
# Remove empty rows (wikipedia tables sometime use empty rows for spacing)
df <- df[!apply(df == "", 1, all),]
# Remove all non-standard rows, they will have <NA> values in them
# (e.g Rows that are comments about leadership changes)
df <- df[complete.cases(df),]
df #print resulting table to console for debugging
#get columns 1-2 and 4-9. Col 3 is a link to poll source
df <- df[1:nrow(df), c(1:2, 4:10) ]
#add column headings
c.names <- c("Firm", "Date", "Liberal", "PC", "NDP", "Green", "Other", "Error", "Sample_Size")
names(df) <- c.names
#print resulting table to console for debugging
df
# Calculate LOESS smoothing parameter [alpha] from number of polls [nrow(df)]
# so that plot uses the same number of datapoints for the local fit even as new polls are added.
alpha <- num_data_points / nrow(df)
print(alpha)
# DATE FORMATTING
# format Date column
df$Date = as.Date(substr(df[, 2], 9, 18))
#as.numeric gives date since 1970-01-01
# add days from 1900-01-01 to match spreadsheet data, +1 for 1970-01-01 day, and +1 for leapyear bug in 1900
df$Date = as.numeric(df$Date) - as.numeric(as.Date('1900-01-01')) +2
# SAMPLE SIZES
## EXTRACTING ROLLING POLL INFO
# Samples sizes are given as e.g.: 1000 (1/4), need to multiple sample size by fraction.
# Get number in parentheses
Rolling_Poll <- str_extract(df$Sample_Size, "(?<=\\().*(?=\\))")
# convert fraction text "1/4" to decimal number 0.25
Rolling_Poll = sapply(Rolling_Poll, function(x) eval(parse(text=x)))
# change NA to 1
Rolling_Poll[is.na(Rolling_Poll)] <- 1
# Extract sample size
df$Sample_Size = gsub(",", "", unlist(df$Sample_Size)) # remove commas in numbers
df$Sample_Size = sub("^$", "99999999", df$Sample_Size)
# Fancy regex to remove parenthetical notes, eg 1,000 (1/4) needs to remove comma and (1/4) to convert to number
df$Sample_Size = as.numeric(gsub("\\s*\\([^\\)]+\\)", "", unlist(df$Sample_Size)))
df$Sample_Size = df$Sample_Size * Rolling_Poll
# Print ot console when run individually
df
df$Error = 1/sqrt(df$Sample_Size)
# reorganize data
mdata <- melt(df, id=c("Date", "Firm", "Error", "Sample_Size"))
# sort data
mdata <- mdata[with(mdata, order(Date)), ]
# relabel data after reorganization
c.names <- c("Date", "Firm", "Error", "Sample_Size", "Party", "Popular_Support")
names(mdata) <- c.names
mdata
mdata$Popular_Support <- str_trim(mdata$Popular_Support)
mdata$Popular_Support <- as.numeric(mdata$Popular_Support)
#mdata$Sample_Size <- as.numeric(mdata$Sample_Size)
polls <- mdata
# Last election data
# last_election_date_value <- 41802 # 2014/06/12
# Date = rep.int(last_election_date_value, 5)
# Party = c('Liberal','PC','NDP','Green','Other')
# Popular_Support = c(38.65,31.25,23.75,4.84,1.51)
# Error = rep.int(0,5)
# LastElection = data.frame(Date, Party, Popular_Support, Error)
# This election -- Add after election to get final election result points
next_election_date_value <- 43258 # 2018/06/07
#Date = rep.int(43258, 5)
#Party = c('Liberal','PC','NDP','Green','Other')
#Popular_Support = c(0,0,0,0,0)
#Error = rep.int(0,5)
#ThisElection = data.frame(Date, Party, Popular_Support, Error)
# Use this if including previous data as part of the smoothing, but don't want it displayed!
# election_polls <- polls[polls$Date > (last_election_date_value + 10),]
#election_polls <- polls
colors <- c("red", "blue", "orange", "green3", "#333333")
library(ggplot2)
main_aes = aes(x = Date, y = Popular_Support, colour=Party, size=1/Error, weight=1/Error)
# Set plot file settings. This need to be before we generate the plot.
svg(filename="Ontario2018PollsPlot.svg",
width=15, # inches, think 72px/inch: want 1080px / 72 ppi = 15 inches
height=7,
pointsize=12
)
plot <- ggplot(polls)
plot2 <- plot + geom_point(main_aes)
plot2 <- plot2 + scale_colour_manual(values = colors)
# Add smooth trendline
plot_smooth <- plot2 + stat_smooth(data=polls, span = alpha, show_guide= F, main_aes)
# Extract the data so we can work on it
smooth_data <- ggplot_build(plot_smooth)$data[[2]]
# Use this if including previous data as part of the smoothing, but don't want it displayed!
# smooth_data <- smooth_data[smooth_data$x > last_election_date_value + 10, ]
# Format and add trendlines for each party/color
for(color in colors) {
party_trend <- subset(smooth_data, colour == color)
plot <- plot + geom_ribbon(data = party_trend, aes(x=x, ymin=ymin, ymax = ymax), alpha = .25)
plot <- plot + geom_line(data = party_trend, colour=color, aes(x = x, y = y))
}
# Legend (Party)
plot <- plot + scale_colour_manual(values = colors)
# The scatterplot points
plot <- plot + geom_point(main_aes, alpha=0.8)
# Vertical line if seperation in dates between election and polls
#plot <- plot + geom_vline(xintercept = 42296, linetype = 5, color='darkgray',show_guide= F)
# Legend (Sample Size)
plot <- plot + scale_size_area(max_size=3, breaks=seq(20,60,10), labels=seq(20,60,10)^2, name="Sample Size")
plot <- plot + guides(color = guide_legend(order=-1) )
# Last election data points and text
# plot <- plot + geom_point(data=LastElection, size=3, shape=5, show_guide = F, main_aes)
# plot <- plot + geom_point(data=LastElection, size=2, show_guide=F, main_aes)
# plot <- plot + geom_text(data=LastElection, show_guide=F,
# aes(x = Date, y = Popular_Support, label = Popular_Support), size=3, hjust=1.5, vjust=-0.4)
# This election -- Add after election to get final election result points
#plot <- plot + geom_point(data=ThisElection, size=3, shape=5, show_guide=F, main_aes) +
# geom_point(data=ThisElection, size=2, show_guide=F, main_aes) +
# geom_text(data=ThisElection, show_guide=F,
# aes(x = Date, y = Popular_Support, label = Popular_Support), size=3, hjust=-.2, vjust=-0.4)
campaign_start_date_value <- 43229 # 2018/05/09
next_election_date_value <- 43258 # 2018/06/07
start_date_value <- campaign_start_date_value
end_date_value <- next_election_date_value
date_labels <- as.character(seq(as.Date("2018-05-09"), as.Date("2018-06-07"), by=1))
date_labels[1] <- "2018-05-09"
date_labels[length(date_labels)] <- "Election\n2018-06-07"
min_gridlines <- seq(campaign_start_date_value, next_election_date_value, by=1)
maj_gridlines <- seq(campaign_start_date_value, next_election_date_value, by=1)
# X-axis
plot <- plot + scale_x_continuous(name = "Date", limits=c(start_date_value, end_date_value),
minor_breaks = min_gridlines,
breaks = maj_gridlines,
labels = date_labels
)
plot <- plot + theme(axis.text.x = element_text(size = 11, vjust=0.5, hjust=0, angle = 90, colour="#333333"))
plot <- plot + theme(axis.title.x = element_blank())
# Y-axis
plot <- plot + scale_y_continuous(name = "% Popular Support", lim=c(0,50), expand=c(0,0))
plot <- plot + theme(axis.text.y = element_text(size = 11))
plot <- plot + theme(axis.title.y = element_text(size = 11, angle = 90, colour="#333333"))
# Legend location
#theme(legend.justification=c(1,1), legend.position=c(1,1))
# Run this command seperately (after running the entire script) to get the plot to appear in RStudio
print(plot)
#dev.copy(svg,'Ontario2018PollsPlot.svg')
dev.off() # saves plot to R project's directory