-
Notifications
You must be signed in to change notification settings - Fork 0
/
Ontario2018.R
203 lines (151 loc) · 7.47 KB
/
Ontario2018.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
library(reshape)
library(RCurl)
library(XML)
library(stringr)
# PARAMETERS FOR PLOT GENERATION:
# How many nearest data points to use for the local fit
num_data_points = 4
theurl <- getURL("https://en.wikipedia.org/wiki/Ontario_general_election,_2018", ssl.verifyPeer=FALSE)
tables <- readHTMLTable(theurl)
#get the nth tables on the page
df <- tables[[24]]
# Remove empty rows (wikipedia tables sometime use empty rows for spacing)
df <- df[!apply(df == "", 1, all),]
# Remove all non-standard rows, they will have <NA> values in them
# (e.g Rows that are comments about leadership changes)
df <- df[complete.cases(df),]
df #print resulting table to console for debugging
#get columns 1-2 and 4-9. Col 3 is a link to poll source
df <- df[1:nrow(df), c(1:2, 4:10) ]
#add column headings
c.names <- c("Firm", "Date", "Liberal", "PC", "NDP", "Green", "Other", "Error", "Sample_Size")
names(df) <- c.names
#print resulting table to console for debugging
df
# Calculate LOESS smoothing parameter [alpha] from number of polls [nrow(df)]
# so that plot uses the same number of datapoints for the local fit even as new polls are added.
alpha <- num_data_points / nrow(df)
print(alpha)
# DATE FORMATTING
# format Date column
df$Date = as.Date(substr(df[, 2], 9, 18))
#as.numeric gives date since 1970-01-01
# add days from 1900-01-01 to match spreadsheet data, +1 for 1970-01-01 day, and +1 for leapyear bug in 1900
df$Date = as.numeric(df$Date) - as.numeric(as.Date('1900-01-01')) +2
# SAMPLE SIZES
## EXTRACTING ROLLING POLL INFO
# Samples sizes are given as e.g.: 1000 (1/4), need to multiple sample size by fraction.
# Get number in parentheses
Rolling_Poll <- str_extract(df$Sample_Size, "(?<=\\().*(?=\\))")
# convert fraction text "1/4" to decimal number 0.25
Rolling_Poll = sapply(Rolling_Poll, function(x) eval(parse(text=x)))
# change NA to 1
Rolling_Poll[is.na(Rolling_Poll)] <- 1
# Extract sample size
df$Sample_Size = gsub(",", "", unlist(df$Sample_Size)) # remove commas in numbers
df$Sample_Size = sub("^$", "99999999", df$Sample_Size)
# Fancy regex to remove parenthetical notes, eg 1,000 (1/4) needs to remove comma and (1/4) to convert to number
df$Sample_Size = as.numeric(gsub("\\s*\\([^\\)]+\\)", "", unlist(df$Sample_Size)))
df$Sample_Size = df$Sample_Size * Rolling_Poll
# Print ot console when run individually
df
df$Error = 1/sqrt(df$Sample_Size)
# reorganize data
mdata <- melt(df, id=c("Date", "Firm", "Error", "Sample_Size"))
# sort data
mdata <- mdata[with(mdata, order(Date)), ]
# relabel data after reorganization
c.names <- c("Date", "Firm", "Error", "Sample_Size", "Party", "Popular_Support")
names(mdata) <- c.names
mdata
mdata$Popular_Support <- str_trim(mdata$Popular_Support)
mdata$Popular_Support <- as.numeric(mdata$Popular_Support)
#mdata$Sample_Size <- as.numeric(mdata$Sample_Size)
polls <- mdata
# Last election data
# last_election_date_value <- 41802 # 2014/06/12
# Date = rep.int(last_election_date_value, 5)
# Party = c('Liberal','PC','NDP','Green','Other')
# Popular_Support = c(38.65,31.25,23.75,4.84,1.51)
# Error = rep.int(0,5)
# LastElection = data.frame(Date, Party, Popular_Support, Error)
# This election -- Add after election to get final election result points
next_election_date_value <- 43258 # 2018/06/07
#Date = rep.int(43258, 5)
#Party = c('Liberal','PC','NDP','Green','Other')
#Popular_Support = c(0,0,0,0,0)
#Error = rep.int(0,5)
#ThisElection = data.frame(Date, Party, Popular_Support, Error)
# Use this if including previous data as part of the smoothing, but don't want it displayed!
# election_polls <- polls[polls$Date > (last_election_date_value + 10),]
#election_polls <- polls
colors <- c("red", "blue", "orange", "green3", "#333333")
library(ggplot2)
main_aes = aes(x = Date, y = Popular_Support, colour=Party, size=1/Error, weight=1/Error)
# Set plot file settings. This need to be before we generate the plot.
svg(filename="Ontario2018PollsPlot.svg",
width=15, # inches, think 72px/inch: want 1080px / 72 ppi = 15 inches
height=7,
pointsize=12
)
plot <- ggplot(polls)
plot2 <- plot + geom_point(main_aes)
plot2 <- plot2 + scale_colour_manual(values = colors)
# Add smooth trendline
plot_smooth <- plot2 + stat_smooth(data=polls, span = alpha, show_guide= F, main_aes)
# Extract the data so we can work on it
smooth_data <- ggplot_build(plot_smooth)$data[[2]]
# Use this if including previous data as part of the smoothing, but don't want it displayed!
# smooth_data <- smooth_data[smooth_data$x > last_election_date_value + 10, ]
# Format and add trendlines for each party/color
for(color in colors) {
party_trend <- subset(smooth_data, colour == color)
plot <- plot + geom_ribbon(data = party_trend, aes(x=x, ymin=ymin, ymax = ymax), alpha = .25)
plot <- plot + geom_line(data = party_trend, colour=color, aes(x = x, y = y))
}
# Legend (Party)
plot <- plot + scale_colour_manual(values = colors)
# The scatterplot points
plot <- plot + geom_point(main_aes, alpha=0.8)
# Vertical line if seperation in dates between election and polls
#plot <- plot + geom_vline(xintercept = 42296, linetype = 5, color='darkgray',show_guide= F)
# Legend (Sample Size)
plot <- plot + scale_size_area(max_size=3, breaks=seq(20,60,10), labels=seq(20,60,10)^2, name="Sample Size")
plot <- plot + guides(color = guide_legend(order=-1) )
# Last election data points and text
# plot <- plot + geom_point(data=LastElection, size=3, shape=5, show_guide = F, main_aes)
# plot <- plot + geom_point(data=LastElection, size=2, show_guide=F, main_aes)
# plot <- plot + geom_text(data=LastElection, show_guide=F,
# aes(x = Date, y = Popular_Support, label = Popular_Support), size=3, hjust=1.5, vjust=-0.4)
# This election -- Add after election to get final election result points
#plot <- plot + geom_point(data=ThisElection, size=3, shape=5, show_guide=F, main_aes) +
# geom_point(data=ThisElection, size=2, show_guide=F, main_aes) +
# geom_text(data=ThisElection, show_guide=F,
# aes(x = Date, y = Popular_Support, label = Popular_Support), size=3, hjust=-.2, vjust=-0.4)
campaign_start_date_value <- 43229 # 2018/05/09
next_election_date_value <- 43258 # 2018/06/07
start_date_value <- campaign_start_date_value
end_date_value <- next_election_date_value
date_labels <- as.character(seq(as.Date("2018-05-09"), as.Date("2018-06-07"), by=1))
date_labels[1] <- "2018-05-09"
date_labels[length(date_labels)] <- "Election\n2018-06-07"
min_gridlines <- seq(campaign_start_date_value, next_election_date_value, by=1)
maj_gridlines <- seq(campaign_start_date_value, next_election_date_value, by=1)
# X-axis
plot <- plot + scale_x_continuous(name = "Date", limits=c(start_date_value, end_date_value),
minor_breaks = min_gridlines,
breaks = maj_gridlines,
labels = date_labels
)
plot <- plot + theme(axis.text.x = element_text(size = 11, vjust=0.5, hjust=0, angle = 90, colour="#333333"))
plot <- plot + theme(axis.title.x = element_blank())
# Y-axis
plot <- plot + scale_y_continuous(name = "% Popular Support", lim=c(0,50), expand=c(0,0))
plot <- plot + theme(axis.text.y = element_text(size = 11))
plot <- plot + theme(axis.title.y = element_text(size = 11, angle = 90, colour="#333333"))
# Legend location
#theme(legend.justification=c(1,1), legend.position=c(1,1))
# Run this command seperately (after running the entire script) to get the plot to appear in RStudio
print(plot)
#dev.copy(svg,'Ontario2018PollsPlot.svg')
dev.off() # saves plot to R project's directory