In [1]:
using CSV
using DataFrames
using Turing
using Plots
using StatsPlots
using Dates
using Distributions
using StatsFuns: logistic, logit

I'm basing my model from the 2012 paper by Linzer:
* [Dynamic Bayesian Forecasting of Presidential Elections in the States](https://www.ime.usp.br/~abe/lista/pdfpWRrt4xFLt.pdf)
"

In [2]:
data = CSV.File("./data/election_2016/polls/fivethirtyeight/president_general_polls_2016.csv")
df = data |> DataFrame;

In [9]:
new_df = select(df, :type, :enddate, :pollster, :grade, :samplesize, :state, :rawpoll_clinton, :rawpoll_trump, :adjpoll_clinton, :adjpoll_trump);

In [None]:
# I will only use the "polls-only" type for now, because these are the most consistent with the model I want to implement.

clean_polls = dropmissing(filter(row -> row.type == "polls-only", new_df));

In [None]:
date_format = DateFormat("mm/dd/yyyy")
poll_dates = Dates.format.(DateTime.(clean_polls.enddate, date_format), date_format);

In [5]:
sort_idx = sortperm(poll_dates)
poll_dates_sorted = unique(poll_dates[sort_idx][1:end-51])
poll_dates_sorted_rev = sort(poll_dates_sorted, rev=true)
states = unique(clean_polls.state)

12472-element Array{Float64,1}:
 40.50216
 40.56037
 40.48936
 35.22936
 35.28772
 35.21672
 34.75656
 34.81506
 34.74408
 35.18376
 35.2424
 35.17143
 36.00981
  ⋮
 41.80086
 52.72585
 52.55064
 36.69257
 26.74907
 52.87342
 24.32952
 16.09479
 28.57472
 34.18689
 40.50982
 45.83417

In [None]:
# This function groups all polls in a dictionary of dictionaries for accessing the polls in some state and in some date.

function group_by_state_and_date(df)
	
	date_format = DateFormat("mm/dd/yyyy")
	state_and_date_dict = Dict{String, Dict{String, Array{Real,2}}}()
	
	for row in eachrow(df)
		if !(row.state in keys(state_and_date_dict))
			state_and_date_dict[row.state] = Dict(row.enddate => reshape([Int64(round(row.adjpoll_clinton*row.samplesize/100)), row.adjpoll_trump, row.samplesize], 3, 1))
			
		elseif !(row.enddate in keys(state_and_date_dict[row.state]))
			state_and_date_dict[row.state][row.enddate] = reshape([Int64(round(row.adjpoll_clinton*row.samplesize/100)), row.adjpoll_trump, row.samplesize], 3, 1)
		
		else
			state_and_date_dict[row.state][row.enddate] = hcat(state_and_date_dict[row.state][row.enddate], [Int64(round(row.adjpoll_clinton*row.samplesize/100)), row.adjpoll_trump, row.samplesize])
		end
	end
	return state_and_date_dict
end

In [None]:
states_and_dates_dict = group_by_state_and_date(clean_polls)

In [None]:
@model function linzer_model(state_polls_dict, hist_state_forecast, poll_dates, states)

	n_states = length(hist_state_forecast)
	n_dates = length(poll_dates)
	
	σ_δ ~ Uniform(0, 0.5)
	σ_β ~ Uniform(0, 0.5)
	#σ_δ ~ Exponential(1)
	#σ_β ~ Exponential(1)
	σ_δ = 0.01
	σ_β = 0.01
	#dists_δ = Array{Distributions.Normal{Float64}(undef, n_dates)
	
	δ = Vector{Real}(undef, n_dates)
	β = Array{Real, 2}(undef, n_dates, n_states)
	
	# Random walks of the parameters β and δ
	for i in 1:(n_dates)
		if i == 1
			δ[i] = 0
			
			β[i,:] = logit.(hist_state_forecast)
			
		#	for j in 1:(n_states)
		#		β[i,j] = logit(hist_state_forecast[j])
		#	end
			continue
		end
		
		δ[i] ~ Normal(δ[i-1], σ_δ)
		for j in 1:(n_states)
			β[i,j] ~ Normal(β[i-1,j], σ_β)
		end
	end
	
	π = Array{Real,2}(undef, n_dates, n_states)
	
	for i in 1:n_dates
		π[i,:] = logistic.(β[i,:] .+ δ[i])
	end
	
	# rows -> days
	# columns -> states
	
	for i in 1:n_states
		for j in 1:n_dates
			
			if !(poll_dates[j] in keys(state_polls_dict[states[i]]))
				continue
			end
			
			n_polls = length(state_polls_dict[states[i]][poll_dates[j]][1,:])
			
			polls = state_polls_dict[states[i]][poll_dates[j]][1,:] # I am using only Hillary polls for now
			
			sample_sizes = state_polls_dict[states[i]][poll_dates[j]][3,:]
			
			for k in 1:n_polls
				polls[k] ~ Binomial(sample_sizes[k], π[j,i])
			end
		end
	end
end

In [None]:
h = [rand(Normal(0.5,0.02)) for _ in 1:length(states)]

In [None]:
posterior = sample(linzer_model(states_and_dates_dict, h, poll_dates_sorted_rev, states), HMC(0.05, 10), 100)