# TODOs


In [None]:
library(forecast)

In [None]:
loadData <- function(dataFolder) {
    files <- list.files(dataFolder)
    data <- list()
    for(file in files) {    
        df <- read.csv(paste0(dataFolder, "/", file), stringsAsFactors=F)    
        minYear <- min(df$Year)
        complaintType <- substr(file,1,(nchar(file))-4)    
        tsObject <- ts(df$Complaints, start=c(minYear, 1), frequency = 12)
        data[[complaintType]] <- tsObject
    }
    data
}
data <- loadData("../../data/topNComplaints")

In [None]:
series <- data[["Removal of garbage"]]

In [None]:
tsdisplay(series)

In [None]:
# data before 2012 are too few to consider
series <- window(series, start=c(2012, 1), end=c(2016, 6))
tsdisplay(series)

## Cleaning up data 

This data looks like it has outliers near the end of 2016. Let's take a look at the series if these (possible) outliers were winsorized

In [None]:
plot(series, col="red", lty=2)
lines(tsclean(series), lty=1)
legend("topright", col=c("red", "black"), lty=c(2,1), legend=c("Original", "Cleaned"))

Although they look like outliers, we'll choose to model them as normal values for now, since the data after 2016 also exhibits a similar sharp uptrend

## Decomposition

The series does look like it has a seasonal component - let's take a look at that.

In [None]:
plot(stl(series, s.window="periodic"))

In [None]:
# the remainder series has a sharp spike in end of 2016. Can we model this in the seasonal component, by changing 
# s.window to something smaller?
plot(stl(series, s.window=6)) # change s.window to something that make sense

In [None]:
# let's take a look at which month this series peaks
seasonal <- stl(series, s.window=6)$time.series[, 1] # change s.window
plot(seasonal, col="grey")
month <- 11 # change this to month you want
for(i in 2012:2016) {    
    abline(v=(month-1)/12 + i, lty=2)
}
# looks like november-december

In [None]:
# this series looks like it fits the data well - since the seasonal component does seem to increase as time progresses
# let's set s.window = 6
stl.fit <- stl(series, s.window=6)
series.adj <- seasadj(stl.fit)
tsdisplay(series.adj)

## Forecasting
### ARIMA models - estimating p, d, q

First, let us estimate $d$. This is done by looking at the ACF of the data.

In [None]:
Acf(series.adj)

In [None]:
# the above series is a classic example of a series that requires a diff of order 1, 
# so let's try that out and take a look at the Acf to see if it is overdifferenced
tsdisplay(diff(series.adj, lag=1, differences = 1))

In [None]:
# looks like the series has a strong, negative ACF at lag2 - 
# which may mean it is over-differenced. we should try both d=0, and d=1 while modeling, and use AR and MA components 
# to compensate for under/over-differencing
# let's also look at d=2
tsdisplay(diff(series, lag = 1, differences = 2))

In [None]:
# take a look at standard-deviation
sd.0 <- sd(series.adj)
sd.1 <- sd(diff(series.adj, differences = 1))
sd.2 <- sd(diff(series.adj, differences = 2))
print(paste0("SD with d = 0: ", sd.0, ", SD with d = 1: ", sd.1, ", SD with d = 2: ", sd.2))
# in terms of sd, d=1 is a better fit

In [None]:
series.diff <- diff(series.adj, lag=1, differences = 1)

In [None]:
plot(series.diff, col="grey")
# a 2x4 MA
lines(ma(ma(series.diff, order=2), order=4))
abline(mean(series.diff), 0, col="blue", lty=2)

In [None]:
ndiffs(series.adj)

Next, we need to estimate p and q. To do this, we take a look at the PACF of the data. Note that this analysis is done on the differenced data. If we decide to fit a model with d=0, then we need to perform this analysis for the un-differenced data as well

In [None]:
# let d=0 first
# looks like a AR(3), MA(6)
Pacf(series.adj)

In [None]:
# let's try with d=1
# looks like MA(2) process
Pacf(series.diff)

### Building candidate models

In [None]:
modelArima <- function(series, order, h, testData = NULL) {
    fit <- Arima(series, order=order)
    print(summary(fit))
    predictions <- forecast(fit, h)
    # compute max and min y
    min.yvalue <- min(min(series), min(testData))
    max.yvalue <- max(max(series), max(testData))
    
    plot(predictions, ylim=c(min.yvalue, max.yvalue))
    if(!is.null(testData)) {
        lines(testData, col="red", lty=2)
        print(accuracy(predictions, testData))
    }
    # check if residuals looklike white noise
    Acf(residuals(fit), main="Residuals")
    # portmantaeu test
    print(Box.test(residuals(fit), lag=24, fitdf=4, type="Ljung"))
}

In [None]:
# split the series into a test and a train set
series.train <- window(series.adj, end=c(2015, 6))
series.test <- window(series.adj, start=c(2015, 7))

In [None]:
# with d=0, p=3, q=6
modelArima(series.train, c(3, 0, 6), length(series.test), series.test)

In [None]:
# with d=1, p=0, q=2
modelArima(series.train, c(0, 1, 12), length(series.test), series.test)