# TODOs


In [None]:
library(forecast)

In [None]:
loadData <- function(dataFolder) {
    files <- list.files(dataFolder)
    data <- list()
    for(file in files) {    
        df <- read.csv(paste0(dataFolder, "/", file), stringsAsFactors=F)    
        minYear <- min(df$Year)
        complaintType <- substr(file,1,(nchar(file))-4)    
        tsObject <- ts(df$Complaints, start=c(minYear, 1), frequency = 12)
        data[[complaintType]] <- tsObject
    }
    data
}
data <- loadData("../../data/topNComplaints")

In [None]:
series <- data[["Mosquito menace "]]

In [None]:
tsdisplay(series)

In [None]:
# data before 2012 are too few to consider
series <- window(series, start=c(2012, 1), end=c(2016, 6))
tsdisplay(series)

## Cleaning up data 

This data looks like it has 3 outliers- one in 2013-2014 and two near 2015. Let's take a look at the 'cleaned' data

In [None]:
plot(series, col="red", lty=2)
lines(tsclean(series), lty=1)
legend("topright", col=c("red", "black"), lty=c(2,1), legend=c("Original", "Cleaned"))

Let's create the cleaned series. For initial analysis we will use both series, one cleaned, and other other left as is. For fitting time series models, we will stick to the cleaned series

In [None]:
series.cleaned <- tsclean(series)

## Decomposition

The series does look like it has a seasonal component - let's take a look at that.

In [None]:
plot(stl(series, s.window="periodic"))
# those two spikes in the seasonal component is pronounced probably due to the outliers, so for estimating 
# the seasonal component it would be better to look at the cleaned adata

In [None]:
# let's fiddle with the s.window parameter
plot(stl(series, s.window=6))

In [None]:
# now take a look at the cleaned series
plot(stl(series.cleaned, s.window=6))
# this is much more regular, especially the seasonal component. 

In [None]:
# let's take a look at which month this series peaks
seasonal <- stl(series.cleaned, s.window=6)$time.series[, 1] # change s.window
plot(seasonal, col="grey")
month <- 11 # change this to month you want
for(i in 2012:2016) {    
    abline(v=(month-1)/12 + i, lty=2)
}

In [None]:
# this series looks like it fits the data well - since the seasonal component does seem to increase as time progresses
# let's set s.window = 6
stl.fit <- stl(series.cleaned, s.window=6)
series.adj <- seasadj(stl.fit)
tsdisplay(series.adj)

## Forecasting
### ARIMA models - estimating p, d, q

First, let us estimate $d$. This is done by looking at the ACF of the data.

In [None]:
Acf(series.adj)

In [None]:
# the above series is a classic example of a series that requires a diff of order 1, 
# so let's try that out and take a look at the Acf to see if it is overdifferenced
tsdisplay(diff(series.adj, lag=1, differences = 1))

In [None]:
# looks like the series has a strong, positive ACF at lag 12
# it's possible that this series still has a seasonal component
# let's also look at d=2
tsdisplay(diff(series, lag = 1, differences = 2))

In [None]:
# take a look at standard-deviation
sd.0 <- sd(series.adj)
sd.1 <- sd(diff(series.adj, differences = 1))
sd.2 <- sd(diff(series.adj, differences = 2))
print(paste0("SD with d = 0: ", sd.0, ", SD with d = 1: ", sd.1, ", SD with d = 2: ", sd.2))
# in terms of sd, d=1 is a better fit

In [None]:
ndiffs(series.adj)

In [None]:
series.diff <- diff(series.adj, lag=1, differences = 1)

In [None]:
plot(series.diff, col="grey")
# a 2x4 MA
lines(ma(ma(series.diff, order=2), order=4))
abline(mean(series.diff), 0, col="blue", lty=2)

Next, we need to estimate p and q. To do this, we take a look at the PACF of the data. Note that this analysis is done on the differenced data. If we decide to fit a model with d=0, then we need to perform this analysis for the un-differenced data as well

In [None]:
# let d=0 first
# looks like a AR(1), MA(12)
Pacf(series.adj)

In [None]:
# let's try with d=1
# looks like AR(11), MA(4) process
Pacf(series.diff)

### Building candidate models

In [None]:
modelArima <- function(series, order, h, testData = NULL) {
    fit <- Arima(series, order=order)
    print(summary(fit))
    predictions <- forecast(fit, h)
    # compute max and min y
    min.yvalue <- min(min(series), min(testData))
    max.yvalue <- max(max(series), max(testData))
    
    plot(predictions, ylim=c(min.yvalue, max.yvalue))
    if(!is.null(testData)) {
        lines(testData, col="red", lty=2)
        print(accuracy(predictions, testData))
    }
    # check if residuals looklike white noise
    Acf(residuals(fit), main="Residuals")
    # portmantaeu test
    print(Box.test(residuals(fit), lag=24, fitdf=4, type="Ljung"))
}

In [None]:
# split the series into a test and a train set
series.train <- window(series.adj, end=c(2015, 6))
series.test <- window(series.adj, start=c(2015, 7))

In [None]:
# with d=0, p=3, q=6
modelArima(series.train, c(1, 0, 12), length(series.test), series.test)

In [None]:
# with d=1, p=0, q=2
modelArima(series.train, c(11, 1, 5), length(series.test), series.test)

## Exponential Smoothing

In [None]:
#Mosquito menace
series
series <- window(series, start = c(2012,4), end = c(2016,6))
stl.fit <- stl(series, s.window=8)
series.adj <- seasadj(stl.fit)
seasonal <- stl.fit$time.series[, 1]
seasonal_train <- stl(window(series, end = c(2015,6)), s.window = 8)[[1]][,1]
#tsdisplay(series.adj)
plot(seasonal)
plot(seasonal_train)

In [None]:
stl.fit <- stl(series, s.window="periodic")
series.adj <- seasadj(stl.fit)
seasonal <- stl.fit$time.series[, 1]
seasonal_train <- stl(window(series, end = c(2015,6)), s.window = "periodic")[[1]][,1]
#tsdisplay(series.adj)
plot(seasonal)
plot(seasonal_train)

In [None]:
seasonal
seasonal_train

#### Note: Three peaks are observed in both the training and overall data sets, but the difference is the amplitude in each of them. It is noticed that the peaks in november and drops in April-May are almost the small. But there are variations in some other months, yet the pattern of seasonality looks more or less the same

In [None]:
## Function for finding the average of seasonal components
period_stat <- function(ts_data_in, type = 1, start_value, years){
#type 1: sum
#type 2: mean

freq <- frequency(ts_data_in)
len <- length(ts_data_in)

freq_vector <- numeric(0)
freq_sum <- numeric(0)
vec <- numeric(0)
sum_vec <- numeric(0)

start_val <- start(ts_data_in)

ts_data_in <- c(rep(NA,start_val[2] - 1),ts_data_in)

max_limit <- ceiling(len/freq)
    for(i in 1:max_limit){
    
    vec <- ts_data_in[(((i-1)*freq)+1):(((i-1)*freq)+freq)]
    freq_vector <- as.numeric(!is.na(vec))
    vec[is.na(vec)] <- 0
    
    if(i == 1){
    sum_vec <- vec
    freq_sum <- freq_vector
    }else{
    sum_vec <- sum_vec + vec
    freq_sum <- freq_sum + freq_vector
    }
    }

final_ts <- numeric(0)
if(type == 1)
{
    final_ts <- sum_vec
}else if(type == 2) {

    final_ts <- (sum_vec/freq_sum)
} else {
    stop("Invalid type")
}


return(ts(rep(final_ts,years),frequency = freq, start = start_value ))

}

In [None]:
#Adjust the negative values in the ts data
es_series <- series.adj
min_ts_value <- min(es_series)

bias_value <- (-1*min_ts_value) + 1
ES_series <- es_series+ bias_value
#plot(ES_series)
ES_series

train_data <- window(ES_series, end=c(2015, 6))
test_data <- window(ES_series, start=c(2015, 7))

In [None]:
#Getting the mean value from the seasonal components for the data set and not for the training set alone.
#Need to adjust based on the input from Suchana.

seasonal_mean <- period_stat(seasonal,2,c(2012,1),years = 7)

In [None]:
#Preprocessing data. Removing 0 from the data
train_data[train_data==0]=0.01 

#Fitting a model with ets function

ets1 = ets(train_data)
summary(ets1)
plot(forecast(ets1))
lines(test_data, col = "red")

In [None]:
#Ljung Box test - One of the checks to perform stationarity of TS data
Box.test(ets1$residuals, lag = 20, type = "Ljung-Box")
p_value <- Box.test(ets1$residuals, lag = 20, type = "Ljung-Box")$p.value
Acf(ets1$residuals)

In [None]:
all_types = c("ANN","AAN","AAA","ANA","MNN","MAN","MNA","MAA","MMN","MNM","MMM","MAM")
forecast_values = 12
# For eg: AAA -> additive level, additive trend and additive seasonality
# ANN -> No trend or seasonality

In [None]:
all_fit <- list()
test_models <- list()

print("Fitting various models: ")
for (bool in c(TRUE,FALSE)){
    for (model_type in all_types){

        if(bool & substr(model_type,2,2)=="N"){
            next
        }
    test_model = ets(train_data, model = model_type,damped = bool)
    #Box.test(test_model$residuals, lag = 20, type = "Ljung-Box")$p.value
    all_fit[[paste0("ETS Model: ",model_type,", Damped: ",bool)]][1] <- accuracy(test_data, forecast.ets(test_model,h=forecast_values)$mean )[5]
    all_fit[[paste0("ETS Model: ",model_type,", Damped: ",bool)]][2] <- 100*(Box.test(test_model$residuals, lag = 20, type = "Ljung-Box")$p.value)
    
        test_models[[paste0("ETS Model: ",model_type,", Damped: ",bool)]] <- test_model

        print(test_model$method)
        print(accuracy(test_data, forecast.ets(test_model,h=forecast_values)$mean )[5])
        print("")

        #Excluding the models which has auto correlated residuals @ 10% significance level

    }
}

In [None]:
#Finding the best fit
proper_models <- all_fit
    if(length(proper_models)==0){
        print("None of the model satisfies - Ljung-Box test; Model with least 3 p values taken")
        p_values <- sapply(all_fit, function(x)x[2])
        proper_models <- all_fit[order(p_values)][1:3]
    }

    best_mape <- min(sapply(proper_models,function(x)x[1]))
    best_model <- names(which.min(sapply(proper_models,function(x)x[1])))

    print(paste0("Complaint: ",names(TS_data)))
    print(paste0("Best Model:",best_model))
    print(paste0("Best Mape: ",best_mape))

In [None]:
#Finding top n fits
top_models <- c()
Top_n <- 3

if(length(proper_models)<3){Top_n <- length(proper_models)}

top_mape_val <- proper_models[order(sapply(proper_models, function(x)x[1]))][1:Top_n]
top_models <- names(top_mape_val)

In [None]:
top_mape_val
seasonal_mean

In [None]:
plot(ES_series,col = "black")
lines(test_data, col = "blue")
lines(forecast.ets(test_models[[top_models[1]]],h=12)$mean, col = "red") #Top model
lines(forecast.ets(test_models[[top_models[2]]],h=12)$mean, col = "green") #Top second model
lines(forecast.ets(test_models[[top_models[3]]],h=12)$mean, col = "yellow") #Top third model

#Observation: Unusual peak at December'15. To check if it is an anomaly

### Getting back the original data

In [None]:
#Adding the bias value which was added to overcome the negative values
ES_series_bias <- ES_series - bias_value
test_series_bias <- test_data - bias_value
forecast1_bias <- forecast.ets(test_models[[top_models[1]]],h=12)$mean - bias_value
forecast2_bias <- forecast.ets(test_models[[top_models[2]]],h=12)$mean - bias_value
forecast3_bias <- forecast.ets(test_models[[top_models[3]]],h=12)$mean - bias_value

#Adding back the seasonal value from stl decomposition
ES_value <- ES_series_bias + seasonal
test_series <- test_series_bias + seasonal

forecast1 <- forecast1_bias + seasonal_mean
forecast2 <- forecast2_bias + seasonal_mean
forecast3 <- forecast3_bias + seasonal_mean


accuracy(test_models[[top_models[1]]])
accuracy(test_models[[top_models[2]]])
accuracy(test_models[[top_models[3]]])

In [None]:
#Checking the MAPE values with original data
print(paste0("Top model: ", top_models[1]))
accuracy(forecast1,test_series)
print(paste0("Top model: ", top_models[2]))
accuracy(forecast2,test_series)
print(paste0("Top model: ", top_models[3]))
accuracy(forecast3,test_series)

#accuracy(test_data, forecast.ets(test_models[[top_models[3]]],h=12)$mean )

In [None]:
#Ljung Box test - One of the checks to perform stationarity of TS data
#Top model
print(top_models[1])
Box.test(test_models[[top_models[1]]]$residuals, lag = 20, type = "Ljung-Box")
p_value <- Box.test(test_models[[top_models[1]]]$residuals, lag = 20, type = "Ljung-Box")
Acf(test_models[[top_models[1]]]$residuals)

In [None]:
#Ljung Box test - One of the checks to perform stationarity of TS data
#Top second model
print(top_models[2])
Box.test(test_models[[top_models[2]]]$residuals, lag = 20, type = "Ljung-Box")
p_value <- Box.test(test_models[[top_models[2]]]$residuals, lag = 20, type = "Ljung-Box")
Acf(test_models[[top_models[2]]]$residuals)

In [None]:
#Ljung Box test - One of the checks to perform stationarity of TS data
#Top Third model
print(top_models[3])
Box.test(test_models[[top_models[3]]]$residuals, lag = 20, type = "Ljung-Box")
p_value <- Box.test(test_models[[top_models[3]]]$residuals, lag = 20, type = "Ljung-Box")
Acf(test_models[[top_models[3]]]$residuals)

#### Residual plot output: From the plot, all the errors seem to be random and there appears to be no much autocorrelation among the errors, confirming that the data is stationary.

In [None]:
plot(ES_value,col = "black") #Original data set
lines(test_series, col = "blue") #Original test data
lines(test_series_bias + seasonal_mean, col = "black") #Deseasonlised data with average seasonal component applied
lines(forecast1, col = "red") #Top model
lines(forecast2, col = "green") #Top second model
lines(forecast3, col = "yellow") #Top third model

#### Observation: The predicted forecast, after applying the seasonal component overpredicts the complaint for one of the months (due to the pattern of the seasonal component). So either the seasonality is absent in the actual data for that particular month or the seasonal component derived is not accurate.