In [None]:
# we will use climate time series data (daily):https://www.kaggle.com/datasets/sumanthvrao/daily-climate-time-series-data
# install scalecast library
!pip install scalecast --upgrade

In [None]:
import scalecast
import tensorflow
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from scalecast.Forecaster import Forecaster
# If a column or index cannot be represented as an array of datetimes,
#say because of an unparsable value or a mixture of timezones,
# the column or index will be returned unaltered as an object data type.

df = pd.read_csv('DailyDelhiClimateTrain.csv',parse_dates=['date'])
# take a look at the data
df.head()
#len(df)

In [None]:
#we must first call the Forecaster object with the y and current_dates parameters as 'meantemp' and 'date' variable specified
f = Forecaster( y=df['meantemp'], current_dates=df['date'] )
f

In [None]:
#Let’s decompose this time series by viewing the PACF (Partial Auto Correlation Function) plot,
# which measures how much the y variable(meantemp) is correlated to past values of itself.
# blue area PACF plots depicts the significance threshold. 
# That means, lags that located within this area is statistically close to zero and thus insignificant autocorrelation
# between data points. 
f.plot_pacf(lags=30)# up to 30 lags
plt.show()
# PACF will depicts intuitively correlations of 1  at  lag  0,
# since this represents the correlation of the time series with itself.
#this plot indicate significant autocorrelation at lag 1 which means that adjacent points (have lag of 1) are highly correlated
# there are non zero autocorrelation at different lags as well

In [None]:
# Let’s further decompose the series into its trend, seasonal, and residual parts:
from matplotlib import pyplot
from statsmodels.tsa.seasonal import seasonal_decompose
result = seasonal_decompose(df['meantemp'],  model='multiplicative', period=365)# requency of the observations is 1
result.plot()
pyplot.show()
#The figure obviously indicates yearly seasonality and increasing trend over time.

In [None]:
# let’s test the series’ stationarity.
# If bool (full_res = False), returns whether the test suggests stationarity.
# null Hypothesis of Augmented Dickey-Fuller (ADF) test: time series is non_stationary ( there is a unit root,)
# If the pvalue is above a critical size (Default is 0.05), then we cannot reject that there is a unit root.
stat = f.adf_test(full_res=True)
print(stat)

# p_value is 0.28 > 0.0.5, then we cannot reject null hypothesis and thus time series is non stationary

In [None]:
# Now, to call an LSTM forecast. By default, 
# this model will be run with a single input layer of 8 units, Adam optimizer, tanh activation,
#  a learning rate of 0.001, and no dropout.

# generate future dates: The number of dates you generate in this step will determine how long all models will be forecast out.
f.set_validation_metric('mape')
f.set_test_length(30)       #   30 observations to test the results
f.generate_future_dates(30) #  30 future points to forecast
f.set_estimator('lstm')     #  LSTM neural network
f.manual_forecast(call_me='lstm_10lags_epochs5',lags=10, epochs=5)
f.plot_test_set(ci=True)

In [None]:
# All data is scaled going into the model with a min-max scaler and un-scaled coming out.
#Anything you can pass to the fit() method in TensorFlow,
# you can also pass to the scalecast manual_forecast() method.
#Plots all test-set predictions with the actuals.
#ci (bool) – Default False. Whether to display the confidence intervals.
# 5 lags, since we noticed 5 days autocorrelation 

# let’s try increasing the number of layers in the network to 4,
#increasing epochs to 10, but monitoring the validation loss value and telling the model to quit after more
#than 5 iterations in which that doesn’t improve. This is known as early stopping.
from tensorflow.keras.callbacks import EarlyStopping
f.manual_forecast(
    call_me='lstm_5lags_20epochs_4layers',
    lags=5,
    epochs=20,
    batch_size=16,
    activation='tanh',
    optimizer='Adam',
    shuffle=True,
    learning_rate=0.01,

    lstm_layer_sizes=(72,)*4, # 4 layers, each 72 units (size)
     dropout=(0,)*4, # dropout rate for each layer
     plot_loss=True
)

f.plot_test_set(ci=True)

In [None]:
f.plot_test_set(ci=True)

In [None]:
# plot the best 2 models based on MAPE metric
#f.plot_test_set(order_by='LevelTestSetMAPE',models='top_2',ci=True)## MAPE metric is used 

In [None]:
# lets have a look on the statistics of our models
res = f.export(dfs=['model_summaries'])
models =res['ModelNickname']
for m in models:
    print(m,  res.loc[res['ModelNickname'] == m, 'LevelTestSetMAPE'])