# Modeling

This notebook is dedicated to modeling the time series data. We will split the data into training and testing sets, train various models such as ARIMA, SARIMA, and LSTM, and tune hyperparameters.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout


In [None]:
# Load the processed data
data = pd.read_csv('../data/processed/processed_data.csv', parse_dates=['date'], index_col='date')
data = data.asfreq('D')  # Ensure the data is daily frequency
data.fillna(method='ffill', inplace=True)  # Forward fill missing values

# Display the first few rows of the dataset
data.head()

In [None]:
# Split the data into training and testing sets
train_size = int(len(data) * 0.8)
train, test = data[0:train_size], data[train_size:]

print(f'Train size: {len(train)}')
print(f'Test size: {len(test)}')

In [None]:
# Train ARIMA model
arima_model = ARIMA(train, order=(5, 1, 0))
arima_result = arima_model.fit()

# Forecast
arima_forecast = arima_result.forecast(steps=len(test))

# Plot the results
plt.figure(figsize=(12, 6))
plt.plot(train.index, train, label='Train')
plt.plot(test.index, test, label='Test')
plt.plot(test.index, arima_forecast, label='ARIMA Forecast', color='red')
plt.legend()
plt.title('ARIMA Model Forecast')
plt.show()

In [None]:
# Evaluate ARIMA model
mae = mean_absolute_error(test, arima_forecast)
rmse = np.sqrt(mean_squared_error(test, arima_forecast))

print(f'MAE: {mae}')
print(f'RMSE: {rmse}')

In [None]:
# Prepare data for LSTM
def create_dataset(data, time_step=1):
    X, Y = [], []
    for i in range(len(data)-time_step-1):
        a = data[i:(i+time_step), 0]
        X.append(a)
        Y.append(data[i + time_step, 0])
    return np.array(X), np.array(Y)

time_step = 10
X_train, y_train = create_dataset(train.values, time_step)
X_test, y_test = create_dataset(test.values, time_step)

# Reshape input to be [samples, time steps, features]
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Build LSTM model
lstm_model = Sequential()
lstm_model.add(LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], 1)))
lstm_model.add(Dropout(0.2))
lstm_model.add(LSTM(50, return_sequences=False))
lstm_model.add(Dropout(0.2))
lstm_model.add(Dense(1))
lstm_model.compile(optimizer='adam', loss='mean_squared_error')

# Train LSTM model
lstm_model.fit(X_train, y_train, epochs=50, batch_size=32)


In [None]:
# Evaluate LSTM model
lstm_forecast = lstm_model.predict(X_test)

# Plot the results
plt.figure(figsize=(12, 6))
plt.plot(test.index, test, label='Test')
plt.plot(test.index[time_step:], lstm_forecast, label='LSTM Forecast', color='green')
plt.legend()
plt.title('LSTM Model Forecast')
plt.show()

In [None]:
# Final evaluation of LSTM model
lstm_mae = mean_absolute_error(y_test, lstm_forecast)
lstm_rmse = np.sqrt(mean_squared_error(y_test, lstm_forecast))

print(f'LSTM MAE: {lstm_mae}')
print(f'LSTM RMSE: {lstm_rmse}')