#### ARIMA Model Forecasting 

In [None]:
import itertools
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [10]:
plt.style.use('seaborn-v0_8-dark-palette')
sns.set_context("talk")


In [11]:
# creating a grid search for ARIMA parameters
def evaluate_arima_model(time_series, p_values, d_values, q_values):
    """
    Grid search to find the best ARIMA model for a time series based on AIC.
    
    Parameters:
        time_series (pd.Series): The time series data.
        p_values (iterable): Candidate AR (p) orders.
        d_values (iterable): Candidate differencing (d) orders.
        q_values (iterable): Candidate MA (q) orders.
        
    Returns:
        best_order (tuple): The (p,d,q) that produced the lowest AIC.
        best_aic (float): The lowest AIC value.
        best_model (ARIMAResultsWrapper): The fitted ARIMA model for best_order.
    """
    best_aic = float("inf")
    best_order = None
    best_model = None
    for p, d, q in itertools.product(p_values, d_values, q_values):
        try:
            model = ARIMA(time_series, order=(p, d, q))
            model_fit = model.fit()
            aic = model_fit.aic
            if aic < best_aic:
                best_aic = aic
                best_order = (p, d, q)
                best_model = model_fit
        except Exception as e:
            continue
    return best_order, best_aic, best_model


In [13]:
file_path = r"C:\Users\GIORDANO\Desktop\financial-time-series-forecasting\data\selected_portfolios.csv"

# Load the CSV file with a DateTime index. 
# The CSV should contain the two selected portfolios (e.g., 'SMALL LoBM' and 'BIG HiBM').
df_arima = pd.read_csv(file_path, parse_dates=True, index_col='date')
print("Dataset loaded. Shape:", df_arima.shape)
print(df_arima.head())

Dataset loaded. Shape: (414, 2)
            SMALL LoBM  BIG HiBM
date                            
1990-07-01      0.0243    0.0139
1990-08-01     -0.1222   -0.1021
1990-09-01     -0.1082   -0.1160
1990-10-01      0.0520    0.0874
1990-11-01     -0.0312   -0.0345


In [14]:
train = df_arima.loc[:'2015-12-31']
test = df_arima.loc['2016-01-01':]

print("\nTraining set period:", train.index.min().strftime('%Y-%m'), "to", train.index.max().strftime('%Y-%m'))
print("Testing set period:", test.index.min().strftime('%Y-%m'), "to", test.index.max().strftime('%Y-%m'))


Training set period: 1990-07 to 2015-12
Testing set period: 2016-01 to 2024-12


In [15]:
# 3. Define Candidate Orders for Grid Search
# The ARIMA model has three parameters: p, d, and q.
# We will use a grid search to find the best combination of these parameters.
p_values = range(0, 4)  # Try p = 0,1,2,3
d_values = range(0, 2)  # Try d = 0,1 (returns are often stationary, so d may be 0)
q_values = range(0, 4)  # Try q = 0,1,2,3


In [18]:
results = {}  # Dictionary to store results
for col in df_arima.columns:
    print(f"\nEvaluating ARIMA model for portfolio: {col}")
    best_order, best_aic, best_model = evaluate_arima_model(train[col], p_values, d_values, q_values)
    
    if best_model is None:
        print("No suitable ARIMA model could be fit for", col)
        continue

    print(f"Best ARIMA order for {col}: {best_order} with AIC: {best_aic:.2f}")


Evaluating ARIMA model for portfolio: SMALL LoBM


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


Best ARIMA order for SMALL LoBM: (0, 0, 1) with AIC: -919.41

Evaluating ARIMA model for portfolio: BIG HiBM


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


Best ARIMA order for BIG HiBM: (3, 0, 0) with AIC: -995.81




In [20]:
  # Forecast for the length of the test set
  n_periods = len(test)
  forecast_result = best_model.get_forecast(steps=n_periods)
  forecast = forecast_result.predicted_mean
  conf_int = forecast_result.conf_int()
  
  # Compute error metrics: MAE and RMSE
  mae = mean_absolute_error(test[col], forecast)
  rmse = math.sqrt(mean_squared_error(test[col], forecast))
  
  # Save results in the dictionary
  results[col] = {
      "best_order": best_order,
      "aic": best_aic,
      "model": best_model,
      "forecast": forecast,
      "conf_int": conf_int,
      "mae": mae,
      "rmse": rmse
  }
  
  print(f"MAE: {mae:.4f}")
  print(f"RMSE: {rmse:.4f}")

MAE: 0.0393
RMSE: 0.0522


In [24]:
# Plot Forecasts vs. Actual Data
plt.figure(figsize=(12, 6))
plt.plot(train[col], label='Training Data', color='blue')
plt.plot(test[col], label='Test Data', color='green')
plt.plot(forecast.index, forecast, label='Forecast', color='red', linestyle='--')
plt.fill_between(forecast.index, conf_int.iloc[:, 0], conf_int.iloc[:, 1],
                 color='pink', alpha=0.3, label='95% Confidence Interval')
plt.title(f'ARIMA Forecast for {col}')
plt.xlabel('Date')
plt.ylabel('Return (decimal)')
plt.legend()
# save the plot in plots folder
plt.savefig(f'plots/{col}_arima_forecast.png')
plt.close()

In [25]:
# Residuals Analysis
residuals = best_model.resid
plt.figure(figsize=(12, 6))
    
# Residuals over time
plt.subplot(2, 1, 1)
plt.plot(residuals)
plt.title(f'Residuals of ARIMA Model for {col}')
plt.xlabel('Date')
plt.ylabel('Residuals')

# save the plot in plots folder
plt.savefig(f'plots/{col}_arima_residuals.png')
plt.close()

In [27]:
# Histogram and KDE of residuals
plt.subplot(2, 1, 2)
sns.histplot(residuals, bins=30, kde=True, color='purple')
plt.title(f'Residual Distribution for {col}')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
    
# save the plot in plots folder
plt.savefig(f'plots/{col}_arima_residuals_hist.png')
plt.close()
    
print(f"Performance Metrics for {col}: MAE = {mae:.4f}, RMSE = {rmse:.4f}")

Performance Metrics for BIG HiBM: MAE = 0.0393, RMSE = 0.0522


In [28]:
print("\nSummary of ARIMA Forecasting Results:")
for col, res in results.items():
    print(f"{col}: Best Order: {res['best_order']}, AIC: {res['aic']:.2f}, MAE: {res['mae']:.4f}, RMSE: {res['rmse']:.4f}")


Summary of ARIMA Forecasting Results:
BIG HiBM: Best Order: (3, 0, 0), AIC: -995.81, MAE: 0.0393, RMSE: 0.0522
