Using Random Forest to Forecast Volatility

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [4]:
file_path = r"C:\Users\GIORDANO\Desktop\financial-time-series-forecasting\data\Developed_6_Portfolios_ME_BE-ME_cleaned_decimals.csv"
df = pd.read_csv(file_path, parse_dates=True, index_col='date')
print("Full dataset shape:", df.shape)

# Extract the SMALL LoBM returns
returns_small = df["SMALL LoBM"].dropna()
print("SMALL LoBM series shape:", returns_small.shape)


Full dataset shape: (414, 6)
SMALL LoBM series shape: (414,)


In [5]:
# 2. Define Target: Realized Volatility
# -------------------------------
# For each date t, the target is the volatility computed from returns at t-3 to t-1 (using past data only)
df_vol = pd.DataFrame({'returns': returns_small})
# Shift by 1 so that at time t, volatility is computed using t-1, t-2, t-3
df_vol['vol_target'] = df_vol['returns'].shift(1).rolling(window=3).std()
df_vol.dropna(inplace=True)
print("Target sample:")
print(df_vol.head())

Target sample:
            returns  vol_target
date                           
1990-10-01   0.0520    0.080844
1990-11-01  -0.0312    0.096786
1990-12-01   0.0124    0.080120
1991-01-01   0.0095    0.041616
1991-02-01   0.1274    0.024378


In [6]:
# 3. Feature Engineering for Volatility Forecasting (Avoiding Leakage)
# -------------------------------
def create_vol_features(series):
    """
    Generates features for volatility forecasting.
    Features include:
    - Lagged squared returns (lags 1, 2, 3)
    - Rolling standard deviation over a 6-month window, shifted by 1.
    """
    df_feat = pd.DataFrame({'returns': series})
    df_feat['lag_sq_1'] = (df_feat['returns'] ** 2).shift(1)
    df_feat['lag_sq_2'] = (df_feat['returns'] ** 2).shift(2)
    df_feat['lag_sq_3'] = (df_feat['returns'] ** 2).shift(3)
    df_feat['roll_std_6'] = df_feat['returns'].rolling(window=6).std().shift(1)
    df_feat.dropna(inplace=True)
    return df_feat

features_df = create_vol_features(returns_small)
print("Feature DataFrame sample:")
print(features_df.head())


Feature DataFrame sample:
            returns  lag_sq_1  lag_sq_2  lag_sq_3  roll_std_6
date                                                         
1991-01-01   0.0095  0.000154  0.000973  0.002704    0.072221
1991-02-01   0.1274  0.000090  0.000154  0.000973    0.070270
1991-03-01   0.0142  0.016231  0.000090  0.000154    0.079020
1991-04-01   0.0064  0.000202  0.016231  0.000090    0.054208
1991-05-01   0.0053  0.000041  0.000202  0.016231    0.053823


In [7]:
# 4. Merge Features and Target (Ensure No Leakage)
# -------------------------------
df_model = pd.merge(df_vol[['vol_target']], features_df, left_index=True, right_index=True, how='inner')
df_model.dropna(inplace=True)
print("Merged model DataFrame shape:", df_model.shape)
print(df_model.head())

# Check correlations between features and target:
corr_matrix = df_model.corr()
print("Correlation with target:")
print(corr_matrix['vol_target'].sort_values(ascending=False))
# A perfect (or near-perfect) correlation would indicate leakage.

Merged model DataFrame shape: (408, 6)
            vol_target  returns  lag_sq_1  lag_sq_2  lag_sq_3  roll_std_6
date                                                                     
1991-01-01    0.041616   0.0095  0.000154  0.000973  0.002704    0.072221
1991-02-01    0.024378   0.1274  0.000090  0.000154  0.000973    0.070270
1991-03-01    0.067248   0.0142  0.016231  0.000090  0.000154    0.079020
1991-04-01    0.066754   0.0064  0.000202  0.016231  0.000090    0.054208
1991-05-01    0.067720   0.0053  0.000041  0.000202  0.016231    0.053823
Correlation with target:
vol_target    1.000000
roll_std_6    0.727630
lag_sq_3      0.586136
lag_sq_2      0.516634
lag_sq_1      0.487525
returns      -0.043839
Name: vol_target, dtype: float64


In [None]:
# 5. Train/Test Split (Time-Series Aware)

train_df = df_model.loc[:'2015-12-31']
test_df = df_model.loc['2016-01-01':]
print("Training period:", train_df.index.min(), "to", train_df.index.max())
print("Testing period:", test_df.index.min(), "to", test_df.index.max())

X_train = train_df.drop(columns=['vol_target'])
y_train = train_df['vol_target']
X_test = test_df.drop(columns=['vol_target'])
y_test = test_df['vol_target']


Training period: 1991-01-01 00:00:00 to 2015-12-01 00:00:00
Testing period: 2016-01-01 00:00:00 to 2024-12-01 00:00:00


In [None]:
# 6. Random Forest Model Training with TimeSeriesSplit CV

tscv = TimeSeriesSplit(n_splits=5)
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10]
}

rf = RandomForestRegressor(random_state=42)
rf_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=20,
                               scoring='neg_mean_squared_error', cv=tscv, n_jobs=-1, random_state=42)
rf_search.fit(X_train, y_train)
print("Best Random Forest params:", rf_search.best_params_)

best_rf = rf_search.best_estimator_
pred_rf = best_rf.predict(X_test)


Best Random Forest params: {'n_estimators': 200, 'min_samples_split': 2, 'max_depth': 7}


In [None]:
# 7. Evaluate Model Performance
mae_rf = mean_absolute_error(y_test, pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, pred_rf))
print(f"Random Forest Volatility Forecast Performance:\nMAE: {mae_rf:.4f}\nRMSE: {rmse_rf:.4f}")

# Benchmark from GARCH (provided earlier):
garch_mae = 0.0323
garch_rmse = 0.0378
print("\nGARCH Benchmark Volatility Forecast Performance:")
print(f"MAE: {garch_mae:.4f}, RMSE: {garch_rmse:.4f}")


Random Forest Volatility Forecast Performance:
MAE: 0.0114
RMSE: 0.0164

GARCH Benchmark Volatility Forecast Performance:
MAE: 0.0323, RMSE: 0.0378


In [12]:
# 8. Plot Forecasted vs. Realized Volatility
plt.figure(figsize=(12,6))
plt.plot(y_test.index, y_test, label='Realized Volatility', color='blue')
plt.plot(y_test.index, pred_rf, label='RF Forecasted Volatility', color='red', linestyle='--')
plt.title("Random Forest Volatility Forecast for SMALL LoBM")
plt.xlabel("Date")
plt.ylabel("Volatility")
plt.legend()
plt.savefig("plots/rf_vol_forecast.png")
plt.close()