In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf

import sklearn as sk
from sklearn.model_selection import TimeSeriesSplit

In [158]:
# Download the data from Yahoo Finance (^SOX and ^VIX)
start_date='2011-12-30'
end_date='2023-01-11'

data = yf.download('^SOX ^VIX', start=start_date, end=end_date, interval='1d')['Adj Close']
data.index = pd.to_datetime(data.index)
data['^VIX^2'] = data['^VIX'].pow(2)

# Calculate the log returns
data['^SOX: Log_Returns'] = np.log(data['^SOX'].pct_change() + 1)
data['^VIX: Log_Returns'] = np.log(data['^VIX'].pct_change() + 1)
data['^VIX^2: Log_Returns'] = np.log(data['^VIX^2'].pct_change() + 1)

data['^SOX: Next_Weekly_RV'] = np.sqrt((data['^SOX: Log_Returns']**2).rolling(5).sum()).shift(-6)
data = data.dropna()
data.describe()

[                       0%%                      ]

[*********************100%%**********************]  2 of 2 completed


Ticker,^SOX,^VIX,^VIX^2,^SOX: Log_Returns,^VIX: Log_Returns,^VIX^2: Log_Returns,^SOX: Next_Weekly_RV
count,2775.0,2775.0,2775.0,2774.0,2774.0,2774.0,2769.0
mean,1371.535977,17.865077,368.87494,0.000725,-4.6e-05,-9.3e-05,0.034441
std,972.985329,7.052083,414.099759,0.018136,0.078583,0.157166,0.021463
min,351.280029,9.14,83.539606,-0.173119,-0.299831,-0.599662,0.003846
25%,616.625,13.23,175.032888,-0.008255,-0.044269,-0.088538,0.020807
50%,1084.849976,15.95,254.402494,0.00147,-0.006686,-0.013373,0.029363
75%,1853.179993,20.76,430.97771,0.010222,0.035804,0.071608,0.04291
max,4039.51001,82.690002,6837.636504,0.105753,0.768245,1.53649,0.271085


In [159]:
data.head(10)

Ticker,^SOX,^VIX,^VIX^2,^SOX: Log_Returns,^VIX: Log_Returns,^VIX^2: Log_Returns,^SOX: Next_Weekly_RV
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2012-01-03,368.421753,22.969999,527.620868,0.01135,-0.018547,-0.037094,0.026753
2012-01-04,368.48172,22.219999,493.728369,0.000163,-0.033196,-0.066392,0.024366
2012-01-05,373.989044,21.48,461.39038,0.014835,-0.033871,-0.067741,0.032283
2012-01-06,375.038544,20.629999,425.596865,0.002802,-0.040376,-0.080752,0.026164
2012-01-09,382.404968,21.07,443.944887,0.019451,0.021104,0.042208,0.054788
2012-01-10,386.09317,20.690001,428.076122,0.009599,-0.0182,-0.036399,0.057642
2012-01-11,387.702423,21.049999,443.102468,0.004159,0.01725,0.0345,0.057041
2012-01-12,391.560547,20.469999,419.020872,0.009902,-0.02794,-0.05588,0.052976
2012-01-13,383.284546,20.91,437.228094,-0.021363,0.021267,0.042534,0.052897
2012-01-17,385.033691,22.200001,492.840034,0.004553,0.059865,0.11973,0.020246


In [160]:
data.tail(10)

Ticker,^SOX,^VIX,^VIX^2,^SOX: Log_Returns,^VIX: Log_Returns,^VIX^2: Log_Returns,^SOX: Next_Weekly_RV
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-12-16,2636.100098,22.620001,511.664438,-0.009555,-0.009241,-0.018482,0.052457
2022-12-19,2599.860107,22.42,502.656403,-0.013843,-0.008881,-0.017762,0.054155
2022-12-20,2583.639893,21.48,461.39038,-0.006258,-0.042831,-0.085662,0.058801
2022-12-21,2644.5,20.07,402.804888,0.023283,-0.067896,-0.135792,0.040179
2022-12-22,2533.330078,21.969999,482.68087,-0.042947,0.090452,0.180903,0.04203
2022-12-23,2535.48999,20.870001,435.556935,0.000852,-0.051365,-0.10273,0.046591
2022-12-27,2490.169922,21.65,468.722483,-0.018036,0.036693,0.073385,0.048492
2022-12-28,2453.48999,22.139999,490.179573,-0.014839,0.02238,0.044761,0.058023
2022-12-29,2534.949951,21.440001,459.673623,0.032662,-0.032128,-0.064255,0.061054
2022-12-30,2532.110107,21.67,469.588903,-0.001121,0.01067,0.021341,0.061138


In [161]:
# rename the columns (delete special characters)
data = data.rename(columns=lambda x: x.replace('^', '').replace(':', ''))
data.columns

Index(['SOX', 'VIX', 'VIX2', 'SOX Log_Returns', 'VIX Log_Returns',
       'VIX2 Log_Returns', 'SOX Next_Weekly_RV'],
      dtype='object', name='Ticker')

### Data Preprocessing
1. Wisnorization
2. Min Max Scaling

In [162]:
# transform numerical data with wisnorization
from scipy.stats.mstats import winsorize
data = pd.DataFrame(winsorize(np.array(data), limits=[0.05, 0.05]), columns=data.columns, index=data.index)
data.describe()

Ticker,SOX,VIX,VIX2,SOX Log_Returns,VIX Log_Returns,VIX2 Log_Returns,SOX Next_Weekly_RV
count,2768.0,2768.0,2768.0,2768.0,2768.0,2768.0,2768.0
mean,992.52146,17.854458,348.025046,0.000749,0.003759,0.021764,0.034444
std,405.078887,7.057683,263.049386,0.017797,0.072216,0.130178,0.021466
min,351.280029,9.14,83.539606,-0.086366,-0.086366,-0.086366,0.003846
25%,616.449982,13.22,174.768407,-0.008255,-0.044182,-0.086366,0.0208
50%,1083.289978,15.935,253.924248,0.001454,-0.006686,-0.013373,0.029367
75%,1418.02002,20.712499,429.007639,0.010203,0.035784,0.071568,0.042919
max,1418.02002,82.690002,1418.02002,0.105753,0.768245,1.53649,0.271085


In [163]:
# transform numerical data with min-max scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns, index=data.index)
data.describe()

Ticker,SOX,VIX,VIX2,SOX Log_Returns,VIX Log_Returns,VIX2 Log_Returns,SOX Next_Weekly_RV
count,2768.0,2768.0,2768.0,2768.0,2768.0,2768.0,2768.0
mean,0.601123,0.118483,0.198194,0.45344,0.105458,0.066629,0.114497
std,0.379735,0.095958,0.197117,0.092636,0.084501,0.080216,0.080325
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.24858,0.055472,0.068363,0.406577,0.049361,0.0,0.063441
50%,0.686212,0.092386,0.127679,0.457112,0.093235,0.044978,0.0955
75%,1.0,0.157342,0.258878,0.502651,0.14293,0.097319,0.14621
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [164]:
# Create a pipeline to transform the data (winsozation and min-max scaling)
# from sklearn.pipeline import Pipeline
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import FunctionTransformer

# def winsorize_data(data):
#     return winsorize(np.array(data), limits=[0.025, 0.025])

# def min_max_scale_data(data):
#     return scaler.fit_transform(data)

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', FunctionTransformer(winsorize_data), data.columns),
#         ('num2', FunctionTransformer(min_max_scale_data), data.columns)
#     ])


# data_tmp = preprocessor.fit_transform(data)
# data = pd.DataFrame(data_tmp, columns=data.columns, index=data.index)
# data.describe()

### Training and Testing Dataset Preparation
1. Training Data : 2012-2020
2. Testing Data: 2021-2022

In [165]:
# Split the data into training and test sets (training data from 2012 to 2020, test data from 2021 to 2022)
data_train = data.loc['2012-01-01':'2020-12-31']
data_test = data.loc['2021-01-01':'2022-12-31']

### Base Model Attributes(X) and Target(Y)
1. X \
    1.1 Index Value: ^SOX, ^VIX \
    1.2 Square Value: ^VIX^2 \
    1.3 Log Return: ^SOX, ^VIX, ^VIX^2
    
2. y: Weekly Relative Volatility of ^SOX

In [166]:
# Split the data into features and target
X_base_train = data_train.drop(columns=['SOX Next_Weekly_RV'])
y_train = data_train[['SOX Next_Weekly_RV']]
X_base_test = data_test.drop(columns=['SOX Next_Weekly_RV'])
y_test = data_test[['SOX Next_Weekly_RV']]

### Benchmark Model Training and Performance
1. Decesion Tree Regressor
2. Linear Regression

performance matrix
1. Root Mean Square Error
2. R-square

In [122]:
# Create a decision tree regressor model and use parameters from the grid search
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

model = DecisionTreeRegressor(random_state=42)
model.fit(X_base_train, y_train)

param_grid = {
    'max_depth': range(1, 5),
    'min_samples_split': range(2, 5),
    'min_samples_leaf': range(3, 8),
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=2, n_jobs=-1, verbose=-1)
grid_search.fit(X_base_train, y_train)

# Report the best model parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"Best parameters: {best_params}")
print(f"Best score: {best_score}")

Best parameters: {'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 2}
Best score: -0.018878200773558618


In [123]:
# Predict the target variable for the test set and calculate the RMSE and R2 score
from sklearn.metrics import mean_squared_error, r2_score

model = grid_search.best_estimator_
y_pred = model.predict(X_base_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R2 score: {r2}")

RMSE: 0.08272153325344428
R2 score: -0.23359850684181271


In [124]:
# Create a linear regression model
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_base_train, y_train)

In [125]:
# Predict the target variable for the test set and calculate the RMSE and R2 score
y_pred = model.predict(X_base_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R2 score: {r2}")

RMSE: 0.06626893137059825
R2 score: 0.20830733958820336


### LightGBM Model Training and Performance
Hyperparamter tuning: Optuna

Time series split = 5

Objective function: Mean(R-Square)

Performance matrix:
1. Root Mean Square Error
2. R-square

Reference:\
https://forecastegy.com/posts/how-to-use-optuna-to-tune-lightgbm-hyperparameters/

In [126]:
tscv = TimeSeriesSplit(n_splits=5)

In [143]:
import lightgbm as lgb
import optuna

def objective(trial, X, y):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "n_estimators": 300,
        "verbosity": -1,
        "bagging_freq": 1,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 2**4),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 20),
    }

    r2_scores = np.array([])
    model = lgb.LGBMRegressor(**params)

    for train_index, val_index in tscv.split(X):
        # print("TRAIN:", train_index, "TEST:", val_index)
        X_t, X_val = X.iloc[train_index], X.iloc[val_index]
        y_t, y_val = y.iloc[train_index], y.iloc[val_index]
        model.fit(X_t, y_t)
        predictions = model.predict(X_val)
        r2 = r2_score(y_val, predictions)
        r2_scores = np.append(r2_scores, r2)
    return r2_scores.mean()

In [144]:
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, X_base_train, y_train), n_trials=30)

[I 2024-04-26 00:18:43,510] A new study created in memory with name: no-name-d953adba-f868-45f5-93e3-26908d3d279e
[I 2024-04-26 00:18:43,831] Trial 0 finished with value: -0.06605924432557009 and parameters: {'learning_rate': 0.0017718736898429054, 'num_leaves': 13, 'subsample': 0.7404246796449283, 'colsample_bytree': 0.6871103661310634, 'min_data_in_leaf': 11}. Best is trial 0 with value: -0.06605924432557009.
[I 2024-04-26 00:18:44,079] Trial 1 finished with value: 0.016499500241419596 and parameters: {'learning_rate': 0.00545266789333957, 'num_leaves': 9, 'subsample': 0.5017278005136825, 'colsample_bytree': 0.8227636911053671, 'min_data_in_leaf': 10}. Best is trial 1 with value: 0.016499500241419596.
[I 2024-04-26 00:18:44,244] Trial 2 finished with value: 0.006545479975904311 and parameters: {'learning_rate': 0.037636393218236225, 'num_leaves': 4, 'subsample': 0.5600024232683103, 'colsample_bytree': 0.5346317785617414, 'min_data_in_leaf': 15}. Best is trial 1 with value: 0.01649950

In [145]:
# Create a LightGBM model with the best parameters
best_params = study.best_params
model = lgb.LGBMRegressor(**best_params)
model.fit(X_base_train, y_train)

# Predict the target variable for the test set and calculate the RMSE and R2 score
y_pred = model.predict(X_base_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R2 score: {r2}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000140 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1530
[LightGBM] [Info] Number of data points in the train set: 2265, number of used features: 6
[LightGBM] [Info] Start training from score 0.102259
RMSE: 0.08066431152617606
R2 score: -0.17300414522294805


### Feature Engineering
1. log of lag weekly realized volatility
2. square of lag weekly realized volatility
3. log of VIX
4. log of past 5 days VIX


In [167]:
data_lag = pd.DataFrame()
for i in range(1, 9):
    # Add new columns name 'SOX Next_Weekly_RV Lag 1', 'SOX Next_Weekly_RV Lag 2', 'SOX Next_Weekly_RV Lag 3', 'SOX Next_Weekly_RV Lag 4', 'SOX Next_Weekly_RV Lag 5', 'SOX Next_Weekly_RV Lag 6', 'SOX Next_Weekly_RV Lag 7', 'SOX Next_Weekly_RV Lag 8'
    data_lag[f'SOX Next_Weekly_RV Lag {i}'] = data['SOX Next_Weekly_RV'].shift(i)
    # Add new columns name 'SOX Next_Weekly_Log_RV Lag 1', 'SOX Next_Weekly_Log_RV Lag 2', 'SOX Next_Weekly_Log_RV Lag 3', 'SOX Next_Weekly_Log_RV Lag 4', 'SOX Next_Weekly_Log_RV Lag 5', 'SOX Next_Weekly_Log_RV Lag 6', 'SOX Next_Weekly_Log_RV Lag 7', 'SOX Next_Weekly_Log_RV Lag 8'
    data_lag[f'SOX Next_Weekly_RV Lag {i}'] = np.log(data['SOX Next_Weekly_RV']).shift(i)
    # Add new columns name 'SOX Next_Weekly_Squared_RV Lag 1', 'SOX Next_Weekly_Squared_RV Lag 2', 'SOX Next_Weekly_Squared_RV Lag 3', 'SOX Next_Weekly_Squared_RV Lag 4', 'SOX Next_Weekly_Squared_RV Lag 5', 'SOX Next_Weekly_Squared_RV Lag 6', 'SOX Next_Weekly_Squared_RV Lag 7', 'SOX Next_Weekly_Squared_RV Lag 8'
    data_lag[f'SOX Weekly_Squared_RV Lag {i}'] = (data['SOX Next_Weekly_RV']**2).shift(i)

data_lag

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0_level_0,SOX Next_Weekly_RV Lag 1,SOX Weekly_Squared_RV Lag 1,SOX Next_Weekly_RV Lag 2,SOX Weekly_Squared_RV Lag 2,SOX Next_Weekly_RV Lag 3,SOX Weekly_Squared_RV Lag 3,SOX Next_Weekly_RV Lag 4,SOX Weekly_Squared_RV Lag 4,SOX Next_Weekly_RV Lag 5,SOX Weekly_Squared_RV Lag 5,SOX Next_Weekly_RV Lag 6,SOX Weekly_Squared_RV Lag 6,SOX Next_Weekly_RV Lag 7,SOX Weekly_Squared_RV Lag 7,SOX Next_Weekly_RV Lag 8,SOX Weekly_Squared_RV Lag 8
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2012-01-03,,,,,,,,,,,,,,,,
2012-01-04,-2.456686,0.007348,,,,,,,,,,,,,,
2012-01-05,-2.566743,0.005896,-2.456686,0.007348,,,,,,,,,,,,
2012-01-06,-2.240444,0.011323,-2.566743,0.005896,-2.456686,0.007348,,,,,,,,,,
2012-01-09,-2.482727,0.006975,-2.240444,0.011323,-2.566743,0.005896,-2.456686,0.007348,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-23,-1.945723,0.020416,-1.995423,0.018484,-1.581629,0.042288,-1.669951,0.035440,-1.704288,0.033088,-1.731182,0.031356,-1.712806,0.032529,-1.718549,0.032158
2022-12-27,-1.832889,0.025584,-1.945723,0.020416,-1.995423,0.018484,-1.581629,0.042288,-1.669951,0.035440,-1.704288,0.033088,-1.731182,0.031356,-1.712806,0.032529
2022-12-28,-1.789374,0.027911,-1.832889,0.025584,-1.945723,0.020416,-1.995423,0.018484,-1.581629,0.042288,-1.669951,0.035440,-1.704288,0.033088,-1.731182,0.031356
2022-12-29,-1.595885,0.041099,-1.789374,0.027911,-1.832889,0.025584,-1.945723,0.020416,-1.995423,0.018484,-1.581629,0.042288,-1.669951,0.035440,-1.704288,0.033088


In [178]:
# Split the data+lagged data into training and test sets (training data from 2012 to 2020, test data from 2021 to 2022)
data_train = pd.concat([data, data_lag], axis=1).loc['2012-01-01':'2020-12-31']
data_test = pd.concat([data, data_lag], axis=1).loc['2021-01-01':'2022-12-31']

In [179]:
# Create X and y for the new features
X_lags_train = data_train.drop(columns=['SOX Next_Weekly_RV'])
y_lags_train= data_train[['SOX Next_Weekly_RV']]
X_lags_test = data_test.drop(columns=['SOX Next_Weekly_RV'])
y_lags_test = data_test[['SOX Next_Weekly_RV']]

In [180]:
# Train a LightGBM model with the new features
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, X_lags_train, y_lags_train), n_trials=30)

[I 2024-04-26 00:47:17,620] A new study created in memory with name: no-name-9738fe87-3d85-433d-8e2b-ea44cb9aeb3b
[I 2024-04-26 00:47:18,283] Trial 0 finished with value: 0.6551046973585504 and parameters: {'learning_rate': 0.022668538613565752, 'num_leaves': 10, 'subsample': 0.623982198385168, 'colsample_bytree': 0.6206240484663136, 'min_data_in_leaf': 3}. Best is trial 0 with value: 0.6551046973585504.
[I 2024-04-26 00:47:18,712] Trial 1 finished with value: 0.4600079669898693 and parameters: {'learning_rate': 0.003093697934551272, 'num_leaves': 6, 'subsample': 0.8678871095032297, 'colsample_bytree': 0.9858789166913651, 'min_data_in_leaf': 16}. Best is trial 0 with value: 0.6551046973585504.
[I 2024-04-26 00:47:19,355] Trial 2 finished with value: 0.6237246528416278 and parameters: {'learning_rate': 0.027909987888205208, 'num_leaves': 11, 'subsample': 0.6630327075115752, 'colsample_bytree': 0.8018155558203652, 'min_data_in_leaf': 1}. Best is trial 0 with value: 0.6551046973585504.
[I

In [183]:
# Create a LightGBM model with the best parameters
best_params = study.best_params
model = lgb.LGBMRegressor(**best_params)
model.fit(X_lags_train, y_lags_train)

# Predict the target variable for the test set and calculate the RMSE and R2 score
y_pred = model.predict(X_lags_test)
mse = mean_squared_error(y_lags_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_lags_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R2 score: {r2}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000265 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5610
[LightGBM] [Info] Number of data points in the train set: 2265, number of used features: 22
[LightGBM] [Info] Start training from score 0.102259
RMSE: 0.03906117370862857
R2 score: 0.724940112132169


In [184]:
# Create new dataframe to store new features for Lof of VIX and Log of past 5-day VIX
data_VIX = pd.DataFrame()
data_VIX['VIX Log'] = np.log(data['VIX'])
data_VIX['VIX Log 5 Day Mean'] = np.log(data['VIX'].rolling(5).mean())
data_VIX['VIX Log 5 Day Sum'] = np.log(data['VIX'].rolling(5).sum())

data_VIX

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0_level_0,VIX Log,VIX Log 5 Day Mean,VIX Log 5 Day Sum
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-01-03,-1.671125,,
2012-01-04,-1.726881,,
2012-01-05,-1.785120,,
2012-01-06,-1.856488,,
2012-01-09,-1.818909,-1.769521,-0.160083
...,...,...,...
2022-12-23,-1.835816,-1.794728,-0.185290
2022-12-27,-1.771437,-1.807408,-0.197970
2022-12-28,-1.733016,-1.796530,-0.187092
2022-12-29,-1.788366,-1.774319,-0.164881


In [185]:
# Split the data+VIX data into training and test sets (training data from 2012 to 2020, test data from 2021 to 2022)
data_train = pd.concat([data, data_VIX], axis=1).loc['2012-01-01':'2020-12-31']
data_test = pd.concat([data, data_VIX], axis=1).loc['2021-01-01':'2022-12-31']


In [186]:
# Create X and y for the new features
X_VIX_train = data_train.drop(columns=['SOX Next_Weekly_RV'])
y_VIX_train = data_train[['SOX Next_Weekly_RV']]
X_VIX_test = data_test.drop(columns=['SOX Next_Weekly_RV'])
y_VIX_test = data_test[['SOX Next_Weekly_RV']]

In [187]:
# Train a LightGBM model with the new features
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, X_VIX_train, y_VIX_train), n_trials=30)

[I 2024-04-26 00:48:44,040] A new study created in memory with name: no-name-60087f9f-1c52-4700-a687-2c430f9ea9e7
[I 2024-04-26 00:48:44,392] Trial 0 finished with value: 0.007601738357451771 and parameters: {'learning_rate': 0.007012070948974538, 'num_leaves': 10, 'subsample': 0.6551787377757826, 'colsample_bytree': 0.5179581033481819, 'min_data_in_leaf': 3}. Best is trial 0 with value: 0.007601738357451771.
[I 2024-04-26 00:48:44,742] Trial 1 finished with value: -0.047477322175287995 and parameters: {'learning_rate': 0.03866002500358503, 'num_leaves': 11, 'subsample': 0.8684870435832603, 'colsample_bytree': 0.6559077202363865, 'min_data_in_leaf': 8}. Best is trial 0 with value: 0.007601738357451771.
[I 2024-04-26 00:48:44,878] Trial 2 finished with value: 0.03367154351167181 and parameters: {'learning_rate': 0.010984039586005977, 'num_leaves': 2, 'subsample': 0.6812657763223308, 'colsample_bytree': 0.6110608173685024, 'min_data_in_leaf': 2}. Best is trial 2 with value: 0.03367154351

In [188]:
# Create a LightGBM model with the best parameters
best_params = study.best_params
model = lgb.LGBMRegressor(**best_params)
model.fit(X_VIX_train, y_VIX_train)

# Predict the target variable for the test set and calculate the RMSE and R2 score
y_pred = model.predict(X_VIX_test)
mse = mean_squared_error(y_VIX_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_VIX_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R2 score: {r2}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000220 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 2265, number of used features: 9
[LightGBM] [Info] Start training from score 0.102259
RMSE: 0.08828676323276353
R2 score: -0.40516678023698893
