In [4]:
import parse_data
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor

2025-02-18 16:32:18.038935: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Creating Feature and Response Vectors

In [9]:
df_dic = parse_data.df_dic

def make_X_and_y(df_dic):
    X_dic = {}
    y_dic = {}
    for k in df_dic:
        df = df_dic[k]
        cols_X = [0,1,2,3,4,5,7,8,9,10]
        cols_y = [6]
        df_X = df.iloc[:, cols_X].copy()
        df_y = df.iloc[:, cols_y].copy()
        X_dic[k] = df_X.values.tolist()
        y_dic[k] = df_y.values.tolist()
    return X_dic, y_dic

X_dic, y_dic = make_X_and_y(df_dic)

X = np.array(sum(X_dic.values(), []))
y = np.array(sum(y_dic.values(), [])).flatten()

## Building and Training a Neural Network Model using 5-Fold Cross Validation:

In [None]:
def build_nn_model():
    nn_model = keras.Sequential([
        keras.layers.Input(shape=(X_train.shape[1],)),
        keras.layers.Dense(64, activation='relu'),  # Input Layer
        keras.layers.Dense(32, activation='relu'),  # Hidden Layer 1
        keras.layers.Dense(16, activation='relu'),  # Hidden Layer 2
        keras.layers.Dense(1)  # Output Layer (Regression, 1 Neuron)
    ])
    nn_model.compile(optimizer='adam', loss='mse', metrics=['mae'])

    return nn_model

k = 5 
kf = KFold(n_splits=k, shuffle=True, random_state=42)
mse_scores = []
mae_scores = []
nn_models = []
num_models = 0

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

for train_index, val_index in kf.split(X_train):
    X_train_fold, X_temp_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_temp_fold = y_train[train_index], y_train[val_index]
    n = len(X_temp_fold)
    X_val_fold, X_test_fold = X_temp_fold[:n//2], X_temp_fold[n//2:]
    y_val_fold, y_test_fold = y_temp_fold[:n//2], y_temp_fold[n//2:]

    # Build and train the model
    model = build_nn_model()
    model.fit(X_train_fold, y_train_fold, epochs=50, batch_size=32, validation_data=(X_val_fold, y_val_fold))
    nn_models.append(model)

    # Evaluate the model on validation fold
    loss, mae = model.evaluate(X_test_fold, y_test_fold, verbose=0)

    mse_scores.append(loss) 
    mae_scores.append(mae)


First, I tried getting the average weights of all CV model to then fit an aggregated model but that gave me very general results that were not useful.
Got better results when I averaged over predictions for each model that was fit in the CV.

In [12]:
## 16m 23s runtime,
print(f"Average MSE: {np.mean(mse_scores):.4f} ± {np.std(mse_scores):.4f}")
print(f"Average MAE: {np.mean(mae_scores):.4f} ± {np.std(mae_scores):.4f}")

Average MSE: 161.3601 ± 4.3378
Average MAE: 9.9763 ± 0.1321


Results w/ No k fold CV: MSE = 170

Results w/ k fold CV: MSE = 161.5

## Training and Fitting a XGBoost Model

In [13]:
## XGBOOST - Good for tabular data, Handles linearity & interactions b/w features automatically.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)


xgb_model = XGBRegressor(objective="reg:squarederror", eval_metric="rmse", random_state=12)

param_dist = {
    "n_estimators": [100, 200, 300], # Number of boosting rounds (trees)
    "max_depth": [3, 6, 9], # Depth of trees
    "learning_rate": [0.01, 0.1, 0.2],
    "subsample": [0.7, 0.8, 1.0],  # % of the data per tree
    "colsample_bytree": [0.7, 0.8, 1.0] # % of the features per tree
}

## Finding the best hyperparameter values to fit model on
search = RandomizedSearchCV(xgb_model, param_distributions=param_dist, n_iter=10, scoring="neg_root_mean_squared_error", cv=5, verbose=1, random_state=42)

search.fit(X_train, y_train)
best_rmse = -search.best_score_
print(f"Best RMSE: {best_rmse:.4f}")
print("Best Parameters:", search.best_params_)

best_xg_model = search.best_estimator_
## 1m 48s runtime

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best RMSE: 12.6653
Best Parameters: {'subsample': 1.0, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.1, 'colsample_bytree': 0.8}


## Testing out predictions

In [15]:
print(nn_models)

[<Sequential name=sequential, built=True>, <Sequential name=sequential_1, built=True>, <Sequential name=sequential_2, built=True>, <Sequential name=sequential_3, built=True>, <Sequential name=sequential_4, built=True>]


In [39]:
import random

i = random.randint(0, len(y) - 1)

X_i = X[i]
X_i = np.array(X_i).reshape(1, -1) 
y_i = y[i]

## Neural Network Prediction
def nn_predict(models, X_i):
    X_i = np.array(X_i).reshape(1, -1)  # Ensure input shape is (1, 10)
    preds = np.array([model.predict(X_i) for model in models])
    return np.mean(preds, axis=0)

nn_spread = nn_predict(nn_models, X_i)[0][0]

## XGBoost Prediction
xg_spread = best_xg_model.predict(X_i)[0]

print("Neural Net Prediction:", nn_spread)
print("XGBoost Prediction:", xg_spread)
print("Actual Spread:", y_i)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Neural Net Prediction: -2.1129336
XGBoost Prediction: -3.2927372
Actual Spread: -8


Clearly, the XGBoost model is giving us much better results and is more efficient while training.