In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

import mlflow
import mlflow.sklearn
import mlflow.pytorch
from mlflow.models.signature import infer_signature

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import xgboost as xgb
import lightgbm as lgb
import optuna

# Set random seed for reproducibility
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x1183593d0>

In [2]:
# Load the data (adjust file paths as needed)
X_train = np.load('X_train.npy', allow_pickle=True)
y_train = np.load('y_train.npy', allow_pickle=True)
X_test = np.load('X_test.npy', allow_pickle=True)
y_test = np.load('y_test.npy', allow_pickle=True)

print(f'X_train shape: {X_train.shape}')  # Expected: (num_races_train, max_riders, num_features)
print(f'y_train shape: {y_train.shape}')  # Expected: (num_races_train, max_riders)
print(f'X_test shape: {X_test.shape}')    # Expected: (num_races_test, max_riders, num_features)
print(f'y_test shape: {y_test.shape}')    # Expected: (num_races_test, max_riders)

# Flatten the data for scikit-learn models
X_train_flat = X_train.reshape(-1, X_train.shape[2])    # Shape: (num_races_train * max_riders, num_features)
X_test_flat = X_test.reshape(-1, X_test.shape[2])       # Shape: (num_races_test * max_riders, num_features)

# Flatten the targets
y_train_flat = y_train.flatten()  # Shape: (num_races_train * max_riders,)
y_test_flat = y_test.flatten()    # Shape: (num_races_test * max_riders,)

# Filter out invalid targets (if necessary)
valid_indices_train = y_train_flat > 0
valid_indices_test = y_test_flat > 0

X_train_flat = X_train_flat[valid_indices_train]
y_train_flat = y_train_flat[valid_indices_train]

X_test_flat = X_test_flat[valid_indices_test]
y_test_flat = y_test_flat[valid_indices_test]

# # Optionally scale the features
# scaler = StandardScaler()
# X_train_flat = scaler.fit_transform(X_train_flat)
# X_test_flat = scaler.transform(X_test_flat)

X_train shape: (2034, 207, 227)
y_train shape: (2034, 207)
X_test shape: (153, 207, 227)
y_test shape: (153, 207)


In [3]:
# Set MLflow experiment
mlflow.set_tracking_uri("file:./mlruns")
mlflow.set_experiment("Race_Prediction_Experiment_II")

2024/11/30 14:07:07 INFO mlflow.tracking.fluent: Experiment with name 'Race_Prediction_Experiment_II' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///Users/feliks/Documents/Faks/Diplomska/App/mlruns/586264776644289656', creation_time=1732972027382, experiment_id='586264776644289656', last_update_time=1732972027382, lifecycle_stage='active', name='Race_Prediction_Experiment_II', tags={}>

In [4]:
def train_and_evaluate_model(model_class, param_grid, model_name, X_train, y_train, X_test, y_test):
    from itertools import product
    import pandas as pd

    # Generate all combinations of hyperparameters
    keys = param_grid.keys()
    values = (param_grid[key] for key in keys)
    param_combinations = [dict(zip(keys, combination)) for combination in product(*values)]

    # For each combination, train and log the model
    for idx, params in enumerate(param_combinations):
        # Initialize model with current hyperparameters
        model = model_class(**params)

        # Fit the model
        model.fit(X_train, y_train)

        # Predict on test set
        y_pred = model.predict(X_test)

        # Calculate metrics
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        mape = mean_absolute_percentage_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        smape = symmetric_mean_absolute_percentage_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Start MLflow run
        with mlflow.start_run(run_name=f"{model_name} - Run {idx+1}"):
            # Log parameters
            mlflow.log_param("model_class", model_name)
            mlflow.log_params(params)

            # Log metrics
            mlflow.log_metric("test_mse", mse)
            mlflow.log_metric("test_mae", mae)
            mlflow.log_metric("test_r2", r2)
            mlflow.log_metric("test_mape", mape)
            mlflow.log_metric("test_rmse", rmse)
            mlflow.log_metric("test_smape", smape)

            # Log the model
            input_example = X_train[:5]
            signature = infer_signature(X_train, model.predict(X_train))

            mlflow.sklearn.log_model(
                sk_model=model,
                artifact_path="model",
                input_example=input_example,
                signature=signature
            )

            # Print results
            print(f"\n{model_name} Run {idx+1} parameters: {params}")
            print(f"{model_name} Run {idx+1} Test MSE: {mse:.4f}")
            print(f"{model_name} Run {idx+1} Test MAE: {mae:.4f}")
            print(f"{model_name} Run {idx+1} Test R^2 Score: {r2:.4f}")
            print(f"{model_name} Run {idx+1} Test MAPE: {mape:.4f}")
            print(f"{model_name} Run {idx+1} Test RMSE: {rmse:.4f}")
            print(f"{model_name} Run {idx+1} Test sMAPE: {smape:.4f}")

def mean_absolute_percentage_error(y_true, y_pred):
    epsilon = 1e-8  # Small number to prevent division by zero
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    # Avoid division by zero
    mask = np.abs(y_true) > epsilon
    if np.sum(mask) == 0:
        return np.inf  # Return infinity if no valid entries
    mape = np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100
    return mape

def symmetric_mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    diff = np.abs(y_pred - y_true)
    # Avoid division by zero
    mask = denominator != 0
    smape = np.mean((diff[mask] / denominator[mask])) * 100
    return smape

### Linear Regression

In [5]:
linear_reg = LinearRegression
param_grid_lr = {
    # No hyperparameters to tune
}

train_and_evaluate_model(
    model_class=linear_reg,
    param_grid=param_grid_lr,
    model_name="Linear Regression",
    X_train=X_train_flat,
    y_train=y_train_flat,
    X_test=X_test_flat,
    y_test=y_test_flat
)


Linear Regression Run 1 parameters: {}
Linear Regression Run 1 Test MSE: 0.0560
Linear Regression Run 1 Test MAE: 0.2105
Linear Regression Run 1 Test R^2 Score: 0.0456
Linear Regression Run 1 Test MAPE: 112.0206
Linear Regression Run 1 Test RMSE: 0.2367
Linear Regression Run 1 Test sMAPE: 67.6160


### Ridge Regression

In [6]:
ridge_reg = Ridge
param_grid_ridge = {
    'alpha': [0.1, 0.9, 1.0, 1.5, 2.0, 10.0]
}

train_and_evaluate_model(
    model_class=ridge_reg,
    param_grid=param_grid_ridge,
    model_name="Ridge Regression",
    X_train=X_train_flat,
    y_train=y_train_flat,
    X_test=X_test_flat,
    y_test=y_test_flat
)


Ridge Regression Run 1 parameters: {'alpha': 0.1}
Ridge Regression Run 1 Test MSE: 0.0560
Ridge Regression Run 1 Test MAE: 0.2105
Ridge Regression Run 1 Test R^2 Score: 0.0456
Ridge Regression Run 1 Test MAPE: 112.0234
Ridge Regression Run 1 Test RMSE: 0.2367
Ridge Regression Run 1 Test sMAPE: 67.6194

Ridge Regression Run 2 parameters: {'alpha': 0.9}
Ridge Regression Run 2 Test MSE: 0.0560
Ridge Regression Run 2 Test MAE: 0.2105
Ridge Regression Run 2 Test R^2 Score: 0.0453
Ridge Regression Run 2 Test MAPE: 112.0267
Ridge Regression Run 2 Test RMSE: 0.2367
Ridge Regression Run 2 Test sMAPE: 67.6403

Ridge Regression Run 3 parameters: {'alpha': 1.0}
Ridge Regression Run 3 Test MSE: 0.0560
Ridge Regression Run 3 Test MAE: 0.2105
Ridge Regression Run 3 Test R^2 Score: 0.0453
Ridge Regression Run 3 Test MAPE: 112.0267
Ridge Regression Run 3 Test RMSE: 0.2367
Ridge Regression Run 3 Test sMAPE: 67.6426

Ridge Regression Run 4 parameters: {'alpha': 1.5}
Ridge Regression Run 4 Test MSE: 0.05

### Lasso Regression

In [7]:
lasso_reg = Lasso
param_grid_lasso = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0, 1.5, 2.0, 10.0],
    'max_iter': [10000]
}

train_and_evaluate_model(
    model_class=lasso_reg,
    param_grid=param_grid_lasso,
    model_name="Lasso Regression",
    X_train=X_train_flat,
    y_train=y_train_flat,
    X_test=X_test_flat,
    y_test=y_test_flat
)


Lasso Regression Run 1 parameters: {'alpha': 0.0001, 'max_iter': 10000}
Lasso Regression Run 1 Test MSE: 0.0563
Lasso Regression Run 1 Test MAE: 0.2112
Lasso Regression Run 1 Test R^2 Score: 0.0410
Lasso Regression Run 1 Test MAPE: 111.9876
Lasso Regression Run 1 Test RMSE: 0.2372
Lasso Regression Run 1 Test sMAPE: 67.8755

Lasso Regression Run 2 parameters: {'alpha': 0.001, 'max_iter': 10000}
Lasso Regression Run 2 Test MSE: 0.0572
Lasso Regression Run 2 Test MAE: 0.2150
Lasso Regression Run 2 Test R^2 Score: 0.0260
Lasso Regression Run 2 Test MAPE: 115.0582
Lasso Regression Run 2 Test RMSE: 0.2391
Lasso Regression Run 2 Test sMAPE: 68.8978

Lasso Regression Run 3 parameters: {'alpha': 0.01, 'max_iter': 10000}
Lasso Regression Run 3 Test MSE: 0.0583
Lasso Regression Run 3 Test MAE: 0.2199
Lasso Regression Run 3 Test R^2 Score: 0.0060
Lasso Regression Run 3 Test MAPE: 118.7917
Lasso Regression Run 3 Test RMSE: 0.2415
Lasso Regression Run 3 Test sMAPE: 70.2226

Lasso Regression Run 4 p

### Decision Tree

In [8]:
decision_tree_reg = DecisionTreeRegressor
param_grid_dtree_reg = {
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10]
}

train_and_evaluate_model(
    model_class=decision_tree_reg,
    param_grid=param_grid_dtree_reg,
    model_name="Decision Tree Regressor",
    X_train=X_train_flat,
    y_train=y_train_flat,
    X_test=X_test_flat,
    y_test=y_test_flat
)


Decision Tree Regressor Run 1 parameters: {'max_depth': None, 'min_samples_split': 2}
Decision Tree Regressor Run 1 Test MSE: 0.1108
Decision Tree Regressor Run 1 Test MAE: 0.2423
Decision Tree Regressor Run 1 Test R^2 Score: -0.8878
Decision Tree Regressor Run 1 Test MAPE: 127.2581
Decision Tree Regressor Run 1 Test RMSE: 0.3328
Decision Tree Regressor Run 1 Test sMAPE: 73.8828

Decision Tree Regressor Run 2 parameters: {'max_depth': None, 'min_samples_split': 5}
Decision Tree Regressor Run 2 Test MSE: 0.1087
Decision Tree Regressor Run 2 Test MAE: 0.2436
Decision Tree Regressor Run 2 Test R^2 Score: -0.8524
Decision Tree Regressor Run 2 Test MAPE: 128.2310
Decision Tree Regressor Run 2 Test RMSE: 0.3297
Decision Tree Regressor Run 2 Test sMAPE: 73.6349

Decision Tree Regressor Run 3 parameters: {'max_depth': None, 'min_samples_split': 10}
Decision Tree Regressor Run 3 Test MSE: 0.0919
Decision Tree Regressor Run 3 Test MAE: 0.2307
Decision Tree Regressor Run 3 Test R^2 Score: -0.565

### SVR

In [None]:
svr_model_class = SVR
param_grid_svr = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'epsilon': [0.01, 0.1, 1]
}

train_and_evaluate_model(
    model_class=svr_model_class,
    param_grid=param_grid_svr,
    model_name="Support Vector Regressor",
    X_train=X_train_flat,
    y_train=y_train_flat,
    X_test=X_test_flat,
    y_test=y_test_flat
)

### Random Forest

In [None]:
random_forest_reg = RandomForestRegressor()
param_grid_rf_reg = {
    'n_estimators': [100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5]
}

train_and_evaluate_model(
    model=random_forest_reg,
    param_grid=param_grid_rf_reg,
    model_name="Random Forest Regressor",
    X_train=X_train_flat,
    y_train=y_train_flat,
    X_test=X_test_flat,
    y_test=y_test_flat
)

### Gradient Boosting

In [None]:
gb_regressor = GradientBoostingRegressor()
param_grid_gb_reg = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5]
}

train_and_evaluate_model(
    model=gb_regressor,
    param_grid=param_grid_gb_reg,
    model_name="Gradient Boosting Regressor",
    X_train=X_train_flat,
    y_train=y_train_flat,
    X_test=X_test_flat,
    y_test=y_test_flat
)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


### XGBoost

In [9]:
xgboost_reg = xgb.XGBRegressor
param_grid_xgb_reg = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'objective': ['reg:squarederror']
}

train_and_evaluate_model(
    model_class=xgboost_reg,
    param_grid=param_grid_xgb_reg,
    model_name="XGBoost Regressor",
    X_train=X_train_flat,
    y_train=y_train_flat,
    X_test=X_test_flat,
    y_test=y_test_flat
)


XGBoost Regressor Run 1 parameters: {'n_estimators': 100, 'learning_rate': 0.01, 'max_depth': 3, 'objective': 'reg:squarederror'}
XGBoost Regressor Run 1 Test MSE: 0.0572
XGBoost Regressor Run 1 Test MAE: 0.2148
XGBoost Regressor Run 1 Test R^2 Score: 0.0259
XGBoost Regressor Run 1 Test MAPE: 113.9762
XGBoost Regressor Run 1 Test RMSE: 0.2391
XGBoost Regressor Run 1 Test sMAPE: 68.9682

XGBoost Regressor Run 2 parameters: {'n_estimators': 100, 'learning_rate': 0.01, 'max_depth': 5, 'objective': 'reg:squarederror'}
XGBoost Regressor Run 2 Test MSE: 0.0571
XGBoost Regressor Run 2 Test MAE: 0.2140
XGBoost Regressor Run 2 Test R^2 Score: 0.0275
XGBoost Regressor Run 2 Test MAPE: 113.7460
XGBoost Regressor Run 2 Test RMSE: 0.2389
XGBoost Regressor Run 2 Test sMAPE: 68.6829

XGBoost Regressor Run 3 parameters: {'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 3, 'objective': 'reg:squarederror'}
XGBoost Regressor Run 3 Test MSE: 0.0560
XGBoost Regressor Run 3 Test MAE: 0.2047
XGBoost 

### LightGBM

In [None]:
lgbm_reg = lgb.LGBMRegressor()
param_grid_lgbm_reg = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'num_leaves': [31, 63]
}

train_and_evaluate_model(
    model=lgbm_reg,
    param_grid=param_grid_lgbm_reg,
    model_name="LightGBM Regressor",
    X_train=X_train_flat,
    y_train=y_train_flat,
    X_test=X_test_flat,
    y_test=y_test_flat
)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.251038 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3654
[LightGBM] [Info] Number of data points in the train set: 219147, number of used features: 145
[LightGBM] [Info] Start training from score 0.006051
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.086654 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3654
[LightGBM] [Info] Number of data points in the train set: 219147, number of used features: 145
[LightGBM] [Info] Start training from score 0.006051
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.202939 seconds.
You can set `force_c

### KNN

In [6]:
from sklearn.neighbors import KNeighborsRegressor

knn_model = KNeighborsRegressor()
param_grid_knn = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance']
}

train_and_evaluate_model(
    model=knn_model,
    param_grid=param_grid_knn,
    model_name="K-Nearest Neighbors Regressor",
    X_train=X_train_flat,
    y_train=y_train_flat,
    X_test=X_test_flat,
    y_test=y_test_flat
)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


KeyboardInterrupt: 

### Neural Network

In [10]:
class RaceRegressionModel(nn.Module):
    def __init__(self, input_size, hidden_size=128):
        super(RaceRegressionModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, 1)  # Output a score for each rider

    def forward(self, x):
        # x should have shape (batch_size, num_features)
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out.squeeze()

In [11]:
from torch.utils.data import Dataset, DataLoader

class RaceRegressionDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)  # Shape: (num_samples, num_features)
        self.y = torch.tensor(y, dtype=torch.float32)  # Shape: (num_samples,)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        X = self.X[idx]
        y = self.y[idx]
        return X, y

# Create datasets
train_dataset = RaceRegressionDataset(X_train_flat, y_train_flat)
test_dataset = RaceRegressionDataset(X_test_flat, y_test_flat)

# Create data loaders
batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [18]:
# Define the objective function for Optuna
def objective(trial):
    # Hyperparameter suggestions
    hidden_size = trial.suggest_categorical('hidden_size', [64, 128, 256])
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2)
    weight_decay = trial.suggest_float('weight_decay', 1e-6, 1e-3)
    num_epochs = trial.suggest_int('num_epochs', 10, 30)
    batch_size = trial.suggest_categorical('batch_size', [64, 128, 256])

    # Create data loaders with the suggested batch size
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    input_size = X_train_flat.shape[1]

    # Initialize model, loss function, and optimizer
    model = RaceRegressionModel(input_size, hidden_size).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    # Start MLflow run
    with mlflow.start_run(run_name=f"Neural Network R2 Trial {trial.number}"):
        mlflow.log_params({
            'model_class': 'RaceRegressionModel',
            'hidden_size': hidden_size,
            'learning_rate': learning_rate,
            'weight_decay': weight_decay,
            'num_epochs': num_epochs,
            'batch_size': batch_size
        })

        # Training loop
        for epoch in range(num_epochs):
            model.train()
            total_loss = 0
            for X_batch, y_batch in train_loader:
                X_batch = X_batch.to(device)
                y_batch = y_batch.to(device)

                optimizer.zero_grad()
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                loss.backward()
                optimizer.step()
                total_loss += loss.item() * X_batch.size(0)

            average_loss = total_loss / len(train_loader.dataset)
            mlflow.log_metric("train_loss", average_loss, step=epoch)

        # Evaluation on test set
        model.eval()
        y_true_list = []
        y_pred_list = []
        with torch.no_grad():
            for X_batch, y_batch in test_loader:
                X_batch = X_batch.to(device)
                y_batch = y_batch.to(device)

                outputs = model(X_batch)
                y_true_list.extend(y_batch.cpu().numpy())
                y_pred_list.extend(outputs.cpu().numpy())

        y_true_array = np.array(y_true_list)
        y_pred_array = np.array(y_pred_list)

        # Compute evaluation metrics
        mse = mean_squared_error(y_true_array, y_pred_array)
        mae = mean_absolute_error(y_true_array, y_pred_array)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_true_array, y_pred_array)
        mape = mean_absolute_percentage_error(y_true_array, y_pred_array)
        smape = symmetric_mean_absolute_percentage_error(y_true_array, y_pred_array)

        # Log metrics
        mlflow.log_metrics({
            'test_mse': mse,
            'test_mae': mae,
            'test_rmse': rmse,
            'test_r2': r2,
            'test_mape': mape,
            'test_smape': smape
        })

        # Log the model
        input_example = X_train_flat[:5].astype(np.float32)
        input_example_tensor = torch.tensor(input_example, dtype=torch.float32).to(device)
        signature = infer_signature(
            input_example,
            model(input_example_tensor).cpu().detach().numpy()
        )
        mlflow.pytorch.log_model(
            pytorch_model=model,
            artifact_path="model",
            input_example=input_example,
            signature=signature
        )

    # Return the metric to optimize
    return r2

# Create an Optuna study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

# Print best hyperparameters
print("Best hyperparameters:", study.best_params)
print("Best sMAPE:", study.best_value)

[I 2024-11-30 14:18:56,872] A new study created in memory with name: no-name-95eb98f6-ec3b-41e5-a798-fe8c3d25536d
[I 2024-11-30 14:19:01,104] Trial 0 finished with value: -0.030196388542649277 and parameters: {'hidden_size': 256, 'learning_rate': 0.009337652427237754, 'weight_decay': 0.00019415087673350546, 'num_epochs': 17, 'batch_size': 128}. Best is trial 0 with value: -0.030196388542649277.
[I 2024-11-30 14:19:05,453] Trial 1 finished with value: -0.04745134185417199 and parameters: {'hidden_size': 256, 'learning_rate': 0.009689776321021842, 'weight_decay': 0.0004671726918041624, 'num_epochs': 18, 'batch_size': 64}. Best is trial 0 with value: -0.030196388542649277.
[I 2024-11-30 14:19:08,824] Trial 2 finished with value: 0.018673010527187617 and parameters: {'hidden_size': 256, 'learning_rate': 0.0032667564866602447, 'weight_decay': 0.00042415680311678106, 'num_epochs': 12, 'batch_size': 128}. Best is trial 2 with value: 0.018673010527187617.
[I 2024-11-30 14:19:13,708] Trial 3 fi

Best hyperparameters: {'hidden_size': 64, 'learning_rate': 0.007854076339347725, 'weight_decay': 6.8048578595835654e-06, 'num_epochs': 26, 'batch_size': 256}
Best sMAPE: 0.05288650204434975


Improving your model's prediction performance involves a multifaceted approach that encompasses data quality, feature engineering, model architecture, training strategies, and evaluation techniques. Below are comprehensive strategies tailored to enhance your current setup:

---

## **1. Enhance Data Quality and Quantity**

### **a. Increase Data Volume**
- **More Data:** More training data can help your model generalize better. If possible, collect additional race and rider data.
- **Data Augmentation:** For structured data, consider techniques like SMOTE (Synthetic Minority Over-sampling Technique) to generate synthetic samples, especially if dealing with class imbalance.

### **b. Data Cleaning**
- **Handle Missing Values:** Ensure all missing values are appropriately imputed or removed. You've used `SimpleImputer`, which is good, but verify the imputation strategy for each feature.
- **Remove Outliers:** Identify and handle outliers that might skew the training process.
- **Consistency Checks:** Ensure data consistency across different features and sources.

### **c. Feature Scaling**
- **Verify Scaling:** Ensure that all numerical features are properly scaled. You’re using `MinMaxScaler`, but sometimes `StandardScaler` can be more effective depending on the feature distribution.

---

## **2. Advanced Feature Engineering**

### **a. Feature Selection**
- **Correlation Analysis:** Perform correlation analysis to identify and remove redundant features.
- **Feature Importance:** Use techniques like feature importance from tree-based models to select the most impactful features.

### **b. Create Interaction Features**
- **Polynomial Features:** Generate polynomial combinations of existing features to capture non-linear relationships.
- **Interaction Terms:** Create interaction terms between race-level and rider-level features (e.g., how rider **age** interacts with race **distance**).

### **c. Categorical Encoding Enhancements**
- **Embeddings:** Instead of one-hot encoding, use embeddings for high-cardinality categorical features like `rider_name`, `team`, and `nationality`. This approach reduces dimensionality and captures semantic relationships.
  
  ```python
  from sklearn.preprocessing import LabelEncoder
  
  # Example for rider_name
  label_encoder = LabelEncoder()
  train_data['rider_name_encoded'] = label_encoder.fit_transform(train_data['rider_name'])
  test_data['rider_name_encoded'] = label_encoder.transform(test_data['rider_name'])
  ```

### **d. Temporal Features**
- **Year-Based Features:** Extract features like the number of years a rider has been active or changes in team affiliation over the years.
- **Recent Performance:** Incorporate recent race performances to capture momentum.

---

## **3. Optimize Model Architecture**

### **a. Experiment with Different Models**

- **Gradient Boosting Machines (GBMs):**
  Models like **XGBoost**, **LightGBM**, or **CatBoost** often outperform neural networks on structured data.
  
  ```python
  import lightgbm as lgb
  from sklearn.model_selection import train_test_split
  from sklearn.metrics import log_loss
  
  X_train_flat = X_train.reshape(X_train.shape[0], -1)
  X_test_flat = X_test.reshape(X_test.shape[0], -1)
  
  lgb_train = lgb.Dataset(X_train_flat, y_train)
  lgb_eval = lgb.Dataset(X_test_flat, y_test, reference=lgb_train)
  
  params = {
      'objective': 'multiclass',  # or 'binary' depending on your task
      'num_class': y_train.shape[1],  # for multiclass
      'metric': 'multi_logloss',
      'boosting_type': 'gbdt',
      'learning_rate': 0.05,
      'num_leaves': 31,
      'verbose': -1
  }
  
  gbm = lgb.train(params,
                  lgb_train,
                  num_boost_round=1000,
                  valid_sets=[lgb_train, lgb_eval],
                  early_stopping_rounds=50)
  
  y_pred = gbm.predict(X_test_flat, num_iteration=gbm.best_iteration)
  print(f'Log Loss: {log_loss(y_test, y_pred)}')
  ```

- **Deep Learning Models:**
  - **Deeper Networks:** Add more hidden layers or units to your current neural network.
  - **Regularization Techniques:** Incorporate dropout layers or batch normalization to prevent overfitting.
  - **Advanced Architectures:** Utilize architectures like **Multi-Layer Perceptrons (MLPs)** with residual connections or **Attention Mechanisms** to better capture complex relationships.

### **b. Model Ensemble**
- **Combine Multiple Models:** Create an ensemble of different models (e.g., neural networks, GBMs, logistic regression) to leverage their strengths.
- **Stacking:** Use predictions from multiple models as input features for a meta-model.

---

## **4. Refine Training Strategies**

### **a. Hyperparameter Tuning**
- **Automated Search:** Use tools like **Grid Search**, **Random Search**, or **Bayesian Optimization** (e.g., **Optuna**, **Hyperopt**) to find optimal hyperparameters.
  
  ```python
  from sklearn.model_selection import GridSearchCV
  from sklearn.ensemble import GradientBoostingClassifier
  
  param_grid = {
      'n_estimators': [100, 200],
      'learning_rate': [0.01, 0.05, 0.1],
      'max_depth': [3, 5, 7]
  }
  
  gbm = GradientBoostingClassifier()
  grid_search = GridSearchCV(gbm, param_grid, cv=3, scoring='neg_log_loss')
  grid_search.fit(X_train_flat, y_train.argmax(axis=1))
  
  print(f'Best parameters: {grid_search.best_params_}')
  print(f'Best log loss: {-grid_search.best_score_}')
  ```

### **b. Learning Rate Scheduling**
- **Dynamic Learning Rates:** Adjust the learning rate during training using schedulers like **StepLR**, **ReduceLROnPlateau**, or **CosineAnnealingLR**.
  
  ```python
  scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10)
  
  # In your training loop
  scheduler.step(average_loss)
  ```

### **c. Early Stopping**
- **Prevent Overfitting:** Use early stopping based on validation loss to stop training when performance no longer improves.
  
  ```python
  # Implement early stopping in your training loop
  best_loss = float('inf')
  patience = 10
  trigger_times = 0
  
  for epoch in range(num_epochs):
      # Training code...
      
      # Validation code
      val_loss = ...  # Compute validation loss
      if val_loss < best_loss:
          best_loss = val_loss
          trigger_times = 0
          # Save the best model
      else:
          trigger_times += 1
          if trigger_times >= patience:
              print('Early stopping!')
              break
  ```

### **d. Data Augmentation for Structured Data**
- **Synthetic Feature Generation:** Create new samples by adding noise or perturbations to existing data points.
  
  ```python
  noise = np.random.normal(0, 0.01, X_train.shape)
  X_train_augmented = X_train + noise
  y_train_augmented = y_train.copy()
  
  X_train = np.concatenate([X_train, X_train_augmented], axis=0)
  y_train = np.concatenate([y_train, y_train_augmented], axis=0)
  ```

---

## **5. Improve Evaluation Metrics and Validation**

### **a. Use Appropriate Metrics**
- **Task-Specific Metrics:** Ensure that the metrics align with your prediction goals. For probabilistic predictions, **Log Loss**, **Brier Score**, or **ROC-AUC** are suitable.
  
  ```python
  from sklearn.metrics import log_loss, roc_auc_score
  
  logloss = log_loss(y_test, y_pred)
  auc_score = roc_auc_score(y_test, y_pred, multi_class='ovr')
  print(f'Log Loss: {logloss}, ROC-AUC: {auc_score}')
  ```

### **b. Cross-Validation**
- **Robust Evaluation:** Use k-fold cross-validation to ensure that your model generalizes well across different subsets of data.
  
  ```python
  from sklearn.model_selection import cross_val_score
  from sklearn.ensemble import RandomForestClassifier
  
  clf = RandomForestClassifier()
  scores = cross_val_score(clf, X_train_flat, y_train.argmax(axis=1), cv=5, scoring='neg_log_loss')
  print(f'Cross-Validation Log Loss: {-scores.mean()}')
  ```

### **c. Validation Set**
- **Hold-Out Set:** Split your training data into training and validation sets to monitor model performance during training.
  
  ```python
  X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
      X_train_flat, y_train, test_size=0.2, random_state=42
  )
  ```

---

## **6. Address Model-Specific Challenges**

### **a. Handle Imbalanced Data**
- **Class Weights:** Assign higher weights to minority classes in the loss function.
  
  ```python
  class_weights = compute_class_weight('balanced', classes=np.unique(y_train.argmax(axis=1)), y=y_train.argmax(axis=1))
  class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)
  
  criterion = nn.CrossEntropyLoss(weight=class_weights)
  ```

- **Oversampling/Undersampling:** Balance your dataset by oversampling minority classes or undersampling majority classes.

### **b. Model Interpretability**
- **Feature Importance:** Use tools like SHAP or LIME to understand which features are contributing most to predictions.
  
  ```python
  import shap
  
  explainer = shap.TreeExplainer(gbm)
  shap_values = explainer.shap_values(X_test_flat)
  shap.summary_plot(shap_values, X_test_flat)
  ```

### **c. Ensemble Methods**
- **Boosting:** As mentioned earlier, models like XGBoost or LightGBM can be powerful.
- **Bagging:** Train multiple instances of your model on different subsets and average their predictions.
- **Stacking:** Combine multiple models by stacking their predictions as inputs to a meta-model.

---

## **7. Optimize Neural Network-Specific Parameters**

### **a. Increase Model Complexity**
- **More Layers/Units:** Add more hidden layers or increase the number of units per layer.
  
  ```python
  class RaceModel(nn.Module):
      def __init__(self, input_size, hidden_size=256):
          super(RaceModel, self).__init__()
          self.fc1 = nn.Linear(input_size, hidden_size)
          self.relu = nn.ReLU()
          self.fc2 = nn.Linear(hidden_size, hidden_size)
          self.relu = nn.ReLU()
          self.fc3 = nn.Linear(hidden_size, 1)
      
      def forward(self, x, mask):
          x = self.fc1(x)
          x = self.relu(x)
          x = self.fc2(x)
          x = self.relu(x)
          x = self.fc3(x)
          x = x.view(x.size(0), -1)
          scores = x.masked_fill(mask == 0, float('-inf'))
          probs = torch.softmax(scores, dim=1)
          return probs
  ```

### **b. Advanced Activation Functions**
- **LeakyReLU or ELU:** These can help mitigate issues like dying neurons in ReLU.
  
  ```python
  self.leaky_relu = nn.LeakyReLU(0.1)
  
  # In forward:
  x = self.leaky_relu(self.fc1(x))
  ```

### **c. Regularization Techniques**
- **Dropout Layers:** Prevent overfitting by randomly dropping units during training.
  
  ```python
  self.dropout = nn.Dropout(p=0.5)
  
  # In forward:
  x = self.dropout(self.relu(self.fc1(x)))
  ```

- **Batch Normalization:** Stabilize and accelerate training by normalizing layer inputs.
  
  ```python
  self.batch_norm = nn.BatchNorm1d(hidden_size)
  
  # In forward:
  x = self.batch_norm(x)
  ```

---

## **8. Advanced Training Techniques**

### **a. Transfer Learning**
- **Pretrained Models:** Although more common in domains like image and text processing, investigate if there's a pretrained model relevant to your data that you can fine-tune.

### **b. Curriculum Learning**
- **Simpler First:** Train your model on easier examples first, then gradually increase the difficulty.

### **c. Multi-Task Learning**
- **Related Tasks:** If there are related tasks (e.g., predicting different aspects of the race), train the model to handle multiple tasks simultaneously. This can help the model learn more robust features.

---

## **9. Post-Training Enhancements**

### **a. Model Calibration**
- **Calibrate Probabilities:** Ensure that predicted probabilities reflect true likelihoods using techniques like Platt Scaling or Isotonic Regression.

  ```python
  from sklearn.calibration import CalibratedClassifierCV
  
  calibrated_clf = CalibratedClassifierCV(base_estimator=gbm, method='sigmoid', cv='prefit')
  calibrated_clf.fit(X_val_split, y_val_split.argmax(axis=1))
  y_pred_prob = calibrated_clf.predict_proba(X_test_flat)
  ```

### **b. Threshold Adjustment**
- **Optimal Thresholds:** If your task involves classification, adjust decision thresholds to balance precision and recall based on your specific needs.

---

## **10. Implement Robust Evaluation and Monitoring**

### **a. Detailed Evaluation Reports**
- **Confusion Matrix:** Understand where your model is making errors.
  
  ```python
  from sklearn.metrics import confusion_matrix
  import seaborn as sns
  import matplotlib.pyplot as plt
  
  y_pred_classes = y_pred.argmax(axis=1)
  y_true = y_test.argmax(axis=1)
  cm = confusion_matrix(y_true, y_pred_classes)
  sns.heatmap(cm, annot=True, fmt='d')
  plt.show()
  ```

- **Precision-Recall Curves:** Especially useful for imbalanced datasets.
  
  ```python
  from sklearn.metrics import precision_recall_curve
  
  precision, recall, thresholds = precision_recall_curve(y_true, y_pred_prob[:,1])
  plt.plot(recall, precision)
  plt.xlabel('Recall')
  plt.ylabel('Precision')
  plt.show()
  ```

### **b. Cross-Validation Insights**
- **Variance Across Folds:** Analyze if the model's performance is consistent across different data splits.

### **c. Error Analysis**
- **Misclassified Instances:** Inspect the cases where your model performs poorly to identify patterns or missing features.

---

## **11. Utilize Advanced Libraries and Tools**

### **a. Automated Machine Learning (AutoML)**
- **AutoML Tools:** Leverage tools like **AutoSklearn**, **TPOT**, or **H2O.ai** to automate the model selection and hyperparameter tuning process.

### **b. Visualization Tools**
- **TensorBoard or Weights & Biases:** Monitor training metrics, visualize model architecture, and track experiments.

  ```python
  from torch.utils.tensorboard import SummaryWriter
  
  writer = SummaryWriter()
  
  # In your training loop:
  writer.add_scalar('Loss/train', loss.item(), epoch * len(train_loader) + batch_idx)
  writer.close()
  ```

---

## **12. Example: Applying Enhancements to Your Current Setup**

Below is an example of how you can implement some of the above strategies within your existing code structure.

### **a. Update Feature Engineering with Embeddings**

Instead of one-hot encoding `rider_name`, use embeddings:



In [None]:
import torch.nn as nn

class RaceModel(nn.Module):
    def __init__(self, input_size, hidden_size=256, num_rider_names=1000, rider_embedding_dim=50):
        super(RaceModel, self).__init__()
        self.rider_embedding = nn.Embedding(num_rider_names, rider_embedding_dim)
        self.fc1 = nn.Linear(input_size + rider_embedding_dim, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.dropout = nn.Dropout(0.5)
        self.fc3 = nn.Linear(hidden_size, 1)
    
    def forward(self, x, rider_ids, mask):
        # x: (batch_size, max_riders, num_features)
        # rider_ids: (batch_size, max_riders)
        embedded = self.rider_embedding(rider_ids)  # (batch_size, max_riders, embedding_dim)
        x = torch.cat([x, embedded], dim=2)  # (batch_size, max_riders, num_features + embedding_dim)
        batch_size, max_riders, _ = x.size()
        x = x.view(-1, x.size(-1))
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.dropout(self.relu(self.fc2(x)))
        scores = self.fc3(x)
        scores = scores.view(batch_size, max_riders)
        scores = scores.masked_fill(mask == 0, float('-inf'))
        probs = torch.softmax(scores, dim=1)
        return probs



### **b. Implement Hyperparameter Tuning with Optuna**



In [None]:
import optuna
from sklearn.model_selection import train_test_split

# Define the objective function
def objective(trial):
    hidden_size = trial.suggest_int('hidden_size', 128, 512)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-2)
    dropout_rate = trial.suggest_uniform('dropout_rate', 0.2, 0.5)
    
    model = RaceModel(input_size, hidden_size=hidden_size, dropout_rate=dropout_rate).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.KLDivLoss(reduction='batchmean')
    
    # Training loop with a smaller number of epochs for tuning
    for epoch in range(5):
        model.train()
        total_loss = 0
        for X_batch, y_batch, mask in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch, mask)
            y_batch_norm = y_batch / y_batch.sum(dim=1, keepdim=True)
            y_batch_norm = y_batch_norm * mask
            loss = criterion(torch.log(outputs + 1e-8), y_batch_norm + 1e-8)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        average_loss = total_loss / len(train_loader)
        trial.report(average_loss, epoch)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
    
    return average_loss

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

print(f'Best hyperparameters: {study.best_params}')
print(f'Best loss: {study.best_value}')



### **c. Leveraging Cross-Validation**



In [None]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)
all_losses = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_flat)):
    X_tr, X_val = X_train_flat[train_idx], X_train_flat[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]
    
    # Create datasets and loaders
    train_ds = RaceDataset(X_tr, y_tr)
    val_ds = RaceDataset(X_val, y_val)
    train_dl = DataLoader(train_ds, batch_size=16, shuffle=True)
    val_dl = DataLoader(val_ds, batch_size=16, shuffle=False)
    
    # Initialize model, optimizer, and loss
    model = RaceModel(input_size).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.KLDivLoss(reduction='batchmean')
    
    # Training loop
    for epoch in range(10):
        model.train()
        total_loss = 0
        for X_batch, y_batch, mask in train_dl:
            optimizer.zero_grad()
            outputs = model(X_batch, mask)
            y_batch_norm = y_batch / y_batch.sum(dim=1, keepdim=True)
            y_batch_norm = y_batch_norm * mask
            loss = criterion(torch.log(outputs + 1e-8), y_batch_norm + 1e-8)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        average_loss = total_loss / len(train_dl)
        
        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_batch, y_batch, mask in val_dl:
                outputs = model(X_batch, mask)
                y_batch_norm = y_batch / y_batch.sum(dim=1, keepdim=True)
                y_batch_norm = y_batch_norm * mask
                loss = criterion(torch.log(outputs + 1e-8), y_batch_norm + 1e-8)
                val_loss += loss.item()
        avg_val_loss = val_loss / len(val_dl)
        all_losses.append(avg_val_loss)
        print(f'Fold {fold+1}, Epoch {epoch+1}, Val Loss: {avg_val_loss:.4f}')

print(f'Average Validation Loss across folds: {np.mean(all_losses):.4f}')



---

## **13. Additional Resources and Best Practices**

### **a. Documentation and Tutorials**
- **PyTorch Tutorials:** [https://pytorch.org/tutorials/](https://pytorch.org/tutorials/)
- **Scikit-learn Documentation:** [https://scikit-learn.org/stable/documentation.html](https://scikit-learn.org/stable/documentation.html)

### **b. Communities and Forums**
- **PyTorch Forums:** [https://discuss.pytorch.org/](https://discuss.pytorch.org/)
- **Kaggle:** Participate in competitions and discussions to learn best practices.
- **Stack Overflow:** For specific coding issues and questions.

### **c. Continuous Learning**
- **Courses:** Consider taking advanced machine learning or deep learning courses to deepen your understanding.
- **Books:** "Deep Learning" by Ian Goodfellow, "Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow" by Aurélien Géron.

---

## **14. Final Considerations**

### **a. Iterative Improvement**
- **Incremental Changes:** Implement one improvement at a time and monitor its impact. This helps in understanding what works best for your specific problem.
- **Maintain a Baseline:** Always compare new models against a simple baseline to ensure that changes are beneficial.

### **b. Model Interpretability**
- **Understand Predictions:** Tools like **SHAP** or **LIME** can help you interpret model predictions and ensure that the model makes sense from a domain perspective.

### **c. Deployment and Real-World Testing**
- **Real-World Validation:** If possible, test your model in real-world scenarios to ensure it performs well outside of the training and testing datasets.
- **Feedback Loop:** Incorporate feedback from real-world usage to continuously improve the model.

---

By systematically applying these strategies, you can significantly enhance your model's predictive performance. Start by identifying which areas (data quality, feature engineering, model complexity, etc.) are most likely to yield improvements in your specific context and prioritize efforts accordingly.

Feel free to reach out with specific questions or if you need further guidance on any of the steps outlined above!