In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

import mlflow
import mlflow.sklearn
import mlflow.pytorch
from mlflow.models.signature import infer_signature

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.utils import shuffle

import xgboost as xgb
import lightgbm as lgb
import optuna

# Set random seed for reproducibility
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x1160b5470>

In [None]:
# Load the data (adjust file paths as needed)
X_train = np.load('X_train.npy', allow_pickle=True)
y_train = np.load('y_train.npy', allow_pickle=True)
X_test = np.load('X_test.npy', allow_pickle=True)
y_test = np.load('y_test.npy', allow_pickle=True)

print(f'X_train shape: {X_train.shape}')  # Expected: (num_races_train, max_riders, num_features)
print(f'y_train shape: {y_train.shape}')  # Expected: (num_races_train, max_riders)
print(f'X_test shape: {X_test.shape}')    # Expected: (num_races_test, max_riders, num_features)
print(f'y_test shape: {y_test.shape}')    # Expected: (num_races_test, max_riders)

# Flatten the data for scikit-learn models
X_train_flat = X_train.reshape(-1, X_train.shape[2])    # Shape: (num_races_train * max_riders, num_features)
X_test_flat = X_test.reshape(-1, X_test.shape[2])       # Shape: (num_races_test * max_riders, num_features)

# Flatten the targets
y_train_flat = y_train.flatten()  # Shape: (num_races_train * max_riders,)
y_test_flat = y_test.flatten()    # Shape: (num_races_test * max_riders,)

# Filter out invalid targets (if necessary)
valid_indices_train = y_train_flat > 0
valid_indices_test = y_test_flat > 0

X_train_flat = X_train_flat[valid_indices_train]
y_train_flat = y_train_flat[valid_indices_train]

X_test_flat = X_test_flat[valid_indices_test]
y_test_flat = y_test_flat[valid_indices_test]

# # Optionally scale the features
# scaler = StandardScaler()
# X_train_flat = scaler.fit_transform(X_train_flat)
# X_test_flat = scaler.transform(X_test_flat)

X_train shape: (2034, 207, 227)
y_train shape: (2034, 207)
X_test shape: (153, 207, 227)
y_test shape: (153, 207)


In [7]:
# Set MLflow experiment
mlflow.set_tracking_uri("http://seito.lavbic.net:5000")
mlflow.set_experiment("Race_Prediction_Experiment_I")

<Experiment: artifact_location='mlflow-artifacts:/775777151092792639', creation_time=1736268824287, experiment_id='775777151092792639', last_update_time=1736268824287, lifecycle_stage='active', name='Race_Prediction_Experiment_I', tags={'mlflow.note.content': 'Only first 3 riders have probability, so have in the '
                        'test set'}>

In [4]:
def train_and_evaluate_model(model_class, param_grid, model_name, X_train, y_train, X_test, y_test):
    from itertools import product
    import pandas as pd

    # Generate all combinations of hyperparameters
    keys = param_grid.keys()
    values = (param_grid[key] for key in keys)
    param_combinations = [dict(zip(keys, combination)) for combination in product(*values)]

    # For each combination, train and log the model
    for idx, params in enumerate(param_combinations):
        # Initialize model with current hyperparameters
        model = model_class(**params)

        # Fit the model
        model.fit(X_train, y_train)

        # Predict on test set
        y_pred = model.predict(X_test)

        # Calculate metrics
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        mape = mean_absolute_percentage_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        smape = symmetric_mean_absolute_percentage_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Start MLflow run
        with mlflow.start_run(run_name=f"{model_name} - Run {idx+1}"):
            # Log parameters
            mlflow.log_param("model_class", model_name)
            mlflow.log_params(params)

            # Log metrics
            mlflow.log_metric("test_mse", mse)
            mlflow.log_metric("test_mae", mae)
            mlflow.log_metric("test_r2", r2)
            mlflow.log_metric("test_mape", mape)
            mlflow.log_metric("test_rmse", rmse)
            mlflow.log_metric("test_smape", smape)

            # Log the model
            input_example = X_train[:5]
            signature = infer_signature(X_train, model.predict(X_train))

            mlflow.sklearn.log_model(
                sk_model=model,
                artifact_path="model",
                input_example=input_example,
                signature=signature
            )

            # Print results
            print(f"\n{model_name} Run {idx+1} parameters: {params}")
            print(f"{model_name} Run {idx+1} Test MSE: {mse:.4f}")
            print(f"{model_name} Run {idx+1} Test MAE: {mae:.4f}")
            print(f"{model_name} Run {idx+1} Test R^2 Score: {r2:.4f}")
            print(f"{model_name} Run {idx+1} Test MAPE: {mape:.4f}")
            print(f"{model_name} Run {idx+1} Test RMSE: {rmse:.4f}")
            print(f"{model_name} Run {idx+1} Test sMAPE: {smape:.4f}")

def mean_absolute_percentage_error(y_true, y_pred):
    epsilon = 1e-8  # Small number to prevent division by zero
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    # Avoid division by zero
    mask = np.abs(y_true) > epsilon
    if np.sum(mask) == 0:
        return np.inf  # Return infinity if no valid entries
    mape = np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100
    return mape

def symmetric_mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    diff = np.abs(y_pred - y_true)
    # Avoid division by zero
    mask = denominator != 0
    smape = np.mean((diff[mask] / denominator[mask])) * 100
    return smape

### Linear Regression

In [11]:
linear_reg = LinearRegression
param_grid_lr = {
    # No hyperparameters to tune
}

train_and_evaluate_model(
    model_class=linear_reg,
    param_grid=param_grid_lr,
    model_name="Linear Regression",
    X_train=X_train_flat,
    y_train=y_train_flat,
    X_test=X_test_flat,
    y_test=y_test_flat
)


Linear Regression Run 1 parameters: {}
Linear Regression Run 1 Test MSE: 0.0000
Linear Regression Run 1 Test MAE: 0.0000
Linear Regression Run 1 Test R^2 Score: 1.0000
Linear Regression Run 1 Test MAPE: 0.0000
Linear Regression Run 1 Test RMSE: 0.0000
Linear Regression Run 1 Test sMAPE: 0.0000
🏃 View run Linear Regression - Run 1 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/e81933db6cea4ffaa7d8ea0c6a04c699
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723


### Ridge Regression

In [6]:
ridge_reg = Ridge
param_grid_ridge = {
    'alpha': [0.1, 0.9, 1.0, 1.5, 2.0, 10.0]
}

train_and_evaluate_model(
    model_class=ridge_reg,
    param_grid=param_grid_ridge,
    model_name="Ridge Regression",
    X_train=X_train_flat,
    y_train=y_train_flat,
    X_test=X_test_flat,
    y_test=y_test_flat
)


Ridge Regression Run 1 parameters: {'alpha': 0.1}
Ridge Regression Run 1 Test MSE: 0.0000
Ridge Regression Run 1 Test MAE: 0.0000
Ridge Regression Run 1 Test R^2 Score: 1.0000
Ridge Regression Run 1 Test MAPE: 0.0000
Ridge Regression Run 1 Test RMSE: 0.0000
Ridge Regression Run 1 Test sMAPE: 0.0000
🏃 View run Ridge Regression - Run 1 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/cc90f27ea5b54aa9879d00391bf8e608
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723

Ridge Regression Run 2 parameters: {'alpha': 0.9}
Ridge Regression Run 2 Test MSE: 0.0000
Ridge Regression Run 2 Test MAE: 0.0000
Ridge Regression Run 2 Test R^2 Score: 1.0000
Ridge Regression Run 2 Test MAPE: 0.0000
Ridge Regression Run 2 Test RMSE: 0.0000
Ridge Regression Run 2 Test sMAPE: 0.0000
🏃 View run Ridge Regression - Run 2 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/5ad942ac29744bc9aa3ce2b6b703ef98
🧪 View experiment at: http://seito

### Lasso Regression

In [7]:
lasso_reg = Lasso
param_grid_lasso = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0, 1.5, 2.0, 10.0],
    'max_iter': [10000]
}

train_and_evaluate_model(
    model_class=lasso_reg,
    param_grid=param_grid_lasso,
    model_name="Lasso Regression",
    X_train=X_train_flat,
    y_train=y_train_flat,
    X_test=X_test_flat,
    y_test=y_test_flat
)

  model = cd_fast.enet_coordinate_descent(



Lasso Regression Run 1 parameters: {'alpha': 0.0001, 'max_iter': 10000}
Lasso Regression Run 1 Test MSE: 0.0000
Lasso Regression Run 1 Test MAE: 0.0000
Lasso Regression Run 1 Test R^2 Score: 1.0000
Lasso Regression Run 1 Test MAPE: 0.0000
Lasso Regression Run 1 Test RMSE: 0.0000
Lasso Regression Run 1 Test sMAPE: 0.0000
🏃 View run Lasso Regression - Run 1 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/1121a8fe6aa2495f9b9a2baabc7ade5c
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723


  model = cd_fast.enet_coordinate_descent(



Lasso Regression Run 2 parameters: {'alpha': 0.001, 'max_iter': 10000}
Lasso Regression Run 2 Test MSE: 0.0000
Lasso Regression Run 2 Test MAE: 0.0000
Lasso Regression Run 2 Test R^2 Score: 1.0000
Lasso Regression Run 2 Test MAPE: 0.0000
Lasso Regression Run 2 Test RMSE: 0.0000
Lasso Regression Run 2 Test sMAPE: 0.0000
🏃 View run Lasso Regression - Run 2 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/3094534ced234af08f341e7b48e9c974
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723


  model = cd_fast.enet_coordinate_descent(



Lasso Regression Run 3 parameters: {'alpha': 0.01, 'max_iter': 10000}
Lasso Regression Run 3 Test MSE: 0.0000
Lasso Regression Run 3 Test MAE: 0.0000
Lasso Regression Run 3 Test R^2 Score: 1.0000
Lasso Regression Run 3 Test MAPE: 0.0000
Lasso Regression Run 3 Test RMSE: 0.0000
Lasso Regression Run 3 Test sMAPE: 0.0000
🏃 View run Lasso Regression - Run 3 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/39ce72a7117747538600ae565f04b536
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723


  model = cd_fast.enet_coordinate_descent(



Lasso Regression Run 4 parameters: {'alpha': 0.1, 'max_iter': 10000}
Lasso Regression Run 4 Test MSE: 0.0000
Lasso Regression Run 4 Test MAE: 0.0000
Lasso Regression Run 4 Test R^2 Score: 1.0000
Lasso Regression Run 4 Test MAPE: 0.0000
Lasso Regression Run 4 Test RMSE: 0.0000
Lasso Regression Run 4 Test sMAPE: 0.0000
🏃 View run Lasso Regression - Run 4 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/35989fcbb88e4ad08ee48f64efaf68df
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723


  model = cd_fast.enet_coordinate_descent(



Lasso Regression Run 5 parameters: {'alpha': 1.0, 'max_iter': 10000}
Lasso Regression Run 5 Test MSE: 0.0000
Lasso Regression Run 5 Test MAE: 0.0000
Lasso Regression Run 5 Test R^2 Score: 1.0000
Lasso Regression Run 5 Test MAPE: 0.0000
Lasso Regression Run 5 Test RMSE: 0.0000
Lasso Regression Run 5 Test sMAPE: 0.0000
🏃 View run Lasso Regression - Run 5 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/09bf18ca520f4896830537057c362d95
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723


  model = cd_fast.enet_coordinate_descent(



Lasso Regression Run 6 parameters: {'alpha': 1.5, 'max_iter': 10000}
Lasso Regression Run 6 Test MSE: 0.0000
Lasso Regression Run 6 Test MAE: 0.0000
Lasso Regression Run 6 Test R^2 Score: 1.0000
Lasso Regression Run 6 Test MAPE: 0.0000
Lasso Regression Run 6 Test RMSE: 0.0000
Lasso Regression Run 6 Test sMAPE: 0.0000
🏃 View run Lasso Regression - Run 6 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/d24528fdf4c348a7b8456772aba112b9
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723


  model = cd_fast.enet_coordinate_descent(



Lasso Regression Run 7 parameters: {'alpha': 2.0, 'max_iter': 10000}
Lasso Regression Run 7 Test MSE: 0.0000
Lasso Regression Run 7 Test MAE: 0.0000
Lasso Regression Run 7 Test R^2 Score: 1.0000
Lasso Regression Run 7 Test MAPE: 0.0000
Lasso Regression Run 7 Test RMSE: 0.0000
Lasso Regression Run 7 Test sMAPE: 0.0000
🏃 View run Lasso Regression - Run 7 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/a61696312fb44fd09b068143a3266b97
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723


  model = cd_fast.enet_coordinate_descent(



Lasso Regression Run 8 parameters: {'alpha': 10.0, 'max_iter': 10000}
Lasso Regression Run 8 Test MSE: 0.0000
Lasso Regression Run 8 Test MAE: 0.0000
Lasso Regression Run 8 Test R^2 Score: 1.0000
Lasso Regression Run 8 Test MAPE: 0.0000
Lasso Regression Run 8 Test RMSE: 0.0000
Lasso Regression Run 8 Test sMAPE: 0.0000
🏃 View run Lasso Regression - Run 8 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/f844756c5f394eee8e13c5651718f92c
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723


### Decision Tree

In [8]:
decision_tree_reg = DecisionTreeRegressor
param_grid_dtree_reg = {
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10]
}

train_and_evaluate_model(
    model_class=decision_tree_reg,
    param_grid=param_grid_dtree_reg,
    model_name="Decision Tree Regressor",
    X_train=X_train_flat,
    y_train=y_train_flat,
    X_test=X_test_flat,
    y_test=y_test_flat
)


Decision Tree Regressor Run 1 parameters: {'max_depth': None, 'min_samples_split': 2}
Decision Tree Regressor Run 1 Test MSE: 0.0000
Decision Tree Regressor Run 1 Test MAE: 0.0000
Decision Tree Regressor Run 1 Test R^2 Score: 1.0000
Decision Tree Regressor Run 1 Test MAPE: 0.0000
Decision Tree Regressor Run 1 Test RMSE: 0.0000
Decision Tree Regressor Run 1 Test sMAPE: 0.0000
🏃 View run Decision Tree Regressor - Run 1 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/c52625310b2e4a409b8c75a58a558d7a
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723

Decision Tree Regressor Run 2 parameters: {'max_depth': None, 'min_samples_split': 5}
Decision Tree Regressor Run 2 Test MSE: 0.0000
Decision Tree Regressor Run 2 Test MAE: 0.0000
Decision Tree Regressor Run 2 Test R^2 Score: 1.0000
Decision Tree Regressor Run 2 Test MAPE: 0.0000
Decision Tree Regressor Run 2 Test RMSE: 0.0000
Decision Tree Regressor Run 2 Test sMAPE: 0.0000
🏃 View run 

### SVR

In [None]:
svr_model_class = SVR
param_grid_svr = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'epsilon': [0.01, 0.1, 1]
}

train_and_evaluate_model(
    model_class=svr_model_class,
    param_grid=param_grid_svr,
    model_name="Support Vector Regressor",
    X_train=X_train_flat,
    y_train=y_train_flat,
    X_test=X_test_flat,
    y_test=y_test_flat
)

### Random Forest

In [None]:
random_forest_reg = RandomForestRegressor()
param_grid_rf_reg = {
    'n_estimators': [100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5]
}

train_and_evaluate_model(
    model=random_forest_reg,
    param_grid=param_grid_rf_reg,
    model_name="Random Forest Regressor",
    X_train=X_train_flat,
    y_train=y_train_flat,
    X_test=X_test_flat,
    y_test=y_test_flat
)

### Gradient Boosting

In [None]:
gb_regressor = GradientBoostingRegressor()
param_grid_gb_reg = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5]
}

train_and_evaluate_model(
    model=gb_regressor,
    param_grid=param_grid_gb_reg,
    model_name="Gradient Boosting Regressor",
    X_train=X_train_flat,
    y_train=y_train_flat,
    X_test=X_test_flat,
    y_test=y_test_flat
)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


### XGBoost

In [9]:
xgboost_reg = xgb.XGBRegressor
param_grid_xgb_reg = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'objective': ['reg:squarederror']
}

train_and_evaluate_model(
    model_class=xgboost_reg,
    param_grid=param_grid_xgb_reg,
    model_name="XGBoost Regressor",
    X_train=X_train_flat,
    y_train=y_train_flat,
    X_test=X_test_flat,
    y_test=y_test_flat
)


XGBoost Regressor Run 1 parameters: {'n_estimators': 100, 'learning_rate': 0.01, 'max_depth': 3, 'objective': 'reg:squarederror'}
XGBoost Regressor Run 1 Test MSE: 0.0000
XGBoost Regressor Run 1 Test MAE: 0.0000
XGBoost Regressor Run 1 Test R^2 Score: 1.0000
XGBoost Regressor Run 1 Test MAPE: 0.0000
XGBoost Regressor Run 1 Test RMSE: 0.0000
XGBoost Regressor Run 1 Test sMAPE: 0.0000
🏃 View run XGBoost Regressor - Run 1 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/bbd0c70049b34962bcd180e5cfad896e
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723

XGBoost Regressor Run 2 parameters: {'n_estimators': 100, 'learning_rate': 0.01, 'max_depth': 5, 'objective': 'reg:squarederror'}
XGBoost Regressor Run 2 Test MSE: 0.0000
XGBoost Regressor Run 2 Test MAE: 0.0000
XGBoost Regressor Run 2 Test R^2 Score: 1.0000
XGBoost Regressor Run 2 Test MAPE: 0.0000
XGBoost Regressor Run 2 Test RMSE: 0.0000
XGBoost Regressor Run 2 Test sMAPE: 0.0000
🏃

### LightGBM

In [92]:
lgbm_reg = lgb.LGBMRegressor()
param_grid_lgbm_reg = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'num_leaves': [31, 63]
}

train_and_evaluate_model(
    model=lgbm_reg,
    param_grid=param_grid_lgbm_reg,
    model_name="LightGBM Regressor",
    X_train=X_train_flat,
    y_train=y_train_flat,
    X_test=X_test_flat,
    y_test=y_test_flat
)

TypeError: train_and_evaluate_model() got an unexpected keyword argument 'model'

### KNN

In [10]:
from sklearn.neighbors import KNeighborsRegressor

knn_model = KNeighborsRegressor
param_grid_knn = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance']
}

train_and_evaluate_model(
    model_class=knn_model,
    param_grid=param_grid_knn,
    model_name="K-Nearest Neighbors Regressor",
    X_train=X_train_flat,
    y_train=y_train_flat,
    X_test=X_test_flat,
    y_test=y_test_flat
)

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md




K-Nearest Neighbors Regressor Run 1 parameters: {'n_neighbors': 3, 'weights': 'uniform'}
K-Nearest Neighbors Regressor Run 1 Test MSE: 0.0000
K-Nearest Neighbors Regressor Run 1 Test MAE: 0.0000
K-Nearest Neighbors Regressor Run 1 Test R^2 Score: 1.0000
K-Nearest Neighbors Regressor Run 1 Test MAPE: 0.0000
K-Nearest Neighbors Regressor Run 1 Test RMSE: 0.0000
K-Nearest Neighbors Regressor Run 1 Test sMAPE: 0.0000
🏃 View run K-Nearest Neighbors Regressor - Run 1 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/3a66a4020b6c44adb55a3c6dc9d4101a
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723

K-Nearest Neighbors Regressor Run 2 parameters: {'n_neighbors': 3, 'weights': 'distance'}
K-Nearest Neighbors Regressor Run 2 Test MSE: 0.0000
K-Nearest Neighbors Regressor Run 2 Test MAE: 0.0000
K-Nearest Neighbors Regressor Run 2 Test R^2 Score: 1.0000
K-Nearest Neighbors Regressor Run 2 Test MAPE: 0.0000
K-Nearest Neighbors Regressor Run 2

### Neural Network

In [11]:
class RaceRegressionModel(nn.Module):
    def __init__(self, input_size, hidden_size=128):
        super(RaceRegressionModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, 1)  # Output a score for each rider

    def forward(self, x):
        # x should have shape (batch_size, num_features)
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out.squeeze()

In [12]:
from torch.utils.data import Dataset, DataLoader

class RaceRegressionDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)  # Shape: (num_samples, num_features)
        self.y = torch.tensor(y, dtype=torch.float32)  # Shape: (num_samples,)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        X = self.X[idx]
        y = self.y[idx]
        return X, y

# Create datasets
train_dataset = RaceRegressionDataset(X_train_flat, y_train_flat)
test_dataset = RaceRegressionDataset(X_test_flat, y_test_flat)

# Create data loaders
batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Define the objective function for Optuna
def objective(trial):
    # Hyperparameter suggestions
    hidden_size = trial.suggest_categorical('hidden_size', [64, 128, 256])
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2)
    weight_decay = trial.suggest_float('weight_decay', 1e-6, 1e-3)
    num_epochs = trial.suggest_int('num_epochs', 10, 30)
    batch_size = trial.suggest_categorical('batch_size', [64, 128, 256])

    # Create data loaders with the suggested batch size
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    input_size = X_train_flat.shape[1]

    # Initialize model, loss function, and optimizer
    model = RaceRegressionModel(input_size, hidden_size).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    # Start MLflow run
    with mlflow.start_run(run_name=f"Neural Network MAE Run {trial.number}"):
        mlflow.log_params({
            'model_class': 'RaceRegressionModel',
            'hidden_size': hidden_size,
            'learning_rate': learning_rate,
            'weight_decay': weight_decay,
            'num_epochs': num_epochs,
            'batch_size': batch_size
        })

        # Training loop
        for epoch in range(num_epochs):
            model.train()
            total_loss = 0
            for X_batch, y_batch in train_loader:
                X_batch = X_batch.to(device)
                y_batch = y_batch.to(device)

                optimizer.zero_grad()
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                loss.backward()
                optimizer.step()
                total_loss += loss.item() * X_batch.size(0)

            average_loss = total_loss / len(train_loader.dataset)
            mlflow.log_metric("train_loss", average_loss, step=epoch)

        # Evaluation on test set
        model.eval()
        y_true_list = []
        y_pred_list = []
        with torch.no_grad():
            for X_batch, y_batch in test_loader:
                X_batch = X_batch.to(device)
                y_batch = y_batch.to(device)

                outputs = model(X_batch)
                y_true_list.extend(y_batch.cpu().numpy())
                y_pred_list.extend(outputs.cpu().numpy())

        y_true_array = np.array(y_true_list)
        y_pred_array = np.array(y_pred_list)

        # Compute evaluation metrics
        mse = mean_squared_error(y_true_array, y_pred_array)
        mae = mean_absolute_error(y_true_array, y_pred_array)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_true_array, y_pred_array)
        mape = mean_absolute_percentage_error(y_true_array, y_pred_array)
        smape = symmetric_mean_absolute_percentage_error(y_true_array, y_pred_array)

        # Log metrics
        mlflow.log_metrics({
            'test_mse': mse,
            'test_mae': mae,
            'test_rmse': rmse,
            'test_r2': r2,
            'test_mape': mape,
            'test_smape': smape
        })

        # Log the model
        input_example = X_train_flat[:5].astype(np.float32)
        input_example_tensor = torch.tensor(input_example, dtype=torch.float32).to(device)
        signature = infer_signature(
            input_example,
            model(input_example_tensor).cpu().detach().numpy()
        )
        mlflow.pytorch.log_model(
            pytorch_model=model,
            artifact_path="model",
            input_example=input_example,
            signature=signature
        )

    # Return the metric to optimize
    return mae

# Create an Optuna study and optimize
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)

# Print best hyperparameters
print("Best hyperparameters:", study.best_params)
print("Best MAE:", study.best_value)

[I 2025-01-09 10:18:16,484] A new study created in memory with name: no-name-40f59f35-04dc-41bf-9039-afdf372b801a
[I 2025-01-09 10:18:21,774] Trial 0 finished with value: 0.0966482013463974 and parameters: {'hidden_size': 128, 'learning_rate': 0.002157631739829833, 'weight_decay': 0.00024634599017599103, 'num_epochs': 17, 'batch_size': 64}. Best is trial 0 with value: 0.0966482013463974.


🏃 View run Neural Network MAE Run 0 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/ffc94545a25d4c73830d478981ff7d14
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723


[I 2025-01-09 10:18:25,749] Trial 1 finished with value: 0.20904843509197235 and parameters: {'hidden_size': 64, 'learning_rate': 0.007551577199034414, 'weight_decay': 0.00015008894709472555, 'num_epochs': 10, 'batch_size': 128}. Best is trial 0 with value: 0.0966482013463974.


🏃 View run Neural Network MAE Run 1 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/389673f5c39b43bfb8fa960c579599ea
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723


[I 2025-01-09 10:18:31,305] Trial 2 finished with value: 0.07524678856134415 and parameters: {'hidden_size': 256, 'learning_rate': 0.0028225810408850417, 'weight_decay': 0.0007046069577602527, 'num_epochs': 26, 'batch_size': 64}. Best is trial 2 with value: 0.07524678856134415.


🏃 View run Neural Network MAE Run 2 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/129959e61a59446384d309ec0ce64b83
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723


[I 2025-01-09 10:18:35,912] Trial 3 finished with value: 0.24459625780582428 and parameters: {'hidden_size': 256, 'learning_rate': 0.006513732904498579, 'weight_decay': 0.0004036646160107114, 'num_epochs': 14, 'batch_size': 64}. Best is trial 2 with value: 0.07524678856134415.


🏃 View run Neural Network MAE Run 3 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/b6edb679952f4bffa07137e1e14682fb
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723


[I 2025-01-09 10:18:40,331] Trial 4 finished with value: 0.19303226470947266 and parameters: {'hidden_size': 64, 'learning_rate': 0.0001360822393902129, 'weight_decay': 0.0007161250755017757, 'num_epochs': 21, 'batch_size': 64}. Best is trial 2 with value: 0.07524678856134415.


🏃 View run Neural Network MAE Run 4 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/c08fca7bf4db4df2835295611b98d415
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723
🏃 View run Neural Network MAE Run 5 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/bec9a49eaa8448d1b66d3ce9bdaf7bbd
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723


[I 2025-01-09 10:18:45,573] Trial 5 finished with value: 0.20796599984169006 and parameters: {'hidden_size': 128, 'learning_rate': 0.004091217486473826, 'weight_decay': 0.0004201280728359196, 'num_epochs': 28, 'batch_size': 256}. Best is trial 2 with value: 0.07524678856134415.
[I 2025-01-09 10:18:50,805] Trial 6 finished with value: 0.0777682512998581 and parameters: {'hidden_size': 256, 'learning_rate': 0.0034257330949070517, 'weight_decay': 0.0005166612169329545, 'num_epochs': 19, 'batch_size': 64}. Best is trial 2 with value: 0.07524678856134415.


🏃 View run Neural Network MAE Run 6 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/428b5264381f4d3890aa63a2c9777732
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723


[I 2025-01-09 10:18:55,439] Trial 7 finished with value: 0.2074243575334549 and parameters: {'hidden_size': 128, 'learning_rate': 0.008295015881908058, 'weight_decay': 0.0005792075771926575, 'num_epochs': 19, 'batch_size': 128}. Best is trial 2 with value: 0.07524678856134415.


🏃 View run Neural Network MAE Run 7 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/e27da3a69624448c9f5398820d064bc2
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723


[I 2025-01-09 10:18:59,805] Trial 8 finished with value: 0.2250986397266388 and parameters: {'hidden_size': 64, 'learning_rate': 0.009930459500792407, 'weight_decay': 0.0006556194790052821, 'num_epochs': 11, 'batch_size': 64}. Best is trial 2 with value: 0.07524678856134415.


🏃 View run Neural Network MAE Run 8 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/711fe49bb25c42edbfced7db35525b7c
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723


[I 2025-01-09 10:19:04,254] Trial 9 finished with value: 0.40690141916275024 and parameters: {'hidden_size': 256, 'learning_rate': 0.007851997747925, 'weight_decay': 0.00012327231127598126, 'num_epochs': 11, 'batch_size': 256}. Best is trial 2 with value: 0.07524678856134415.


🏃 View run Neural Network MAE Run 9 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/bf91e9d6a8344a0a84abfd85eb3b8403
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723


[I 2025-01-09 10:19:09,387] Trial 10 finished with value: 0.13775208592414856 and parameters: {'hidden_size': 256, 'learning_rate': 0.0007370315283262859, 'weight_decay': 0.0009972834799366853, 'num_epochs': 30, 'batch_size': 128}. Best is trial 2 with value: 0.07524678856134415.


🏃 View run Neural Network MAE Run 10 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/1d749059c98b48eeb804bc521ba9b059
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723


[I 2025-01-09 10:19:14,769] Trial 11 finished with value: 0.10511182993650436 and parameters: {'hidden_size': 256, 'learning_rate': 0.003918906112156891, 'weight_decay': 0.0008412334294209073, 'num_epochs': 24, 'batch_size': 64}. Best is trial 2 with value: 0.07524678856134415.


🏃 View run Neural Network MAE Run 11 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/86ee14bcd2ef47ac8472f08dda313175
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723


[I 2025-01-09 10:19:19,837] Trial 12 finished with value: 0.10956001281738281 and parameters: {'hidden_size': 256, 'learning_rate': 0.0024515443302800975, 'weight_decay': 0.0005109050137390424, 'num_epochs': 23, 'batch_size': 64}. Best is trial 2 with value: 0.07524678856134415.


🏃 View run Neural Network MAE Run 12 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/b6a11fdedc87406d80af8d81e63575b2
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723


[I 2025-01-09 10:19:25,003] Trial 13 finished with value: 0.12516681849956512 and parameters: {'hidden_size': 256, 'learning_rate': 0.0052809300924140734, 'weight_decay': 0.0007892759724480732, 'num_epochs': 27, 'batch_size': 64}. Best is trial 2 with value: 0.07524678856134415.


🏃 View run Neural Network MAE Run 13 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/94d3f68eff1c4cdd920b0e5f416ab390
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723


[I 2025-01-09 10:19:29,726] Trial 14 finished with value: 0.1450427770614624 and parameters: {'hidden_size': 256, 'learning_rate': 0.0026708442383857655, 'weight_decay': 0.00033199314468094387, 'num_epochs': 17, 'batch_size': 64}. Best is trial 2 with value: 0.07524678856134415.


🏃 View run Neural Network MAE Run 14 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/c47ae74bb2bb4f97b3e892303688456c
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723


[I 2025-01-09 10:19:35,716] Trial 15 finished with value: 0.2979873716831207 and parameters: {'hidden_size': 256, 'learning_rate': 0.004995442198011131, 'weight_decay': 0.000988403653825521, 'num_epochs': 25, 'batch_size': 256}. Best is trial 2 with value: 0.07524678856134415.


🏃 View run Neural Network MAE Run 15 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/c0255f5cb7db4e458b0b8526ee6b0410
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723


[I 2025-01-09 10:19:40,809] Trial 16 finished with value: 0.11716160178184509 and parameters: {'hidden_size': 256, 'learning_rate': 0.0014487102491458279, 'weight_decay': 0.0005887178849470995, 'num_epochs': 21, 'batch_size': 64}. Best is trial 2 with value: 0.07524678856134415.


🏃 View run Neural Network MAE Run 16 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/eda2831502da47a3967f0d3dd5543c2c
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723


[I 2025-01-09 10:19:45,659] Trial 17 finished with value: 0.1615927368402481 and parameters: {'hidden_size': 256, 'learning_rate': 0.0034638849391990766, 'weight_decay': 0.0008683105277465863, 'num_epochs': 15, 'batch_size': 64}. Best is trial 2 with value: 0.07524678856134415.


🏃 View run Neural Network MAE Run 17 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/b426871596f1416f9107fa2ed9a93c84
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723


[I 2025-01-09 10:19:50,317] Trial 18 finished with value: 0.12053829431533813 and parameters: {'hidden_size': 64, 'learning_rate': 0.005605759042661504, 'weight_decay': 0.0007027356360157736, 'num_epochs': 26, 'batch_size': 128}. Best is trial 2 with value: 0.07524678856134415.


🏃 View run Neural Network MAE Run 18 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/46b3d5292ddf4a1a9255061ecd76ea10
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723


[I 2025-01-09 10:19:55,387] Trial 19 finished with value: 0.18014822900295258 and parameters: {'hidden_size': 128, 'learning_rate': 0.0032282057699583437, 'weight_decay': 1.8283620788416455e-05, 'num_epochs': 30, 'batch_size': 256}. Best is trial 2 with value: 0.07524678856134415.


🏃 View run Neural Network MAE Run 19 at: http://seito.lavbic.net:5000/#/experiments/414728986215934723/runs/176b1ba5e1244c028124f10493ae91fa
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/414728986215934723
Best hyperparameters: {'hidden_size': 256, 'learning_rate': 0.0028225810408850417, 'weight_decay': 0.0007046069577602527, 'num_epochs': 26, 'batch_size': 64}
Best sMAPE: 0.07524678856134415


Retrieving best performing model artifacts

In [8]:
import mlflow
from mlflow.tracking import MlflowClient
import os
import pandas as pd
from torch.utils.data import Dataset, DataLoader

# set the tracking uri
mlflow.set_tracking_uri("http://seito.lavbic.net:5000")

client = MlflowClient()

# get the best model id
experiment_name = "Race_Prediction_Experiment_I"
experiment_id = client.get_experiment_by_name(experiment_name).experiment_id
runs = client.search_runs(experiment_id, order_by=["metrics.test_mae ASC"], max_results=1)
best_run = runs[0]

best_params = best_run.data.params
print("Best Run Parameters:")
print(best_params)

Best Run Parameters:
{'batch_size': '256', 'num_epochs': '20', 'model_class': 'RaceRegressionModel', 'hidden_size': '64', 'learning_rate': '0.007408841693587583', 'weight_decay': '0.0007490024204516086'}


In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import mlflow
import mlflow.pytorch
from mlflow.models.signature import infer_signature

# Define the model class
class RaceRegressionModel(nn.Module):
    def __init__(self, input_size, hidden_size=128):
        super(RaceRegressionModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, 1)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out.squeeze()

# Define the dataset class
class RaceRegressionDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Create datasets
train_dataset = RaceRegressionDataset(X_train_flat, y_train_flat)
test_dataset = RaceRegressionDataset(X_test_flat, y_test_flat)

# Create data loaders with the best batch size
train_loader = DataLoader(train_dataset, batch_size=int(best_params['batch_size']), shuffle=True, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=int(best_params['batch_size']), shuffle=False, num_workers=0)

# Initialize the model, optimizer, and loss function
device = 'cuda' if torch.cuda.is_available() else 'cpu'
input_size = X_train_flat.shape[1]

model = RaceRegressionModel(input_size, int(best_params['hidden_size'])).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=float(best_params['learning_rate']), weight_decay=float(best_params['weight_decay']))

# Start MLflow run
with mlflow.start_run(run_name="Retrained Best Model"):
    # Log parameters
    mlflow.log_params(best_params)

    # Training loop
    num_epochs = best_params['num_epochs']
    for epoch in range(int(num_epochs)):
        model.train()
        total_loss = 0
        for X_batch, y_batch in train_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * X_batch.size(0)

        average_loss = total_loss / len(train_loader.dataset)
        mlflow.log_metric("train_loss", average_loss, step=epoch)
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {average_loss:.4f}")

    # Evaluation on test set
    model.eval()
    y_true_list = []
    y_pred_list = []
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            outputs = model(X_batch)
            y_true_list.extend(y_batch.cpu().numpy())
            y_pred_list.extend(outputs.cpu().numpy())

    y_true_array = np.array(y_true_list)
    y_pred_array = np.array(y_pred_list)

    # Compute evaluation metrics
    mse = mean_squared_error(y_true_array, y_pred_array)
    mae = mean_absolute_error(y_true_array, y_pred_array)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true_array, y_pred_array)

    def mean_absolute_percentage_error(y_true, y_pred):
        epsilon = 1e-8  # Avoid division by zero
        mask = np.abs(y_true) > epsilon
        return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

    def symmetric_mean_absolute_percentage_error(y_true, y_pred):
        denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
        diff = np.abs(y_pred - y_true)
        mask = denominator != 0
        return np.mean((diff[mask] / denominator[mask])) * 100

    mape = mean_absolute_percentage_error(y_true_array, y_pred_array)
    smape = symmetric_mean_absolute_percentage_error(y_true_array, y_pred_array)

    # Log metrics
    mlflow.log_metrics({
        'test_mse': mse,
        'test_mae': mae,
        'test_rmse': rmse,
        'test_r2': r2,
        'test_mape': mape,
        'test_smape': smape
    })

    # Log the model
    input_example = X_train_flat[:5].astype(np.float32)
    input_example_tensor = torch.tensor(input_example, dtype=torch.float32).to(device)
    signature = infer_signature(
        input_example,
        model(input_example_tensor).cpu().detach().numpy()
    )
    mlflow.pytorch.log_model(
        pytorch_model=model,
        artifact_path="model",
        input_example=input_example,
        signature=signature
    )

print("Training complete. Model and metrics logged to MLflow.")

Epoch 1/20, Loss: 106.6261
Epoch 2/20, Loss: 3.3555
Epoch 3/20, Loss: 0.1431
Epoch 4/20, Loss: 0.0721
Epoch 5/20, Loss: 0.0687
Epoch 6/20, Loss: 0.0677
Epoch 7/20, Loss: 0.0669
Epoch 8/20, Loss: 0.0660
Epoch 9/20, Loss: 0.0655
Epoch 10/20, Loss: 0.0652
Epoch 11/20, Loss: 0.0655
Epoch 12/20, Loss: 0.2737
Epoch 13/20, Loss: 0.1603
Epoch 14/20, Loss: 0.0675
Epoch 15/20, Loss: 0.0642
Epoch 16/20, Loss: 0.0622
Epoch 17/20, Loss: 0.0619
Epoch 18/20, Loss: 0.0626
Epoch 19/20, Loss: 0.0629
Epoch 20/20, Loss: 0.0624
🏃 View run Retrained Best Model at: http://seito.lavbic.net:5000/#/experiments/775777151092792639/runs/faddefe3cb5748b58e38563ea616268a
🧪 View experiment at: http://seito.lavbic.net:5000/#/experiments/775777151092792639
Training complete. Model and metrics logged to MLflow.


Promote model to production

In [16]:
import numpy as np

print("PyTorch version:", np.__version__)

PyTorch version: 1.24.3


In [2]:
import requests
import json
import numpy as np

url = "http://seito.lavbic.net:5005/invocations"

# Load the test data
X_test = np.load('X_test.npy', allow_pickle=True)

# Select a single race (207 riders, 227 features)
race_data = X_test[0]  # Shape should be (207, 227)
print(f"Input shape: {race_data.shape}")  # Ensure it prints (207, 227)

# Prepare the payload
payload = {
    "instances": race_data.tolist()  # Convert to list format
}

# Send the request
try:
    response = requests.post(url, headers={"Content-Type": "application/json"}, data=json.dumps(payload), timeout=600)
    if response.status_code == 200:
        prediction = response.json()['predictions']
        print("Prediction:", prediction)
    else:
        print("Error:", response.text)
except requests.exceptions.RequestException as e:
    print("Request failed:", e)


Input shape: (207, 227)
Prediction: [0.27269816398620605, 0.2527596950531006, 0.26030707359313965, 0.20777678489685059, 0.19027400016784668, 0.2926747798919678, 0.40406009554862976, 0.2845776081085205, 0.22556281089782715, 0.20351195335388184, 0.2538764476776123, 0.27646660804748535, 0.22603297233581543, 0.19678139686584473, 0.14006435871124268, 0.1255316138267517, 0.29378199577331543, 0.20826363563537598, 0.17711138725280762, 0.276688814163208, 0.3010740280151367, 0.2258756160736084, 0.21817564964294434, 0.17943501472473145, 0.22046589851379395, 0.035171061754226685, 0.13812056183815002, 0.249375581741333, 0.29229092597961426, 0.322174072265625, 0.04750180244445801, 0.17222821712493896, 0.20850610733032227, 0.21213603019714355, 0.27309250831604004, 0.1721576452255249, 0.19855666160583496, 0.26486897468566895, 0.3014333248138428, 0.1888504922389984, 0.1979362964630127, 0.1669788360595703, 0.29357361793518066, 0.20566010475158691, 0.2508995532989502, 0.27065229415893555, 0.2933619022369

In [1]:
import pandas as pd

# Load the test data
data = pd.read_csv('final_data.csv')

# Retrieve race names of year 2024
data_2024 = data[data['year'] == 2024]

# Select unique names
race_names_2024 = data_2024['name'].unique()

# Split the data into format name, stage and index in array
split_data = []
for i, race_name in enumerate(race_names_2024):
    names = race_name.split(' ')
    name = names[0] 
    stage = names[1] 
    index = i
    split_data.append([name, stage, index])

# Convert to DataFrame
split_df = pd.DataFrame(split_data, columns=['name', 'stage', 'index'])

print(split_df)

# Save as CSV
split_df.to_csv('race_names.csv', index=False)

                name    stage  index
0    tour-down-under  stage-1      0
1    tour-down-under  stage-2      1
2    tour-down-under  stage-3      2
3    tour-down-under  stage-4      3
4    tour-down-under  stage-5      4
..               ...      ...    ...
148  tour-of-guangxi  stage-2    148
149  tour-of-guangxi  stage-3    149
150  tour-of-guangxi  stage-4    150
151  tour-of-guangxi  stage-5    151
152  tour-of-guangxi  stage-6    152

[153 rows x 3 columns]


In [11]:
data = pd.read_csv('race_names.csv')

# replace - in name with ' ' and capitalize the first letter of each word
data['name'] = data['name'].str.replace('-', ' ').str.title()

# replace - in stage with ' ' and capitalize the first letter of each word
data['stage'] = data['stage'].str.replace('-', ' ').str.title()

print(data.head())

              name    stage  index
0  Tour Down Under  Stage 1      0
1  Tour Down Under  Stage 2      1
2  Tour Down Under  Stage 3      2
3  Tour Down Under  Stage 4      3
4  Tour Down Under  Stage 5      4
