In [1]:
# train_lstm_optuna.ipynb

import logging
import sys
import numpy as np
import torch
import optuna
from torch.utils.data import DataLoader

# Ensure you can import from parent directories
sys.path.append('../../')

from common.data_preparation import load_and_preprocess_data, prepare_sequence_data, split_data_by_race, save_data_splits
from common.features import RaceFeatures
from common.evaluation import evaluate_model, plot_predictions
from models.lstm.lstm_model import F1PredictionModel, F1Dataset, F1DataPreprocessor, train_model, save_model_with_preprocessor

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def filter_driver_races(df, min_laps=3):
    """
    Filters out driver-race combinations with fewer than min_laps.
    """
    counts = df.groupby(['raceId', 'driverId']).size().reset_index(name='lap_count')
    valid = counts[counts['lap_count'] >= min_laps]
    df_filtered = df.merge(valid[['raceId', 'driverId']], on=['raceId', 'driverId'], how='inner')
    return df_filtered

def main():
    # 1. Load and preprocess data
    df = load_and_preprocess_data()

    # Split data by race
    train_df, test_df = split_data_by_race(df, test_size=0.2, random_state=42)
    # Optionally, save splits
    save_data_splits(train_df, test_df)

    # Further split train_df into train and validation sets for hyperparameter tuning
    # For example, split train_df again by race:
    val_ratio = 0.2
    train_races = train_df['raceId'].unique()
    np.random.shuffle(train_races)
    val_size = int(len(train_races) * val_ratio)
    val_races = train_races[:val_size]
    remain_races = train_races[val_size:]

    val_df = train_df[train_df['raceId'].isin(val_races)]
    train_df_final = train_df[train_df['raceId'].isin(remain_races)]

    # Initialize preprocessor and features
    preprocessor = F1DataPreprocessor()
    race_features = RaceFeatures()

    # Prepare sequence data
    sequences_train, static_train, targets_train = prepare_sequence_data(train_df_final, race_features, window_size=3)
    sequences_val, static_val, targets_val = prepare_sequence_data(val_df, race_features, window_size=3)
    sequences_test, static_test, targets_test = prepare_sequence_data(test_df, race_features, window_size=3)
    
    # Log the number of sequences
    logging.info(f"Number of training sequences: {len(sequences_train)}")
    logging.info(f"Number of validation sequences: {len(sequences_val)}")
    logging.info(f"Number of test sequences: {len(sequences_test)}")

    # Fit scalers on training data only
    preprocessor.fit_scalers(sequences_train, static_train, targets_train)

    # Transform all datasets
    sequences_train_scaled, static_train_scaled, targets_train_scaled = preprocessor.transform_data(
        sequences_train, static_train, targets_train)
    sequences_val_scaled, static_val_scaled, targets_val_scaled = preprocessor.transform_data(
        sequences_val, static_val, targets_val)
    sequences_test_scaled, static_test_scaled, targets_test_scaled = preprocessor.transform_data(
        sequences_test, static_test, targets_test)

    # Create Datasets
    train_dataset = F1Dataset(sequences_train_scaled, static_train_scaled, targets_train_scaled)
    val_dataset = F1Dataset(sequences_val_scaled, static_val_scaled, targets_val_scaled)
    test_dataset = F1Dataset(sequences_test_scaled, static_test_scaled, targets_test_scaled)

    # 2. Define the objective function for Optuna
    def objective(trial):
        # Suggest hyperparameters
        hidden_dim = trial.suggest_int('hidden_dim', 32, 256, step=32)
        num_layers = trial.suggest_int('num_layers', 2, 10)
        dropout_prob = trial.suggest_float('dropout_prob', 0.1, 0.5, step=0.1)
        learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
        batch_size = trial.suggest_categorical('batch_size', [32, 64, 128])

        # Create DataLoaders with suggested batch_size
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        # Check if validation loader has data
        if len(val_loader.dataset) == 0:
            logging.warning("Validation set is empty. Skipping this trial.")
            raise optuna.exceptions.TrialPruned()

        # Initialize model with suggested hyperparameters
        model = F1PredictionModel(
            sequence_dim=sequences_train_scaled.shape[2],
            static_dim=static_train_scaled.shape[1],
            hidden_dim=hidden_dim,
            num_layers=num_layers,
            dropout_prob=dropout_prob
        )

        # Train model and get validation loss
        history = train_model(
            model,
            train_loader,
            val_loader,
            epochs=10,            # You can adjust number of epochs
            learning_rate=learning_rate,
            patience=3            # Early stopping patience
        )

        # Return the last validation loss (or the best validation loss)
        if not history['val_loss']:
            logging.warning("No validation loss recorded. Skipping this trial.")
            raise optuna.exceptions.TrialPruned()
        
        val_loss = history['val_loss'][-1]
        return val_loss


    # 3. Run Optuna optimization
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=20)  # Adjust n_trials as needed

    logging.info(f"Best trial: {study.best_trial.params}")

    # 4. Retrain final model with best hyperparameters on train+val sets
    best_params = study.best_trial.params
    batch_size = best_params['batch_size']

    # Combine train and val sets for final training
    all_sequences = np.concatenate([sequences_train_scaled, sequences_val_scaled], axis=0)
    all_static = np.concatenate([static_train_scaled, static_val_scaled], axis=0)
    all_targets = np.concatenate([targets_train_scaled, targets_val_scaled], axis=0)

    all_dataset = F1Dataset(all_sequences, all_static, all_targets)
    final_train_loader = DataLoader(all_dataset, batch_size=batch_size, shuffle=True)
    final_val_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)  # We'll just use test as final check

    final_model = F1PredictionModel(
        sequence_dim=sequences_train_scaled.shape[2],
        static_dim=static_train_scaled.shape[1],
        hidden_dim=best_params['hidden_dim'],
        num_layers=best_params['num_layers'],
        dropout_prob=best_params['dropout_prob']
    )

    final_history = train_model(
        final_model,
        final_train_loader,
        final_val_loader,
        epochs=10,                       # Might retrain longer now
        learning_rate=best_params['learning_rate'],
        patience=3
    )

    # 5. Evaluate on the test set
    final_model.eval()
    predictions = []
    true_values = []

    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    with torch.no_grad():
        for batch in test_loader:
            sequences = batch['sequence']
            static = batch['static']
            targets = batch['target']

            outputs = final_model(sequences, static)
            predictions.extend(outputs.numpy())
            true_values.extend(targets.numpy())

    # Inverse transform predictions and true values
    predictions = preprocessor.lap_time_scaler.inverse_transform(
        np.array(predictions).reshape(-1, 1)
    ).flatten()
    true_values = preprocessor.lap_time_scaler.inverse_transform(
        np.array(true_values).reshape(-1, 1)
    ).flatten()

    # Calculate and display evaluation metrics
    metrics = evaluate_model(true_values, predictions)
    logging.info(f"Final test set metrics after optuna optimization: {metrics}")

    # Plot results
    plot_predictions(true_values, predictions, model_name='LSTM Model (Optuna Tuned)')

    # Save the model and preprocessor
    save_model_with_preprocessor(
        final_model,
        preprocessor,
        '../../models/lstm/lstm_model_optuna_tuned.pth'
    )

if __name__ == "__main__":
    main()


INFO:root:Using device: cpu


/Users/I551659/Documents/GitHub/IE650-RAMP/ie500-data-mining-group7/race_simulation/notebooks/lstm
Index(['resultId', 'raceId', 'driverId', 'constructorId', 'number', 'grid',
       'position', 'positionText', 'positionOrder', 'points', 'laps', 'time',
       'racetime_milliseconds', 'fastestLap', 'rank', 'fastestLapTime',
       'fastestLapSpeed', 'statusId'],
      dtype='object')





Initial data sizes:
Lap times: (586171, 6)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


The behavior will change in pa


After initial merge: (586171, 46) - All lap data merged
After year filtering (>=2018): (159538, 46)
After adding constructor info: (159538, 52)
After adding circuit info: (159538, 55)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFr

After adding weather info: (159538, 60)

Fetching weather for 92 races: [989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1069, 1070, 1071, 1072, 1073, 1096, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129]
No weather data available for race 992
No weather data available for race 1005
No weather data available for race 1013
No weather data available for race 1023
No weather data available for race 1026
No weather data available for race 1038
No weather data available for race 1057


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try usi

After adding tire info: (159538, 65)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try usi

After adding practice info: (159538, 70)
After cleaning time intervals: (159538, 70)
Calculating enhanced driver metrics...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try usi

After enhancing driver attributes: (159538, 77)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




After adding dynamic features: (159538, 81)

Calculating race intervals...
After calculating race intervals: (159538, 81)

Before outlier removal:

Before outlier removal:
Unique races: 146
Unique drivers per race: count    146.000000
mean      19.500000
std        0.888625
min       16.000000
25%       19.000000
50%       20.000000
75%       20.000000
max       20.000000
Name: driverId, dtype: float64

After removing outliers: (139755, 81)
Special laps removed: (2648, 81)
After dropping unnecessary columns: (139755, 50)

Removed Columns:
[]
After removing duplicate columns: (139755, 50)

Before handling missing values:
Unique races: 145
Unique drivers per race: count    145.000000
mean      19.234483
std        1.462560
min        8.000000
25%       19.000000
50%       20.000000
75%       20.000000
max       20.000000
Name: driverId, dtype: float64

Initial shape: (139755, 50)
Initial unique races: 145
Initial unique drivers: 39

Columns with missing values:

Column: driver_aggression

[I 2024-12-07 13:18:10,953] A new study created in memory with name: no-name-285a0639-6c9e-4e77-b1f9-f8123d8a8240
INFO:root:Epoch 1/10: Train Loss: 0.087298, Val Loss: 0.302103
INFO:root:New best model at epoch 1
INFO:root:Epoch 2/10: Train Loss: 0.042160, Val Loss: 0.258369
INFO:root:New best model at epoch 2
INFO:root:Epoch 3/10: Train Loss: 0.033719, Val Loss: 0.245149
INFO:root:New best model at epoch 3
INFO:root:Epoch 4/10: Train Loss: 0.028893, Val Loss: 0.264692
INFO:root:Epoch 5/10: Train Loss: 0.026398, Val Loss: 0.265436
INFO:root:Epoch 6/10: Train Loss: 0.024112, Val Loss: 0.324463
INFO:root:Early stopping at epoch 6
ERROR:root:Trial failed with error: 
[I 2024-12-07 13:20:44,144] Trial 0 pruned. 
INFO:root:Epoch 1/10: Train Loss: 0.188535, Val Loss: 0.213860
INFO:root:New best model at epoch 1
INFO:root:Epoch 2/10: Train Loss: 0.159574, Val Loss: 0.212710
INFO:root:New best model at epoch 2
INFO:root:Epoch 3/10: Train Loss: 0.166351, Val Loss: 0.195421
INFO:root:New best mo

KeyboardInterrupt: 

In [None]:
from ydata_profiling import ProfileReport
import pandas as pd

df = pd.read_csv('../../data/LAPS.csv')

profile = ProfileReport(df, title="Pandas Profiling Report")
profile.to_file("profile_report.html")

In [None]:
df.dtypes