In [2]:
# Import standard libraries
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader

# Import custom modules
from data_preparation import load_and_preprocess_data, prepare_sequence_data, split_data_by_race, save_data_splits
from features import RaceFeatures
from lstm import F1PredictionModel, F1Dataset, F1DataPreprocessor, train_model, save_model_with_preprocessor
from evaluation import evaluate_model, plot_predictions

def main():
    # Load and preprocess data
    print("Loading and preprocessing data...")
    df = load_and_preprocess_data()

    print(df.columns)

    # Split data by race to prevent data leakage
    print("Splitting data...")
    train_df, test_df = split_data_by_race(df, test_size=0.2, random_state=42)
    save_data_splits(train_df, test_df)

    # Initialize preprocessor and features
    preprocessor = F1DataPreprocessor()
    race_features = RaceFeatures()

    # Prepare sequence data
    print("Preparing sequence data...")
    sequences_train, static_train, targets_train = prepare_sequence_data(train_df, race_features, window_size=3)
    sequences_test, static_test, targets_test = prepare_sequence_data(test_df, race_features, window_size=3)

    # Fit scalers on training data and transform all datasets
    print("Scaling data...")
    preprocessor.fit_scalers(sequences_train, static_train, targets_train)

    sequences_train_scaled, static_train_scaled, targets_train_scaled = preprocessor.transform_data(
        sequences_train, static_train, targets_train)
    sequences_test_scaled, static_test_scaled, targets_test_scaled = preprocessor.transform_data(
        sequences_test, static_test, targets_test)

    # Create datasets
    train_dataset = F1Dataset(sequences_train_scaled, static_train_scaled, targets_train_scaled)
    test_dataset = F1Dataset(sequences_test_scaled, static_test_scaled, targets_test_scaled)

    # Initialize the model with default parameters
    model = F1PredictionModel(
        sequence_dim=sequences_train_scaled.shape[2],
        static_dim=static_train_scaled.shape[1],
        hidden_dim=64,
        num_layers=10,
        dropout_prob=0.5
    )

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

    # Train model
    print("Training model...")
    history = train_model(
        model,
        train_loader,
        test_loader,  # Using test_loader as validation for now
        epochs=10,
        learning_rate=0.001
    )

    # Evaluate on test set
    print("Evaluating model...")
    model.eval()
    predictions = []
    true_values = []

    with torch.no_grad():
        for batch in test_loader:
            sequences = batch['sequence']
            static = batch['static']
            targets = batch['target']

            outputs = model(sequences, static)
            predictions.extend(outputs.numpy())
            true_values.extend(targets.numpy())

    # Inverse transform predictions and true values
    predictions = preprocessor.lap_time_scaler.inverse_transform(
        np.array(predictions).reshape(-1, 1)).flatten()
    true_values = preprocessor.lap_time_scaler.inverse_transform(
        np.array(true_values).reshape(-1, 1)).flatten()

    # Calculate and display evaluation metrics
    metrics = evaluate_model(true_values, predictions)
    print("Test set metrics:", metrics)

    # Plot results
    plot_predictions(true_values, predictions, model_name='LSTM Model')

    save_model_with_preprocessor(
        model,
        preprocessor,
        'models/lstm_model.pth'
    )



if __name__ == "__main__":
   main()

Loading and preprocessing data...


  practice_sessions = pd.read_csv('../data/raw_data/ff1_laps.csv', na_values=na_values)
  tire_data = pd.read_csv('../data/raw_data/ff1_laps.csv', na_values=na_values)


(586171, 15)
(586171, 32)
(586171, 40)
(586171, 45)
(586171, 46)
(586171, 47)
(586171, 47)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  laps['pitstop_milliseconds'].fillna(0, inplace=True)  # Assuming 0 if no pit stop
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  laps['constructor_points'].fillna(laps['constructor_points'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will ne

(159538, 56)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  laps['Compound'].fillna('UNKNOWN', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  laps['fp1_median_time'].fillna(global_medians.get('FP1', 0), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object 

Shape of laps before merging weather data: (159538, 65)

Processing race 989 (2018 AUSTRALIAN GRAND PRIX)
No weather data found for race - Assigning default weather values

Processing race 990 (2018 BAHRAIN GRAND PRIX)
No weather data found for race - Assigning default weather values

Processing race 991 (2018 CHINESE GRAND PRIX)
No weather data found for race - Assigning default weather values

Processing race 992 (2018 AZERBAIJAN GRAND PRIX)
No weather data found for race - Assigning default weather values

Processing race 993 (2018 SPANISH GRAND PRIX)
No weather data found for race - Assigning default weather values

Processing race 994 (2018 MONACO GRAND PRIX)
No weather data found for race - Assigning default weather values

Processing race 995 (2018 CANADIAN GRAND PRIX)
No weather data found for race - Assigning default weather values

Processing race 996 (2018 FRENCH GRAND PRIX)
No weather data found for race - Assigning default weather values

Processing race 997 (2018 AUSTRIAN

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  laps_with_weather['TrackTemp'].fillna(25.0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  laps_with_weather['AirTemp'].fillna(20.0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which w

Columns in laps DataFrame: ['raceId', 'driverId', 'lap', 'position', 'time', 'milliseconds', 'date', 'driverRef', 'number', 'code', 'forename', 'surname', 'dob', 'nationality', 'url_x', 'year', 'round', 'circuitId', 'name_x', 'date_race', 'time_race', 'url_race', 'fp1_date', 'fp1_time', 'fp2_date', 'fp2_time', 'fp3_date', 'fp3_time', 'quali_date', 'quali_date_time', 'sprint_date', 'sprint_time', 'circuitRef', 'name_y', 'location', 'country', 'lat', 'lng', 'alt', 'url_y', 'positionOrder', 'grid', 'racetime_milliseconds', 'fastestLap', 'statusId', 'status', 'pitstop_milliseconds', 'constructorId', 'constructor_points', 'constructor_position', 'constructor_performance', 'circuit_length', 'circuit_type', 'circuit_type_encoded', 'cumulative_milliseconds', 'seconds_from_start', 'TrackStatus', 'tire_compound', 'fp1_median_time', 'fp2_median_time', 'fp3_median_time', 'quali_time', 'R', 'S', 'is_pit_lap', 'TrackTemp', 'AirTemp', 'Humidity']
Sample data after merging weather data:
   raceId  lap

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  laps['TrackStatus'].fillna(1, inplace=True)  # 1 = regular racing status


Shape before filtering and outlier removal: (159538, 73)
Normal racing laps: (141117, 73)
Special laps (pit stops, safety car, etc.): (18421, 73)
Final shape after outlier removal: (148148, 73)
Index(['raceId', 'driverId', 'lap', 'position', 'time', 'milliseconds', 'date',
       'driverRef', 'number', 'code', 'forename', 'surname', 'dob',
       'nationality', 'year', 'round', 'circuitId', 'date_race', 'time_race',
       'url_race', 'circuitRef', 'location', 'country', 'lat', 'lng', 'alt',
       'positionOrder', 'grid', 'statusId', 'status', 'pitstop_milliseconds',
       'constructorId', 'constructor_points', 'constructor_position',
       'constructor_performance', 'circuit_length', 'circuit_type',
       'circuit_type_encoded', 'cumulative_milliseconds', 'seconds_from_start',
       'TrackStatus', 'tire_compound', 'fp1_median_time', 'fp2_median_time',
       'fp3_median_time', 'quali_time', 'is_pit_lap', 'TrackTemp', 'AirTemp',
       'Humidity', 'driver_aggression', 'driver_over

NameError: name 'r2_score' is not defined

In [None]:
from ydata_profiling import ProfileReport

df = pd.read_csv('data/LAPS.csv')

profile = ProfileReport(df, title="Pandas Profiling Report")
profile.to_file("profile_report.html")