In [1]:
import sys, os
sys.path.append('../../')
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, LSTM, Input, Concatenate, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.preprocessing import StandardScaler
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

class F1LapPredictor:
    def __init__(self, sequence_length=5, model_dir='models'):
        self.sequence_length = sequence_length
        self.static_scaler = StandardScaler()
        self.dynamic_scaler = StandardScaler()
        self.target_scaler = StandardScaler()
        self.model_dir = model_dir
        
        # Create model directory if it doesn't exist
        os.makedirs(self.model_dir, exist_ok=True)
        
    def prepare_sequences(self, df, static_features, dynamic_features):
        """
        Prepare sequences for each driver in each race, maintaining temporal order.
        Returns sequences, static inputs, targets, and race IDs for temporal splitting.
        """
        sequences = []
        targets = []
        static_inputs = []
        race_ids = []
        
        for race_id, race_group in df.groupby('raceId'):
            for driver_id, driver_group in race_group.groupby('driverId'):
                group = driver_group.sort_values('lap')
                
                # Prepare dynamic features
                dynamic_data = group[dynamic_features].values
                
                # Create sequences
                for i in range(len(group) - self.sequence_length):
                    sequence = dynamic_data[i:i + self.sequence_length]
                    target = group['milliseconds'].iloc[i + self.sequence_length]
                    static_input = group[static_features].iloc[i + self.sequence_length].values
                    
                    sequences.append(sequence)
                    targets.append(target)
                    static_inputs.append(static_input)
                    race_ids.append(race_id)
        
        return (np.array(sequences), np.array(static_inputs), 
                np.array(targets), np.array(race_ids))
    
    def temporal_split(self, groups, split_ratio=0.2):
        """
        Split data temporally based on race IDs.
        """
        unique_races = np.unique(groups)
        # Sort races to ensure temporal order
        unique_races.sort()
        split_idx = int(len(unique_races) * (1 - split_ratio))
        train_races = unique_races[:split_idx]
        val_races = unique_races[split_idx:]
        
        train_mask = np.isin(groups, train_races)
        val_mask = np.isin(groups, val_races)
        
        return train_mask, val_mask
    
    def build_model(self, n_static_features, n_dynamic_features):
        """
        Build a hybrid model combining LSTM for sequential data and Dense for static features.
        """
        # Sequential input branch
        seq_input = Input(shape=(self.sequence_length, n_dynamic_features))
        x1 = LSTM(64, return_sequences=True)(seq_input)
        x1 = LSTM(32)(x1)
        x1 = Dropout(0.2)(x1)
        
        # Static input branch
        static_input = Input(shape=(n_static_features,))
        x2 = Dense(32, activation='relu')(static_input)
        x2 = BatchNormalization()(x2)
        x2 = Dropout(0.2)(x2)
        
        # Combine branches
        combined = Concatenate()([x1, x2])
        x = Dense(64, activation='relu')(combined)
        x = BatchNormalization()(x)
        x = Dropout(0.2)(x)
        x = Dense(32, activation='relu')(x)
        x = BatchNormalization()(x)
        output = Dense(1)(x)
        
        model = Model(inputs=[seq_input, static_input], outputs=output)
        model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
        return model
    def train(self, df, static_features, dynamic_features, validation_split=0.2):
        """
        Train the model with the provided data and create validation plots.
        """
        # Previous training code remains the same until after model.fit()
        X_seq, X_static, y, race_ids = self.prepare_sequences(
            df, static_features, dynamic_features
        )
        
        # Scale features
        X_static_scaled = self.static_scaler.fit_transform(X_static)
        X_seq_scaled = np.array([self.dynamic_scaler.fit_transform(seq) for seq in X_seq])
        y_scaled = self.target_scaler.fit_transform(y.reshape(-1, 1))
        
        # Get temporal split masks
        train_mask, val_mask = self.temporal_split(race_ids, validation_split)
        
        # Split data using temporal masks
        X_seq_train = X_seq_scaled[train_mask]
        X_static_train = X_static_scaled[train_mask]
        y_train = y_scaled[train_mask]
        
        X_seq_val = X_seq_scaled[val_mask]
        X_static_val = X_static_scaled[val_mask]
        y_val = y_scaled[val_mask]
        
        # Build and train model
        self.model = self.build_model(len(static_features), len(dynamic_features))
        
        callbacks = [
            EarlyStopping(
                monitor='val_loss',
                patience=5,
                restore_best_weights=True,
                verbose=1
            ),
            ModelCheckpoint(
                filepath=os.path.join(self.model_dir, 'best_model.keras'),
                monitor='val_loss',
                save_best_only=True,
                verbose=1
            )
        ]
        
        # Train model
        history = self.model.fit(
            [X_seq_train, X_static_train],
            y_train,
            validation_data=([X_seq_val, X_static_val], y_val),
            epochs=50,
            batch_size=32,
            callbacks=callbacks,
            verbose=1
        )
        
        # Create validation plots
        self.plot_prediction_accuracy([X_seq_val, X_static_val], y_val, "Validation Set")
        
        return history
    
    def plot_prediction_accuracy(self, X, y_true, title):
        """
        Create a scatter plot comparing actual vs predicted lap times.
        
        Args:
            X: Input features (sequence and static)
            y_true: Actual lap times
            title: Plot title
        """
        # Get predictions
        y_pred_scaled = self.model.predict(X)
        
        # Inverse transform predictions and actual values
        y_pred = self.target_scaler.inverse_transform(y_pred_scaled)
        y_true = self.target_scaler.inverse_transform(y_true.reshape(-1, 1))
        
        # Calculate metrics
        mae = np.mean(np.abs(y_pred - y_true))
        mse = np.mean((y_pred - y_true) ** 2)
        r2 = 1 - np.sum((y_true - y_pred) ** 2) / np.sum((y_true - np.mean(y_true)) ** 2)
        
        # Create the plot
        plt.figure(figsize=(10, 8))
        
        # Create scatter plot
        plt.scatter(y_true, y_pred, alpha=0.5, color='blue', label='Predictions')
        
        # Add perfect prediction line
        min_val = min(np.min(y_true), np.min(y_pred))
        max_val = max(np.max(y_true), np.max(y_pred))
        plt.plot([min_val, max_val], [min_val, max_val], 'r--', label='Perfect Prediction')
        
        # Add labels and title
        plt.xlabel('Actual Lap Time (ms)')
        plt.ylabel('Predicted Lap Time (ms)')
        plt.title(f'Actual vs Predicted Lap Times - {title}')
        
        # Add metrics text box
        metrics_text = f'MAE: {mae:.0f} ms\nMSE: {mse:.0f} ms²\nR²: {r2:.3f}'
        plt.text(0.05, 0.95, metrics_text,
                transform=plt.gca().transAxes,
                bbox=dict(facecolor='white', alpha=0.8),
                verticalalignment='top')
        
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        # Add +/- 1 second boundaries
        plt.plot([min_val, max_val], [min_val + 1000, max_val + 1000], 'g--', alpha=0.5, label='+1 second')
        plt.plot([min_val, max_val], [min_val - 1000, max_val - 1000], 'g--', alpha=0.5, label='-1 second')
        
        plt.tight_layout()
        
        # Save plot
        plt.savefig(os.path.join(self.model_dir, f'prediction_accuracy_{title.lower().replace(" ", "_")}.png'))
        plt.close()
        
        # Print metrics summary
        print(f"\nPrediction Metrics for {title}:")
        print(f"Mean Absolute Error: {mae:.0f} ms")
        print(f"Mean Squared Error: {mse:.0f} ms²")
        print(f"R² Score: {r2:.3f}")
        
        # Calculate percentage of predictions within different time windows
        time_windows = [100, 250, 500, 1000]  # ms
        for window in time_windows:
            within_window = np.mean(np.abs(y_pred - y_true) < window) * 100
            print(f"Predictions within {window}ms: {within_window:.1f}%")

    def predict(self, X_seq, X_static):
        """
        Make predictions using the trained model.
        """
        X_static_scaled = self.static_scaler.transform(X_static)
        X_seq_scaled = np.array([self.dynamic_scaler.transform(seq) for seq in X_seq])
        predictions_scaled = self.model.predict([X_seq_scaled, X_static_scaled])
        return self.target_scaler.inverse_transform(predictions_scaled)
    
def train_model(data_path='LAPS.csv'):
    from common.data_preparation import load_and_preprocess_data
    
    # Load preprocessed data
    df = load_and_preprocess_data()
    
    # Define feature sets
    static_features = [
        'driver_overall_skill', 'driver_circuit_skill', 'driver_consistency',
        'driver_reliability', 'driver_aggression', 'driver_risk_taking',
        'constructor_performance', 'circuit_length', 'circuit_type_encoded'
    ]
    
    dynamic_features = [
        'position', 'tire_age', 'fuel_load', 'track_position', 'is_pit_lap',
        'TrackTemp', 'AirTemp', 'Humidity', 'GapToLeader_ms', 'IntervalToPositionAhead_ms'
    ]
    
    # Initialize and train model
    predictor = F1LapPredictor(sequence_length=5)
    history = predictor.train(df, static_features, dynamic_features)
    
    return predictor, history

if __name__ == "__main__":
    predictor, history = train_model()

/Users/I551659/Documents/GitHub/IE650-RAMP/ie500-data-mining-group7/race_simulation/notebooks/correlation
Index(['resultId', 'raceId', 'driverId', 'constructorId', 'number', 'grid',
       'position', 'positionText', 'positionOrder', 'points', 'laps', 'time',
       'racetime_milliseconds', 'fastestLap', 'rank', 'fastestLapTime',
       'fastestLapSpeed', 'statusId'],
      dtype='object')





Initial data sizes:
Lap times: (586171, 6)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


The behavior will change in pa


After initial merge: (586171, 46) - All lap data merged
After year filtering (>=2018): (159538, 46)
After adding constructor info: (159538, 52)
After adding circuit info: (159538, 55)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFr

After adding weather info: (159538, 60)

Fetching weather for 92 races: [989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1069, 1070, 1071, 1072, 1073, 1096, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129]
No weather data available for race 992
No weather data available for race 1005
No weather data available for race 1013
No weather data available for race 1023
No weather data available for race 1026
No weather data available for race 1038
No weather data available for race 1057


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try usi

After adding tire info: (159538, 65)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try usi

After adding practice info: (159538, 70)
After cleaning time intervals: (159538, 70)
Calculating enhanced driver metrics...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try usi

After enhancing driver attributes: (159538, 77)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




After adding dynamic features: (159538, 81)

Calculating race intervals...
