In [1]:
# simulation_xgboost.py

import pandas as pd
import numpy as np
from pathlib import Path
import logging
import sys

from common.race import Race
from common.driver import Driver
from common.features import RaceFeatures
from common.data_preparation import load_data_splits
from common.evaluation import evaluate_race_simulation
from common.race_utils import extract_pit_strategies, extract_safety_car_periods, get_race_length
from models.xgboost.race_simulator_xgboost import XGBoostRaceSimulator
from models.xgboost.xgboost_model import load_model_with_preprocessor
from common.utils import plot_race_positions, plot_lap_times, plot_actual_vs_predicted

def initialize_drivers(
    drivers_df: pd.DataFrame, 
    preprocessor, 
    race_features: RaceFeatures, 
    race: Race, 
    lap_times_df: pd.DataFrame, 
    special_laps: pd.DataFrame,
    circuit_attributes_df: pd.DataFrame
) -> list:
    """Initialize driver objects for the simulation."""
    pit_strategies = extract_pit_strategies(special_laps, race.race_id)
    drivers_race_df = drivers_df[drivers_df['raceId'] == race.race_id]

    if drivers_race_df.empty:
        raise ValueError(f"No drivers found for raceId {race.race_id}")

    grid_positions = lap_times_df[
        (lap_times_df['raceId'] == race.race_id) & (lap_times_df['lap'] == 1)
    ][['driverId', 'grid']].drop_duplicates()
    grid_mapping = grid_positions.set_index('driverId')['grid'].to_dict()

    circuit_matches = circuit_attributes_df[circuit_attributes_df['circuitId'] == race.circuit_id]
    if circuit_matches.empty:
        logging.warning(f"No circuit attributes found for circuitId {race.circuit_id}. Using defaults.")
        circuit_length_val = 5.0
        circuit_type_encoded_val = 0
        alt_val = 0.0
    else:
        circuit_data = circuit_matches.iloc[0]
        circuit_length_val = circuit_data.get('circuit_length', 5.0)
        circuit_type_encoded_val = circuit_data.get('circuit_type_encoded', 0)
        alt_val = circuit_data.get('alt', 0.0)

    drivers = []
    for _, row in drivers_race_df.iterrows():
        driver_id = row['driverId']
        grid_position = grid_mapping.get(driver_id, len(drivers_race_df))
        pit_strategy_info = pit_strategies.get(driver_id, {})
        starting_compound = pit_strategy_info.get('starting_compound', 2)
        pit_strategy = pit_strategy_info.get('pit_strategy', [])

        static_features_dict = {
            'driver_overall_skill': row['driver_overall_skill'],
            'driver_circuit_skill': row['driver_circuit_skill'],
            'driver_consistency': row['driver_consistency'],
            'driver_reliability': row['driver_reliability'],
            'driver_aggression': row['driver_aggression'],
            'driver_risk_taking': row['driver_risk_taking'],
            'constructor_performance': row['constructor_performance'],
            'fp1_median_time': row['fp1_median_time'],
            'fp2_median_time': row['fp2_median_time'],
            'fp3_median_time': row['fp3_median_time'],
            'quali_time': row['quali_time'],
            'circuit_length': circuit_length_val,
            'circuit_type_encoded': circuit_type_encoded_val,
            'alt': alt_val
        }

        static_features = np.array([static_features_dict[f] for f in race_features.static_features])

        initial_dynamic_features = {
            'tire_age': 0,
            'fuel_load': 100.0,
            'track_position': grid_position,
            'TrackTemp': 35.0,
            'AirTemp': 25.0,
            'Humidity': 50.0,
            'TrackStatus': 1,
            'is_pit_lap': 0,
            'tire_compound': starting_compound,
            'cumulative_race_time': 0.0,
            'GapToLeader_ms': 0.0,
            'IntervalToPositionAhead_ms': 0.0
        }

        driver = Driver(
            driver_id=driver_id,
            name=row.get('driverName

INFO:root:Pipeline loaded from ../../models/xgboost/xgboost_pipeline.pkl
INFO:root:Evaluating race 1123
INFO:root:Initialized 18 drivers for raceId 1123
ERROR:root:Error processing race 1123: columns are missing: {'country', 'grid', 'circuitId', 'round', 'lng', 'positionOrder', 'raceId', 'pitstop_milliseconds', 'position', 'status', 'CumRaceTime_ms', 'lat', 'constructorId', 'constructor_position', 'code', 'driverId', 'lap', 'cumulative_milliseconds', 'driver_adaptability', 'circuit_type', 'constructor_nationality', 'seconds_from_start', 'year', 'nationality'}
INFO:root:Evaluating race 1105
INFO:root:Initialized 20 drivers for raceId 1105
ERROR:root:Error processing race 1105: columns are missing: {'country', 'grid', 'circuitId', 'round', 'lng', 'positionOrder', 'raceId', 'pitstop_milliseconds', 'position', 'status', 'CumRaceTime_ms', 'lat', 'constructorId', 'constructor_position', 'code', 'driverId', 'lap', 'cumulative_milliseconds', 'driver_adaptability', 'circuit_type', 'constructor_

In [2]:
# simulation.py

import pandas as pd
import numpy as np
import torch
from pathlib import Path
import logging

# Import from common modules
from common.race import Race
from common.driver import Driver
from common.config import TEAM_COLORS
from common.utils import (
    get_constructor_color,
    extract_pit_strategies,
    extract_safety_car_periods,
    get_race_length,
    plot_race_positions,
    plot_lap_times
)

# Import LSTM-specific simulator
from models.race_simulator_lstm import LSTMRaceSimulator

# Import other necessary modules
from lstm import load_model_with_preprocessor
from features import RaceFeatures

def initialize_drivers(drivers_df: pd.DataFrame, preprocessor, 
                       race_features: RaceFeatures, race: Race, 
                       lap_times_df: pd.DataFrame, special_laps: pd.DataFrame) -> List[Driver]:
    """
    Initializes drivers with their respective pit strategies and grid positions from historical data.
    """
    drivers = []
    
    # Extract pit strategies
    pit_strategies = extract_pit_strategies(special_laps, race.race_id)
    
    # Filter driver attributes for the specific raceId
    drivers_race_df = drivers_df[drivers_df['raceId'] == race.race_id]
    
    if drivers_race_df.empty:
        raise ValueError(f"No drivers found for raceId {race.race_id}")
    
    # Get grid positions for this race
    grid_positions = lap_times_df[
        (lap_times_df['raceId'] == race.race_id) & 
        (lap_times_df['lap'] == 1)
    ][['driverId', 'grid']].drop_duplicates()
    
    # Create a grid position mapping
    grid_mapping = grid_positions.set_index('driverId')['grid'].to_dict()

    for idx, row in drivers_race_df.iterrows():
        driver_id = row['driverId']
        # Get the actual grid position
        grid_position = grid_mapping.get(driver_id, len(drivers_race_df))
        
        static_features_dict = {
            'driver_overall_skill': row['driver_overall_skill'],
            'driver_circuit_skill': row['driver_circuit_skill'],
            'driver_consistency': row['driver_consistency'],
            'driver_reliability': row['driver_reliability'],
            'driver_aggression': row['driver_aggression'],
            'driver_risk_taking': row['driver_risk_taking'],
            'constructor_performance': row['constructor_performance'],
            'fp1_median_time': row['fp1_median_time'],
            'fp2_median_time': row['fp2_median_time'],
            'fp3_median_time': row['fp3_median_time'],
            'quali_time': row['quali_time'],
            'circuitId': race.circuit_id
        }

        # Extract pit strategy for the driver
        pit_strategy_info = pit_strategies.get(driver_id, {})
        starting_compound = pit_strategy_info.get('starting_compound', 2)
        pit_strategy = pit_strategy_info.get('pit_strategy', [])

        # Extract features in the correct order
        static_features = np.array([static_features_dict[feature] for feature in preprocessor.static_feature_names])
        
        driver = Driver(
            driver_id=driver_id,
            name=row.get('driverName', f"Driver {driver_id}"),
            static_features=static_features,
            initial_dynamic_features={
                'tire_age': 0,
                'fuel_load': 100.0,
                'track_position': grid_position,
                'TrackTemp': 35.0,
                'AirTemp': 25.0,
                'Humidity': 50.0,
                'TrackStatus': 1,
                'is_pit_lap': 0,
                'tire_compound': starting_compound
            },
            start_position=grid_position,
            pit_strategy=pit_strategy,
            starting_compound=starting_compound
        )

        # Scale static features using the preprocessor
        driver.static_features = preprocessor.scale_static_features(static_features).flatten()
        
        # Initialize sequence with zeros
        driver.sequence = np.zeros((preprocessor.window_size, preprocessor.sequence_dim))

        drivers.append(driver)

    # Sort drivers by grid position
    drivers.sort(key=lambda x: x.start_position)
    
    logging.info(f"Initialized {len(drivers)} drivers for raceId {race.race_id}")
    return drivers

def evaluate_lstm_model(simulator: LSTMRaceSimulator, test_df: pd.DataFrame):
    """
    Evaluate LSTM race simulation model on test races.
    """
    # Store results for all races
    all_results = []
    
    # Get unique test races
    test_races = test_df['raceId'].unique()
    
    # Create results directories
    Path('results/lstm').mkdir(parents=True, exist_ok=True)
    Path('plots/lstm').mkdir(parents=True, exist_ok=True)
    
    for race_id in test_races:
        try:
            logging.info(f"Evaluating race {race_id}")
            
            # Get race data
            race_data = test_df[test_df['raceId'] == race_id]
            
            if race_data.empty:
                logging.warning(f"No data found for race {race_id}, skipping...")
                continue
            
            # Get race parameters
            race_length = get_race_length(race_id, lap_times)
            safety_car_periods = extract_safety_car_periods(special_laps, race_id)
            
            # Create race instance
            race = Race(
                race_id=race_id,
                circuit_id=race_data['circuitId'].iloc[0],
                total_laps=race_length,
                weather_conditions={},
                safety_car_periods=safety_car_periods
            )
            
            # Initialize drivers
            race_features = RaceFeatures()
            drivers = initialize_drivers(
                drivers_df, 
                simulator.preprocessor, 
                race_features, 
                race,
                lap_times,
                special_laps
            )
            race.drivers.extend(drivers)
            
            # Run simulation
            race_lap_data = simulator.simulate_race(race)
            
            # Evaluate results (implement evaluate_race_simulation)
            sim_results = evaluate_race_simulation(race, race_data)
            
            # Add race ID to metrics
            sim_results['metrics']['race_id'] = race_id
            all_results.append(sim_results['metrics'])
            
            # Save detailed results
            detailed_df = sim_results['detailed_results']
            detailed_df.to_csv(f'results/lstm/race_{race_id}_detailed.csv', index=False)
            
            # Plot results
            plot_race_positions(
                race=race,
                drivers_df=drivers_df,
                constructor_mapping=constructor_mapping,
                driver_code_mapping=driver_code_mapping,
                TEAM_COLORS=TEAM_COLORS,
                save_path=f'plots/lstm/race_{race_id}_positions.png'
            )
            
            plot_lap_times(
                race=race,
                drivers_df=drivers_df,
                constructor_mapping=constructor_mapping,
                driver_code_mapping=driver_code_mapping,
                TEAM_COLORS=TEAM_COLORS,
                save_path=f'plots/lstm/race_{race_id}_lap_times.png'
            )

            logging.info(f"Results for Race {race_id}:")
            logging.info(f"RMSE: {sim_results['metrics']['rmse']:.2f}")
            logging.info(f"MAE: {sim_results['metrics']['mae']:.2f}")
            logging.info(f"Positions Correct: {sim_results['metrics']['positions_correct']*100:.1f}%")
            
        except Exception as e:
            logging.error(f"Error processing race {race_id}: {str(e)}")
            continue
    
    if not all_results:
        logging.error("No successful race simulations completed")
        return None, None
    
    # Create final results DataFrame
    results_df = pd.DataFrame(all_results)
    
    # Calculate aggregate metrics
    overall_metrics = results_df.agg({
        'rmse': ['mean', 'std'],
        'mae': ['mean', 'std'],
        'positions_correct': ['mean', 'std'],
        'position_changes_accuracy': ['mean', 'std'],
        'num_drivers': 'mean'
    }).round(3)
    
    # Save results
    results_df.to_csv('results/lstm/simulation_evaluation.csv', index=False)
    overall_metrics.to_csv('results/lstm/simulation_overall_metrics.csv')
    
    return results_df, overall_metrics

if __name__ == "__main__":
    from data_preparation import load_data_splits
    
    # Load train/test split
    train_df, test_df = load_data_splits()
    
    # Load the trained model and preprocessor
    model_path = 'models/lstm_model.pth'
    try:
        model, preprocessor = load_model_with_preprocessor(model_path)
        logging.info(f"Model loaded from {model_path}")
    except Exception as e:
        logging.error(f"Error loading model: {str(e)}")
        raise
    
    # Load necessary data with proper handling
    try:
        lap_times = pd.read_csv('data/LAPS.csv', na_values=['\\N', 'NaN', ''])
        special_laps = pd.read_csv('data/SPECIAL_LAPS.csv', na_values=['\\N', 'NaN', ''])
        constructors = pd.read_csv('../data/raw_data/constructors.csv', na_values=['\\N', 'NaN', ''])
        drivers_df = pd.read_csv('data/util/drivers_attributes.csv')
        logging.info("All data files loaded successfully.")
    except FileNotFoundError as e:
        logging.error(f"Data file not found: {e}")
        raise
    except Exception as e:
        logging.error(f"An error occurred while loading data files: {e}")
        raise
    
    # Create Constructor Mapping
    constructor_mapping = constructors.set_index('constructorId')['name'].to_dict()
    constructor_mapping = {k: v.lower() for k, v in constructor_mapping.items()}  # Convert to lowercase
    
    # Create Driver Code Mapping
    if 'code' in drivers_df.columns:
        driver_code_mapping = drivers_df.set_index('driverId')['code'].to_dict()
    else:
        # Create 'code' based on forename and surname
        drivers_df['code'] = drivers_df['forename'].str[0].str.upper() + drivers_df['surname'].str[:2].str.upper()
        driver_code_mapping = drivers_df.set_index('driverId')['code'].to_dict()
    
    # Initialize simulator
    simulator = LSTMRaceSimulator(model, preprocessor)
    
    # Run evaluation
    results_df, overall_metrics = evaluate_lstm_model(simulator, test_df)


ImportError: cannot import name 'get_constructor_color' from 'common.utils' (/Users/I551659/Documents/GitHub/IE650-RAMP/ie500-data-mining-group7/race_simulation/notebooks/xgboost/../../common/utils.py)

In [None]:
display(pit_strategies)

In [None]:
display(safety_car_periods)