In [None]:
# train_and_save_xgboost.py

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from xgboost import XGBRegressor
from data_preparation import load_and_preprocess_data
import logging
from xgboost_utils import save_pipeline
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def main():
    # Load and preprocess data
    logging.info("Loading and preprocessing data...")
    df = load_and_preprocess_data()
    logging.info(f"Data loaded with shape: {df.shape}")
    
    # Prepare features and target
    features_to_drop = [
        "cumulative_milliseconds", "positionOrder", "date", "driverRef", "number", 
        "date_race", "time_race", "time", "forename", "surname", "dob", "url_race", 
        "location", "circuitRef"  # Removed "milliseconds" from features_to_drop
    ]
    
    X = df.drop(columns=features_to_drop)
    y = df["milliseconds"]
    logging.info(f"Features and target prepared. X shape: {X.shape}, y shape: {y.shape}")
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    logging.info(f"Data split into train and test sets.")
    logging.info(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
    
    # Define categorical and numerical features
    categorical_features = ["code", "nationality", "status", "circuit_type", "country"]
    numerical_features = [col for col in X.columns if col not in categorical_features]
    logging.info(f"Categorical features: {categorical_features}")
    logging.info(f"Numerical features: {numerical_features}")
    
    # Create preprocessing pipeline including ColumnTransformer and PCA
    preprocessor = Pipeline(steps=[
        ('transformer', ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), numerical_features),
                ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
            ]
        )),
        ('pca', PCA(n_components=100, random_state=42))
    ])
    
    # Create the full pipeline with preprocessor and XGBoost regressor
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', XGBRegressor(n_estimators=100, random_state=42))
    ])
    
    # Train the pipeline
    logging.info("Training XGBoost pipeline...")
    pipeline.fit(X_train, y_train)
    logging.info("Training completed.")
    
    # Save the entire pipeline
    model_path = "models/xgboost_pipeline.pkl"
    save_pipeline(pipeline, model_path)
    logging.info(f"XGBoost pipeline saved to {model_path}")
    
    # Evaluate model performance
    logging.info("Evaluating model performance on test set...")
    y_pred = pipeline.predict(X_test)
    rmse = root_mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    logging.info(f"Test RMSE: {rmse:.2f}")
    logging.info(f"Test MAE: {mae:.2f}")
    logging.info(f"Test R²: {r2:.4f}")

if __name__ == "__main__":
    main()


In [None]:
# simulation_with_pit_strategy.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import logging
from race_simulator_xgboost import (
    Race, Driver, RaceSimulator, plot_race_positions, plot_lap_times, create_lap_times_with_inputs_dataframe
)
from sim_utils import extract_pit_strategies # Ensure this is accessible
from xgboost_utils import load_pipeline
import pickle
from typing import List

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load the trained pipeline
model_path = 'models/xgboost_pipeline.pkl'
pipeline = load_pipeline(model_path)

# Function to get race length (you can adjust this based on your data)
def get_race_length(race_id: int, lap_times_df: pd.DataFrame) -> int:
    """
    Get the actual race length for a given race ID from historical data.
    """
    race_laps = lap_times_df[lap_times_df['raceId'] == race_id]['lap'].max()
    if pd.isna(race_laps):
        # Fallback to a default length if race not found
        return 50
    return int(race_laps)

# Load necessary data
lap_times = pd.read_csv('data/SPECIAL_LAPS.csv', na_values=['\\N', 'NaN', ''])
race_length = get_race_length(1000, lap_times)  # Use your actual race ID

# Create an instance of the Race with actual length
race = Race(
    race_id=1000,
    circuit_id=1,
    total_laps=race_length,
    weather_conditions={},  # Add actual weather data if available
    safety_car_periods=[(10, 12)]  # Example safety car periods
)

# Load driver attributes    
drivers_df = pd.read_csv('data/util/drivers_attributes.csv')

# Define categorical and numerical features (must match training)
# Extract numerical and categorical feature names from the pipeline's preprocessor
preprocessor = pipeline.named_steps['preprocessor']
transformers = preprocessor.named_steps['transformer'].transformers_

# Extract numerical and categorical feature names
numerical_features = []
categorical_features = []
for name, transformer, features in transformers:
    if name == 'num':
        numerical_features.extend(features)
    elif name == 'cat':
        categorical_features.extend(features)

logging.info(f"Numerical features: {numerical_features}")
logging.info(f"Categorical features: {categorical_features}")

def initialize_drivers(drivers_df: pd.DataFrame, lap_times_df: pd.DataFrame, race: Race) -> List[Driver]:
    """
    Initializes drivers with their respective pit strategies extracted from historical data.

    Args:
        drivers_df (pd.DataFrame): DataFrame containing driver attributes.
        lap_times_df (pd.DataFrame): DataFrame containing lap data.
        pipeline (Pipeline): The trained scikit-learn pipeline.
        race (Race): The race instance.

    Returns:
        List[Driver]: A list of initialized Driver instances.
    """
    drivers = []
    
    # Extract pit strategies
    pit_strategies = extract_pit_strategies(lap_times_df, race.race_id)
    
    # Filter driver attributes for the specific raceId
    drivers_race_df = drivers_df[drivers_df['raceId'] == race.race_id]
    
    if drivers_race_df.empty:
        raise ValueError(f"No drivers found for raceId {race.race_id}")
    
    for idx, row in drivers_race_df.iterrows():
        driver_id = row['driverId']
        driver_name = row.get('driverName', f"Driver {driver_id}")  # Adjust based on actual column name
        
        # Load driver features from LAPS.csv
        driver_features_df = lap_times_df[lap_times_df['driverId'] == driver_id]
        
        if driver_features_df.empty:
            raise ValueError(f"No feature data found for driverId {driver_id}")
        
        # For simplicity, take the latest lap's features or aggregate as needed
        driver_features = driver_features_df.iloc[-1].to_dict()
        
        # Extract pit strategy for the driver
        pit_strategy_info = pit_strategies.get(driver_id, {})
        print(pit_strategy_info)
        starting_compound = pit_strategy_info.get('starting_compound', 2)  # Default to medium compound if not found
        pit_strategy = pit_strategy_info.get('pit_strategy', [])
        
        # Update dynamic features based on simulation requirements
        dynamic_features = {
            'tire_age': 0,
            'fuel_load': 100.0,
            'track_position': idx + 1,
            'TrackTemp': 35.0,
            'AirTemp': 25.0,
            'Humidity': 50.0,
            'TrackStatus': 1,
            'is_pit_lap': 0,
            'tire_compound': starting_compound  # Set starting compound
        }
        
        # Combine static and dynamic features
        full_features = driver_features.copy()
        full_features.update(dynamic_features)
        
        # Ensure all necessary columns are present
        for col in numerical_features + categorical_features + ['raceId', 'driverId', 'lap']:
            if col not in full_features:
                # Assign reasonable defaults
                if col in numerical_features:
                    full_features[col] = 0
                elif col in categorical_features:
                    full_features[col] = 'unknown'
                else:
                    full_features[col] = 1  # For 'lap'
        
        # Extract features needed for the pipeline
        driver_feature_values = {col: full_features[col] for col in numerical_features + categorical_features + ['raceId', 'driverId', 'lap']}
        driver_df = pd.DataFrame([driver_feature_values])
        
        # Create the Driver instance
        driver = Driver(
            driver_id=driver_id,
            name=driver_name,
            static_features=full_features,  # Original features before transformation
            initial_dynamic_features=dynamic_features,
            start_position=idx + 1,
            pit_strategy=pit_strategy,  # Actual pit strategy from historical data
            starting_compound=starting_compound
        )
        
        drivers.append(driver)
    
    logging.info(f"Initialized {len(drivers)} drivers for raceId {race.race_id}")
    return drivers

# Initialize drivers using the pipeline
drivers = initialize_drivers(drivers_df, lap_times, race)

# Add drivers to the race
race.drivers.extend(drivers)

# Initialize the RaceSimulator with the loaded pipeline
simulator = RaceSimulator(pipeline, model_type='xgboost')

# Simulate the race
race_lap_data = simulator.simulate_race(race)

# Create the DataFrame with inputs and predicted lap times
lap_times_df = create_lap_times_with_inputs_dataframe(race)

# Display the first few rows
print(lap_times_df.head())

# Optionally, save to a CSV for further analysis
lap_times_df.to_csv('race_simulation_results.csv', index=False)

# Proceed with analysis and plotting
plot_race_positions(race)
plot_lap_times(race)
