In [None]:
# Existing imports and data loading code...
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler
from dataclasses import dataclass, field
from typing import List, Tuple, Dict, Optional
import torch
from torch.utils.data import DataLoader, Dataset
import os

# Load additional data
na_values = ['\\N', 'NaN', '']

constructors = pd.read_csv('../../data/raw_data/constructors.csv', na_values=na_values)
constructor_results = pd.read_csv('../../data/raw_data/constructor_results.csv', na_values=na_values)
constructor_standings = pd.read_csv('../../data/raw_data/constructor_standings.csv', na_values=na_values)


In [None]:
@dataclass
class RaceFeatures:
    """Data structure for race features"""
    static_features: List[str] = field(default_factory=lambda: [
        # Existing static features
        'driver_overall_skill', 'driver_circuit_skill', 'driver_consistency',
        'driver_reliability', 'driver_aggression', 'driver_risk_taking',
        'fp1_median_time', 'fp2_median_time', 'fp3_median_time', 'quali_time',
        # New static features
        'constructor_performance', 'circuit_length', 'circuit_type'
    ])
    
    dynamic_features: List[str] = field(default_factory=lambda: [
        # Existing dynamic features
        'tire_age', 'fuel_load', 'track_position', 'track_temp',
        'air_temp', 'humidity', 'tire_compound', 'TrackStatus', 'is_pit_lap',
        # New dynamic features
        'weather_forecast', 'safety_car_active'
    ])
    
    target: str = 'milliseconds'


In [None]:
def load_and_preprocess_data() -> pd.DataFrame:
    """
    Load data from CSV files and preprocess it to create the enhanced_laps DataFrame.
    """
    # Load data (existing code)...
    na_values = ['\\N', 'NaN', '']
    # ... existing data loading code ...

    # Load additional data
    constructors = pd.read_csv('../../data/raw_data/constructors.csv', na_values=na_values)
    constructor_results = pd.read_csv('../../data/raw_data/constructor_results.csv', na_values=na_values)
    constructor_standings = pd.read_csv('../../data/raw_data/constructor_standings.csv', na_values=na_values)
    
    # Merge constructors with drivers
    results = results.merge(constructors[['constructorId', 'name', 'nationality']], on='constructorId', how='left')
    results.rename(columns={'name': 'constructor_name', 'nationality': 'constructor_nationality'}, inplace=True)
    
    # Map driverId to constructorId
    driver_constructor = results[['raceId', 'driverId', 'constructorId']].drop_duplicates()
    
    # Merge driver_constructor into laps
    laps = laps.merge(driver_constructor, on=['raceId', 'driverId'], how='left')
    
    # Add constructor performance metrics
    # For simplicity, we'll use the constructor standings position as a performance metric
    constructor_standings_latest = constructor_standings.sort_values('raceId', ascending=False).drop_duplicates('constructorId')
    constructor_standings_latest = constructor_standings_latest[['constructorId', 'points', 'position']]
    constructor_standings_latest.rename(columns={'points': 'constructor_points', 'position': 'constructor_position'}, inplace=True)
    
    laps = laps.merge(constructor_standings_latest, on='constructorId', how='left')
    
    # Fill missing constructor performance data
    laps['constructor_points'].fillna(laps['constructor_points'].mean(), inplace=True)
    laps['constructor_position'].fillna(laps['constructor_position'].max(), inplace=True)
    
    # Add constructor performance as a static feature
    laps['constructor_performance'] = laps['constructor_points']
    
    # Add circuit characteristics
    # For simplicity, let's assume circuit length and type are available in circuits.csv
    circuits['circuit_length'] = 5.0  # Placeholder value, replace with actual data if available
    circuits['circuit_type'] = 'Permanent'  # Options could be 'Permanent', 'Street', 'Hybrid'
    
    # Merge circuit data into laps
    laps = laps.merge(circuits[['circuitId', 'circuit_length', 'circuit_type']], on='circuitId', how='left')
    
    # Encode circuit_type as a categorical variable
    circuit_type_mapping = {'Permanent': 0, 'Street': 1, 'Hybrid': 2}
    laps['circuit_type_encoded'] = laps['circuit_type'].map(circuit_type_mapping)
    
    # Update weather data to include forecasts
    # For simplicity, we'll use the actual weather data as the forecast
    # In practice, you might add noise or adjust the data to simulate forecast inaccuracies
    
    # Existing weather data merging code...
    
    # Introduce global race events (safety cars)
    # For now, we'll predefine safety car events
    # For example, let's say there's a safety car from lap 20 to 22
    safety_car_periods = {
        'raceId_example': [(20, 22)]
        # Add more raceIds and periods as needed
    }
    
    # Add safety car active flag to laps
    def is_safety_car_active(race_id, lap_number):
        periods = safety_car_periods.get(race_id, [])
        for start_lap, end_lap in periods:
            if start_lap <= lap_number <= end_lap:
                return 1
        return 0
    
    laps['safety_car_active'] = laps.apply(
        lambda row: is_safety_car_active(row['raceId'], row['lap']), axis=1
    )
    
    # Handle missing values and encode categorical variables
    # ... existing code ...
    
    # Update required columns
    required_columns = race_features.static_features + race_features.dynamic_features
    # Ensure all required columns are present in laps
    missing_columns = set(required_columns) - set(laps.columns)
    if missing_columns:
        raise ValueError(f"Missing required columns: {missing_columns}")
    
    # Drop rows with missing values in required columns
    laps = laps.dropna(subset=required_columns)
    
    return laps
