In [12]:
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, random_split, Dataset
from typing import List, Tuple, Dict, Optional
from dataclasses import dataclass, field
from sklearn.preprocessing import StandardScaler
import os

# Define the RaceFeatures dataclass
@dataclass
class RaceFeatures:
    """Data structure for race features"""
    static_features: List[str] = field(default_factory=lambda: [
        'driver_overall_skill', 'driver_circuit_skill', 'driver_consistency',
        'driver_reliability', 'driver_aggression', 'driver_risk_taking',
        'fp1_median_time', 'fp2_median_time', 'fp3_median_time', 'quali_time'
    ])
    
    dynamic_features: List[str] = field(default_factory=lambda: [
        'tire_age', 'fuel_load', 'track_position', 'track_temp',
        'air_temp', 'humidity', 'tire_compound', 'track_status', 'is_pit_lap'
    ])
    
    target: str = 'milliseconds'

# Define the F1Dataset class
class F1Dataset(Dataset):
    def __init__(self, sequences, static_features, targets):
        self.sequences = torch.FloatTensor(sequences)
        self.static_features = torch.FloatTensor(static_features)
        self.targets = torch.FloatTensor(targets)
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return {
            'sequence': self.sequences[idx],
            'static': self.static_features[idx],
            'target': self.targets[idx]
        }

# Define the F1DataPreprocessor class
class F1DataPreprocessor:
    def __init__(self):
        self.static_scaler = StandardScaler()
        self.dynamic_scaler = StandardScaler()
        self.lap_time_scaler = StandardScaler()
        
    def prepare_sequence_data(self, df: pd.DataFrame, window_size: int = 3) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        """
        Prepare sequential data with sliding window and apply scaling
        """
        sequences = []
        static_features = []
        targets = []
        
        # Instantiate RaceFeatures
        race_features = RaceFeatures()
        
        # Sort the dataframe to ensure consistent ordering
        df = df.sort_values(['raceId', 'driverId', 'lap'])
        
        # Group by race and driver
        for (race_id, driver_id), group in df.groupby(['raceId', 'driverId']):
            group = group.sort_values('lap')
            
            # Extract static features (assumed to be constant per driver per race)
            static = group[race_features.static_features].iloc[0].values
            static_features.append(static)
            
            # Extract dynamic features and target
            lap_times = group[race_features.target].values.reshape(-1, 1)  # Shape: (num_laps, 1)
            dynamic = group[race_features.dynamic_features].values  # Shape: (num_laps, num_dynamic_features)
            
            # Apply scaling
            # Note: Scalers should be fitted on the training data to prevent data leakage.
            # Here, for simplicity, we're fitting on the entire dataset. For a real-world scenario,
            # consider splitting the data first before fitting the scalers.
            dynamic_features_to_scale = [col for col in race_features.dynamic_features if col != 'tire_compound']
            tire_compounds = dynamic[:, race_features.dynamic_features.index('tire_compound')].reshape(-1, 1)
            other_dynamic = dynamic[:, [race_features.dynamic_features.index(col) for col in dynamic_features_to_scale]]
            
            lap_times_scaled = self.lap_time_scaler.fit_transform(lap_times).flatten()
            other_dynamic_scaled = self.dynamic_scaler.fit_transform(other_dynamic)
            static_scaled = self.static_scaler.fit_transform(static.reshape(1, -1)).flatten()
            
            dynamic_scaled = np.hstack((tire_compounds, other_dynamic_scaled))
            
            # Create sequences
            # Create sequences
        for i in range(len(lap_times_scaled) - window_size):
            sequence_lap_times = lap_times_scaled[i:i+window_size].reshape(-1, 1)  # Shape: (window_size, 1)
            sequence_dynamic = dynamic_scaled[i:i+window_size]  # Shape: (window_size, num_dynamic_features)
            sequence = np.hstack((sequence_lap_times, sequence_dynamic))  # Shape: (window_size, 1 + num_dynamic_features)
            sequences.append(sequence)
            static_features.append(static_scaled)
            targets.append(lap_times_scaled[i + window_size])
        
        return (np.array(sequences), 
                np.array(static_features), 
                np.array(targets))

    
    def create_train_val_loaders(
        self, 
        sequences: np.ndarray, 
        static_features: np.ndarray, 
        targets: np.ndarray,
        batch_size: int = 32,
        val_split: float = 0.2
    ) -> Tuple[DataLoader, DataLoader]:
        """
        Create train and validation dataloaders with given split ratio
        """
        dataset = F1Dataset(sequences, static_features, targets)
        
        # Calculate lengths for split
        val_size = int(len(dataset) * val_split)
        train_size = len(dataset) - val_size
        
        # Split dataset
        train_dataset, val_dataset = random_split(
            dataset, 
            [train_size, val_size],
            generator=torch.Generator().manual_seed(42)
        )
        
        # Create dataloaders
        train_loader = DataLoader(
            train_dataset,
            batch_size=batch_size,
            shuffle=True
        )
        
        val_loader = DataLoader(
            val_dataset,
            batch_size=batch_size,
            shuffle=False
        )
        
        return train_loader, val_loader

# Define the F1PredictionModel class
class F1PredictionModel(nn.Module):
    def __init__(self, 
                 sequence_dim: int,
                 static_dim: int,
                 hidden_dim: int = 64,
                 num_layers: int = 2):
        super().__init__()
        
        # LSTM for sequential features
        self.lstm = nn.LSTM(
            input_size=sequence_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True
        )
        
        # Static features processing
        self.static_network = nn.Sequential(
            nn.Linear(static_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim)
        )
        
        # Combine everything
        self.final_network = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )
    
    def forward(self, sequence, static):
        # Process sequence through LSTM
        lstm_out, _ = self.lstm(sequence)
        lstm_out = lstm_out[:, -1, :]  # Take the output of the last time step
        
        # Process static features
        static_out = self.static_network(static)
        
        # Combine LSTM output and static features
        combined = torch.cat([lstm_out, static_out], dim=1)
        
        # Final prediction
        prediction = self.final_network(combined)
        
        return prediction.squeeze()

# Define the training function
def train_model(model: nn.Module, 
                train_loader: DataLoader,
                val_loader: DataLoader,
                epochs: int = 10,
                learning_rate: float = 0.001,
                lap_time_scaler: StandardScaler = None,  # Pass the lap time scaler
                device: Optional[str] = None) -> Dict[str, List[float]]:
    """
    Train the model and return training history including MAE in milliseconds
    """
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()
    history = {'train_loss': [], 'val_loss': [], 'train_mae': [], 'val_mae': []}
    
    for epoch in range(epochs):
        # Training
        model.train()
        train_losses = []
        train_maes = []
        for batch in train_loader:
            sequences = batch['sequence'].to(device)
            static = batch['static'].to(device)
            targets = batch['target'].to(device)
            
            optimizer.zero_grad()
            predictions = model(sequences, static)
            loss = criterion(predictions, targets)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())
            
            # Calculate MAE in normalized scale
            mae = torch.mean(torch.abs(predictions - targets)).item()
            train_maes.append(mae)
        
        # Validation
        model.eval()
        val_losses = []
        val_maes = []
        with torch.no_grad():
            for batch in val_loader:
                sequences = batch['sequence'].to(device)
                static = batch['static'].to(device)
                targets = batch['target'].to(device)
                
                predictions = model(sequences, static)
                loss = criterion(predictions, targets)
                val_losses.append(loss.item())
                
                # Calculate MAE in normalized scale
                mae_normalized = torch.mean(torch.abs(predictions - targets)).item()
                val_maes.append(mae_normalized)
        
        # Record metrics
        train_loss = np.mean(train_losses)
        val_loss = np.mean(val_losses)
        train_mae_normalized = np.mean(train_maes)
        val_mae_normalized = np.mean(val_maes)
        
        # Convert MAE back to milliseconds using the inverse scaler
        if lap_time_scaler:
            train_mae_ms = lap_time_scaler.inverse_transform([[train_mae_normalized]])[0][0]
            val_mae_ms = lap_time_scaler.inverse_transform([[val_mae_normalized]])[0][0]
        else:
            train_mae_ms, val_mae_ms = train_mae_normalized, val_mae_normalized
        
        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        history['train_mae'].append(train_mae_ms)
        history['val_mae'].append(val_mae_ms)
        
        print(f'Epoch {epoch+1}/{epochs}:')
        print(f'Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Train MAE: {train_mae_ms:.2f} ms, Val MAE: {val_mae_ms:.2f} ms')
    
    return history



# Define a function to save the model
def save_model(model: nn.Module, path: str):
    torch.save(model.state_dict(), path)
    print(f"Model saved to {path}")

# Now, integrate your code snippets into data preprocessing
def load_and_preprocess_data() -> pd.DataFrame:
    """
    Load data from CSV files and preprocess it to create the enhanced_laps DataFrame.
    """
    # Load data
    na_values = ['\\N', 'NaN', '']
    lap_times = pd.read_csv('../../data/raw_data/lap_times.csv', na_values=na_values)
    drivers = pd.read_csv('../../data/raw_data/drivers.csv', na_values=na_values)
    races = pd.read_csv('../../data/raw_data/races.csv', na_values=na_values)
    circuits = pd.read_csv('../../data/raw_data/circuits.csv', na_values=na_values)
    pit_stops = pd.read_csv('../../data/raw_data/pit_stops.csv', na_values=na_values)
    pit_stops.rename(columns={'milliseconds' : 'pitstop_milliseconds'}, inplace=True)
    results = pd.read_csv('../../data/raw_data/results.csv', na_values=na_values)
    results.rename(columns={'milliseconds' : 'racetime_milliseconds'}, inplace=True)

    qualifying = pd.read_csv('../../data/raw_data/qualifying.csv', na_values=na_values)
    status = pd.read_csv('../../data/raw_data/status.csv', na_values=na_values)
    weather_data = pd.read_csv('../../data/raw_data/ff1_weather.csv', na_values=na_values)
    practice_sessions = pd.read_csv('../../data/raw_data/ff1_laps.csv', na_values=na_values)
    # Load the tire data
    tire_data = pd.read_csv('../../data/raw_data/ff1_laps.csv', na_values=na_values)

    
    # Convert date columns to datetime
    races['date'] = pd.to_datetime(races['date'])
    results['date'] = results['raceId'].map(races.set_index('raceId')['date'])
    lap_times['date'] = lap_times['raceId'].map(races.set_index('raceId')['date'])
    
    # Merge dataframes
    laps = lap_times.merge(drivers, on='driverId', how='left')
    print(laps.shape)
    laps = laps.merge(races, on='raceId', how='left', suffixes=('', '_race'))
    laps.rename(columns={'quali_time' : 'quali_date_time'}, inplace=True)
    print(laps.shape)
    laps = laps.merge(circuits, on='circuitId', how='left')
    print(laps.shape)
    laps = laps.merge(results[['raceId', 'driverId', 'positionOrder', 'grid', 'racetime_milliseconds', 'fastestLap', 'statusId']], on=['raceId', 'driverId'], how='left')
    print(laps.shape)
    laps = laps.merge(status, on='statusId', how='left')
    print(laps.shape)
    laps = laps.merge(pit_stops[['raceId', 'driverId', 'lap', 'pitstop_milliseconds']], on=['raceId', 'driverId', 'lap'], how='left')
    print(laps.shape)
    laps['pitstop_milliseconds'].fillna(0, inplace=True)  # Assuming 0 if no pit stop
    print(laps.shape)
    
    # Add weather information
    # Filter weather data to include only the Race session
    weather_data = weather_data[weather_data['SessionName'] == 'R']
    
    # Merge weather data with races to get raceId
    weather_data = weather_data.merge(
        races[['raceId', 'year', 'name']], 
        left_on=['EventName', 'Year'],
        right_on=['name', 'year'],
        how='left'
    )
    
    # Compute cumulative time from the start of the race for each driver
    laps.sort_values(['raceId', 'driverId', 'lap'], inplace=True)
    laps['cumulative_milliseconds'] = laps.groupby(['raceId', 'driverId'])['milliseconds'].cumsum()
    laps['seconds_from_start'] = laps['cumulative_milliseconds'] / 1000
    print(laps.shape)
    
    # Use 'Time' in weather_data as 'seconds_from_start'
    weather_data['seconds_from_start'] = weather_data['Time']
    
    # Standardize text data
    tire_data['Compound'] = tire_data['Compound'].str.upper()
    tire_data['EventName'] = tire_data['EventName'].str.strip().str.upper()
    races['name'] = races['name'].str.strip().str.upper()
    
    # Filter for race sessions only
    tire_data = tire_data[tire_data['SessionName'] == 'R']
    
    # Merge with races to get raceId
    tire_data = tire_data.merge(
        races[['raceId', 'year', 'name']],
        left_on=['Year', 'EventName'],
        right_on=['year', 'name'],
        how='left'
    )
    
    # Map driver codes to driverId
    tire_data['Driver'] = tire_data['Driver'].str.strip().str.upper()
    drivers['code'] = drivers['code'].str.strip().str.upper()
    driver_code_to_id = drivers.set_index('code')['driverId'].to_dict()
    tire_data['driverId'] = tire_data['Driver'].map(driver_code_to_id)
    
    # Rename 'LapNumber' to 'lap' and ensure integer type
    tire_data.rename(columns={'LapNumber': 'lap'}, inplace=True)
    tire_data['lap'] = tire_data['lap'].astype(int)
    laps['lap'] = laps['lap'].astype(int)
    
    # Create compound mapping (ordered from hardest to softest)
    compound_mapping = {
        'UNKNOWN': 0,
        'HARD': 1,
        'MEDIUM': 2,
        'SOFT': 3,
        'INTERMEDIATE': 4,
        'WET': 5
    }
    
    # Merge tire_data with laps
    laps = laps.merge(
        tire_data[['raceId', 'driverId', 'lap', 'Compound', 'TrackStatus']],
        on=['raceId', 'driverId', 'lap'],
        how='left'
    )
    
    laps.rename(columns={'TrackStatus' : 'track_status'} )
    
    # Handle missing compounds and apply numeric encoding
    laps['Compound'].fillna('UNKNOWN', inplace=True)
    laps['tire_compound'] = laps['Compound'].map(compound_mapping)
    
    # Drop the original Compound column if desired
    laps.drop('Compound', axis=1, inplace=True)
    
    # Standardize names
    practice_sessions['EventName'] = practice_sessions['EventName'].str.strip().str.upper()
    races['name'] = races['name'].str.strip().str.upper()
    
    # Merge practice_sessions with races to get raceId
    practice_sessions = practice_sessions.merge(
        races[['raceId', 'year', 'name']],
        left_on=['Year', 'EventName'],
        right_on=['year', 'name'],
        how='left'
    )
    
    # Map driver codes to driverId
    practice_sessions['Driver'] = practice_sessions['Driver'].str.strip().str.upper()
    drivers['code'] = drivers['code'].str.strip().str.upper()
    driver_code_to_id = drivers.set_index('code')['driverId'].to_dict()
    practice_sessions['driverId'] = practice_sessions['Driver'].map(driver_code_to_id)
    
    # Convert LapTime to milliseconds
    practice_sessions['LapTime_ms'] = practice_sessions['LapTime'].apply(lambda x: pd.to_timedelta(x).total_seconds() * 1000)
    
    # Calculate median lap times for each driver in each session
    session_medians = practice_sessions.groupby(['raceId', 'driverId', 'SessionName'])['LapTime_ms'].median().reset_index()
    
    # Pivot the data to have sessions as columns
    session_medians_pivot = session_medians.pivot_table(
        index=['raceId', 'driverId'],
        columns='SessionName',
        values='LapTime_ms'
    ).reset_index()
    
    # Rename columns for clarity
    session_medians_pivot.rename(columns={
        'FP1': 'fp1_median_time',
        'FP2': 'fp2_median_time',
        'FP3': 'fp3_median_time',
        'Q': 'quali_time'
    }, inplace=True)
    
    laps = laps.merge(
    session_medians_pivot,
    on=['raceId', 'driverId'],
    how='left'
    )
    
    # Fill missing practice times with global median or a placeholder value
    global_median_fp1 = laps['fp1_median_time'].median()
    laps['fp1_median_time'].fillna(global_median_fp1, inplace=True)
    
    # Repeat for other sessions
    global_median_fp2 = laps['fp2_median_time'].median()
    laps['fp2_median_time'].fillna(global_median_fp2, inplace=True)
    
    global_median_fp3 = laps['fp3_median_time'].median()
    laps['fp3_median_time'].fillna(global_median_fp3, inplace=True)
    
    global_median_quali = laps['quali_time'].median()
    laps['quali_time'].fillna(global_median_quali, inplace=True)

    
    # Create a binary indicator for pit stops
    laps['is_pit_lap'] = laps['pitstop_milliseconds'].apply(lambda x: 1 if x > 0 else 0)

    
    # Define a function to match weather data to laps
    def match_weather_to_lap(race_laps, race_weather):
        """
        For each lap, find the closest weather measurement in time
        """
        race_laps = race_laps.sort_values('seconds_from_start')
        race_weather = race_weather.sort_values('seconds_from_start')
        merged = pd.merge_asof(
            race_laps,
            race_weather,
            on='seconds_from_start',
            direction='nearest'
        )
        return merged

    # Apply matching per race
    matched_laps_list = []
    for race_id in laps['raceId'].unique():
        print(f'Matching for {race_id}')
        race_laps = laps[laps['raceId'] == race_id]
        race_weather = weather_data[weather_data['raceId'] == race_id]
        
        if not race_weather.empty:
            matched = match_weather_to_lap(race_laps, race_weather)
            print(f"Matched DataFrame shape: {matched.shape}")
            matched_laps_list.append(matched)
        else:
            matched_laps_list.append(race_laps)  # No weather data for this race

    # Concatenate all matched laps
    laps = pd.concat(matched_laps_list, ignore_index=True)
    print(laps.shape)
    
    # Fill missing weather data with default values
    laps['track_temp'] = laps['TrackTemp'].fillna(25.0)
    laps['air_temp'] = laps['AirTemp'].fillna(20.0)
    laps['humidity'] = laps['Humidity'].fillna(50.0)
    
    # Calculate driver aggression and skill
    # Create driver names
    drivers['driver_name'] = drivers['forename'] + ' ' + drivers['surname']
    driver_mapping = drivers[['driverId', 'driver_name']].copy()
    driver_mapping.set_index('driverId', inplace=True)
    driver_names = driver_mapping['driver_name'].to_dict()
    
    # Map statusId to status descriptions
    status_dict = status.set_index('statusId')['status'].to_dict()
    results['status'] = results['statusId'].map(status_dict)
    
    # Calculate driver aggression and skill
    def calculate_aggression(driver_results):
        if len(driver_results) == 0:
            return 0.5  # Default aggression for new drivers
        
        # Only consider recent races for more current behavior
        recent_results = driver_results.sort_values('date', ascending=False).head(20)
        
        # Calculate overtaking metrics
        positions_gained = recent_results['grid'] - recent_results['positionOrder']
        
        # Calculate risk metrics
        dnf_rate = (recent_results['status'] != 'Finished').mean()
        incidents = (recent_results['statusId'].isin([
            4,  # Collision
            5,  # Spun off
            6,  # Accident
            20, # Collision damage
            82, # Collision with another driver
        ])).mean()
        
        # Calculate overtaking success rate (normalized between 0-1)
        positive_overtakes = (positions_gained > 0).sum()
        negative_overtakes = (positions_gained < 0).sum()
        total_overtake_attempts = positive_overtakes + negative_overtakes
        overtake_success_rate = positive_overtakes / total_overtake_attempts if total_overtake_attempts > 0 else 0.5
        
        # Normalize average positions gained (0-1)
        avg_positions_gained = positions_gained[positions_gained > 0].mean() if len(positions_gained[positions_gained > 0]) > 0 else 0
        max_possible_gain = 20  # Maximum grid positions that could be gained
        normalized_gains = np.clip(avg_positions_gained / max_possible_gain, 0, 1)
        
        # Normalize risk factors (0-1)
        normalized_dnf = np.clip(dnf_rate, 0, 1)
        normalized_incidents = np.clip(incidents, 0, 1)
        
        # Calculate component scores (each between 0-1)
        overtaking_component = (normalized_gains * 0.6 + overtake_success_rate * 0.4)
        risk_component = (normalized_dnf * 0.5 + normalized_incidents * 0.5)
        
        # Combine components with weights (ensuring sum of weights = 1)
        weights = {
            'overtaking': 0.4,  # Aggressive overtaking
            'risk': 0.5,       # Risk-taking behavior
            'baseline': 0.1    # Baseline aggression
        }
        
        aggression = (
            overtaking_component * weights['overtaking'] +
            risk_component * weights['risk'] +
            0.5 * weights['baseline']  # Baseline aggression factor
        )
        
        # Add small random variation while maintaining 0-1 bounds
        variation = np.random.normal(0, 0.02)
        aggression = np.clip(aggression + variation, 0, 1)
        
        return aggression
    
    def calculate_skill(driver_data, results_data, circuit_id):
        driver_results = results_data[
            (results_data['driverId'] == driver_data['driverId']) & 
            (results_data['circuitId'] == circuit_id)
        ].sort_values('date', ascending=False).head(10)  # Use last 10 races at circuit
        
        if len(driver_results) == 0:
            return 0.5  # Default skill
        
        # Calculate performance metrics
        avg_finish_pos = driver_results['positionOrder'].mean()
        avg_quali_pos = driver_results['grid'].mean()
        points_per_race = driver_results['points'].mean()
        fastest_laps = (driver_results['rank'] == 1).mean()  # Add fastest lap consideration
        
        # Improved normalization (exponential decay for positions)
        normalized_finish_pos = np.exp(-avg_finish_pos/5) # Better spread of values
        normalized_quali_pos = np.exp(-avg_quali_pos/5)
        
        # Points normalization with improved scaling
        max_points_per_race = 26  # Maximum possible points (25 + 1 fastest lap)
        normalized_points = points_per_race / max_points_per_race
        
        # Weighted combination with more factors
        weights = {
            'finish': 0.35,
            'quali': 0.25,
            'points': 0.25,
            'fastest_laps': 0.15
        }
        
        skill = (
            weights['finish'] * normalized_finish_pos +
            weights['quali'] * normalized_quali_pos +
            weights['points'] * normalized_points +
            weights['fastest_laps'] * fastest_laps
        )
        
        # Add random variation to prevent identical skills
        skill = np.clip(skill + np.random.normal(0, 0.05), 0.1, 1.0)
        
        return skill
    
    # First merge results with races to get circuitId
    results = results.merge(
        races[['raceId', 'circuitId']], 
        on='raceId',
        how='left'
    )

    # Now calculate driver aggression and skill
    driver_aggression = {}
    driver_skill = {}
    for driver_id in drivers['driverId'].unique():
        driver_results = results[results['driverId'] == driver_id]
        aggression = calculate_aggression(driver_results)
        driver_aggression[driver_id] = aggression
        
        # Now we have circuit_id from the merge
        recent_race = driver_results.sort_values('date', ascending=False).head(1)
        if not recent_race.empty:
            circuit_id = recent_race['circuitId'].iloc[0]
            skill = calculate_skill({'driverId': driver_id}, results, circuit_id)
            driver_skill[driver_id] = skill
        else:
            driver_skill[driver_id] = 0.5  # Default skill for new drivers
    
    # Map calculated aggression and skill back to laps DataFrame
    laps['driver_aggression'] = laps['driverId'].map(driver_aggression)
    laps['driver_overall_skill'] = laps['driverId'].map(driver_skill)
    laps['driver_circuit_skill'] = laps['driver_overall_skill']  # For simplicity, using overall skill
    laps['driver_consistency'] = 0.5  # Placeholder
    laps['driver_reliability'] = 0.5  # Placeholder
    laps['driver_risk_taking'] = laps['driver_aggression']  # Assuming similar to aggression
    
    # Dynamic features
    laps['tire_age'] = laps.groupby(['raceId', 'driverId'])['lap'].cumcount()
    laps['fuel_load'] = laps.groupby(['raceId', 'driverId'])['lap'].transform(lambda x: x.max() - x + 1)
    laps['track_position'] = laps['position']  # Assuming 'position' is available in laps data
    
    # Ensure that all required columns are present
    # Create an instance of RaceFeatures
    race_features = RaceFeatures()
    
    # Ensure that all required columns are present
    required_columns = race_features.static_features + race_features.dynamic_features
    missing_columns = set(required_columns) - set(laps.columns)
    if missing_columns:
        raise ValueError(f"Missing required columns: {missing_columns}")
    
    # Drop rows with missing values in required columns
    laps = laps.dropna(subset=required_columns)
    
    print(laps.shape)
    
    return laps

# Update the main function
def main():
    # Load and preprocess data
    enhanced_laps = load_and_preprocess_data()
    
    # Save the preprocessed laps DataFrame for inspection
    enhanced_laps.to_csv('enhanced_laps_before_training.csv', index=False)
    print(enhanced_laps.shape)
    enhanced_laps.drop(columns=['position', 'time', 'driverRef', 'number', 'code', 'forename', 'surname', 'url_x', 'url_race', 'name_x', 'circuitRef', 'name_y', 'location', 'country', 'url_y', 'positionOrder', 'fastestLap', 'cumulative_milliseconds', 'seconds_from_start', 'raceId_x', 'year_x', 'Time', 'TrackTemp', 'AirTemp', 'Humidity', 'name', 'year_y', 'raceId_y'], inplace=True)
    
    print("Enhanced laps DataFrame saved to 'enhanced_laps_before_training.csv'")
    
    preprocessor = F1DataPreprocessor()
    sequences, static, targets = preprocessor.prepare_sequence_data(enhanced_laps, window_size=3)
    
    # Create train and validation loaders
    train_loader, val_loader = preprocessor.create_train_val_loaders(
        sequences, 
        static, 
        targets,
        batch_size=32,
        val_split=0.2
    )
    
    # Initialize model
    model = F1PredictionModel(
        sequence_dim=sequences.shape[2],
        static_dim=static.shape[1]
    )
    
    # Train the model
    history = train_model(model, train_loader, val_loader, epochs=20, learning_rate=0.001)
    
    # Save the trained model
    save_model(model, 'f1_prediction_model.pth')

if __name__ == "__main__":
    main()

  practice_sessions = pd.read_csv('../../data/raw_data/ff1_laps.csv', na_values=na_values)
  tire_data = pd.read_csv('../../data/raw_data/ff1_laps.csv', na_values=na_values)


(586171, 15)
(586171, 32)
(586171, 40)
(586171, 45)
(586171, 46)
(586171, 47)
(586171, 47)
(586171, 49)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  laps['pitstop_milliseconds'].fillna(0, inplace=True)  # Assuming 0 if no pit stop
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  laps['Compound'].fillna('UNKNOWN', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate 

Matching for 1
Matching for 2
Matching for 3
Matching for 4
Matching for 5
Matching for 6
Matching for 7
Matching for 8
Matching for 9
Matching for 10
Matching for 11
Matching for 12
Matching for 13
Matching for 14
Matching for 15
Matching for 16
Matching for 17
Matching for 18
Matching for 19
Matching for 20
Matching for 21
Matching for 22
Matching for 23
Matching for 24
Matching for 25
Matching for 26
Matching for 27
Matching for 28
Matching for 29
Matching for 30
Matching for 31
Matching for 32
Matching for 33
Matching for 34
Matching for 35
Matching for 36
Matching for 37
Matching for 38
Matching for 39
Matching for 40
Matching for 41
Matching for 42
Matching for 43
Matching for 44
Matching for 45
Matching for 46
Matching for 47
Matching for 48
Matching for 49
Matching for 50
Matching for 51
Matching for 52
Matching for 53
Matching for 54
Matching for 55
Matching for 56
Matching for 57
Matching for 58
Matching for 59
Matching for 60
Matching for 61
Matching for 62
Matching for 63
M

ValueError: Missing required columns: {'track_status'}

# Phase 1

# Phase 2

In [None]:
class RaceEffects:
    """Class to handle various race effects"""

    @staticmethod
    def calculate_tire_degradation(lap_number: int, stint_lap: int, compound: str = 'medium') -> float:
        """
        Calculate tire degradation effect
        stint_lap: Lap number since last pit stop
        """
        # Tire degradation factors for different compounds
        deg_factors = {
            'soft': 0.015,
            'medium': 0.010,
            'hard': 0.007
        }

        base_deg = deg_factors.get(compound, 0.010)
        return 1.0 + (base_deg * stint_lap)

    @staticmethod
    def calculate_fuel_effect(fuel_load: float) -> float:
        """Calculate time penalty due to fuel load"""
        # Approximate 0.3s per 10kg of fuel
        return 1.0 + (fuel_load * 0.03)

    @staticmethod
    def calculate_traffic_effect(position: int, gap_to_front: float) -> float:
        """Calculate effect of traffic"""
        if gap_to_front < 1.5:  # Within DRS range
            return 0.97  # 3% faster
        elif gap_to_front < 0.5:  # In dirty air
            return 1.03  # 3% slower
        return 1.0

    @staticmethod
    def calculate_weather_effect(temp_change: float, rain_intensity: float = 0.0) -> float:
        """Calculate weather effects on lap time"""
        # Temperature effect (optimal temp assumed to be 35°C)
        temp_effect = 1.0 + abs(temp_change - 35.0) * 0.002

        # Rain effect (0.0 to 1.0)
        rain_effect = 1.0 + (rain_intensity * 0.2)

        return temp_effect * rain_effect


def visualize_race_simulation(simulation_results: Dict[str, List[float]], pit_stops: List[int] = None):
    """Visualize race simulation results"""
    import matplotlib.pyplot as plt

    lap_numbers = list(range(1, len(simulation_results['predictions']) + 1))
    lap_times = [t / 1000 for t in simulation_results['predictions']]  # Convert to seconds
    uncertainties = simulation_results['uncertainties']

    plt.figure(figsize=(15, 8))

    # Plot lap times
    plt.plot(lap_numbers, lap_times, 'b-', label='Lap Times')

    # Plot uncertainty bands
    upper_bound = [t * (1 + u) for t, u in zip(lap_times, uncertainties)]
    lower_bound = [t * (1 - u) for t, u in zip(lap_times, uncertainties)]
    plt.fill_between(lap_numbers, lower_bound, upper_bound, alpha=0.2, color='blue')

    # Mark pit stops
    if pit_stops:
        for pit_lap in pit_stops:
            plt.axvline(x=pit_lap, color='r', linestyle='--', alpha=0.5)
            plt.text(pit_lap, min(lap_times), 'Pit Stop', rotation=90)

    plt.title('Race Simulation Results')
    plt.xlabel('Lap Number')
    plt.ylabel('Lap Time (seconds)')
    plt.grid(True)
    plt.legend()

    plt.tight_layout()
    plt.show()


# Update RacePredictionManager's simulation method
def simulate_race(self,
                  practice_data: pd.DataFrame,
                  static_features: np.ndarray,
                  n_laps: int,
                  pit_stop_laps: List[int] = None,
                  weather_forecast: Dict = None,
                  traffic_scenario: Dict = None) -> Dict[str, List[float]]:
    """
    Enhanced race simulation with weather and traffic effects
    """
    predictions = []
    uncertainties = []
    current_stint_lap = 0
    fuel_load = 100.0  # Initial fuel load in kg

    # Initialize sequence with practice data
    current_sequence = self._prepare_initial_sequence(practice_data)

    for lap in range(n_laps):
        # Update race conditions
        current_stint_lap = 0 if lap in (pit_stop_laps or []) else current_stint_lap + 1
        fuel_load = max(0, fuel_load - 2.0)  # Fuel consumption per lap

        # Calculate combined effects
        tire_effect = RaceEffects.calculate_tire_degradation(lap, current_stint_lap)
        fuel_effect = RaceEffects.calculate_fuel_effect(fuel_load)

        # Get weather effect if forecast provided
        weather_effect = 1.0
        if weather_forecast and lap in weather_forecast:
            weather_effect = RaceEffects.calculate_weather_effect(
                weather_forecast[lap]['temp'],
                weather_forecast[lap].get('rain', 0.0)
            )

        # Get traffic effect if scenario provided
        traffic_effect = 1.0
        if traffic_scenario and lap in traffic_scenario:
            traffic_effect = RaceEffects.calculate_traffic_effect(
                traffic_scenario[lap]['position'],
                traffic_scenario[lap]['gap_to_front']
            )

        # Make base prediction
        with torch.no_grad():
            base_prediction = self.model(
                torch.FloatTensor(current_sequence).unsqueeze(0),
                torch.FloatTensor(static_features).unsqueeze(0)
            )

        # Apply all effects
        predicted_time = self._denormalize_prediction(base_prediction.item())
        predicted_time *= (tire_effect * fuel_effect * weather_effect * traffic_effect)

        predictions.append(predicted_time)
        uncertainties.append(self._calculate_uncertainty(lap))

        # Update sequence for next prediction
        current_sequence = self._update_simulation_sequence(
            current_sequence,
            self.lap_time_scaler.transform([[predicted_time]])[0][0]
        )

    return {
        'predictions': predictions,
        'uncertainties': uncertainties
    }


class RacePredictionManager:
    def __init__(self, model: F1PredictionModel, window_size: int = 3):
        self.model = model
        self.window_size = window_size
        self.lap_buffer = []  # For storing recent laps in real-time mode
        self.predictions = []  # Store all predictions
        self.race_features = RaceFeatures()  # Initialize RaceFeatures

        # Scalers from training
        self.lap_time_scaler = None
        self.dynamic_scaler = None
        self.static_scaler = None

    def set_scalers(self, lap_time_scaler, dynamic_scaler, static_scaler):
        """Set the scalers used during training"""
        self.lap_time_scaler = lap_time_scaler
        self.dynamic_scaler = dynamic_scaler
        self.static_scaler = static_scaler

    def real_time_predict(self,
                          current_lap_data: dict,
                          static_features: np.ndarray) -> float:
        """
        Make real-time predictions during a race
        
        Args:
            current_lap_data: Dict containing lap time and dynamic features
            static_features: Array of static driver/circuit features
        """
        # Update lap buffer
        if len(self.lap_buffer) >= self.window_size:
            self.lap_buffer.pop(0)
        self.lap_buffer.append(current_lap_data)

        # If we don't have enough laps yet, use a simple baseline
        if len(self.lap_buffer) < self.window_size:
            return self._baseline_prediction(current_lap_data)

        # Prepare features
        sequence = self._prepare_sequence(self.lap_buffer)
        static = self._prepare_static(static_features)

        # Make prediction
        with torch.no_grad():
            prediction = self.model(
                torch.FloatTensor(sequence).unsqueeze(0),
                torch.FloatTensor(static).unsqueeze(0)
            )

        # Convert prediction back to milliseconds
        predicted_time = self._denormalize_prediction(prediction.item())
        self.predictions.append(predicted_time)

        return predicted_time

    def _prepare_sequence(self, lap_buffer: List[dict]) -> np.ndarray:
        """Prepare sequence data for model input"""
        sequence = []
        for lap in lap_buffer:
            lap_features = [
                lap['milliseconds'],  # Normalized lap time
                *[lap[feat] for feat in self.race_features.dynamic_features]
            ]
            sequence.append(lap_features)
        return np.array(sequence)

    def _prepare_static(self, static_features: np.ndarray) -> np.ndarray:
        """Prepare static features for model input"""
        return self.static_scaler.transform(static_features.reshape(1, -1)).flatten()

    def _baseline_prediction(self, current_lap_data: dict) -> float:
        """Simple baseline prediction for cold start"""
        return current_lap_data['milliseconds']

    def _denormalize_prediction(self, prediction: float) -> float:
        """Convert normalized prediction back to milliseconds"""
        return self.lap_time_scaler.inverse_transform([[prediction]])[0][0]

    def simulate_race(self,
                      practice_data: pd.DataFrame,
                      static_features: np.ndarray,
                      n_laps: int,
                      pit_stop_laps: List[int] = None) -> Dict[str, List[float]]:
        """
        Simulate an entire race
        
        Args:
            practice_data: DataFrame containing practice session data
            static_features: Array of static driver/circuit features
            n_laps: Number of laps to simulate
            pit_stop_laps: List of planned pit stop laps
        """
        predictions = []
        uncertainties = []

        # Initialize sequence with practice data
        initial_sequence = self._prepare_initial_sequence(practice_data)
        current_sequence = initial_sequence

        for lap in range(n_laps):
            # Adjust features for pit stops
            if pit_stop_laps and lap in pit_stop_laps:
                current_sequence = self._adjust_for_pit_stop(current_sequence)

            # Make prediction
            with torch.no_grad():
                prediction = self.model(
                    torch.FloatTensor(current_sequence).unsqueeze(0),
                    torch.FloatTensor(static_features).unsqueeze(0)
                )

            # Add uncertainty based on prediction horizon
            uncertainty = self._calculate_uncertainty(lap)

            # Convert prediction to milliseconds
            predicted_time = self._denormalize_prediction(prediction.item())

            predictions.append(predicted_time)
            uncertainties.append(uncertainty)

            # Update sequence for next prediction
            current_sequence = self._update_simulation_sequence(
                current_sequence, prediction.item()
            )

        return {
            'predictions': predictions,
            'uncertainties': uncertainties
        }

    def _prepare_initial_sequence(self, practice_data: pd.DataFrame) -> np.ndarray:
        """Prepare initial sequence from practice data"""
        if len(practice_data) < self.window_size:
            raise ValueError(f"Practice data must contain at least {self.window_size} laps")

        sequence = []
        for _, lap in practice_data.head(self.window_size).iterrows():
            lap_features = [
                lap['milliseconds'],  # Normalized lap time
                *[lap[feat] for feat in self.race_features.dynamic_features]
            ]
            sequence.append(lap_features)
        return np.array(sequence)

    def _adjust_for_pit_stop(self, sequence: np.ndarray) -> np.ndarray:
        """Adjust sequence features for pit stop"""
        sequence_copy = sequence.copy()
        sequence_copy[-1, 1] = 0  # Reset tire age
        sequence_copy[-1, 0] += 20000  # Add 20 seconds in milliseconds
        return sequence_copy

    def _update_simulation_sequence(self,
                                    sequence: np.ndarray,
                                    new_prediction: float) -> np.ndarray:
        """Update sequence with new prediction for simulation"""
        new_sequence = sequence.copy()
        new_sequence = np.roll(new_sequence, -1, axis=0)
        new_sequence[-1, 0] = new_prediction

        # Update dynamic features
        new_sequence[-1, 1:] = self._update_dynamic_features(new_sequence[-2, 1:])
        return new_sequence

    def _update_dynamic_features(self, previous_features: np.ndarray) -> np.ndarray:
        """Update dynamic features for next lap"""
        updated_features = previous_features.copy()
        updated_features[0] += 1  # Increment tire age
        updated_features[1] -= 1  # Decrease fuel load
        return updated_features

    def _calculate_uncertainty(self, lap_number: int) -> float:
        """Calculate prediction uncertainty based on lap number"""
        base_uncertainty = 0.01  # 1% base uncertainty
        horizon_factor = 1 + (lap_number * 0.001)  # Increase by 0.1% per lap
        return base_uncertainty * horizon_factor

In [None]:
import torch
import pandas as pd
import numpy as np
from typing import Dict, List


# Update RacePredictionManager's simulation method
def simulate_race(self,
                  practice_data: pd.DataFrame,
                  static_features: np.ndarray,
                  n_laps: int,
                  pit_stop_laps: List[int] = None,
                  weather_forecast: Dict = None,
                  traffic_scenario: Dict = None) -> Dict[str, List[float]]:
    """
    Enhanced race simulation with weather and traffic effects
    """
    predictions = []
    uncertainties = []
    current_stint_lap = 0
    fuel_load = 100.0  # Initial fuel load in kg

    # Initialize sequence with practice data
    current_sequence = self._prepare_initial_sequence(practice_data)

    for lap in range(n_laps):
        # Update race conditions
        current_stint_lap = 0 if lap in (pit_stop_laps or []) else current_stint_lap + 1
        fuel_load = max(0, fuel_load - 2.0)  # Fuel consumption per lap

        # Calculate combined effects
        tire_effect = RaceEffects.calculate_tire_degradation(lap, current_stint_lap)
        fuel_effect = RaceEffects.calculate_fuel_effect(fuel_load)

        # Get weather effect if forecast provided
        weather_effect = 1.0
        if weather_forecast and lap in weather_forecast:
            weather_effect = RaceEffects.calculate_weather_effect(
                weather_forecast[lap]['temp'],
                weather_forecast[lap].get('rain', 0.0)
            )

        # Get traffic effect if scenario provided
        traffic_effect = 1.0
        if traffic_scenario and lap in traffic_scenario:
            traffic_effect = RaceEffects.calculate_traffic_effect(
                traffic_scenario[lap]['position'],
                traffic_scenario[lap]['gap_to_front']
            )

        # Make base prediction
        with torch.no_grad():
            base_prediction = self.model(
                torch.FloatTensor(current_sequence).unsqueeze(0),
                torch.FloatTensor(static_features).unsqueeze(0)
            )

        # Apply all effects
        predicted_time = self._denormalize_prediction(base_prediction.item())
        predicted_time *= (tire_effect * fuel_effect * weather_effect * traffic_effect)

        predictions.append(predicted_time)
        uncertainties.append(self._calculate_uncertainty(lap))

        # Update sequence for next prediction
        current_sequence = self._update_simulation_sequence(
            current_sequence,
            self.lap_time_scaler.transform([[predicted_time]])[0][0]
        )

    return {
        'predictions': predictions,
        'uncertainties': uncertainties
    }


def predict_race():
    # Load the trained model and scalers
    model, scalers = load_model_with_scalers('f1_prediction_model_with_scalers.pth')
    model.eval()  # Set model to evaluation mode

    # Initialize race manager
    race_manager = RacePredictionManager(model, window_size=3)
    race_manager.set_scalers(
        scalers['lap_time_scaler'],
        scalers['dynamic_scaler'],
        scalers['static_scaler']
    )

    # Example static features for a driver
    driver_static_features = np.array([
        0.8,  # driver_overall_skill
        0.75,  # driver_circuit_skill
        0.85,  # driver_consistency
        0.9,  # driver_reliability
        0.7,  # driver_aggression
        0.65  # driver_risk_taking
    ])

    # 1. Real-time prediction example
    print("Real-time prediction example:")
    for lap in range(1, 6):  # Simulate first 5 laps
        # Simulate current lap data
        current_lap_data = {
            'milliseconds': 80000 + np.random.normal(0, 500),  # ~80 seconds with some variation
            'tire_age': lap,
            'fuel_load': 100 - (lap * 2),  # Decreasing fuel load
            'track_position': 5,  # Example position
            'track_temp': 35.0,
            'air_temp': 25.0,
            'humidity': 60.0
        }

        prediction = race_manager.real_time_predict(
            current_lap_data=current_lap_data,
            static_features=driver_static_features
        )

        print(f"Lap {lap} - Predicted time: {prediction / 1000:.3f} seconds")

    # 2. Race simulation example
    print("\nRace simulation example:")

    # Create example practice session data
    practice_data = pd.DataFrame({
        'milliseconds': [80500, 80300, 80100],  # Example lap times
        'tire_age': [1, 2, 3],
        'fuel_load': [98, 96, 94],
        'track_position': [5, 5, 5],
        'track_temp': [35.0, 35.0, 35.0],
        'air_temp': [25.0, 25.0, 25.0],
        'humidity': [60.0, 60.0, 60.0]
    })

    # Add weather and traffic scenarios
    weather_forecast = {
        10: {'temp': 40.0, 'rain': 0.0},  # Hot weather at lap 10
        20: {'temp': 38.0, 'rain': 0.3},  # Light rain at lap 20
    }

    traffic_scenario = {
        5: {'position': 2, 'gap_to_front': 1.2},  # DRS range
        15: {'position': 4, 'gap_to_front': 0.3},  # Dirty air
    }

    # Update race simulation call
    race_simulation = race_manager.simulate_race(
        practice_data=practice_data,
        static_features=driver_static_features,
        n_laps=50,
        pit_stop_laps=[15, 35],
        weather_forecast=weather_forecast,
        traffic_scenario=traffic_scenario
    )

    # Print simulation results
    print("\nRace simulation results:")
    print(f"Total laps simulated: {len(race_simulation['predictions'])}")
    print(f"Average lap time: {np.mean(race_simulation['predictions']) / 1000:.3f} seconds")
    print(f"Fastest lap: {min(race_simulation['predictions']) / 1000:.3f} seconds")
    print(f"Slowest lap: {max(race_simulation['predictions']) / 1000:.3f} seconds")

    # Print lap times around pit stops
    pit_stops = [15, 35]
    for pit_lap in pit_stops:
        print(f"\nLap times around pit stop at lap {pit_lap}:")
        start_lap = 0
        end_lap = len(race_simulation['predictions'])
        for lap in range(start_lap, end_lap):
            print(f"Lap {lap + 1}: {race_simulation['predictions'][lap] / 1000:.3f} seconds "
                  f"(Uncertainty: ±{race_simulation['uncertainties'][lap] * 100:.1f}%)")

    visualize_race_simulation(race_simulation, pit_stop_laps=[15, 35])


if __name__ == "__main__":
    predict_race()