# F1 Lap Times Data Processing and Feature Engineering

This notebook loads raw Formula 1 data, preprocesses it, and engineers new features related to driver performance attributes.

In [26]:
import pandas as pd
import numpy as np
from typing import Dict, Tuple

## Define NA Values and Load Data

In [27]:


print('Data Loaded Successfully')

Data Loaded Successfully


  practice_sessions = pd.read_csv('../../data/raw_data/ff1_free_practice.csv', na_values=na_values)
  preprocessed = pd.read_csv('../../data/processed/export_v1.csv', na_values=na_values)


## Inspect Lap Times Data

In [28]:
lap_times.head()

Unnamed: 0,raceId,driverId,lap,position,time,milliseconds
0,841,20,1,1,1:38.109,98109
1,841,20,2,1,1:33.006,93006
2,841,20,3,1,1:32.713,92713
3,841,20,4,1,1:32.803,92803
4,841,20,5,1,1:32.342,92342


## Merge Lap Times with Preprocessed Data

In [29]:
# Merge lap times with preprocessed data on 'raceId' and 'driverId'
data = lap_times.merge(preprocessed, on=['raceId', 'driverId'], how='left')

# Drop the 'time' column and rename 'milliseconds' to 'lap_time'
if 'time' in data.columns:
    data.drop(columns=['time'], inplace=True)
if 'milliseconds' in data.columns:
    data.rename(columns={'milliseconds': 'lap_time'}, inplace=True)
else:
    print('Column "milliseconds" not found in merged data.')

## Define Functions and Classes for Feature Engineering

In [30]:
def prepare_lap_data(lap_times_df: pd.DataFrame, races_df: pd.DataFrame, results_df: pd.DataFrame) -> pd.DataFrame:
    """
    Prepare lap times data by merging with races and results data.
    
    Parameters:
    -----------
    lap_times_df: DataFrame containing lap times
    races_df: DataFrame containing race information
    results_df: DataFrame containing race results
    
    Returns:
    --------
    DataFrame with merged lap times, race dates, and circuit information
    """
    # Get required columns from races
    race_info = races_df[['raceId', 'date', 'year', 'circuitId']].copy()
    
    # Merge lap times with race information
    enhanced_laps = lap_times_df.merge(
        race_info,
        on='raceId',
        how='left'
    )
    
    # Convert date to datetime
    enhanced_laps['date'] = pd.to_datetime(enhanced_laps['date'])
    
    return enhanced_laps

class LapAttributeCalculator:
    def __init__(self, lap_times_df: pd.DataFrame, races_df: pd.DataFrame, results_df: pd.DataFrame):
        """
        Initialize with required dataframes.
        """
        # Merge results with races to include 'date'
        if 'date' not in results_df.columns:
            print('Merging results with races to include "date" column.')
            results_df = results_df.merge(races_df[['raceId', 'date']], on='raceId', how='left')
            if 'date' not in results_df.columns:
                raise KeyError('After merging, "date" column is still missing in results_df.')
        else:
            results_df['date'] = pd.to_datetime(results_df['date'])
        
        self.results = results_df.copy()
        self.base_data = prepare_lap_data(lap_times_df, races_df, results_df)
        
    def add_driver_attributes(self, n_previous_races: int = 20) -> pd.DataFrame:
        """
        Add driver performance attributes to each lap.
        """
        enhanced_laps = self.base_data.copy()
        print(f"Starting with {len(enhanced_laps)} laps")
        
        # Ensure 'driverId' is present in base_data
        if 'driverId' not in enhanced_laps.columns:
            raise KeyError('driverId column is missing from the base data.')
            
        # Process each unique driver-race combination
        unique_combinations = enhanced_laps[['driverId', 'raceId', 'date', 'circuitId']].drop_duplicates()
        print(f"Processing {len(unique_combinations)} unique driver-race combinations")
        
        attributes_list = []
        for _, combo in unique_combinations.iterrows():
            # Get previous results for this driver up to this race
            previous_results = self.results[
                (self.results['driverId'] == combo['driverId']) &
                (self.results['date'] < combo['date'])
            ].sort_values('date', ascending=False).head(n_previous_races)
            
            # Calculate attributes
            attributes = {
                'raceId': combo['raceId'],
                'driverId': combo['driverId'],
                **self._calculate_driver_attributes(
                    previous_results,
                    combo['circuitId'],
                    combo['date']
                )
            }
            attributes_list.append(attributes)
        
        # Convert to DataFrame and merge with laps
        print("Creating attributes DataFrame...")
        attributes_df = pd.DataFrame(attributes_list)
        
        # Merge attributes with original lap data
        print("Merging attributes with lap data...")
        enhanced_laps = enhanced_laps.merge(
            attributes_df,
            on=['raceId', 'driverId'],
            how='left'
        )
        
        print(f"Final dataframe has {len(enhanced_laps)} rows and {len(enhanced_laps.columns)} columns")
        return enhanced_laps
    
    def _calculate_driver_attributes(self, previous_results: pd.DataFrame, 
                                   circuit_id: int, race_date: pd.Timestamp) -> Dict[str, float]:
        """Calculate all driver attributes based on previous races."""
        if previous_results.empty:
            return self._get_default_attributes()
        
        # Calculate attributes
        circuit_results = previous_results[previous_results['circuitId'] == circuit_id]
        
        attributes = {
            # Skill metrics
            'driver_overall_skill': self._calculate_skill(previous_results),
            'driver_circuit_skill': self._calculate_skill(circuit_results) if not circuit_results.empty else 0.5,
            
            # Race performance metrics
            'driver_consistency': self._calculate_consistency(previous_results),
            'driver_reliability': self._calculate_reliability(previous_results),
            'driver_aggression': self._calculate_aggression(previous_results),
            'driver_risk_taking': self._calculate_risk_taking(previous_results),
            
            # Position and points metrics
            'avg_finish_position': previous_results['positionOrder'].mean(),
            'avg_grid_position': previous_results['grid'].mean(),
            'points_per_race': previous_results['points'].mean(),
            
            # Overtaking metrics
            **self._calculate_overtaking_metrics(previous_results),
            
            # DNF and completion metrics
            **self._calculate_completion_metrics(previous_results),
            
            # Circuit specific performance
            **self._calculate_circuit_metrics(circuit_results)
        }
        
        return attributes
    
    def _calculate_overtaking_metrics(self, results: pd.DataFrame) -> Dict[str, float]:
        """Calculate overtaking-related metrics."""
        if results.empty:
            return {'overtakes_per_race': 0.5, 'overtake_success_rate': 0.5}
            
        positions_gained = results['grid'] - results['positionOrder']
        positive_overtakes = (positions_gained > 0).sum()
        total_overtakes = len(positions_gained[positions_gained != 0])
        
        return {
            'overtakes_per_race': positions_gained.mean(),
            'overtake_success_rate': positive_overtakes / total_overtakes if total_overtakes > 0 else 0.5
        }
    
    def _calculate_reliability(self, driver_results: pd.DataFrame) -> float:
        """Calculate driver reliability score."""
        if driver_results.empty:
            return 0.5
            
        # Calculate finish rate
        finish_rate = (driver_results['statusId'] == 1).mean()  # Status 1 = Finished
        
        # Calculate mechanical failure rate
        mechanical_status_ids = [2, 3, 4, 5, 6]  # Mechanical failures
        mechanical_failure_rate = (driver_results['statusId'].isin(mechanical_status_ids)).mean()
        
        # Combine metrics
        reliability = (finish_rate * 0.7 + (1 - mechanical_failure_rate) * 0.3)
        
        return np.clip(reliability, 0.5, 1.0)
    
    def _calculate_completion_metrics(self, results: pd.DataFrame) -> Dict[str, float]:
        """Calculate race completion metrics."""
        if results.empty:
            return {'race_completion_rate': 0.5, 'dnf_rate': 0.5}
            
        completion_rate = (results['statusId'] == 1).mean()
        dnf_rate = (results['statusId'] != 1).mean()
        
        return {
            'race_completion_rate': completion_rate,
            'dnf_rate': dnf_rate
        }
    
    def _calculate_circuit_metrics(self, circuit_results: pd.DataFrame) -> Dict[str, float]:
        """Calculate circuit-specific performance metrics."""
        if circuit_results.empty:
            return {
                'circuit_avg_position': 10.0,
                'circuit_points_average': 0.0,
                'circuit_completion_rate': 0.5
            }
            
        return {
            'circuit_avg_position': circuit_results['positionOrder'].mean(),
            'circuit_points_average': circuit_results['points'].mean(),
            'circuit_completion_rate': (circuit_results['statusId'] == 1).mean()
        }
    
    def _calculate_risk_taking(self, driver_results: pd.DataFrame) -> float:
        """Calculate driver risk-taking score."""
        if driver_results.empty:
            return 0.5
            
        # Calculate various risk metrics
        positions_gained = driver_results['grid'] - driver_results['positionOrder']
        big_gains = (positions_gained > 5).mean()  # Significant position improvements
        incident_rate = (driver_results['statusId'].isin([4, 5, 6, 20, 82])).mean()
        
        # Combine metrics
        risk_score = (big_gains * 0.6 + incident_rate * 0.4)
        
        return np.clip(risk_score, 0, 1)
    
    def _calculate_consistency(self, driver_results: pd.DataFrame) -> float:
        """Calculate driver consistency score."""
        if driver_results.empty:
            return 0.5
            
        # Calculate position variance
        pos_std = driver_results['positionOrder'].std()
        normalized_std = np.exp(-pos_std/5)  # Lower std = higher consistency
        
        # Calculate finish rate in points
        points_finish_rate = (driver_results['points'] > 0).mean()
        
        # Combine metrics
        consistency = (normalized_std * 0.6 + points_finish_rate * 0.4)
        
        return np.clip(consistency, 0, 1)
    
    def _calculate_aggression(self, driver_results: pd.DataFrame) -> float:
        """Calculate driver aggression score based on overtaking and incidents."""
        if driver_results.empty:
            return 0.5
            
        # Calculate overtaking metrics
        positions_gained = driver_results['grid'] - driver_results['positionOrder']
        positive_overtakes = (positions_gained > 0).sum()
        negative_overtakes = (positions_gained < 0).sum()
        total_overtake_attempts = positive_overtakes + negative_overtakes
        
        # Calculate incident rates
        incident_status_ids = [4, 5, 6, 20, 82]  # Collisions, accidents, etc.
        incident_rate = (driver_results['statusId'].isin(incident_status_ids)).mean()
        
        # Calculate components
        overtake_success = positive_overtakes / total_overtake_attempts if total_overtake_attempts > 0 else 0.5
        avg_positions_gained = positions_gained[positions_gained > 0].mean() if (positions_gained > 0).any() else 0
        
        # Normalize and combine
        normalized_gains = np.clip(avg_positions_gained / 20, 0, 1)  # 20 as max possible positions gained
        
        # Weight the components
        aggression = (
            normalized_gains * 0.4 +
            overtake_success * 0.3 +
            incident_rate * 0.3
        )
        
        return np.clip(aggression, 0, 1)
    
    def _get_default_attributes(self) -> Dict[str, float]:
        """Return default attributes for drivers with no previous races."""
        return {
            'driver_overall_skill': 0.5,
            'driver_circuit_skill': 0.5,
            'driver_consistency': 0.5,
            'driver_reliability': 0.5,
            'driver_aggression': 0.5,
            'driver_risk_taking': 0.5,
            'avg_finish_position': 10.0,
            'avg_grid_position': 10.0,
            'points_per_race': 0.0,
            'overtakes_per_race': 0.0,
            'overtake_success_rate': 0.5,
            'race_completion_rate': 0.5,
            'dnf_rate': 0.5,
            'circuit_avg_position': 10.0,
            'circuit_points_average': 0.0,
            'circuit_completion_rate': 0.5
        }
    
    def _calculate_skill(self, results: pd.DataFrame) -> float:
        """Calculate driver skill score."""
        if results.empty:
            return 0.5
            
        avg_finish_pos = results['positionOrder'].mean()
        avg_quali_pos = results['grid'].mean()
        points_per_race = results['points'].mean()
        
        # Normalize metrics
        norm_finish = np.exp(-avg_finish_pos/5)
        norm_quali = np.exp(-avg_quali_pos/5)
        norm_points = points_per_race / 26  # Assuming 26 is the maximum points per race
        
        skill = (norm_finish * 0.4 + norm_quali * 0.3 + norm_points * 0.3)
        return np.clip(skill, 0.1, 1.0)

## Initialize Calculator and Add Attributes

In [31]:
# Merge results with races to include 'date'
if 'date' not in results.columns:
    print('Merging results with races to include "date" column.')
    results = results.merge(races[['raceId', 'date', 'circuitId']], on='raceId', how='left')
    if 'date' not in results.columns:
        raise KeyError('After merging, "date" column is still missing in results.')
    if 'circuitId' not in results.columns:
         raise KeyError('After merging, "circuitId" column is still missing in results.')
else:
    results['date'] = pd.to_datetime(results['date'])

# Verify that 'date' column is present
print('Columns in results after merge:', results.columns.tolist())

# Initialize calculator
calculator = LapAttributeCalculator(
    lap_times_df=lap_times,
    races_df=races,
    results_df=results
)

# Add attributes to lap times
enhanced_laps = calculator.add_driver_attributes(n_previous_races=20)

Merging results with races to include "date" column.
Columns in results after merge: ['resultId', 'raceId', 'driverId', 'constructorId', 'number', 'grid', 'position', 'positionText', 'positionOrder', 'points', 'laps', 'time', 'milliseconds', 'fastestLap', 'rank', 'fastestLapTime', 'fastestLapSpeed', 'statusId', 'date', 'circuitId']
Starting with 586171 laps
Processing 10984 unique driver-race combinations
Creating attributes DataFrame...
Merging attributes with lap data...
Final dataframe has 586171 rows and 25 columns


## View New Features

In [32]:
# Identify new feature columns
new_features = [col for col in enhanced_laps.columns if col not in lap_times.columns]
print("New features added:")
print(new_features)

# Basic statistics of new features
print("\nNew features statistics:")
print(enhanced_laps[new_features].describe())

New features added:
['date', 'year', 'circuitId', 'driver_overall_skill', 'driver_circuit_skill', 'driver_consistency', 'driver_reliability', 'driver_aggression', 'driver_risk_taking', 'avg_finish_position', 'avg_grid_position', 'points_per_race', 'overtakes_per_race', 'overtake_success_rate', 'race_completion_rate', 'dnf_rate', 'circuit_avg_position', 'circuit_points_average', 'circuit_completion_rate']

New features statistics:
                                date           year      circuitId  \
count                         586171  586171.000000  586171.000000   
mean   2011-07-25 08:47:43.732255744    2011.021168      19.414688   
min              1996-03-10 00:00:00    1996.000000       1.000000   
25%              2004-08-29 00:00:00    2004.000000       7.000000   
50%              2012-03-18 00:00:00    2012.000000      13.000000   
75%              2018-07-29 00:00:00    2018.000000      21.000000   
max              2024-11-03 00:00:00    2024.000000      80.000000   
std   

In [33]:
enhanced_laps.drop(columns=['position', 'time'], axis=1, inplace=True)
enhanced_laps

Unnamed: 0,raceId,driverId,lap,milliseconds,date,year,circuitId,driver_overall_skill,driver_circuit_skill,driver_consistency,...,avg_finish_position,avg_grid_position,points_per_race,overtakes_per_race,overtake_success_rate,race_completion_rate,dnf_rate,circuit_avg_position,circuit_points_average,circuit_completion_rate
0,841,20,1,98109,2011-03-27,2011,1,0.483233,0.258969,0.483917,...,5.75,1.95,13.30,-3.8,0.312500,0.8,0.2,17.0,0.0,0.0
1,841,20,2,93006,2011-03-27,2011,1,0.483233,0.258969,0.483917,...,5.75,1.95,13.30,-3.8,0.312500,0.8,0.2,17.0,0.0,0.0
2,841,20,3,92713,2011-03-27,2011,1,0.483233,0.258969,0.483917,...,5.75,1.95,13.30,-3.8,0.312500,0.8,0.2,17.0,0.0,0.0
3,841,20,4,92803,2011-03-27,2011,1,0.483233,0.258969,0.483917,...,5.75,1.95,13.30,-3.8,0.312500,0.8,0.2,17.0,0.0,0.0
4,841,20,5,92342,2011-03-27,2011,1,0.483233,0.258969,0.483917,...,5.75,1.95,13.30,-3.8,0.312500,0.8,0.2,17.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
586166,1141,815,65,82220,2024-11-03,2024,18,0.211130,0.500000,0.494178,...,8.65,7.95,6.85,-0.7,0.588235,0.7,0.3,10.0,0.0,0.5
586167,1141,815,66,82978,2024-11-03,2024,18,0.211130,0.500000,0.494178,...,8.65,7.95,6.85,-0.7,0.588235,0.7,0.3,10.0,0.0,0.5
586168,1141,815,67,82143,2024-11-03,2024,18,0.211130,0.500000,0.494178,...,8.65,7.95,6.85,-0.7,0.588235,0.7,0.3,10.0,0.0,0.5
586169,1141,815,68,82263,2024-11-03,2024,18,0.211130,0.500000,0.494178,...,8.65,7.95,6.85,-0.7,0.588235,0.7,0.3,10.0,0.0,0.5


In [34]:
enhanced_laps.dtypes

raceId                              int64
driverId                            int64
lap                                 int64
milliseconds                        int64
date                       datetime64[ns]
year                                int64
circuitId                           int64
driver_overall_skill              float64
driver_circuit_skill              float64
driver_consistency                float64
driver_reliability                float64
driver_aggression                 float64
driver_risk_taking                float64
avg_finish_position               float64
avg_grid_position                 float64
points_per_race                   float64
overtakes_per_race                float64
overtake_success_rate             float64
race_completion_rate              float64
dnf_rate                          float64
circuit_avg_position              float64
circuit_points_average            float64
circuit_completion_rate           float64
dtype: object

Phase 1


In [35]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, random_split, Dataset
from typing import List, Tuple, Dict, Optional
from dataclasses import dataclass, field
from sklearn.preprocessing import StandardScaler
import os

# Define the RaceFeatures dataclass
@dataclass
class RaceFeatures:
    """Data structure for race features"""
    static_features: List[str] = field(default_factory=lambda: [
        'driver_overall_skill', 'driver_circuit_skill', 'driver_consistency',
        'driver_reliability', 'driver_aggression', 'driver_risk_taking'
    ])
    
    dynamic_features: List[str] = field(default_factory=lambda: [
        'tire_age', 'fuel_load', 'track_position', 'track_temp',
        'air_temp', 'humidity'
    ])
    
    target: str = 'milliseconds'

# Define the F1Dataset class
class F1Dataset(Dataset):
    def __init__(self, sequences, static_features, targets):
        self.sequences = torch.FloatTensor(sequences)
        self.static_features = torch.FloatTensor(static_features)
        self.targets = torch.FloatTensor(targets)
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return {
            'sequence': self.sequences[idx],
            'static': self.static_features[idx],
            'target': self.targets[idx]
        }

# Define the F1DataPreprocessor class
class F1DataPreprocessor:
    def __init__(self):
        self.static_scaler = StandardScaler()
        self.dynamic_scaler = StandardScaler()
        self.lap_time_scaler = StandardScaler()
        
    def prepare_sequence_data(self, df: pd.DataFrame, window_size: int = 3) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        """
        Prepare sequential data with sliding window and apply scaling
        """
        sequences = []
        static_features = []
        targets = []
        
        # Sort the dataframe to ensure consistent ordering
        df = df.sort_values(['raceId', 'driverId', 'lap'])
        
        # Group by race and driver
        for (race_id, driver_id), group in df.groupby(['raceId', 'driverId']):
            group = group.sort_values('lap')
            
            # Extract static features (assumed to be constant per driver per race)
            static = group[RaceFeatures.static_features].iloc[0].values
            static_features.append(static)
            
            # Extract dynamic features and target
            lap_times = group[RaceFeatures.target].values.reshape(-1, 1)  # Shape: (num_laps, 1)
            dynamic = group[RaceFeatures.dynamic_features].values  # Shape: (num_laps, num_dynamic_features)
            
            # Apply scaling
            # Note: Scalers should be fitted on the training data to prevent data leakage.
            # Here, for simplicity, we're fitting on the entire dataset. For a real-world scenario,
            # consider splitting the data first before fitting the scalers.
            lap_times_scaled = self.lap_time_scaler.fit_transform(lap_times).flatten()
            dynamic_scaled = self.dynamic_scaler.fit_transform(dynamic)
            static_scaled = self.static_scaler.fit_transform(static.reshape(1, -1)).flatten()
            
            # Create sequences
            for i in range(len(lap_times_scaled) - window_size):
                sequence_lap_times = lap_times_scaled[i:i+window_size].reshape(-1, 1)  # Shape: (window_size, 1)
                sequence_dynamic = dynamic_scaled[i:i+window_size]  # Shape: (window_size, num_dynamic_features)
                sequence = np.hstack((sequence_lap_times, sequence_dynamic))  # Shape: (window_size, 1 + num_dynamic_features)
                sequences.append(sequence)
                static_features.append(static_scaled)
                targets.append(lap_times_scaled[i + window_size])
        
        return (np.array(sequences), 
                np.array(static_features), 
                np.array(targets))
    
    def create_train_val_loaders(
        self, 
        sequences: np.ndarray, 
        static_features: np.ndarray, 
        targets: np.ndarray,
        batch_size: int = 32,
        val_split: float = 0.2
    ) -> Tuple[DataLoader, DataLoader]:
        """
        Create train and validation dataloaders with given split ratio
        """
        dataset = F1Dataset(sequences, static_features, targets)
        
        # Calculate lengths for split
        val_size = int(len(dataset) * val_split)
        train_size = len(dataset) - val_size
        
        # Split dataset
        train_dataset, val_dataset = random_split(
            dataset, 
            [train_size, val_size],
            generator=torch.Generator().manual_seed(42)
        )
        
        # Create dataloaders
        train_loader = DataLoader(
            train_dataset,
            batch_size=batch_size,
            shuffle=True
        )
        
        val_loader = DataLoader(
            val_dataset,
            batch_size=batch_size,
            shuffle=False
        )
        
        return train_loader, val_loader

# Define the F1PredictionModel class
class F1PredictionModel(nn.Module):
    def __init__(self, 
                 sequence_dim: int,
                 static_dim: int,
                 hidden_dim: int = 64,
                 num_layers: int = 2):
        super().__init__()
        
        # LSTM for sequential features
        self.lstm = nn.LSTM(
            input_size=sequence_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True
        )
        
        # Static features processing
        self.static_network = nn.Sequential(
            nn.Linear(static_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim)
        )
        
        # Combine everything
        self.final_network = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )
    
    def forward(self, sequence, static):
        # Process sequence through LSTM
        lstm_out, _ = self.lstm(sequence)
        lstm_out = lstm_out[:, -1, :]  # Take the output of the last time step
        
        # Process static features
        static_out = self.static_network(static)
        
        # Combine LSTM output and static features
        combined = torch.cat([lstm_out, static_out], dim=1)
        
        # Final prediction
        prediction = self.final_network(combined)
        
        return prediction.squeeze()

# Define the training function
def train_model(model: nn.Module, 
                train_loader: DataLoader,
                val_loader: DataLoader,
                epochs: int = 10,
                learning_rate: float = 0.001,
                device: Optional[str] = None) -> Dict[str, List[float]]:
    """
    Train the model and return training history
    """
    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()
    history = {'train_loss': [], 'val_loss': []}
    
    for epoch in range(epochs):
        # Training
        model.train()
        train_losses = []
        for batch in train_loader:
            sequences = batch['sequence'].to(device)
            static = batch['static'].to(device)
            targets = batch['target'].to(device)
            
            optimizer.zero_grad()
            predictions = model(sequences, static)
            loss = criterion(predictions, targets)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())
        
        # Validation
        model.eval()
        val_losses = []
        with torch.no_grad():
            for batch in val_loader:
                sequences = batch['sequence'].to(device)
                static = batch['static'].to(device)
                targets = batch['target'].to(device)
                
                predictions = model(sequences, static)
                loss = criterion(predictions, targets)
                val_losses.append(loss.item())
        
        # Record losses
        train_loss = np.mean(train_losses)
        val_loss = np.mean(val_losses)
        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        
        print(f'Epoch {epoch+1}/{epochs}:')
        print(f'Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
    
    return history

# Define a function to save the model
def save_model(model: nn.Module, path: str):
    torch.save(model.state_dict(), path)
    print(f"Model saved to {path}")

# New function to load and preprocess the data
def load_and_preprocess_data() -> pd.DataFrame:
    """
    Load data from CSV files and preprocess it to create the enhanced_laps DataFrame.
    """
    # Load data
    # Define NA values
    na_values = ['\\N']
    
    # Load Data
    circuits = pd.read_csv('../../data/raw_data/circuits.csv', na_values=na_values)
    constructors = pd.read_csv('../../data/raw_data/constructors.csv', na_values=na_values)
    drivers = pd.read_csv('../../data/raw_data/drivers.csv', na_values=na_values)
    races = pd.read_csv('../../data/raw_data/races.csv', na_values=na_values)
    results = pd.read_csv('../../data/raw_data/results.csv', na_values=na_values)
    lap_times = pd.read_csv('../../data/raw_data/lap_times.csv', na_values=na_values)
    pit_stops = pd.read_csv('../../data/raw_data/pit_stops.csv', na_values=na_values)
    qualifying = pd.read_csv('../../data/raw_data/qualifying.csv', na_values=na_values)
    status = pd.read_csv('../../data/raw_data/status.csv', na_values=na_values)
    weather_data = pd.read_csv('../../data/raw_data/ff1_weather.csv', na_values=na_values)
    practice_sessions = pd.read_csv('../../data/raw_data/ff1_free_practice.csv', na_values=na_values)
    
    preprocessed = pd.read_csv('../../data/processed/export_v1.csv', na_values=na_values)

    # Merge dataframes
    laps = lap_times.merge(drivers, on='driverId', how='left')
    laps = laps.merge(races, on='raceId', how='left')
    laps = laps.merge(circuits, on='circuitId', how='left')

    # Add pit stop information
    laps = laps.merge(pit_stops[['raceId', 'driverId', 'lap', 'duration']], on=['raceId', 'driverId', 'lap'], how='left')
    laps['duration'].fillna(0, inplace=True)  # Assuming 0 if no pit stop

    # Add weather information
    # This is a placeholder; you'll need to match your actual weather data
    laps = laps.merge(weather_data, on=['raceId', 'lap'], how='left')

    # Feature Engineering
    laps['tire_age'] = laps.groupby(['raceId', 'driverId'])['lap'].cumcount()
    laps['fuel_load'] = laps.groupby(['raceId', 'driverId'])['lap'].apply(lambda x: x.max() - x + 1)

    # For simplicity, we can assign dummy values to static features
    # In a real scenario, you should compute these based on historical data
    laps['driver_overall_skill'] = 1.0  # Placeholder
    laps['driver_circuit_skill'] = 1.0  # Placeholder
    laps['driver_consistency'] = 1.0    # Placeholder
    laps['driver_reliability'] = 1.0    # Placeholder
    laps['driver_aggression'] = 1.0     # Placeholder
    laps['driver_risk_taking'] = 1.0    # Placeholder

    # Dynamic features (assuming you have these in your weather_data)
    # If not, assign dummy values or extract from available data
    laps['track_temp'] = laps['track_temp'].fillna(25.0)  # Placeholder
    laps['air_temp'] = laps['air_temp'].fillna(20.0)      # Placeholder
    laps['humidity'] = laps['humidity'].fillna(50.0)      # Placeholder

    # Ensure that all required columns are present
    required_columns = RaceFeatures.static_features + RaceFeatures.dynamic_features + [RaceFeatures.target]
    missing_columns = set(required_columns) - set(laps.columns)
    if missing_columns:
        raise ValueError(f"Missing required columns: {missing_columns}")

    # Drop rows with missing values in required columns
    laps = laps.dropna(subset=required_columns)

    return laps

# Update the main function
def main():
    # Load and preprocess data
    enhanced_laps = load_and_preprocess_data()
    
    preprocessor = F1DataPreprocessor()
    sequences, static, targets = preprocessor.prepare_sequence_data(enhanced_laps, window_size=3)
    
    # Create train and validation loaders
    train_loader, val_loader = preprocessor.create_train_val_loaders(
        sequences, 
        static, 
        targets,
        batch_size=32,
        val_split=0.2
    )
    
    # Initialize model
    model = F1PredictionModel(
        sequence_dim=sequences.shape[2],
        static_dim=static.shape[1]
    )
    
    # Train the model
    history = train_model(model, train_loader, val_loader, epochs=20, learning_rate=0.001)
    
    # Save the trained model
    save_model(model, 'f1_prediction_model.pth')

if __name__ == "__main__":
    main()


AttributeError: type object 'RaceFeatures' has no attribute 'static_features'