In [1]:
import sys, os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
sys.path.append('../')
import datetime
from models.xgboost.xgboost import F1XGBoostPredictor

In [2]:
# Load your processed data
with open('processed_race_data.pkl', 'rb') as f:
    processed_data = pickle.load(f)

# Initialize and optimize
predictor = F1XGBoostPredictor(processed_data)
study = predictor.optimize(n_trials=250)

# Train the model with best parameters
predictor.train(study)

# Evaluate performance
metrics = predictor.evaluate()
print(f"Test RMSE: {metrics['rmse']:.2f} ms")
print(f"Test MAE: {metrics['mae']:.2f} ms")

# Save the model
predictor.save_model('f1_xgboost_model.json')

[I 2024-12-08 11:43:23,324] A new study created in memory with name: no-name-9c18c3ea-7bdc-4234-b1a5-759a3ac24bc1
[I 2024-12-08 11:43:25,181] Trial 0 finished with value: 6909.208211597503 and parameters: {'max_depth': 4, 'learning_rate': 0.0015485890196237778, 'n_estimators': 578, 'min_child_weight': 1, 'subsample': 0.689507634685872, 'colsample_bytree': 0.8775055431411992, 'reg_alpha': 0.005009544984592522, 'reg_lambda': 4.800552887262388e-06}. Best is trial 0 with value: 6909.208211597503.
[I 2024-12-08 11:43:27,059] Trial 1 finished with value: 4671.194747361591 and parameters: {'max_depth': 3, 'learning_rate': 0.025981542680047296, 'n_estimators': 777, 'min_child_weight': 2, 'subsample': 0.8754294109962378, 'colsample_bytree': 0.9081774592003278, 'reg_alpha': 6.422989460626791e-05, 'reg_lambda': 1.7001814628149523e-07}. Best is trial 1 with value: 4671.194747361591.
[I 2024-12-08 11:43:28,580] Trial 2 finished with value: 6649.309586137278 and parameters: {'max_depth': 4, 'learnin

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr
from typing import Dict, List, Tuple

def analyze_feature_importance(predictor, processed_data: Dict) -> Dict:
    """
    Comprehensive feature importance analysis using multiple methods.
    
    Args:
        predictor: Trained XGBoost predictor
        processed_data: Dictionary containing processed feature data
    
    Returns:
        Dictionary containing various feature importance metrics
    """
    # Get feature names
    feature_names = predictor.static_features + predictor.dynamic_features
    
    # Get feature importance from XGBoost
    xgb_importance = predictor.model.feature_importances_
    
    # Calculate correlation with target
    features = processed_data['train']['features']
    targets = processed_data['train']['targets']
    
    # Calculate Spearman correlation for each feature
    correlations = []
    for i in range(features.shape[1]):
        correlation, _ = spearmanr(features[:, i], targets)
        correlations.append(abs(correlation))  # Use absolute correlation
    
    # Calculate feature stability (variance across different subsets)
    n_splits = 5
    split_size = len(features) // n_splits
    stability_scores = []
    
    for i in range(features.shape[1]):
        importances = []
        for j in range(n_splits):
            start_idx = j * split_size
            end_idx = (j + 1) * split_size
            subset_features = features[start_idx:end_idx]
            subset_targets = targets[start_idx:end_idx]
            
            # Calculate correlation for this subset
            corr, _ = spearmanr(subset_features[:, i], subset_targets)
            importances.append(abs(corr))
        
        stability_scores.append(1 - np.std(importances))
    
    # Combine all metrics
    feature_metrics = []
    for i, feature_name in enumerate(feature_names):
        feature_metrics.append({
            'feature': feature_name,
            'xgb_importance': xgb_importance[i],
            'correlation': correlations[i],
            'stability': stability_scores[i],
            # Combined score giving equal weight to all metrics
            'combined_score': (
                0.4 * xgb_importance[i] + 
                0.4 * correlations[i] + 
                0.2 * stability_scores[i]
            )
        })
    
    return pd.DataFrame(feature_metrics).sort_values('combined_score', ascending=False)

def plot_feature_importance(importance_df: pd.DataFrame, top_n: int = 20):
    """
    Create visualizations for feature importance analysis.
    
    Args:
        importance_df: DataFrame containing feature importance metrics
        top_n: Number of top features to display
    """
    plt.figure(figsize=(15, 10))
    
    # Plot top features by combined score
    top_features = importance_df.head(top_n)
    
    plt.subplot(2, 1, 1)
    sns.barplot(data=top_features, x='combined_score', y='feature')
    plt.title(f'Top {top_n} Features by Combined Importance Score')
    plt.xlabel('Combined Importance Score')
    
    # Plot correlation between different importance metrics
    plt.subplot(2, 1, 2)
    importance_metrics = ['xgb_importance', 'correlation', 'stability']
    correlation_matrix = importance_df[importance_metrics].corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
    plt.title('Correlation between Importance Metrics')
    
    plt.tight_layout()
    plt.show()

def get_feature_recommendations(importance_df: pd.DataFrame, 
                              threshold: float = 0.01) -> Tuple[List[str], List[str]]:
    """
    Provide recommendations for feature selection.
    
    Args:
        importance_df: DataFrame containing feature importance metrics
        threshold: Minimum importance threshold for keeping features
    
    Returns:
        Tuple containing (features to keep, features to consider removing)
    """
    # Features to definitely keep (high importance or correlation)
    keep_features = importance_df[
        (importance_df['combined_score'] > threshold) |
        (importance_df['correlation'] > threshold * 2)
    ]['feature'].tolist()
    
    # Features to consider removing
    remove_features = importance_df[
        (importance_df['combined_score'] <= threshold) &
        (importance_df['correlation'] <= threshold * 2)
    ]['feature'].tolist()
    
    return keep_features, remove_features

# Example usage after optimization:

# Run analysis
importance_df = analyze_feature_importance(predictor, processed_data)

# Plot results
plot_feature_importance(importance_df)

# Get recommendations
keep_features, remove_features = get_feature_recommendations(importance_df)

print("\nRecommended features to keep:")
print("\n".join(f"- {feature}" for feature in keep_features))

print("\nFeatures to consider removing:")
print("\n".join(f"- {feature}" for feature in remove_features))


In [None]:
def validate_physical_patterns(predictions, actual, metadata):
    """Validate if predictions follow expected F1 physics patterns."""
    # Group by race and driver
    races = metadata.groupby(['raceId', 'driverId'])
    
    patterns = {
        'fuel_effect': [],
        'tire_effect': [],
        'combined_effect': []
    }
    
    for (race_id, driver_id), group in races:
        # Calculate lap time trends
        early_laps = predictions[group.index[:10]].mean()
        late_laps = predictions[group.index[-10:]].mean()
        
        # Net effect should be faster laps (negative delta)
        patterns['combined_effect'].append(late_laps - early_laps)
    
    return pd.DataFrame(patterns).describe()

In [None]:
def analyze_lap_sequences(predictions, actual, metadata):
    """Analyze if predicted lap time sequences match F1 patterns."""
    # Group by stint (periods between pit stops)
    stints = metadata.groupby(['raceId', 'driverId', 'stint'])
    
    stint_patterns = []
    for _, stint in stints:
        # Calculate trend lines
        actual_trend = np.polyfit(range(len(stint)), actual[stint.index], 1)[0]
        pred_trend = np.polyfit(range(len(stint)), predictions[stint.index], 1)[0]
        
        stint_patterns.append({
            'actual_trend': actual_trend,
            'predicted_trend': pred_trend,
            'stint_length': len(stint)
        })
    
    return pd.DataFrame(stint_patterns)