In [2]:

import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.base import clone
from xgboost import XGBRegressor, plot_importance
from sklearn.model_selection import TimeSeriesSplit
import joblib
import matplotlib.pyplot as plt
import os

# Set environment variable to prevent XGBoost device mismatch warning
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
# /kaggle/input/burnout-datathon-ieeecsmuj/test.csv
# Load data
train = pd.read_csv('/kaggle/input/burnout-datathon-ieeecsmuj/train.csv')
val = pd.read_csv('/kaggle/input/burnout-datathon-ieeecsmuj/val.csv')
test = pd.read_csv('/kaggle/input/burnout-datathon-ieeecsmuj/test.csv')

# Feature engineering with improved safety
def create_features(df):
    # Temporal features
    df['season_progress'] = df['year_x'] - df['min_year']
    df['career_phase'] = (df['year_x'] - df['min_year']) / (df['years_active'].replace(0, np.nan) + 1e-6)
    
    # Team performance metrics
    df['team_success_rate'] = df['podiums'] / (df['starts'].replace(0, np.nan) + 1e-6)
    df['team_reliability'] = df['finishes'] / (df['starts'].replace(0, np.nan) + 1e-6)
    
    # Tire dynamics
    df['tire_wear_ratio'] = df['Tire_Degradation_Factor_per_Lap'] * df['Laps']
    df['compound_aggressiveness'] = np.where(
        df['Tire_Compound_Front'] == 'Soft', 1.2,
        np.where(df['Tire_Compound_Front'] == 'Medium', 1.0, 0.8)
    )
    
    # Track difficulty
    df['speed_load'] = df['Avg_Speed_kmh'] / (df['Circuit_Length_km'].replace(0, np.nan) + 1e-6)
    df['corner_intensity'] = df['Corners_per_Lap'] / (df['Circuit_Length_km'].replace(0, np.nan) + 1e-6)
    
    # Session progression
    session_order = {'FP1':1, 'FP2':2, 'FP3':3, 'FP4':4, 
                    'Qualifying':5, 'Sprint':6, 'Race':7}
    df['session_importance'] = df['Session'].map(session_order).fillna(0)
    
    # List of all potential numerical columns
    numerical_cols = ['Circuit_Length_km', 'Laps', 'Avg_Speed_kmh', 
                     'Tire_Degradation_Factor_per_Lap', 'Ambient_Temperature_Celsius',
                     'Humidity_%', 'points', 'Championship_Points', 'Corners_per_Lap',
                     'Pit_Stop_Duration_Seconds', 'Track_Temperature_Celsius', 'air',
                     'ground', 'starts', 'finishes', 'with_points', 'podiums', 'wins',
                     'min_year', 'max_year', 'years_active', 'year_x',
                     'season_progress', 'career_phase', 'team_success_rate', 
                     'team_reliability', 'tire_wear_ratio', 'compound_aggressiveness',
                     'speed_load', 'corner_intensity', 'session_importance']
    
    # Convert to numeric and fill NaN with 0
    for col in numerical_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
            
    return df

# Apply feature engineering
train = create_features(train)
val = create_features(val)
test = create_features(test)

# Define feature groups
high_cardinality = ['rider_name', 'circuit_name', 'team_name', 'shortname', 'bike_name']
ordinal_features = ['Tire_Compound_Front', 'Tire_Compound_Rear', 
                   'Grid_Position', 'Championship_Position']
nominal_features = ['category_x', 'Track_Condition', 'Penalty', 
                   'weather', 'track', 'Session']

numerical_features = [
    'Circuit_Length_km', 'Laps', 'Avg_Speed_kmh', 'Tire_Degradation_Factor_per_Lap',
    'season_progress', 'career_phase', 'team_success_rate', 'team_reliability',
    'tire_wear_ratio', 'compound_aggressiveness', 'speed_load', 'corner_intensity',
    'session_importance', 'Ambient_Temperature_Celsius', 'Humidity_%', 'points',
    'Championship_Points', 'Corners_per_Lap', 'Pit_Stop_Duration_Seconds',
    'Track_Temperature_Celsius', 'air', 'ground', 'starts', 'finishes', 'with_points',
    'podiums', 'wins', 'min_year', 'max_year', 'years_active', 'year_x'
]

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('target_enc', OrdinalEncoder(
        handle_unknown='use_encoded_value', 
        unknown_value=-1
    ), high_cardinality),
    ('ordinal', OrdinalEncoder(
        categories=[
            ['Hard','Medium','Soft'], 
            ['Hard','Medium','Soft'],
            list(range(1, 26)),  # Grid positions
            list(range(1, 31))   # Championship positions
        ],
        handle_unknown='use_encoded_value',
        unknown_value=-1
    ), ordinal_features),
    ('onehot', OneHotEncoder(handle_unknown='ignore'), nominal_features),
    ('num', 'passthrough', numerical_features)
])

# Prepare data
features_used = high_cardinality + ordinal_features + nominal_features + numerical_features
X_train = train[features_used]
y_train = train['Lap_Time_Seconds']
X_val = val[features_used]
y_val = val['Lap_Time_Seconds']
X_test = test[features_used]

# Combine train and val for final training
X_train_val = pd.concat([X_train, X_val])
y_train_val = pd.concat([y_train, y_val])

# Preprocess all data at once
preprocessor.fit(X_train_val)
X_train_val_trans = preprocessor.transform(X_train_val)
X_test_trans = preprocessor.transform(X_test)

# Create validation split for plotting
split_index = int(len(X_train_val_trans) * 0.9)  # 90% train, 10% validation
X_final_train = X_train_val_trans[:split_index]
y_final_train = y_train_val.iloc[:split_index]
X_final_val = X_train_val_trans[split_index:]
y_final_val = y_train_val.iloc[split_index:]

# Configure final model
final_model = XGBRegressor(
    tree_method='hist',
    device='cuda',
    n_estimators=20000,
    learning_rate=0.03770760297401682,
    max_depth=20,
    subsample=0.6936774881064425,
    colsample_bytree=0.851962030829581,
    gamma=0.146963904565,
    reg_alpha=1.53108214125,
    reg_lambda=1.9924755681,
    eval_metric='rmse',
    early_stopping_rounds=50
)

# Train with validation set for learning curve
history = final_model.fit(
    X_final_train, y_final_train,
    eval_set=[(X_final_val, y_final_val)],
    verbose=100  # Print progress every 10 iterations
)

# Plot learning curve
results = final_model.evals_result()
epochs = len(results['validation_0']['rmse'])
x_axis = range(0, epochs)

plt.figure(figsize=(12, 6))
plt.plot(x_axis, results['validation_0']['rmse'], label='Validation')
plt.legend()
plt.ylabel('RMSE')
plt.xlabel('Epochs')
plt.title('XGBoost Learning Curve')
plt.savefig('learning_curve.png')
plt.close()

# Plot feature importance
plt.figure(figsize=(16, 12))
plot_importance(final_model, max_num_features=30)
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

# Retrain on full dataset
final_model = XGBRegressor(
    tree_method='hist',
    device='cuda',
    n_estimators=final_model.best_iteration,
    learning_rate=0.025,
    max_depth=9,
    subsample=0.75,
    colsample_bytree=0.85,
    gamma=0.1,
    reg_alpha=0.5,
    reg_lambda=0.5,
    eval_metric='rmse'
)

final_model.fit(X_train_val_trans, y_train_val)

# Generate predictions
# Set device to CPU for prediction to avoid device mismatch
final_model.set_params(device='cpu')
test_preds = final_model.predict(X_test_trans)

# Create submission
submission = pd.DataFrame({
    'Unique ID': test['Unique ID'],
    'Lap_Time_Seconds': np.round(test_preds, 3)
})
submission.to_csv('solution.csv', index=False)

# Save model artifacts
joblib.dump(preprocessor, 'preprocessor.pkl')
joblib.dump(final_model, 'final_model.pkl')

print("Training completed successfully!")
print(f"Learning curve saved as 'learning_curve.png'")
print(f"Feature importance plot saved as 'feature_importance.png'")
print(f"Submission file saved as 'solution.csv'")

[0]	validation_0-rmse:11.33351
[100]	validation_0-rmse:1.76940
[200]	validation_0-rmse:0.37775
[300]	validation_0-rmse:0.12755
[400]	validation_0-rmse:0.11570
[483]	validation_0-rmse:0.11464
Training completed successfully!
Learning curve saved as 'learning_curve.png'
Feature importance plot saved as 'feature_importance.png'
Submission file saved as 'solution.csv'


<Figure size 1600x1200 with 0 Axes>

In [6]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from xgboost import XGBRegressor, plot_importance
from sklearn.model_selection import TimeSeriesSplit
import joblib
import matplotlib.pyplot as plt
import optuna
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# Load data/kaggle/input/burnout-datathon-ieeecsmuj/test.csv
train = pd.read_csv('/kaggle/input/burnout-datathon-ieeecsmuj/train.csv')
val = pd.read_csv('/kaggle/input/burnout-datathon-ieeecsmuj/val.csv')
test = pd.read_csv('/kaggle/input/burnout-datathon-ieeecsmuj/test.csv')

# Advanced feature engineering
def create_features(df):
    # Temporal features
    df['years_since_debut'] = df['year_x'] - df['min_year']
    df['career_phase'] = df['years_since_debut'] / (df['years_active'] + 1e-6)
    
    # Performance features
    df['podium_rate'] = df['podiums'] / (df['starts'] + 1e-6)
    df['finish_rate'] = df['finishes'] / (df['starts'] + 1e-6)
    df['win_rate'] = df['wins'] / (df['starts'] + 1e-6)
    
    # Tire and track dynamics
    df['tire_wear_effect'] = df['Tire_Degradation_Factor_per_Lap'] * df['Laps'] * df['Circuit_Length_km']
    df['compound_aggressiveness'] = np.select(
        [
            df['Tire_Compound_Front'] == 'Soft',
            df['Tire_Compound_Front'] == 'Medium',
            df['Tire_Compound_Front'] == 'Hard'
        ],
        [1.2, 1.0, 0.8],
        default=1.0
    )
    
    # Speed and corner dynamics
    df['speed_intensity'] = df['Avg_Speed_kmh'] / df['Circuit_Length_km']
    df['corner_intensity'] = df['Corners_per_Lap'] / df['Circuit_Length_km']
    df['speed_corner_ratio'] = df['Avg_Speed_kmh'] / (df['Corners_per_Lap'] + 1e-6)
    
    # Session importance
    session_order = {'FP1':1, 'FP2':2, 'FP3':3, 'FP4':4, 
                    'Qualifying':5, 'Sprint':6, 'Race':7}
    df['session_importance'] = df['Session'].map(session_order).fillna(0)
    
    # Weather effects
    df['temp_humidity_effect'] = df['Ambient_Temperature_Celsius'] * df['Humidity_%'] / 100
    df['track_temp_effect'] = df['Track_Temperature_Celsius'] - df['Ambient_Temperature_Celsius']
    
    # Rider-team synergy
    df['rider_team_experience'] = df['year_x'] - df.groupby(['rider_name', 'team_name'])['year_x'].transform('min')
    # df = df.sort_values('year_x')
    # group = df.groupby(['rider_name', 'team_name'])
    # df['rider_team_experience'] = group['year_x'].expanding().apply(lambda x: x.iloc[-1] - x.iloc[0], raw=False)

    # Recent performance
    df['recent_points'] = df.groupby('rider_name')['points'].transform(lambda x: x.rolling(3, min_periods=1).mean())
    #=================================================
    # df['tire_temp_diff'] = df['Track_Temperature_Celsius'] - {
    #     'Soft': 30, 'Medium': 40, 'Hard': 50
    # }[df['Tire_Compound_Front']]
    df['tire_temp_diff'] = df['Track_Temperature_Celsius'] - df['Tire_Compound_Front'].map({
        'Soft': 30, 
        'Medium': 40, 
        'Hard': 50
    })
        
    df['rain_effect'] = np.where(df['weather'] == 'Raining', 
                                 df['Humidity_%'] * df['Tire_Degradation_Factor_per_Lap'], 
                                 0)
    #=================================================

    # Numerical columns to process
    num_cols = [
        'Circuit_Length_km', 'Laps', 'Avg_Speed_kmh', 'Tire_Degradation_Factor_per_Lap',
        'Ambient_Temperature_Celsius', 'Humidity_%', 'points', 'Championship_Points',
        'Corners_per_Lap', 'Pit_Stop_Duration_Seconds', 'Track_Temperature_Celsius',
        'years_since_debut', 'career_phase', 'podium_rate', 'finish_rate', 'win_rate',
        'tire_wear_effect', 'compound_aggressiveness', 'speed_intensity', 
        'corner_intensity', 'speed_corner_ratio', 'session_importance',
        'temp_humidity_effect', 'track_temp_effect', 'rider_team_experience', 'recent_points','tire_temp_diff', 'rain_effect', 'recent_points'
    ]
    
    # Process numerical columns
    for col in num_cols:
        if col in df.columns:
            # Handle infinite values and NaNs
            df[col] = df[col].replace([np.inf, -np.inf], np.nan)
            mean_val = df[col].mean()
            df[col] = df[col].fillna(mean_val if not np.isnan(mean_val) else 0)
            
    return df

# Apply feature engineering
train = create_features(train)
val = create_features(val)
test = create_features(test)

# Define feature groups
high_cardinality = ['rider_name', 'circuit_name', 'team_name', 'shortname', 'bike_name']
ordinal_features = ['Tire_Compound_Front', 'Tire_Compound_Rear', 
                   'Grid_Position', 'Championship_Position']
nominal_features = ['category_x', 'Track_Condition', 'Penalty', 
                   'weather', 'track', 'Session']

numerical_features = [
    'Circuit_Length_km', 'Laps', 'Avg_Speed_kmh', 'Tire_Degradation_Factor_per_Lap',
    'Ambient_Temperature_Celsius', 'Humidity_%', 'points', 'Championship_Points',
    'Corners_per_Lap', 'Pit_Stop_Duration_Seconds', 'Track_Temperature_Celsius',
    'years_since_debut', 'career_phase', 'podium_rate', 'finish_rate', 'win_rate',
    'tire_wear_effect', 'compound_aggressiveness', 'speed_intensity', 
    'corner_intensity', 'speed_corner_ratio', 'session_importance',
    'temp_humidity_effect', 'track_temp_effect', 'rider_team_experience', 'recent_points'
]

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('target_enc', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), 
     high_cardinality),
    ('ordinal', OrdinalEncoder(
        categories=[
            ['Hard','Medium','Soft'], 
            ['Hard','Medium','Soft'],
            list(range(1, 26)),  # Grid positions   
            list(range(1, 31))   # Championship positions
        ], 
        handle_unknown='use_encoded_value',
        unknown_value=-1
    ), ordinal_features),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), nominal_features),
    ('num', 'passthrough', numerical_features)
])

# Prepare data
features_used = high_cardinality + ordinal_features + nominal_features + numerical_features
X_train = train[features_used]
y_train = train['Lap_Time_Seconds']
X_val = val[features_used]
y_val = val['Lap_Time_Seconds']
X_test = test[features_used]

# Combine train and val
X_full = pd.concat([X_train, X_val])
y_full = pd.concat([y_train, y_val])

# Preprocess data
preprocessor.fit(X_full)
X_train_pre = preprocessor.transform(X_train)
X_val_pre = preprocessor.transform(X_val)
X_full_pre = preprocessor.transform(X_full)
X_test_pre = preprocessor.transform(X_test)

# Hyperparameter optimization with Optuna
def objective(trial):
    params = {
        'tree_method': 'hist',
        'device': 'cuda',
        'n_estimators': 5000,
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth',10, 20),
        'subsample': trial.suggest_float('subsample', 0.4, 0.8),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'eval_metric': 'rmse',
        'early_stopping_rounds': 100
    }
    
    model = XGBRegressor(**params)
    
    model.fit(
        X_train_pre, y_train,
        eval_set=[(X_val_pre, y_val)],
        verbose=False
    )
    
    # Get best score
    best_rmse = min(model.evals_result()['validation_0']['rmse'])
    return best_rmse

# Run optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50, timeout=3600)  # 1 hour timeout

# Get best parameters
best_params = study.best_params
print(f"Best parameters: {best_params}")
print(f"Best validation RMSE: {study.best_value:.6f}")

# Train final model with best parameters
final_model = XGBRegressor(
    **best_params,
    tree_method='hist',
    device='cuda'
)

# Train with early stopping
final_model.fit(
    X_train_pre, y_train,
    eval_set=[(X_val_pre, y_val)],
    verbose=100
)

# Plot learning curve
results = final_model.evals_result()
epochs = len(results['validation_0']['rmse'])
x_axis = range(0, epochs)

plt.figure(figsize=(12, 6))
plt.plot(x_axis, results['validation_0']['rmse'], label='Validation')
plt.axhline(y=study.best_value, color='r', linestyle='--', label=f'Best RMSE: {study.best_value:.6f}')
plt.legend()
plt.ylabel('RMSE')
plt.xlabel('Epochs')
plt.title('XGBoost Learning Curve')
plt.savefig('learning_curve.png')
plt.close()

# Plot feature importance
plt.figure(figsize=(16, 12))
plot_importance(final_model, max_num_features=30)
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

# Generate predictions
test_preds = final_model.predict(X_test_pre)

# Create submission
submission = pd.DataFrame({
    'Unique ID': test['Unique ID'],
    'Lap_Time_Seconds': np.round(test_preds, 6)  # More precision
})
submission.to_csv('solution.csv', index=False)

# # Save model artifacts
# joblib.dump(preprocessor, 'preprocessor.pkl')
# joblib.dump(final_model, 'final_model.pkl')

# print(f"Final model trained with validation RMSE: {study.best_value:.6f}")
# print("Submission file created with high-precision predictions")

[I 2025-06-14 15:31:39,610] A new study created in memory with name: no-name-c5110ed3-c6e3-45de-92e7-ce48a7ffd688
[I 2025-06-14 15:36:04,657] Trial 0 finished with value: 0.2950977346183734 and parameters: {'learning_rate': 0.03404520225692639, 'max_depth': 12, 'subsample': 0.5378664558053667, 'colsample_bytree': 0.7312894225838537, 'gamma': 0.1592056224129913, 'reg_alpha': 0.18087492390629234, 'reg_lambda': 3.766542462825125}. Best is trial 0 with value: 0.2950977346183734.
[I 2025-06-14 15:43:20,095] Trial 1 finished with value: 0.1319071050942873 and parameters: {'learning_rate': 0.011402872059158305, 'max_depth': 18, 'subsample': 0.6627781711580402, 'colsample_bytree': 0.876682342023657, 'gamma': 0.10717525341150558, 'reg_alpha': 0.6025606773970926, 'reg_lambda': 2.1170292796479124}. Best is trial 1 with value: 0.1319071050942873.
[I 2025-06-14 15:48:31,438] Trial 2 finished with value: 0.32998954254633905 and parameters: {'learning_rate': 0.012401106600149989, 'max_depth': 15, 'su

KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from xgboost import XGBRegressor, plot_importance
import joblib
import matplotlib.pyplot as plt
import os

# Set environment variables
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

# Load data
train = pd.read_csv('/kaggle/input/burnout-datathon-ieeecsmuj/train.csv')
val = pd.read_csv('/kaggle/input/burnout-datathon-ieeecsmuj/val.csv')
test = pd.read_csv('/kaggle/input/burnout-datathon-ieeecsmuj/test.csv')

# Advanced feature engineering
def create_features(df):
    # Temporal features
    df['years_since_debut'] = df['year_x'] - df['min_year']
    df['career_phase'] = df['years_since_debut'] / (df['years_active'].replace(0, np.nan) + 1e-6)
    
    # Performance features
    df['podium_rate'] = df['podiums'] / (df['starts'].replace(0, np.nan) + 1e-6)
    df['finish_rate'] = df['finishes'] / (df['starts'].replace(0, np.nan) + 1e-6)
    df['win_rate'] = df['wins'] / (df['starts'].replace(0, np.nan) + 1e-6)
    
    # Tire and track dynamics
    df['tire_wear_effect'] = df['Tire_Degradation_Factor_per_Lap'] * df['Laps'] * df['Circuit_Length_km']
    df['compound_aggressiveness'] = np.select(
        [
            df['Tire_Compound_Front'] == 'Soft',
            df['Tire_Compound_Front'] == 'Medium',
            df['Tire_Compound_Front'] == 'Hard'
        ],
        [1.2, 1.0, 0.8],
        default=1.0
    )
    
    # Speed and corner dynamics
    df['speed_intensity'] = df['Avg_Speed_kmh'] / (df['Circuit_Length_km'] + 1e-6)
    df['corner_intensity'] = df['Corners_per_Lap'] / (df['Circuit_Length_km'] + 1e-6)
    df['speed_corner_ratio'] = df['Avg_Speed_kmh'] / (df['Corners_per_Lap'] + 1e-6)
    
    # Session importance
    session_order = {'FP1':1, 'FP2':2, 'FP3':3, 'FP4':4, 
                    'Qualifying':5, 'Sprint':6, 'Race':7}
    df['session_importance'] = df['Session'].map(session_order).fillna(0)
    
    # Weather effects
    df['temp_humidity_effect'] = df['Ambient_Temperature_Celsius'] * df['Humidity_%'] / 100
    df['track_temp_effect'] = df['Track_Temperature_Celsius'] - df['Ambient_Temperature_Celsius']
    
    # Rider-team synergy
    df['rider_team_experience'] = df.groupby(['rider_name', 'team_name'])['year_x'].transform(lambda x: x - x.min())
    
    # Recent performance
    df['recent_points'] = df.groupby('rider_name')['points'].transform(lambda x: x.rolling(3, min_periods=1).mean())
    
    # Tire temperature
    compound_map = {'Soft': 30, 'Medium': 40, 'Hard': 50}
    df['tire_temp_diff'] = df['Track_Temperature_Celsius'] - df['Tire_Compound_Front'].map(compound_map)
    
    # Rain effect
    df['rain_effect'] = np.where(df['weather'] == 'Raining', 
                                 df['Humidity_%'] * df['Tire_Degradation_Factor_per_Lap'], 
                                 0)
    
    # Numerical columns to process
    num_cols = [
        'Circuit_Length_km', 'Laps', 'Avg_Speed_kmh', 'Tire_Degradation_Factor_per_Lap',
        'Ambient_Temperature_Celsius', 'Humidity_%', 'points', 'Championship_Points',
        'Corners_per_Lap', 'Pit_Stop_Duration_Seconds', 'Track_Temperature_Celsius',
        'years_since_debut', 'career_phase', 'podium_rate', 'finish_rate', 'win_rate',
        'tire_wear_effect', 'compound_aggressiveness', 'speed_intensity', 
        'corner_intensity', 'speed_corner_ratio', 'session_importance',
        'temp_humidity_effect', 'track_temp_effect', 'rider_team_experience', 'recent_points',
        'tire_temp_diff', 'rain_effect'
    ]
    
    # Process numerical columns
    for col in num_cols:
        if col in df.columns:
            # Handle infinite values and NaNs
            df[col] = df[col].replace([np.inf, -np.inf], np.nan)
            mean_val = df[col].mean()
            df[col] = df[col].fillna(mean_val if not np.isnan(mean_val) else 0)
            
    return df

# Apply feature engineering
train = create_features(train)
val = create_features(val)
test = create_features(test)

# Define feature groups
high_cardinality = ['rider_name', 'circuit_name', 'team_name', 'shortname', 'bike_name']
ordinal_features = ['Tire_Compound_Front', 'Tire_Compound_Rear', 
                   'Grid_Position', 'Championship_Position']
nominal_features = ['category_x', 'Track_Condition', 'Penalty', 
                   'weather', 'track', 'Session']

numerical_features = [
    'Circuit_Length_km', 'Laps', 'Avg_Speed_kmh', 'Tire_Degradation_Factor_per_Lap',
    'Ambient_Temperature_Celsius', 'Humidity_%', 'points', 'Championship_Points',
    'Corners_per_Lap', 'Pit_Stop_Duration_Seconds', 'Track_Temperature_Celsius',
    'years_since_debut', 'career_phase', 'podium_rate', 'finish_rate', 'win_rate',
    'tire_wear_effect', 'compound_aggressiveness', 'speed_intensity', 
    'corner_intensity', 'speed_corner_ratio', 'session_importance',
    'temp_humidity_effect', 'track_temp_effect', 'rider_team_experience', 'recent_points',
    'tire_temp_diff', 'rain_effect'
]

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('target_enc', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), 
     high_cardinality),
    ('ordinal', OrdinalEncoder(
        categories=[
            ['Hard','Medium','Soft'], 
            ['Hard','Medium','Soft'],
            list(range(1, 26)),  # Grid positions
            list(range(1, 31))   # Championship positions
        ], 
        handle_unknown='use_encoded_value',
        unknown_value=-1
    ), ordinal_features),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), nominal_features),
    ('num', 'passthrough', numerical_features)
])

# Prepare data
features_used = high_cardinality + ordinal_features + nominal_features + numerical_features
X_train = train[features_used]
y_train = train['Lap_Time_Seconds']
X_val = val[features_used]
y_val = val['Lap_Time_Seconds']
X_test = test[features_used]

# Combine train and val
X_full = pd.concat([X_train, X_val])
y_full = pd.concat([y_train, y_val])

# Preprocess data
preprocessor.fit(X_full)
X_train_pre = preprocessor.transform(X_train)
X_val_pre = preprocessor.transform(X_val)
X_full_pre = preprocessor.transform(X_full)
X_test_pre = preprocessor.transform(X_test)

# Best hyperparameters from Optuna trial
best_params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "tree_method": "hist",
    "device": "cuda",
    "enable_categorical": False,
    "seed": 42,
    "n_estimators": 10000,
    "learning_rate": 0.011402872059158305,
    "max_depth": 18,
    "subsample": 0.6627781711580402,
    "colsample_bytree": 0.876682342023657,
    "gamma": 0.10717525341150558,
    "reg_alpha": 0.6025606773970926,
    "reg_lambda": 2.1170292796479124,
    "early_stopping_rounds": 100
}

# Train model with early stopping
model = XGBRegressor(**best_params)
model.fit(
    X_train_pre, y_train,
    eval_set=[(X_val_pre, y_val)],
    verbose=100
)

# Get best iteration and RMSE
best_iteration = model.best_iteration
best_rmse = model.best_score
print(f"Best iteration: {best_iteration}")
print(f"Best validation RMSE: {best_rmse}")

# Retrain on full dataset with optimal iterations
final_model = XGBRegressor(
    **{k: v for k, v in best_params.items() if k not in ['early_stopping_rounds']},
    n_estimators=best_iteration
)
final_model.fit(X_full_pre, y_full)

# Plot learning curve
results = model.evals_result()
epochs = len(results['validation_0']['rmse'])
x_axis = range(0, epochs)

plt.figure(figsize=(12, 6))
plt.plot(x_axis, results['validation_0']['rmse'], label='Validation')
plt.axhline(y=best_rmse, color='r', linestyle='--', label=f'Best RMSE: {best_rmse:.6f}')
plt.legend()
plt.ylabel('RMSE')
plt.xlabel('Epochs')
plt.title('XGBoost Learning Curve')
plt.savefig('learning_curve.png')
plt.close()

# Plot feature importance
plt.figure(figsize=(16, 12))
plot_importance(model, max_num_features=30)
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

# Generate predictions
test_preds = final_model.predict(X_test_pre)

# Create submission
submission = pd.DataFrame({
    'Unique ID': test['Unique ID'],
    'Lap_Time_Seconds': np.round(test_preds, 6)
})
submission.to_csv('solution.csv', index=False)

print("Training completed successfully!")
print(f"Best validation RMSE: {best_rmse:.6f}")
print(f"Used {best_iteration} iterations for final model")
print(f"Learning curve saved as 'learning_curve.png'")
print(f"Feature importance plot saved as 'feature_importance.png'")
print(f"Submission file saved as 'solution.csv'")

In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from xgboost import XGBRegressor, plot_importance
import joblib
import matplotlib.pyplot as plt
import os

# Set environment variables
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

# Load data
train = pd.read_csv('/kaggle/input/burnout-datathon-ieeecsmuj/train.csv')
val = pd.read_csv('/kaggle/input/burnout-datathon-ieeecsmuj/val.csv')
test = pd.read_csv('/kaggle/input/burnout-datathon-ieeecsmuj/test.csv')

# Advanced feature engineering
def create_features(df):
    # Temporal features
    df['years_since_debut'] = df['year_x'] - df['min_year']
    df['career_phase'] = df['years_since_debut'] / (df['years_active'].replace(0, np.nan) + 1e-6)
    
    # Performance features
    df['podium_rate'] = df['podiums'] / (df['starts'].replace(0, np.nan) + 1e-6)
    df['finish_rate'] = df['finishes'] / (df['starts'].replace(0, np.nan) + 1e-6)
    df['win_rate'] = df['wins'] / (df['starts'].replace(0, np.nan) + 1e-6)
    
    # Tire and track dynamics
    df['tire_wear_effect'] = df['Tire_Degradation_Factor_per_Lap'] * df['Laps'] * df['Circuit_Length_km']
    df['compound_aggressiveness'] = np.select(
        [
            df['Tire_Compound_Front'] == 'Soft',
            df['Tire_Compound_Front'] == 'Medium',
            df['Tire_Compound_Front'] == 'Hard'
        ],
        [1.2, 1.0, 0.8],
        default=1.0
    )
    
    # Speed and corner dynamics
    df['speed_intensity'] = df['Avg_Speed_kmh'] / (df['Circuit_Length_km'] + 1e-6)
    df['corner_intensity'] = df['Corners_per_Lap'] / (df['Circuit_Length_km'] + 1e-6)
    df['speed_corner_ratio'] = df['Avg_Speed_kmh'] / (df['Corners_per_Lap'] + 1e-6)
    
    # Session importance
    session_order = {'FP1':1, 'FP2':2, 'FP3':3, 'FP4':4, 
                    'Qualifying':5, 'Sprint':6, 'Race':7}
    df['session_importance'] = df['Session'].map(session_order).fillna(0)
    
    # Weather effects
    df['temp_humidity_effect'] = df['Ambient_Temperature_Celsius'] * df['Humidity_%'] / 100
    df['track_temp_effect'] = df['Track_Temperature_Celsius'] - df['Ambient_Temperature_Celsius']
    
    # Rider-team synergy
    df['rider_team_experience'] = df.groupby(['rider_name', 'team_name'])['year_x'].transform(lambda x: x - x.min())
    
    # Recent performance
    df['recent_points'] = df.groupby('rider_name')['points'].transform(lambda x: x.rolling(3, min_periods=1).mean())
    
    # Tire temperature
    compound_map = {'Soft': 30, 'Medium': 40, 'Hard': 50}
    df['tire_temp_diff'] = df['Track_Temperature_Celsius'] - df['Tire_Compound_Front'].map(compound_map)
    
    # Rain effect
    df['rain_effect'] = np.where(df['weather'] == 'Raining', 
                                 df['Humidity_%'] * df['Tire_Degradation_Factor_per_Lap'], 
                                 0)
    
    # Numerical columns to process
    num_cols = [
        'Circuit_Length_km', 'Laps', 'Avg_Speed_kmh', 'Tire_Degradation_Factor_per_Lap',
        'Ambient_Temperature_Celsius', 'Humidity_%', 'points', 'Championship_Points',
        'Corners_per_Lap', 'Pit_Stop_Duration_Seconds', 'Track_Temperature_Celsius',
        'years_since_debut', 'career_phase', 'podium_rate', 'finish_rate', 'win_rate',
        'tire_wear_effect', 'compound_aggressiveness', 'speed_intensity', 
        'corner_intensity', 'speed_corner_ratio', 'session_importance',
        'temp_humidity_effect', 'track_temp_effect', 'rider_team_experience', 'recent_points',
        'tire_temp_diff', 'rain_effect'
    ]
    
    # Process numerical columns
    for col in num_cols:
        if col in df.columns:
            # Handle infinite values and NaNs
            df[col] = df[col].replace([np.inf, -np.inf], np.nan)
            mean_val = df[col].mean()
            df[col] = df[col].fillna(mean_val if not np.isnan(mean_val) else 0)
            
    return df

# Apply feature engineering
train = create_features(train)
val = create_features(val)
test = create_features(test)

# Define feature groups
high_cardinality = ['rider_name', 'circuit_name', 'team_name', 'shortname', 'bike_name']
ordinal_features = ['Tire_Compound_Front', 'Tire_Compound_Rear', 
                   'Grid_Position', 'Championship_Position']
nominal_features = ['category_x', 'Track_Condition', 'Penalty', 
                   'weather', 'track', 'Session']

numerical_features = [
    'Circuit_Length_km', 'Laps', 'Avg_Speed_kmh', 'Tire_Degradation_Factor_per_Lap',
    'Ambient_Temperature_Celsius', 'Humidity_%', 'points', 'Championship_Points',
    'Corners_per_Lap', 'Pit_Stop_Duration_Seconds', 'Track_Temperature_Celsius',
    'years_since_debut', 'career_phase', 'podium_rate', 'finish_rate', 'win_rate',
    'tire_wear_effect', 'compound_aggressiveness', 'speed_intensity', 
    'corner_intensity', 'speed_corner_ratio', 'session_importance',
    'temp_humidity_effect', 'track_temp_effect', 'rider_team_experience', 'recent_points',
    'tire_temp_diff', 'rain_effect'
]

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('target_enc', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), 
     high_cardinality),
    ('ordinal', OrdinalEncoder(
        categories=[
            ['Hard','Medium','Soft'], 
            ['Hard','Medium','Soft'],
            list(range(1, 26)),  # Grid positions
            list(range(1, 31))   # Championship positions
        ], 
        handle_unknown='use_encoded_value',
        unknown_value=-1
    ), ordinal_features),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), nominal_features),
    ('num', 'passthrough', numerical_features)
])

# Prepare data
features_used = high_cardinality + ordinal_features + nominal_features + numerical_features
X_train = train[features_used]
y_train = train['Lap_Time_Seconds']
X_val = val[features_used]
y_val = val['Lap_Time_Seconds']
X_test = test[features_used]

# Combine train and val
X_full = pd.concat([X_train, X_val])
y_full = pd.concat([y_train, y_val])

# Preprocess data
preprocessor.fit(X_full)
X_train_pre = preprocessor.transform(X_train)
X_val_pre = preprocessor.transform(X_val)
X_full_pre = preprocessor.transform(X_full)
X_test_pre = preprocessor.transform(X_test)

# Best hyperparameters from Optuna trial
best_params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "tree_method": "hist",
    "device": "cuda",
    "enable_categorical": False,
    "seed": 42,
    "n_estimators": 10000,
    "learning_rate": 0.011402872059158305,
    "max_depth": 18,
    "subsample": 0.6627781711580402,
    "colsample_bytree": 0.876682342023657,
    "gamma": 0.10717525341150558,
    "reg_alpha": 0.6025606773970926,
    "reg_lambda": 2.1170292796479124,
    "early_stopping_rounds": 100
}

# Train model with early stopping
model = XGBRegressor(**best_params)
model.fit(
    X_train_pre, y_train,
    eval_set=[(X_val_pre, y_val)],
    verbose=100
)

# Get best iteration and RMSE
best_iteration = model.best_iteration
best_rmse = model.best_score
print(f"Best iteration: {best_iteration}")
print(f"Best validation RMSE: {best_rmse}")

# Retrain on full dataset with optimal iterations
final_model = XGBRegressor(
    **{k: v for k, v in best_params.items() if k not in ['early_stopping_rounds']},
    n_estimators=best_iteration
)
final_model.fit(X_full_pre, y_full)

# Plot learning curve
results = model.evals_result()
epochs = len(results['validation_0']['rmse'])
x_axis = range(0, epochs)

plt.figure(figsize=(12, 6))
plt.plot(x_axis, results['validation_0']['rmse'], label='Validation')
plt.axhline(y=best_rmse, color='r', linestyle='--', label=f'Best RMSE: {best_rmse:.6f}')
plt.legend()
plt.ylabel('RMSE')
plt.xlabel('Epochs')
plt.title('XGBoost Learning Curve')
plt.savefig('learning_curve.png')
plt.close()

# Plot feature importance
plt.figure(figsize=(16, 12))
plot_importance(model, max_num_features=30)
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

# Generate predictions
test_preds = final_model.predict(X_test_pre)

# Create submission
submission = pd.DataFrame({
    'Unique ID': test['Unique ID'],
    'Lap_Time_Seconds': np.round(test_preds, 6)
})
submission.to_csv('solution.csv', index=False)

print("Training completed successfully!")
print(f"Best validation RMSE: {best_rmse:.6f}")
print(f"Used {best_iteration} iterations for final model")
print(f"Learning curve saved as 'learning_curve.png'")
print(f"Feature importance plot saved as 'feature_importance.png'")
print(f"Submission file saved as 'solution.csv'")

[0]	validation_0-rmse:11.46130
[100]	validation_0-rmse:7.53693
[200]	validation_0-rmse:5.04374
[300]	validation_0-rmse:3.54819
[400]	validation_0-rmse:2.48362
[500]	validation_0-rmse:1.74803
[600]	validation_0-rmse:1.22379
[700]	validation_0-rmse:0.87520
[800]	validation_0-rmse:0.61089
[900]	validation_0-rmse:0.43354
[1000]	validation_0-rmse:0.32635
[1100]	validation_0-rmse:0.24995
[1200]	validation_0-rmse:0.20066
[1300]	validation_0-rmse:0.16915
[1400]	validation_0-rmse:0.15539
[1500]	validation_0-rmse:0.15099
[1600]	validation_0-rmse:0.14798
[1700]	validation_0-rmse:0.14728
[1800]	validation_0-rmse:0.14600
[1900]	validation_0-rmse:0.14572
[2000]	validation_0-rmse:0.14530
[2100]	validation_0-rmse:0.14505
[2200]	validation_0-rmse:0.14463
[2300]	validation_0-rmse:0.14443
[2400]	validation_0-rmse:0.14433
[2500]	validation_0-rmse:0.14412
[2600]	validation_0-rmse:0.14394
[2700]	validation_0-rmse:0.14375
[2800]	validation_0-rmse:0.14374
[2900]	validation_0-rmse:0.14367
[3000]	validation_0-r

TypeError: xgboost.sklearn.XGBRegressor() got multiple values for keyword argument 'n_estimators'

In [3]:
# Plot learning curve
results = model.evals_result()
epochs = len(results['validation_0']['rmse'])
x_axis = range(0, epochs)

plt.figure(figsize=(12, 6))
plt.plot(x_axis, results['validation_0']['rmse'], label='Validation')
plt.axhline(y=best_rmse, color='r', linestyle='--', label=f'Best RMSE: {best_rmse:.6f}')
plt.legend()
plt.ylabel('RMSE')
plt.xlabel('Epochs')
plt.title('XGBoost Learning Curve')
plt.savefig('learning_curve.png')
plt.close()

# Plot feature importance
plt.figure(figsize=(16, 12))
plot_importance(model, max_num_features=30)
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

# Generate predictions
test_preds = model.predict(X_test_pre)

# Create submission
submission = pd.DataFrame({
    'Unique ID': test['Unique ID'],
    'Lap_Time_Seconds': np.round(test_preds, 6)
})
submission.to_csv('solution.csv', index=False)

print("Training completed successfully!")
print(f"Best validation RMSE: {best_rmse:.6f}")
print(f"Used {best_iteration} iterations for final model")
print(f"Learning curve saved as 'learning_curve.png'")
print(f"Feature importance plot saved as 'feature_importance.png'")
print(f"Submission file saved as 'solution.csv'")

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Training completed successfully!
Best validation RMSE: 0.142009
Used 3839 iterations for final model
Learning curve saved as 'learning_curve.png'
Feature importance plot saved as 'feature_importance.png'
Submission file saved as 'solution.csv'


<Figure size 1600x1200 with 0 Axes>