## Code with step by Step

In [3]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

# Reload data
data = pd.read_csv('../../reference/f1_cleaned.csv')
data = data.rename(columns={'driver_name' : 'team_name'})
data.drop(['code'], axis=1, inplace=True)

# Split
train = data[data.year < 2024].copy()
test = data[data.year == 2024].copy()

y_train = train.pop('finishing_pos')
x_train = train
y_test = test.pop('finishing_pos')
x_test = test

# Selected columns
cat_feat = ['team_name', 'driver_nat', 'circuitRef']
x_num_feat = ['starting_pos', 'laps', 'driver_dnf', 'car_dnf', 'round', 'quali_mean', 'driver_age']

# --- Encoding with frequency threshold ---
def encode_with_threshold(df, cols, threshold=0.02):
    new_df = df.copy()
    for col in cols:
        freq = new_df[col].value_counts(normalize=True)
        keep = freq[freq > threshold].index
        new_df[col] = np.where(new_df[col].isin(keep), new_df[col], 'Other')
    return pd.get_dummies(new_df, columns=cols, drop_first=True)

# Apply
x_train_encoded = encode_with_threshold(x_train, cat_feat)
x_test_encoded = encode_with_threshold(x_test, cat_feat)

# --- Scaling numerical features ---
scaler = StandardScaler()
x_train_encoded[x_num_feat] = scaler.fit_transform(x_train_encoded[x_num_feat])
x_test_encoded[x_num_feat] = scaler.transform(x_test_encoded[x_num_feat])

# Align columns (test might miss some categories)
x_train_encoded, x_test_encoded = x_train_encoded.align(x_test_encoded, join='left', axis=1, fill_value=0)

# --- Add constant for OLS ---
x_train_encoded = sm.add_constant(x_train_encoded)
x_test_encoded = sm.add_constant(x_test_encoded)

x_train_encoded = x_train_encoded.astype(float)
y_train = y_train.astype(float)

# --- Backward Elimination ---
def backward_elimination(x, y, sl=0.05):
    numVars = len(x.columns)
    for i in range(numVars):
        regressor_OLS = sm.OLS(y, x).fit()
        maxVar = max(regressor_OLS.pvalues)
        if maxVar > sl:
            excluded_feature = regressor_OLS.pvalues.idxmax()
            print(f"Dropping '{excluded_feature}' with p-value {maxVar:.4f}")
            x = x.drop(columns=[excluded_feature])
        else:
            break
    regressor_OLS.summary()
    return x, regressor_OLS

# Run it
x_train_be, model_be = backward_elimination(x_train_encoded, y_train)

# Final model summary
print(model_be.summary())


Dropping 'circuitRef_yas_marina' with p-value 0.9561
Dropping 'driver_nat_German' with p-value 0.8520
Dropping 'circuitRef_bahrain' with p-value 0.7135
Dropping 'driver_age' with p-value 0.6892
Dropping 'circuitRef_suzuka' with p-value 0.6534
Dropping 'circuitRef_sepang' with p-value 0.6196
Dropping 'circuitRef_sochi' with p-value 0.6574
Dropping 'driver_nat_Swedish' with p-value 0.5812
Dropping 'driver_nat_British' with p-value 0.6319
Dropping 'circuitRef_marina_bay' with p-value 0.5528
Dropping 'circuitRef_hockenheimring' with p-value 0.5363
Dropping 'driver_nat_Danish' with p-value 0.3732
Dropping 'driver_nat_Mexican' with p-value 0.3724
Dropping 'circuitRef_interlagos' with p-value 0.3323
Dropping 'circuitRef_baku' with p-value 0.2641
Dropping 'circuitRef_monza' with p-value 0.2279
Dropping 'circuitRef_villeneuve' with p-value 0.1760
Dropping 'circuitRef_red_bull_ring' with p-value 0.1717
Dropping 'circuitRef_americas' with p-value 0.1406
Dropping 'circuitRef_spa' with p-value 0.14