In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime, timedelta
from xgboost import XGBRegressor
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error

In [None]:
FEATURES_PREPPED_FILE = "features_prepped.csv"
TRAIN_TEST_SPLIT = 0.8
VAL_TEST_SPLIT = 0.5
START_DATE = datetime(2025,9,8) +timedelta(days=28)
BAD_FEATURES = ['avg_half_grade_2w_ago',
 'avg_price_2w_ago',
 'prev_2_half_grade',
 'avg_grade_co_BGS_1w_ago',
 'prev_1_half_grade',
 'index_change_1w',
 'avg_grade_co_PSA_1w_ago',
 'avg_half_grade_1w_ago',
 'prev_3_top_seller',
 'avg_top_seller_3w_ago',
 'prev_2_grade_co_BGS',
 'avg_grade_co_PSA_2w_ago',
 'prev_3_grade_co_BGS',
 'prev_2_top_seller',
 'prev_3_half_grade']

In [None]:
df = pd.read_csv(FEATURES_PREPPED_FILE)
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df.sort_values(by='date')
df = df[df['date'] >= START_DATE]
df

In [None]:
feature_cols = [col for col in df.columns if col not in ['date', 'spec_id', 'price'] and col not in BAD_FEATURES]
feature_cols

In [None]:
train_df = df.iloc[:int(len(df) * TRAIN_TEST_SPLIT)]
test_df = df.iloc[int(len(df) * TRAIN_TEST_SPLIT):]
val_df = test_df.iloc[:int(len(test_df) * VAL_TEST_SPLIT)]
test_df = test_df.iloc[int(len(test_df) * VAL_TEST_SPLIT):]

In [None]:
X_train = train_df[feature_cols].copy()
y_train = train_df['price'].copy()

X_val = val_df[feature_cols].copy()
y_val = val_df['price'].copy()

X_test = test_df[feature_cols].copy()
y_test = test_df['price'].copy()

print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Number of features: {len(feature_cols)}")


In [None]:
model = XGBRegressor(device='cuda')

# Define parameter grid
param_grid = {
    'max_depth': [10, 12, 15, 20],
    'learning_rate': [0.05, 0.075, 0.1, 0.15, 0.2],
    'n_estimators': [10, 20, 50, 100],
    'min_child_weight': [5, 7, 10, 20],
    'subsample': [0.75, 0.8, 0.9],
    'colsample_bytree': [1.0],
    'gamma': [0],
    'reg_alpha': [0, 0.01, 0.1, 1],
    'reg_lambda': [0.5, 1, 2, 5],
    'colsample_bylevel': [0.7, 0.8, 1.0],
    'max_delta_step': [0, 1, 5],
}

best_score = 99999
best_grid = {}
for g in ParameterGrid(param_grid):
    model.set_params(**g)
    model.fit(X_train,y_train)
    y_val_pred = model.predict(X_val)
    mape = mean_absolute_percentage_error(y_val, y_val_pred)
    print(f"MAPE: {mape:.2%}")
    if mape < best_score:
        best_score = mape
        best_grid = g
    print(f"Best MAPE: {best_score:.2%}")

print ("Best MAPE: %0.5f" % best_score )
print ("Best Grid:", best_grid)

In [None]:
best_model = XGBRegressor(device='cuda')
best_model.set_params(**best_grid)
best_model.fit(X_train,y_train)

In [None]:
y_test_pred = best_model.predict(X_test)

val_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
val_mae = mean_absolute_error(y_test, y_test_pred)
val_mape = mean_absolute_percentage_error(y_test, y_test_pred)
val_r2 = r2_score(y_test, y_test_pred)

simple_percent_error = (np.abs(X_test["prev_1_price"].values - y_test.values) / y_test.values) * 100
simple_percent_error_series = pd.Series(simple_percent_error, name='simple_percent_error')
# Percent error (signed, not absolute)
percent_error = (np.abs(y_test_pred - y_test.values) / y_test.values) * 100
percent_error_series = pd.Series(percent_error, name='percent_error')

print("Validation Metrics:")
print(f"  RMSE: ${val_rmse:,.2f}")
print(f"  MAE:  ${val_mae:,.2f}")
print(f"  MAPE: {val_mape:.2%}")
print(f"  RÂ²:   {val_r2:.4f}")

print("\nSimple Percent Error Percentiles:")
print(simple_percent_error_series.describe(percentiles=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]))

print("\nPercent Error Percentiles:")
print(percent_error_series.describe(percentiles=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]))


In [None]:
importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 12))
plt.barh(importance_df['feature'][:20], importance_df['importance'][:20])
plt.xlabel('Importance')
plt.title('Top 20 Feature Importances')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

importance_df.head(20)
importance_df["importance_cumsum"] = importance_df["importance"].cumsum()

In [None]:
importance_df
bad_features = importance_df[importance_df["importance_cumsum"] > 0.95]["feature"]
list(bad_features)