In [1]:
# Import
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import uniform, randint
import warnings
warnings.filterwarnings("ignore")

# Load and preprocess
df = pd.read_csv("vehicles.csv")
df = df[(df["price"] >= 100) & (df["price"] <= 100000)]
df["car_age"] = 2025 - df["year"]
df = df.drop(columns=["year"], errors="ignore")
top_models = df["model"].value_counts().nlargest(20).index
df["model"] = df["model"].where(df["model"].isin(top_models), other="other")
cat_cols = ["manufacturer", "condition", "fuel", "title_status", "transmission", "drive", "type", "state", "model"]
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)
df = df.drop(columns=df.select_dtypes(include="object").columns, errors="ignore")
df = df.fillna(0)
df["mileage_per_year"] = df["odometer"] / df["car_age"].replace(0, 1)
df["is_old_car"] = (df["car_age"] > 10).astype(int)
df["is_high_mileage"] = (df["odometer"] > 150000).astype(int)

# Target and split
y = df["price"]
X = df.drop(columns=["price"])
y_log = np.log1p(y)
X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.2, random_state=42)

# Search space
param_dist = {
    "n_estimators": randint(100, 500),
    "learning_rate": uniform(0.01, 0.3),
    "max_depth": randint(3, 10),
    "subsample": uniform(0.6, 0.4),
    "colsample_bytree": uniform(0.6, 0.4),
    "reg_alpha": uniform(0, 1),
    "reg_lambda": uniform(0.5, 2)
}

# Randomized search
xgb_model = XGBRegressor(random_state=42, n_jobs=-1)
random_search = RandomizedSearchCV(
    xgb_model,
    param_distributions=param_dist,
    n_iter=50,
    scoring="neg_mean_squared_error",
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)
random_search.fit(X_train, y_train_log)
best_model = random_search.best_estimator_

# Predict
y_pred_log = best_model.predict(X_test)
y_pred = np.expm1(y_pred_log)
y_test = np.expm1(y_test_log)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Best Params:", random_search.best_params_)
print(f"MAE: ${mae:,.2f}")
print(f"RMSE: ${rmse:,.2f}")
print(f"R² Score: {r2:.4f}")


Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best Params: {'colsample_bytree': 0.9043140194467589, 'learning_rate': 0.17838315927084888, 'max_depth': 9, 'n_estimators': 497, 'reg_alpha': 0.49379559636439074, 'reg_lambda': 1.5454656587639881, 'subsample': 0.7710164073434198}
MAE: $3,504.86
RMSE: $6,111.42
R² Score: 0.8235
