In [1]:
# Import
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load and preprocess
df = pd.read_csv("vehicles.csv")
df = df[(df["price"] > 100) & (df["price"] < 100000)]
df["car_age"] = 2025 - df["year"]
df = df.drop(columns=["year"], errors="ignore")
top_models = df["model"].value_counts().nlargest(20).index
df["model"] = df["model"].where(df["model"].isin(top_models), other="other")
cat_cols = ["manufacturer", "condition", "fuel", "title_status", "transmission", "drive", "type", "state", "model"]
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)
df = df.drop(columns=df.select_dtypes(include="object").columns, errors="ignore")
df = df.fillna(0)
df["mileage_per_year"] = df["odometer"] / df["car_age"].replace(0, 1)
df["is_old_car"] = (df["car_age"] > 10).astype(int)
df["is_high_mileage"] = (df["odometer"] > 150000).astype(int)

# Target and split
y = df["price"]
X = df.drop(columns=["price"])
y_log = np.log1p(y)
X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.2, random_state=42)

# Train and evaluate
model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42, n_jobs=-1)
model.fit(X_train, y_train_log)
y_pred_log = model.predict(X_test)
y_pred = np.expm1(y_pred_log)
y_test = np.expm1(y_test_log)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"MAE: ${mae:,.2f}")
print(f"RMSE: ${rmse:,.2f}")
print(f"R² Score: {r2:.4f}")


MAE: $5,142.25
RMSE: $8,424.17
R² Score: 0.6594
