In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# 📥 1. Load enhanced dataset
df = pd.read_csv("exchange_dataset_enhanced.csv")

# 🧹 2. Handle any missing values
df = df.dropna()
df.count()

vehicle_age                  200000
vehicle_condition_score      200000
trade_in_history             200000
incentive_received           200000
mileage                      200000
customer_engagement_score    200000
exchange                     200000
vehicle_age_squared          200000
is_high_mileage              200000
engagement_to_age_ratio      200000
dtype: int64

In [None]:

# 🎯 3. Define features and target
features = [
    "vehicle_age", 
    "vehicle_condition_score", 
    "trade_in_history", 
    "incentive_received", 
    "mileage", 
    "customer_engagement_score",
    "vehicle_age_squared", 
    "is_high_mileage", 
    "engagement_to_age_ratio"
]
X = df[features]
y = df["exchange"]

# 🧪 4. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ⚖️ 5. Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

# 📏 6. Scale features (recommended for some models, optional for RandomForest)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_bal)
X_test_scaled = scaler.transform(X_test)

# 🤖 7. Train Model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train_bal)

# 🔮 8. Predict and Evaluate
y_pred = model.predict(X_test_scaled)

# 🧾 9. Evaluation
print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred))

print("\n🧮 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
