In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_selection import SelectFromModel

# üì• Load dataset
df = pd.read_csv("exchange_dataset_enhanced.csv")

# üßπ Preprocessing: Drop rows with NaN in target
df = df.dropna(subset=['exchange'])

# üéØ Define features and target
X = df.drop(columns=["exchange"])
y = df["exchange"]

# üß™ Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# üìä Normalize data (optional but recommended for Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# üå≤ Step 1: Feature Selection using Random Forest
selector_model = RandomForestClassifier(n_estimators=100, random_state=42)
selector_model.fit(X_train_scaled, y_train)

# ‚úÖ Use SelectFromModel to get top features
selector = SelectFromModel(selector_model, prefit=True, threshold="median")
X_train_selected = selector.transform(X_train_scaled)
X_test_selected = selector.transform(X_test_scaled)

selected_feature_indices = selector.get_support(indices=True)
selected_feature_names = X.columns[selected_feature_indices]
print("\nüîç Top Selected Features:")
print(selected_feature_names)

# üìö Initialize Models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight="balanced")
}

# üîÅ Train and evaluate each model
for name, model in models.items():
    print(f"\nüöÄ Training Model: {name}")
    model.fit(X_train_selected, y_train)
    y_pred = model.predict(X_test_selected)

    print(f"üìä Classification Report for {name}:")
    print(classification_report(y_test, y_pred))

    print(f"üßÆ Confusion Matrix for {name}:")
    print(confusion_matrix(y_test, y_pred))



üîç Top Selected Features:
Index(['vehicle_condition_score', 'trade_in_history', 'incentive_received',
       'mileage', 'customer_engagement_score'],
      dtype='object')

üöÄ Training Model: Random Forest
üìä Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.93      0.95      0.94     28789
           1       0.86      0.81      0.83     11211

    accuracy                           0.91     40000
   macro avg       0.89      0.88      0.89     40000
weighted avg       0.91      0.91      0.91     40000

üßÆ Confusion Matrix for Random Forest:
[[27345  1444]
 [ 2166  9045]]

üöÄ Training Model: XGBoost


Parameters: { "use_label_encoder" } are not used.



üìä Classification Report for XGBoost:
              precision    recall  f1-score   support

           0       0.93      0.96      0.94     28789
           1       0.88      0.82      0.85     11211

    accuracy                           0.92     40000
   macro avg       0.91      0.89      0.90     40000
weighted avg       0.92      0.92      0.92     40000

üßÆ Confusion Matrix for XGBoost:
[[27587  1202]
 [ 2056  9155]]

üöÄ Training Model: Logistic Regression
üìä Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.93      0.83      0.88     28789
           1       0.66      0.84      0.74     11211

    accuracy                           0.83     40000
   macro avg       0.79      0.83      0.81     40000
weighted avg       0.85      0.83      0.84     40000

üßÆ Confusion Matrix for Logistic Regression:
[[23863  4926]
 [ 1822  9389]]
