In [34]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

# Load data
data = pd.read_csv("processed_rfms_data.csv")

# ==================================================================
# STEP 1: Define the Target Variable
# ==================================================================
# Option 1: If 'DefaultLabel' already exists in the CSV (Good/Bad labels):
# data['DefaultLabel'] = data['DefaultLabel'].map({'Good': 1, 'Bad': 0})

# Option 2: Create target based on Amount threshold (example):
data['DefaultLabel'] = (data['Amount'] > 1000).astype(int)

# ==================================================================
# STEP 2: Split Data FIRST to Avoid Data Leakage
# ==================================================================
X = data.drop(columns=['DefaultLabel'])
y = data['DefaultLabel']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# ==================================================================
# STEP 3: Preprocessing Pipeline
# ==================================================================
# Identify categorical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

# Custom function to bucket rare categories using TRAINING DATA
def bucket_rare_categories(X, top_n=20):
    X_processed = X.copy()
    for col in X_processed.select_dtypes(include=['object']).columns:
        top_cats = X_processed[col].value_counts().nlargest(top_n).index
        X_processed[col] = X_processed[col].where(X_processed[col].isin(top_cats), 'Other')
    return X_processed

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('categorical', Pipeline(steps=[
            ('bucket', FunctionTransformer(bucket_rare_categories, kw_args={'top_n': 20})),
            ('frequency_encoder', FunctionTransformer(
                lambda X: X.apply(lambda col: col.map(col.value_counts(normalize=True))),
                validate=False
            ))
        ]), categorical_cols)
    ],
    remainder='passthrough'
)

# ==================================================================
# STEP 4: Model Training & Evaluation
# ==================================================================
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

results = {}

for name, model in models.items():
    # Create a pipeline with preprocessing and model
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    print(f"\nTraining {name}...")
    pipeline.fit(X_train, y_train)

    # Predictions
    y_pred = pipeline.predict(X_test)
    y_proba = pipeline.predict_proba(X_test)[:, 1]

    # Metrics
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, zero_division=0),
        "Recall": recall_score(y_test, y_pred, zero_division=0),
        "F1": f1_score(y_test, y_pred, zero_division=0),
        "ROC-AUC": roc_auc_score(y_test, y_proba)
    }

    results[name] = metrics
    print(f"{name} Classification Report:\n{classification_report(y_test, y_pred)}")

# ==================================================================
# STEP 5: Hyperparameter Tuning (Random Forest Example)
# ==================================================================
# Full pipeline with hyperparameter tuning
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 15, 30],
    'classifier__min_samples_split': [2, 5]
}

grid_search = GridSearchCV(
    estimator=rf_pipeline,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

# Best model
best_rf = grid_search.best_estimator_
print(f"\nBest Parameters: {grid_search.best_params_}")
print(f"Best ROC-AUC: {grid_search.best_score_:.3f}")

# ==================================================================
# STEP 6: Save Results
# ==================================================================
results_df = pd.DataFrame(results).T
results_df.to_csv("model_performance_metrics.csv", index=True)
print("\nResults saved to 'model_performance_metrics.csv'")


Training Logistic Regression...
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18597
           1       1.00      1.00      1.00     10102

    accuracy                           1.00     28699
   macro avg       1.00      1.00      1.00     28699
weighted avg       1.00      1.00      1.00     28699


Training Decision Tree...
Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18597
           1       1.00      1.00      1.00     10102

    accuracy                           1.00     28699
   macro avg       1.00      1.00      1.00     28699
weighted avg       1.00      1.00      1.00     28699


Training Random Forest...
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18597
           1       1.00      1.00