In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import (OneHotEncoder, StandardScaler, LabelBinarizer)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report,
    roc_auc_score, roc_curve
)
import matplotlib.pyplot as plt

In [15]:
df = pd.read_csv("credit.csv")

In [16]:
# 1. Define Target and Features
# ============================================
y = df["default"]
X = df.drop("default", axis=1)

numeric_features = ["months_loan_duration", "amount", "percent_of_income", 
                    "years_at_residence", "age", "existing_loans_count", "dependents"]

categorical_features = ["checking_balance", "credit_history", "purpose", 
                        "savings_balance", "employment_duration", 
                        "other_credit", "housing", "job", "phone"]



In [17]:
# 2. Preprocessing
# ============================================
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(drop="first"), categorical_features)
    ]
)

In [18]:
# 3. Model Pipelines
# ============================================
log_reg_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000, random_state=42))
])

tree_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", DecisionTreeClassifier(random_state=42))
])

forest_pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

In [19]:
# 4. Parameter Grids
# ============================================
param_grid_log_reg = {
    "classifier__C": [0.01, 0.1, 1, 10],
    "classifier__solver": ["lbfgs", "liblinear"]
}

param_grid_tree = {
    "classifier__max_depth": [3, 5, 7, 10, None],
    "classifier__min_samples_split": [2, 5, 10],
    "classifier__criterion": ["gini", "entropy"]
}

param_grid_forest = {
    "classifier__n_estimators": [50, 100, 200],
    "classifier__max_depth": [5, 8, 10, None],
    "classifier__min_samples_split": [2, 5, 10],
    "classifier__criterion": ["gini", "entropy"]
}

In [20]:
# 5. Train-Test Split
# ============================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [21]:
# 6. GridSearchCV (5-fold CV)
# ============================================
models = {
    "Logistic Regression": (log_reg_pipe, param_grid_log_reg),
    "Decision Tree": (tree_pipe, param_grid_tree),
    "Random Forest": (forest_pipe, param_grid_forest)
}

best_models = {}
results_summary = []

for name, (pipe, params) in models.items():
    print(f"\n🔍 Tuning {name} ...")
    grid = GridSearchCV(pipe, params, cv=5, n_jobs=-1, scoring="accuracy")
    grid.fit(X_train, y_train)
    
    best_models[name] = grid.best_estimator_
    best_score = grid.best_score_
    best_params = grid.best_params_
    
    print(f"Best CV Score: {best_score:.4f}")
    print(f"Best Parameters: {best_params}")



🔍 Tuning Logistic Regression ...
Best CV Score: 0.7387
Best Parameters: {'classifier__C': 10, 'classifier__solver': 'liblinear'}

🔍 Tuning Decision Tree ...
Best CV Score: 0.7025
Best Parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': 3, 'classifier__min_samples_split': 2}

🔍 Tuning Random Forest ...
Best CV Score: 0.7525
Best Parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': None, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}


In [28]:
# 7. Evaluation on Test Set (Accuracy + AUC)
# ============================================
# plt.figure(figsize=(8, 6))
for name, model in best_models.items():
    # Predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    # Metrics
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob) if y_prob is not None else np.nan
    
    # Save results
    results_summary.append({
        "Model": name,
        "Accuracy": acc,
        "AUC": auc
    })
    
    # Print detailed report
    print(f"\n=== {name} Evaluation ===")
    print("Accuracy:", round(acc, 4))
    print("AUC:", round(auc, 4))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))


    lb = LabelBinarizer()
    y_test_bin = lb.fit_transform(y_test).ravel()

    
    # Plot ROC curve
    if y_prob is not None:
        fpr, tpr, _ = roc_curve(y_test_bin, y_prob)
        plt.plot(fpr, tpr, label=f"{name} (AUC = {auc:.2f})")


=== Logistic Regression Evaluation ===
Accuracy: 0.75
AUC: 0.7693
Confusion Matrix:
 [[121  19]
 [ 31  29]]
Classification Report:
               precision    recall  f1-score   support

          no       0.80      0.86      0.83       140
         yes       0.60      0.48      0.54        60

    accuracy                           0.75       200
   macro avg       0.70      0.67      0.68       200
weighted avg       0.74      0.75      0.74       200



NameError: name 'LabelBinarizer' is not defined

In [26]:
print("Unique classes in y_test:", np.unique(y_test))

Unique classes in y_test: ['no' 'yes']


In [25]:
# 9. Model Comparison Table
# ============================================
results_df = pd.DataFrame(results_summary).sort_values(by="Accuracy", ascending=False)
print("\n=== Model Comparison Summary ===")
print(results_df.reset_index(drop=True))


=== Model Comparison Summary ===
                 Model  Accuracy       AUC
0        Random Forest     0.765  0.797262
1        Random Forest     0.765  0.797262
2  Logistic Regression     0.750  0.769286
3  Logistic Regression     0.750  0.769286
4        Decision Tree     0.685  0.715357
5        Decision Tree     0.685  0.715357
