# Credit Default Prediction Application

This notebook trains machine learning models to predict credit default and provides an interface for predictions.

Upload the dataset `UCI_Credit_Card.csv` to Colab.

In [None]:
# Install dependencies
!pip install streamlit scikit-learn xgboost pandas numpy matplotlib seaborn joblib plotly

# For Colab, uncomment if needed
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# Import libraries
import argparse
import os
from datetime import datetime
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_recall_fscore_support,
    matthews_corrcoef,
    confusion_matrix,
    classification_report,
)
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
from IPython.display import display

sns.set()
DEFAULT_TARGET = "default.payment.next.month"

In [None]:
# Helper functions
def load_csv(csv_path: str) -> pd.DataFrame:
    return pd.read_csv(csv_path)

def split_features_target(df: pd.DataFrame, target: str):
    X = df.drop(columns=[target])
    y = df[target]
    return X, y

def drop_id_if_present(df: pd.DataFrame) -> pd.DataFrame:
    if "ID" in df.columns:
        df = df.drop(columns=["ID"])
    return df

def infer_column_types(X: pd.DataFrame):
    cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
    num_cols = [c for c in X.columns if c not in cat_cols]
    return num_cols, cat_cols

def build_preprocessor(X: pd.DataFrame, numeric_cols=None, categorical_cols=None) -> ColumnTransformer:
    if numeric_cols is None or categorical_cols is None:
        cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
        num_cols = [c for c in X.columns if c not in cat_cols]
    else:
        num_cols, cat_cols = numeric_cols, categorical_cols

    numeric_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ]
    )

    categorical_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
        ]
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, num_cols),
            ("cat", categorical_transformer, cat_cols),
        ]
    )

    return preprocessor

def get_models(random_state=42):
    models = {
        "Logistic Regression": LogisticRegression(random_state=random_state),
        "Decision Tree": DecisionTreeClassifier(random_state=random_state),
        "kNN": KNeighborsClassifier(),
        "Naive Bayes": GaussianNB(),
        "Random Forest": RandomForestClassifier(random_state=random_state),
        "XGBoost": xgb.XGBClassifier(random_state=random_state),
    }
    return models

def compute_metrics(y_true, y_pred, y_proba=None):
    metrics = {}
    metrics["Accuracy"] = accuracy_score(y_true, y_pred)

    labels = np.unique(y_true)
    is_binary = len(labels) == 2
    average = "binary" if is_binary else "weighted"

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average=average, zero_division=0
    )
    metrics["Precision"] = precision
    metrics["Recall"] = recall
    metrics["F1"] = f1
    metrics["MCC"] = matthews_corrcoef(y_true, y_pred)

    auc_val = None
    if y_proba is not None:
        try:
            if is_binary:
                if y_proba.ndim == 1:
                    auc_val = roc_auc_score(y_true, y_proba)
                else:
                    auc_val = roc_auc_score(y_true, y_proba[:, 1])
            else:
                auc_val = roc_auc_score(
                    y_true, y_proba, multi_class="ovr", average="weighted"
                )
        except Exception:
            auc_val = None
    metrics["AUC"] = auc_val
    return metrics

def plot_confusion_matrix(y_true, y_pred):
    fig, ax = plt.subplots(figsize=(5, 4))
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")
    ax.set_title("Confusion Matrix")
    plt.show()

def show_classification_report(y_true, y_pred):
    report = classification_report(y_true, y_pred, zero_division=0)
    print(report)

In [None]:
# Load data
# Upload UCI_Credit_Card.csv to Colab and set the path
csv_path = '/content/UCI_Credit_Card.csv'  # Adjust path if needed

df = load_csv(csv_path)
df = drop_id_if_present(df)

X, y = split_features_target(df, DEFAULT_TARGET)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

In [None]:
# Train models
num_cols, cat_cols = infer_column_types(X_train)
preproc = build_preprocessor(X_train, numeric_cols=num_cols, categorical_cols=cat_cols)
preproc.fit(X_train)

models = get_models(random_state=42)

rows = []
trained_models = {}
for name, model in models.items():
    pipe = Pipeline(steps=[("preprocessor", preproc), ("model", model)])
    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)
    y_proba = None
    if hasattr(pipe.named_steps["model"], "predict_proba"):
        y_proba = pipe.named_steps["model"].predict_proba(preproc.transform(X_test))
    elif hasattr(pipe.named_steps["model"], "decision_function"):
        y_proba = pipe.named_steps["model"].decision_function(preproc.transform(X_test))

    metrics = compute_metrics(y_test, y_pred, y_proba)
    metrics_row = {"Model": name, **metrics}
    rows.append(metrics_row)

    trained_models[name] = pipe.named_steps["model"]

summary_df = pd.DataFrame(rows)
print(summary_df)

In [None]:
# Evaluate a model (example)
model_name = "XGBoost"  # Change as needed
model = trained_models[model_name]
y_pred = model.predict(preproc.transform(X_test))
y_proba = model.predict_proba(preproc.transform(X_test))

metrics = compute_metrics(y_test, y_pred, y_proba)
print(metrics)

plot_confusion_matrix(y_test, y_pred)
show_classification_report(y_test, y_pred)

In [None]:
# Prediction interface
# For simplicity, use a sample from test data
sample = X_test.iloc[0:1]
print("Sample input:")
print(sample)

for name, model in trained_models.items():
    pred = model.predict(preproc.transform(sample))
    proba = model.predict_proba(preproc.transform(sample))
    print(f"{name}: Prediction = {pred[0]}, Probability = {proba[0][1]:.3f}")