# Import Libraries

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import RobustScaler

# Data Ingestion

In [None]:
filepath = "../data/data.csv"

df = pd.read_csv(filepath)

df.head()

In [None]:
# Dataframe information

df.info()

In [None]:
# Statistical values for numerical colunns

df.describe()

# Data Cleansing

## Drop Columns 

In [None]:
# Drop irrelevant columns

df.drop(columns=["ID"], inplace=True)
df.head()

## Rename Columns

In [None]:
# Rename Target Column

df = df.rename(columns={"default.payment.next.month": "Default"})

In [None]:
# Convert column names to have only the first letter capitalized

df.columns = df.columns.str.capitalize()

df.head()

# Exploratory Data Analysis 

## Distribution

In [None]:
# Distribution

# Loop through each numerical feature
for numerical_feature in df.columns:
    # Creating two subplots per numerical_feature
    fig, ax = plt.subplots(1, 2, figsize=(10, 3))

    # Histogram to get an overview of the distribution of each numerical_feature
    ax[0].set_title(f"Distribution of: {numerical_feature}")
    ax[0].hist(df[numerical_feature], bins=30, color="blue", alpha=0.7, edgecolor="black")

    # Adding kernel density estimate (KDE)
    kde_x = np.linspace(df[numerical_feature].min(), df[numerical_feature].max(), 100)
    kde_y = np.exp(-0.5 * ((kde_x - df[numerical_feature].mean()) / df[numerical_feature].std()) ** 2) / (
        df[numerical_feature].std() * np.sqrt(2 * np.pi)
    )
    ax[0].plot(kde_x, kde_y, color="orange")

    # Boxplot to detect outliers
    ax[1].set_title(f"Boxplot of: {numerical_feature}")
    ax[1].boxplot(df[numerical_feature], patch_artist=True, boxprops=dict(facecolor="green", color="black"))

    plt.tight_layout()
    plt.show()

## Correct Unknown Values Education/Marriage/Pay

In [None]:
df.Education.value_counts()

In [None]:
# Replace 0, 5, and 6 with 4 in the Education column (for simplification, they represent the same)

df["Education"] = df["Education"].replace({0: 4, 5: 4, 6: 4})

print(df["Education"].value_counts())

In [None]:
df.Marriage.value_counts()

In [None]:
# Replace 0 with 3 in the Marriage column (for simplification, they represent the same)

df["Marriage"] = df["Marriage"].replace({0: 3})

# Display the updated value counts
print(df["Marriage"].value_counts())

In [None]:
# Replace -1, -2, with 0 in the Pay columns (for simplification, they represent the same)

columns_to_replace = ["Pay_0", "Pay_2", "Pay_3", "Pay_4", "Pay_5", "Pay_6"]

df[columns_to_replace] = df[columns_to_replace].replace({-1: 0, -2: 0})

# Display the updated DataFrame
print(df["Pay_6"].value_counts())

## Check Balance

In [None]:
# Count the occurrences of each value in the 'Exited' column
exit_counts = df["Default"].value_counts()

# Custom labels for the pie chart
labels = ["Non-Default", "Default"]

# Plotting a pie chart for the 'Exited' column with custom labels
plt.figure(figsize=(6, 6))
plt.pie(exit_counts, labels=labels, autopct="%1.1f%%", startangle=90, colors=["skyblue", "lightgreen"])
plt.title("Overall Default Rate Distribution")
plt.axis("equal")  # Equal aspect ratio ensures that pie is drawn as a circle

plt.show()

# Preprocessing

In [None]:
df.columns

In [None]:
# Prepocessor

# Select Features
features_robust = [
    "Limit_bal",
    "Bill_amt1",
    "Bill_amt2",
    "Bill_amt3",
    "Bill_amt4",
    "Bill_amt5",
    "Bill_amt6",
    "Pay_amt1",
    "Pay_amt2",
    "Pay_amt3",
    "Pay_amt4",
    "Pay_amt5",
    "Pay_amt6",
]

# Define features and target
X = df.drop(columns=["Default"])
y = df["Default"]

# Column Transformer for scaling
preprocessor = ColumnTransformer(
    transformers=[
        ("robust_scaler", RobustScaler(), features_robust)  # Apply RobustScaler to the selected features
    ],
    remainder="passthrough",  # Keep other columns unchanged
)

preprocessor

# Modeling

In [None]:
# Split Data

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

In [None]:
X_train.shape, X_val.shape, X_test.shape

In [None]:
# Fit the preprocessor on the training set and transform both training and validation sets
X_train_scaled = preprocessor.fit_transform(X_train)
X_val_scaled = preprocessor.transform(X_val)
X_test_scaled = preprocessor.transform(X_test)

# Apply SMOTE to the training set
smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

In [None]:
y_train_resampled.value_counts()

In [None]:
# Create and fit the model
model = LGBMClassifier(objective="binary", learning_rate=0.05)

model.fit(
    X_train_resampled,
    y_train_resampled,
    eval_metric="logloss",
    eval_set=[(X_val_scaled, y_val)],
)

# Validate the model on the validation set
y_pred_val = model.predict(X_val_scaled)
y_pred_proba_val = model.predict_proba(X_val_scaled)[:, 1]  # Get probabilities for the positive class

# Calculate AUC score
auc_val = roc_auc_score(y_val, y_pred_proba_val)
print("Validation AUC:", auc_val)

# Confusion Matrix and Classification Report
conf_matrix = confusion_matrix(y_val, y_pred_val)
class_report = classification_report(y_val, y_pred_val)

print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)

In [None]:
# Validate the model on the test set
y_pred_test = model.predict(X_test_scaled)
y_pred_proba_test = model.predict_proba(X_test_scaled)[:, 1]  # Get probabilities for the positive class

# Calculate AUC score on test set
auc_test = roc_auc_score(y_test, y_pred_proba_test)
print("Test AUC:", auc_test)

# Confusion Matrix and Classification Report for test set
conf_matrix_test = confusion_matrix(y_test, y_pred_test)
class_report_test = classification_report(y_test, y_pred_test)

print("\nTest Confusion Matrix:\n", conf_matrix_test)
print("\nTest Classification Report:\n", class_report_test)

In [None]:
# Define the LightGBM model
model = LGBMClassifier(objective="binary", force_row_wise=True)

# Define the parameter grid
param_grid = {
    "boosting_type": ["gbdt", "dart"],
    "learning_rate": [0.01, 0.05, 0.1],
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring="roc_auc", cv=2, verbose=1, n_jobs=-1)

# Fit the GridSearchCV
grid_search.fit(X_train_resampled, y_train_resampled)

# Best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Cross-Validation AUC Score:", best_score)

# Validate the model on the validation set using the best estimator from GridSearchCV
best_model = grid_search.best_estimator_
y_pred_val = best_model.predict(X_val_scaled)
y_pred_proba_val = best_model.predict_proba(X_val_scaled)[:, 1]  # Get probabilities for the positive class

# Calculate AUC score
auc_val = roc_auc_score(y_val, y_pred_proba_val)
print("Validation AUC:", auc_val)

# Confusion Matrix and Classification Report
conf_matrix = confusion_matrix(y_val, y_pred_val)
class_report = classification_report(y_val, y_pred_val)

print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)

In [None]:
# Validate the model on the test set
y_pred_test = model.predict(X_test_scaled)
y_pred_proba_test = model.predict_proba(X_test_scaled)[:, 1]  # Get probabilities for the positive class

# Calculate AUC score on test set
auc_test = roc_auc_score(y_test, y_pred_proba_test)
print("Test AUC:", auc_test)

# Confusion Matrix and Classification Report for test set
conf_matrix_test = confusion_matrix(y_test, y_pred_test)
class_report_test = classification_report(y_test, y_pred_test)

print("\nTest Confusion Matrix:\n", conf_matrix_test)
print("\nTest Classification Report:\n", class_report_test)