In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
# from databricks.connect import DatabricksSession

# # Create a Spark session connected to your Databricks cluster
# spark = DatabricksSession.builder.profile("dbc-df5087bc-8b50").getOrCreate()

# # Read the table from Databricks
# df = spark.read.csv("dbfs:/Volumes/maven/default/data/data.csv", header=True, inferSchema=True)

# # Show the first 5 rows
# df.show(5)

In [None]:
filepath = "../data/data.csv"

df = pd.read_csv(filepath)

df.head()

In [None]:
# Dataframe information

df.info()

In [None]:
df = df.rename(columns={"default.payment.next.month": "default"})

In [None]:
# Convert column names to have only the first letter capitalized
df.columns = df.columns.str.capitalize()

# Display the modified DataFrame
df.head()

In [None]:
# Statistical values for numerical colunns

df.describe()

In [None]:
# Distribution

# Selecting numerical features from the DataFrame
df_num = df.select_dtypes(exclude=["object"])

# Loop through each numerical feature
for numerical_feature in df_num.columns:
    # Creating two subplots per numerical_feature
    fig, ax = plt.subplots(1, 2, figsize=(10, 3))

    # Histogram to get an overview of the distribution of each numerical_feature
    ax[0].set_title(f"Distribution of: {numerical_feature}")
    ax[0].hist(df_num[numerical_feature], bins=30, color="blue", alpha=0.7, edgecolor="black")

    # Adding kernel density estimate (KDE)
    kde_x = np.linspace(df_num[numerical_feature].min(), df_num[numerical_feature].max(), 100)
    kde_y = np.exp(-0.5 * ((kde_x - df_num[numerical_feature].mean()) / df_num[numerical_feature].std()) ** 2) / (
        df_num[numerical_feature].std() * np.sqrt(2 * np.pi)
    )
    ax[0].plot(kde_x, kde_y, color="orange")

    # Boxplot to detect outliers
    ax[1].set_title(f"Boxplot of: {numerical_feature}")
    ax[1].boxplot(df_num[numerical_feature], patch_artist=True, boxprops=dict(facecolor="green", color="black"))

    plt.tight_layout()
    plt.show()

In [None]:
df.Education.value_counts()

In [None]:
# Replace 0, 5, and 6 with 4 in the Education column
df["Education"] = df["Education"].replace({0: 4, 5: 4, 6: 4})

# Display the updated value counts
print(df["Education"].value_counts())

In [None]:
df.Marriage.value_counts()

In [None]:
# Replace 0, 5, and 6 with 4 in the Education column
df["Marriage"] = df["Marriage"].replace({0: 3})

# Display the updated value counts
print(df["Marriage"].value_counts())

In [None]:
# Replace 0, 5, and 6 with 4 in the Education column
df["Marriage"] = df["Marriage"].replace({0: 3})

# Display the updated value counts
print(df["Marriage"].value_counts())

In [None]:
columns_to_replace = ["Pay_0", "Pay_2", "Pay_3", "Pay_4", "Pay_5", "Pay_6"]
df[columns_to_replace] = df[columns_to_replace].replace({-1: 0, -2: 0})

# Display the updated DataFrame
print(df["Pay_6"].value_counts())

In [None]:
df[["Sex", "Default"]].groupby(["Sex"]).mean().sort_values(by="Default")

In [None]:
df.columns

In [None]:
target = "Default"

predictors = [
    "Limit_bal",
    "Sex",
    "Education",
    "Marriage",
    "Age",
    "Pay_0",
    "Pay_2",
    "Pay_3",
    "Pay_4",
    "Pay_5",
    "Pay_6",
    "Bill_amt1",
    "Bill_amt2",
    "Bill_amt3",
    "Bill_amt4",
    "Bill_amt5",
    "Bill_amt6",
    "Pay_amt1",
    "Pay_amt2",
    "Pay_amt3",
    "Pay_amt4",
    "Pay_amt5",
    "Pay_amt6",
]

In [None]:
X = df.drop(columns=["Default", "Id"])

In [None]:
y = df["Default"]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score  # Import roc_auc_score
from sklearn.model_selection import GridSearchCV

# Sample dataset
X = df.drop(columns=["Default", "Id"])
y = df["Default"]

# Split the dataset
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)


# Create LGBMClassifier
model = LGBMClassifier(force_row_wise=True)

# Define the parameter grid
param_grid = {
    "boosting_type": ["gbdt"],
    "objective": ["binary"],
    "learning_rate": [0.05],
    "scale_pos_weight": [50],  # Adjust for class imbalance
}

# Create the GridSearchCV object
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring="roc_auc",  # Use AUC as the scoring metric
    cv=2,
    verbose=1,
    n_jobs=-1,
)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Cross-Validation AUC Score:", best_score)

# Validate the model on the validation set
best_model = grid_search.best_estimator_
y_pred_proba = best_model.predict_proba(X_valid)[:, 1]  # Get probabilities for the positive class
auc = roc_auc_score(y_valid, y_pred_proba)

print("Validation AUC:", auc)

In [None]:
from lightgbm import LGBMClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import GridSearchCV

# Create LGBMClassifier
model = LGBMClassifier(force_row_wise=True)

# Define the parameter grid
param_grid = {
    "boosting_type": ["gbdt"],
    "objective": ["binary"],
    "learning_rate": [0.05],
    "scale_pos_weight": [50],  # Adjust this based on class imbalance
}

# Create a CalibratedClassifierCV with the model
calibrated_model = CalibratedClassifierCV(estimator=model, method="sigmoid")

# Create the GridSearchCV object
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring="roc_auc",  # Use AUC as the scoring metric
    cv=2,
    verbose=1,
    n_jobs=-1,
)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Cross-Validation Score (AUC):", best_score)

# Validate the model on the validation set
best_model = grid_search.best_estimator_

# Fit the calibrated model on the training data
calibrated_model.fit(X_train, y_train)

# Get predicted classes and probabilities
y_pred = calibrated_model.predict(X_valid)
y_pred_proba = calibrated_model.predict_proba(X_valid)[:, 1]  # Get probabilities for the positive class

# Calculate AUC
auc = roc_auc_score(y_valid, y_pred_proba)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_valid, y_pred)

# Generate classification report
class_report = classification_report(y_valid, y_pred, output_dict=True)

# Extract metrics from the classification report
f1 = class_report["1"]["f1-score"]
recall = class_report["1"]["recall"]
precision = class_report["1"]["precision"]

# Print results
print("Validation AUC:", auc)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", classification_report(y_valid, y_pred))
print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")

In [None]:
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import GridSearchCV

# Create LGBMClassifier
model = LGBMClassifier(force_row_wise=True, is_unbalance=True)


# Define the parameter grid
param_grid = {
    "boosting_type": ["gbdt"],
    "objective": ["binary"],
    "learning_rate": [0.05, 0.1],
    # 'scale_pos_weight': [50],  # Adjust for class imbalance
}

# Create the GridSearchCV object
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring="roc_auc",  # Use AUC as the scoring metric
    cv=2,
    verbose=1,
    n_jobs=-1,
)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Cross-Validation AUC Score:", best_score)

# Validate the model on the validation set
best_model = grid_search.best_estimator_
y_pred_proba = best_model.predict_proba(X_valid)[:, 1]  # Get probabilities for the positive class
auc = roc_auc_score(y_valid, y_pred_proba)

print("Validation AUC:", auc)

In [None]:
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import GridSearchCV

# Create LGBMClassifier
model = LGBMClassifier(force_row_wise=True)

# Define the parameter grid
param_grid = {
    "boosting_type": ["gbdt"],
    "objective": ["binary"],
    "learning_rate": [0.05],
    "scale_pos_weight": [50],  # Adjust this based on class imbalance
}

# Create a CalibratedClassifierCV with the model
calibrated_model = CalibratedClassifierCV(estimator=model, method="sigmoid")

# Create the GridSearchCV object
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring="roc_auc",  # Use AUC as the scoring metric
    cv=2,
    verbose=1,
    n_jobs=-1,
)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Cross-Validation Score (AUC):", best_score)

# Validate the model on the validation set
best_model = grid_search.best_estimator_

# Fit the calibrated model on the training data
calibrated_model.fit(X_train, y_train)

# Get predicted classes and probabilities
y_pred = calibrated_model.predict(X_valid)
y_pred_proba = calibrated_model.predict_proba(X_valid)[:, 1]  # Get probabilities for the positive class

# Calculate AUC
auc = roc_auc_score(y_valid, y_pred_proba)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_valid, y_pred)

# Generate classification report
class_report = classification_report(y_valid, y_pred, output_dict=True)

# Extract metrics from the classification report
f1 = class_report["1"]["f1-score"]
recall = class_report["1"]["recall"]
precision = class_report["1"]["precision"]

# Print results
print("Validation AUC:", auc)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", classification_report(y_valid, y_pred))
print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")