## Data Load and Setup

In [91]:
import pandas as pd
from pathlib import Path

DATA_PATH = Path("../data/german_credit_cleaned.csv")
df = pd.read_csv(DATA_PATH)
df.shape

(1000, 21)

## Exploratory Data Analysis (EDA)

In [92]:
# Check the balance between "good" and "bad" credit
df["target"].value_counts(normalize=True)

target
good    0.7
bad     0.3
Name: proportion, dtype: float64

In [93]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,1000.0,20.903,12.058814,4.0,12.0,18.0,24.0,72.0
loan_amt,1000.0,3271.258,2822.736876,250.0,1365.5,2319.5,3972.25,18424.0
installment_rate,1000.0,2.973,1.118715,1.0,2.0,3.0,4.0,4.0
present_residence_since,1000.0,2.845,1.103718,1.0,2.0,3.0,4.0,4.0
age,1000.0,35.546,11.375469,19.0,27.0,33.0,42.0,75.0
num_curr_loans,1000.0,1.407,0.577654,1.0,1.0,1.0,2.0,4.0
num_people_provide_maint,1000.0,1.155,0.362086,1.0,1.0,1.0,1.0,2.0


In [94]:
# Check for missing values across all columns
df.isnull().sum().sort_values(ascending=False)

checking_acc_status         0
property                    0
is_foreign_worker           0
telephone                   0
num_people_provide_maint    0
job                         0
num_curr_loans              0
housing                     0
other_installment_plans     0
age                         0
present_residence_since     0
duration                    0
other_debtors_guarantors    0
personal_stat_gender        0
installment_rate            0
present_employment_since    0
saving_acc_bonds            0
loan_amt                    0
purpose                     0
cred_hist                   0
target                      0
dtype: int64

In [95]:
# Identify numeric and categorical features
num_cols = df.select_dtypes(include=["number"]).columns.tolist()
cat_cols = df.select_dtypes(exclude=["number"]).columns.tolist()

len(num_cols), len(cat_cols), num_cols[:5], cat_cols[:5]

(7,
 14,
 ['duration',
  'loan_amt',
  'installment_rate',
  'present_residence_since',
  'age'],
 ['checking_acc_status',
  'cred_hist',
  'purpose',
  'saving_acc_bonds',
  'present_employment_since'])

In [96]:
# Count unique values in each categorical column
df[cat_cols].nunique().sort_values()

telephone                    2
is_foreign_worker            2
target                       2
other_debtors_guarantors     3
other_installment_plans      3
housing                      3
checking_acc_status          4
personal_stat_gender         4
property                     4
job                          4
cred_hist                    5
saving_acc_bonds             5
present_employment_since     5
purpose                     10
dtype: int64

In [97]:
## Feature Engineering & Data Preparation

In [98]:
# Separate numeric categorical vs continuous numeric features
u = df.nunique()
numeric_categorical = [c for c in num_cols if u[c] <= 10]
numeric_continuous = [c for c in num_cols if u[c] > 10]

numeric_categorical, numeric_continuous

(['installment_rate',
  'present_residence_since',
  'num_curr_loans',
  'num_people_provide_maint'],
 ['duration', 'loan_amt', 'age'])

In [99]:
# Inspect values of a numeric categorical feature
df["installment_rate"].unique()

array([4, 2, 3, 1])

In [100]:
# Convert numeric categorical features to category dtype (for clarity)
for c in numeric_categorical:
    df[c] = df[c].astype("category")

df[numeric_categorical].dtypes

installment_rate            category
present_residence_since     category
num_curr_loans              category
num_people_provide_maint    category
dtype: object

In [101]:
cat_cols

['checking_acc_status',
 'cred_hist',
 'purpose',
 'saving_acc_bonds',
 'present_employment_since',
 'personal_stat_gender',
 'other_debtors_guarantors',
 'property',
 'other_installment_plans',
 'housing',
 'job',
 'telephone',
 'is_foreign_worker',
 'target']

In [102]:
# Remove target from categorical features and check unique category counts
cat_cols = [c for c in cat_cols if c != "target"]

df[cat_cols].nunique().sort_values()

telephone                    2
is_foreign_worker            2
other_debtors_guarantors     3
other_installment_plans      3
housing                      3
checking_acc_status          4
personal_stat_gender         4
property                     4
job                          4
cred_hist                    5
saving_acc_bonds             5
present_employment_since     5
purpose                     10
dtype: int64

In [103]:
# Inspect all categorical feature values to verify categories and spot formatting issues
for c in cat_cols:
    print(c, "→", df[c].unique())

checking_acc_status → ['below_0' 'below_200' 'no_cheking_acc' 'above:200']
cred_hist → ['risky_acc_or_curr_loan_other' 'curr_loans_paid_duly' 'delay_in_past'
 'no_loan_or_paid_duly_other' 'paid_duly_this_bank']
purpose → ['radio_tv' 'education' 'furniture_equipment' 'car_new' 'car_used'
 'business' 'domestic_applience' 'repairs' 'others' 'retraining']
saving_acc_bonds → ['unknown_no_saving_acc' 'below_100' 'below_1000' 'above_1000' 'below_500']
present_employment_since → ['above_7y' 'below_4y' 'below_7y' 'unemployed' 'below_1y']
personal_stat_gender → ['male:single' 'female:divorced_or_married' 'male:divorced'
 'male:married_or_widowed']
other_debtors_guarantors → ['none' 'guarantor' 'co_applicant']
property → ['real_estate' 'life_insurance_or_aggreements' 'unknown_or_no_property'
 'car_or_other']
other_installment_plans → ['none' 'bank' 'store']
housing → ['own' 'for_free' 'rent']
job → ['skilled_official' 'unskilled_resident' 'management_or_self_emp'
 'unemployed_non_resident']
telep

In [104]:
# Encode 'checking_acc_status' as ordered categories (higher = better financial standing)
checking_map = {"no_cheking_acc": 0, "below_0": 1, "below_200": 2, "above:200": 3}

df["checking_acc_status"] = df["checking_acc_status"].map(checking_map)

In [105]:
df["saving_acc_bonds"].unique()

array(['unknown_no_saving_acc', 'below_100', 'below_1000', 'above_1000',
       'below_500'], dtype=object)

In [106]:
# Encode 'saving_acc_bonds' (higher = more savings)
saving_map = {
    "unknown_no_saving_acc": 0,
    "below_100": 1,
    "below_500": 2,
    "below_1000": 3,
    "above_1000": 4,
}
df["saving_acc_bonds"] = df["saving_acc_bonds"].map(saving_map)
df["saving_acc_bonds"].unique()

array([0, 1, 3, 4, 2])

In [107]:
df["present_employment_since"].unique()

array(['above_7y', 'below_4y', 'below_7y', 'unemployed', 'below_1y'],
      dtype=object)

In [108]:
# Encode 'present_employment_since' (higher = longer employment stability)
employment_map = {
    "unemployed": 0,
    "below_1y": 1,
    "below_4y": 2,
    "below_7y": 3,
    "above_7y": 4,
}
df["present_employment_since"] = df["present_employment_since"].map(employment_map)
df["present_employment_since"].unique()

array([4, 2, 3, 0, 1])

In [109]:
df["personal_stat_gender"].unique()

array(['male:single', 'female:divorced_or_married', 'male:divorced',
       'male:married_or_widowed'], dtype=object)

In [110]:
# Split 'personal_stat_gender' into 'gender' and 'personal_status'
df[["gender", "personal_status"]] = df["personal_stat_gender"].str.split(
    ":", expand=True
)

df[["gender", "personal_status"]].head()

Unnamed: 0,gender,personal_status
0,male,single
1,female,divorced_or_married
2,male,single
3,male,single
4,male,single


In [111]:
# Split 'personal_stat_gender' into two separate categorical features
df.drop(columns=["personal_stat_gender"], inplace=True)

df[["gender", "personal_status"]].nunique()

gender             2
personal_status    4
dtype: int64

In [112]:
# Update categorical feature list after splitting
cat_cols.remove("personal_stat_gender")
cat_cols.extend(["gender", "personal_status"])

In [113]:
# One-hot encode remaining categorical features
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)

df_encoded.shape

(1000, 50)

In [114]:
# Make label numeric and verify all features are numeric
df_encoded["target"] = df["target"].map({"bad": 0, "good": 1})

df_encoded.select_dtypes(exclude=["number"]).columns.tolist()

['installment_rate',
 'present_residence_since',
 'num_curr_loans',
 'num_people_provide_maint',
 'checking_acc_status_1',
 'checking_acc_status_2',
 'checking_acc_status_3',
 'cred_hist_delay_in_past',
 'cred_hist_no_loan_or_paid_duly_other',
 'cred_hist_paid_duly_this_bank',
 'cred_hist_risky_acc_or_curr_loan_other',
 'purpose_car_new',
 'purpose_car_used',
 'purpose_domestic_applience',
 'purpose_education',
 'purpose_furniture_equipment',
 'purpose_others',
 'purpose_radio_tv',
 'purpose_repairs',
 'purpose_retraining',
 'saving_acc_bonds_1',
 'saving_acc_bonds_2',
 'saving_acc_bonds_3',
 'saving_acc_bonds_4',
 'present_employment_since_1',
 'present_employment_since_2',
 'present_employment_since_3',
 'present_employment_since_4',
 'other_debtors_guarantors_guarantor',
 'other_debtors_guarantors_none',
 'property_life_insurance_or_aggreements',
 'property_real_estate',
 'property_unknown_or_no_property',
 'other_installment_plans_none',
 'other_installment_plans_store',
 'housing_