## Data Load and Setup

In [7]:
import pandas as pd
from pathlib import Path

DATA_PATH = Path("../data/german_credit_cleaned.csv")
df = pd.read_csv(DATA_PATH)
df.shape

(1000, 21)

## Exploratory Data Analysis (EDA)

In [8]:
# Check the balance between "good" and "bad" credit
df["target"].value_counts(normalize=True)

target
good    0.7
bad     0.3
Name: proportion, dtype: float64

In [9]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,1000.0,20.903,12.058814,4.0,12.0,18.0,24.0,72.0
loan_amt,1000.0,3271.258,2822.736876,250.0,1365.5,2319.5,3972.25,18424.0
installment_rate,1000.0,2.973,1.118715,1.0,2.0,3.0,4.0,4.0
present_residence_since,1000.0,2.845,1.103718,1.0,2.0,3.0,4.0,4.0
age,1000.0,35.546,11.375469,19.0,27.0,33.0,42.0,75.0
num_curr_loans,1000.0,1.407,0.577654,1.0,1.0,1.0,2.0,4.0
num_people_provide_maint,1000.0,1.155,0.362086,1.0,1.0,1.0,1.0,2.0


In [10]:
# Check for missing values across all columns
df.isnull().sum().sort_values(ascending=False)

checking_acc_status         0
property                    0
is_foreign_worker           0
telephone                   0
num_people_provide_maint    0
job                         0
num_curr_loans              0
housing                     0
other_installment_plans     0
age                         0
present_residence_since     0
duration                    0
other_debtors_guarantors    0
personal_stat_gender        0
installment_rate            0
present_employment_since    0
saving_acc_bonds            0
loan_amt                    0
purpose                     0
cred_hist                   0
target                      0
dtype: int64

In [11]:
# Identify numeric and categorical features
num_cols = df.select_dtypes(include=["number"]).columns.tolist()
cat_cols = df.select_dtypes(exclude=["number"]).columns.tolist()

len(num_cols), len(cat_cols), num_cols[:5], cat_cols[:5]

(7,
 14,
 ['duration',
  'loan_amt',
  'installment_rate',
  'present_residence_since',
  'age'],
 ['checking_acc_status',
  'cred_hist',
  'purpose',
  'saving_acc_bonds',
  'present_employment_since'])

In [12]:
# Count unique values in each categorical column
df[cat_cols].nunique().sort_values()

telephone                    2
is_foreign_worker            2
target                       2
other_debtors_guarantors     3
other_installment_plans      3
housing                      3
checking_acc_status          4
personal_stat_gender         4
property                     4
job                          4
cred_hist                    5
saving_acc_bonds             5
present_employment_since     5
purpose                     10
dtype: int64