In [1]:
from pathlib import Path
import pandas as pd
from loguru import logger
import typer

from ml_classification.config import SILVER_DATA_DIR, GOLD_DATA_DIR

[32m2025-08-17 08:07:48.463[0m | [1mINFO    [0m | [36mml_classification.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: C:\Users\flavi\git\ml-default-payment-project[0m


In [5]:
df = pd.read_parquet('../data/silver/credit_card_default.parquet')

In [9]:
display(df.head())

Unnamed: 0,limit_bal,sex,education,marriage,age,pay_0,pay_2,pay_3,pay_4,pay_5,...,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,default_payment_next_month
0,20000.0,2,2,1,24,2,2,-1,-1,-2,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,120000.0,2,2,2,26,-1,2,0,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,90000.0,2,2,2,34,0,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,50000.0,2,2,1,37,0,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,50000.0,1,2,1,57,-1,0,-1,0,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [10]:
# Age binning
bins = [17, 25, 35, 50, 120]
labels = ["18-25", "26-35", "36-50", "50+"]
df["age_bin"] = pd.cut(df["age"], bins=bins, labels=labels)

In [16]:
# Bill trend (difference between last and first bill amount)
df["bill_trend"] = df["bill_amt6"] - df["bill_amt1"]

print("Data after feature engineering:")
display(df["bill_trend"])

Data after feature engineering:


0         -3913.0
1           579.0
2        -13690.0
3        -17443.0
4         10514.0
           ...   
29995   -172968.0
29996     -1683.0
29997     15792.0
29998     50589.0
29999    -32616.0
Name: bill_trend, Length: 30000, dtype: float64

In [17]:
# Pay ratio (total paid / total billed)
total_pay = df[["pay_amt1", "pay_amt2", "pay_amt3", "pay_amt4", "pay_amt5", "pay_amt6"]].sum(axis=1)
total_bill = df[["bill_amt1", "bill_amt2", "bill_amt3", "bill_amt4", "bill_amt5", "bill_amt6"]].sum(axis=1)
df["pay_ratio"] = total_pay / (total_bill.replace(0, 1))  # avoid division by zero

display(df["pay_ratio"])

0        0.089434
1        0.292791
2        0.108388
3        0.036259
4        0.540054
           ...   
29995    0.058661
29996    0.684071
29997    0.443997
29998    0.552044
29999    0.035985
Name: pay_ratio, Length: 30000, dtype: float64

In [18]:
df = pd.read_parquet('../data/gold/credit_card_default_features.parquet')

In [19]:
display(df.head())

Unnamed: 0,limit_bal,sex,education,marriage,age,pay_0,pay_2,pay_3,pay_4,pay_5,...,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,default_payment_next_month,age_bin,bill_trend,pay_ratio,utilization
0,20000.0,2,2,1,24,2,2,-1,-1,-2,...,689.0,0.0,0.0,0.0,0.0,1,18-25,-3913.0,0.089434,0.0
1,120000.0,2,2,2,26,-1,2,0,0,0,...,1000.0,1000.0,1000.0,0.0,2000.0,1,26-35,579.0,0.292791,0.027175
2,90000.0,2,2,2,34,0,0,0,0,0,...,1500.0,1000.0,1000.0,1000.0,5000.0,0,26-35,-13690.0,0.108388,0.172767
3,50000.0,2,2,1,37,0,0,0,0,0,...,2019.0,1200.0,1100.0,1069.0,1000.0,0,36-50,-17443.0,0.036259,0.59094
4,50000.0,1,2,1,57,-1,0,-1,0,0,...,36681.0,10000.0,9000.0,689.0,679.0,0,50+,10514.0,0.540054,0.38262
