In [7]:
import os, sys
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import lightgbm as lgb


In [8]:
# Read raw data
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

from src.data.read_data import load_and_label_raw_data, replace_categorical_labels
raw_path = "../data/raw/german/german.data"
df = load_and_label_raw_data(raw_path)
df_label = replace_categorical_labels(df)

# Train test split
X = df_label.drop(columns=['target'])
y = df_label['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=123
)

# Feature Groups
ordinal_features = {
    'account_status': ["no checking account", "< 0 DM", "0 <= ... < 200 DM", ">= 200 DM / salary assignment"],
    'savings': ["unknown", "< 100 DM", "100-500 DM", "500-1000 DM", ">= 1000 DM"],
    'employment_yr': ["unemployed", "< 1 year", "1-4 years", "4-7 years", ">= 7 years"],
    'job': ["unskilled-nonresident", "unskilled-resident", "skilled", "management"]
}
ordinal_cols = list(ordinal_features.keys())

onehot_cols = [
    'credit_history', 'purpose', 'personal_status_sex', 'other_debtors', 
    'property', 'other_installment_plans', 'telephone', 'foreign_worker',
    'housing'
]

numeric_cols = ['duration_mon', 'credit_amount', 'age']

# From EDA: Treat these as categorical (though they are int)
discrete_as_categorical = ['installment_rate', 'residence_since', 'existing_credits', 'num_liable_people']
onehot_cols += [col for col in discrete_as_categorical if col not in onehot_cols]

# Setup preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('ord', OrdinalEncoder(categories=[ordinal_features[col] for col in ordinal_cols]), ordinal_cols),
        ('ohe', OneHotEncoder(drop='first', sparse_output=False), onehot_cols),
        ('num', StandardScaler(), numeric_cols)
    ],
    remainder='drop'
)

In [9]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(max_depth=5, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "LightGBM": lgb.LGBMClassifier(random_state=42)
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = []

for name, model in models.items():
    print(f"Running CV for {name}...")

    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('select', SelectKBest(score_func=f_classif, k=20)),
        ('model', model)
    ])

    scores = cross_validate(pipeline, X, y, cv=cv,
                             scoring=['recall', 'accuracy', 'f1'],
                             return_train_score=False)

    results.append({
        'model': name,
        'recall': f"{scores['test_recall'].mean():.4f} ± {scores['test_recall'].std():.4f}",
        'accuracy': f"{scores['test_accuracy'].mean():.4f} ± {scores['test_accuracy'].std():.4f}",
        'f1': f"{scores['test_f1'].mean():.4f} ± {scores['test_f1'].std():.4f}"
    })

# ------------------- Show Results -------------------
df_results = pd.DataFrame(results)
df_results = df_results.sort_values(by="recall", ascending=False)
df_results

Running CV for Logistic Regression...
Running CV for Decision Tree...
Running CV for Random Forest...
Running CV for XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Running CV for LightGBM...
[LightGBM] [Info] Number of positive: 560, number of negative: 240
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000471 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 366
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.700000 -> initscore=0.847298
[LightGBM] [Info] Start training from score 0.847298
[LightGBM] [Info] Number of positive: 560, number of negative: 240
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000182 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 369
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 20
[LightG



[LightGBM] [Info] Number of positive: 560, number of negative: 240
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000104 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 369
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.700000 -> initscore=0.847298
[LightGBM] [Info] Start training from score 0.847298
[LightGBM] [Info] Number of positive: 560, number of negative: 240
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000089 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 369
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 20
[LightGBM] [Info] [binary:BoostFro



Unnamed: 0,model,recall,accuracy,f1
0,Logistic Regression,0.8971 ± 0.0107,0.7250 ± 0.0152,0.8204 ± 0.0092
2,Random Forest,0.8929 ± 0.0186,0.7520 ± 0.0144,0.8344 ± 0.0107
4,LightGBM,0.8614 ± 0.0205,0.7380 ± 0.0204,0.8215 ± 0.0144
1,Decision Tree,0.8457 ± 0.0291,0.7120 ± 0.0169,0.8043 ± 0.0096
3,XGBoost,0.8457 ± 0.0205,0.7250 ± 0.0155,0.8114 ± 0.0118
