In [1]:
import pandas as pd
import numpy as np
import os
import sys
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [2]:
# Read raw data
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

from src.data.read_data import load_and_label_raw_data, replace_categorical_labels
raw_path = "../data/raw/german/german.data"
df = load_and_label_raw_data(raw_path)
df_label = replace_categorical_labels(df)

# Train test split
X = df_label.drop(columns=['target'])
y = df_label['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=123
)

# Feature Groups
ordinal_features = {
    'account_status': ["no checking account", "< 0 DM", "0 <= ... < 200 DM", ">= 200 DM / salary assignment"],
    'savings': ["unknown", "< 100 DM", "100-500 DM", "500-1000 DM", ">= 1000 DM"],
    'employment_yr': ["unemployed", "< 1 year", "1-4 years", "4-7 years", ">= 7 years"],
    'job': ["unskilled-nonresident", "unskilled-resident", "skilled", "management"]
}
ordinal_cols = list(ordinal_features.keys())

onehot_cols = [
    'credit_history', 'purpose', 'personal_status_sex', 'other_debtors', 
    'property', 'other_installment_plans', 'telephone', 'foreign_worker',
    'housing'
]

numeric_cols = ['duration_mon', 'credit_amount', 'age']

# From EDA: Treat these as categorical (though they are int)
discrete_as_categorical = ['installment_rate', 'residence_since', 'existing_credits', 'num_liable_people']
onehot_cols += [col for col in discrete_as_categorical if col not in onehot_cols]

# Setup preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('ord', OrdinalEncoder(categories=[ordinal_features[col] for col in ordinal_cols]), ordinal_cols),
        ('ohe', OneHotEncoder(drop='first', sparse_output=False), onehot_cols),
        ('num', StandardScaler(), numeric_cols)
    ],
    remainder='drop'
)

# Fit and transform
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

ord_cols = ordinal_cols
ohe_cols = preprocessor.named_transformers_['ohe'].get_feature_names_out(onehot_cols).tolist()
num_cols = numeric_cols
final_cols = ord_cols + ohe_cols + num_cols

df_train = pd.DataFrame(X_train_processed, columns=final_cols)
df_train['target'] = y_train.values

df_test = pd.DataFrame(X_test_processed, columns=final_cols)
df_test['target'] = y_test.values

In [3]:
df_train.columns

Index(['account_status', 'savings', 'employment_yr', 'job',
       'credit_history_critical account', 'credit_history_delayed payments',
       'credit_history_existing credits paid',
       'credit_history_no credits / all paid back', 'purpose_business',
       'purpose_car (new)', 'purpose_car (used)', 'purpose_education',
       'purpose_furniture', 'purpose_other', 'purpose_radio/TV',
       'purpose_repairs', 'purpose_retraining',
       'personal_status_sex_male-div/sep', 'personal_status_sex_male-married',
       'personal_status_sex_male-single', 'other_debtors_guarantor',
       'other_debtors_none', 'property_life insurance', 'property_real estate',
       'property_unknown', 'other_installment_plans_none',
       'other_installment_plans_stores', 'telephone_yes', 'foreign_worker_yes',
       'housing_own', 'housing_rent', 'installment_rate_2',
       'installment_rate_3', 'installment_rate_4', 'residence_since_2',
       'residence_since_3', 'residence_since_4', 'existing_cr

In [4]:
# Save processed data
df_train.to_csv("../data/processed/train_processed.csv", index=False)
df_test.to_csv("../data/processed/test_processed.csv", index=False)