In [14]:
import pandas as pd
import numpy as np

X = pd.read_csv("X_train_pca.csv")
credit = pd.read_csv("credit_record.csv")

# 產生分類標籤
credit["STATUS"] = credit["STATUS"].astype(str)
latest_status = credit.sort_values("MONTHS_BALANCE", ascending=False).groupby("ID")["STATUS"].first().reset_index()

X = X.drop_duplicates("ID", keep="first")
df = X.merge(latest_status, on="ID", how="left").dropna(subset=["STATUS"])
label_map = {'0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, 'C': 6, 'X': 7}
X_data = df.drop(columns=["ID", "STATUS"]).values
y_data = df["STATUS"].map(label_map).values


In [15]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

kf = KFold(n_splits=5, shuffle=True, random_state=42)
meta_features = []
meta_labels = []

for train_idx, val_idx in kf.split(X_data):
    X_train, X_val = X_data[train_idx], X_data[val_idx]
    y_train, y_val = y_data[train_idx], y_data[val_idx]

    fold_preds = []
    for model in [
        RandomForestClassifier(n_estimators=100),
        XGBClassifier(n_estimators=100, eval_metric='mlogloss'),
        LGBMClassifier(n_estimators=100)
    ]:
        model.fit(X_train, y_train)
        pred = model.predict_proba(X_val)
        fold_preds.append(pred)

    meta_input = np.hstack(fold_preds)
    meta_features.append(meta_input)
    meta_labels.append(y_val)

X_meta = np.vstack(meta_features)
y_meta = np.hstack(meta_labels)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000803 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 23404, number of used features: 15
[LightGBM] [Info] Start training from score -1.461152
[LightGBM] [Info] Start training from score -4.895876
[LightGBM] [Info] Start training from score -7.495713
[LightGBM] [Info] Start training from score -8.674368
[LightGBM] [Info] Start training from score -8.451224
[LightGBM] [Info] Start training from score -6.210515
[LightGBM] [Info] Start training from score -0.711343
[LightGBM] [Info] Start training from score -1.321927
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000741 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 23404, number of used fea

In [16]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

y_meta_cat = to_categorical(y_meta, num_classes=8)
X_train, X_val, y_train, y_val = train_test_split(X_meta, y_meta_cat, test_size=0.2, random_state=42)

model = Sequential([
    Dense(128, activation='relu', input_shape=(X_meta.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(8, activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, validation_data=(X_val, y_val),
          epochs=50, batch_size=64, callbacks=[EarlyStopping(monitor="val_loss", patience=5)])


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5231 - loss: 1.2499 - val_accuracy: 0.5768 - val_loss: 0.9934
Epoch 2/50
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.5704 - loss: 0.9985 - val_accuracy: 0.5768 - val_loss: 0.9956
Epoch 3/50
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.5694 - loss: 1.0017 - val_accuracy: 0.5753 - val_loss: 0.9935
Epoch 4/50
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.5794 - loss: 0.9933 - val_accuracy: 0.5751 - val_loss: 0.9932
Epoch 5/50
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.5808 - loss: 0.9854 - val_accuracy: 0.5806 - val_loss: 0.9929
Epoch 6/50
[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.5729 - loss: 0.9953 - val_accuracy: 0.5811 - val_loss: 0.9936
Epoch 7/50
[1m366/366[0m [32m━━━━━━━

<keras.src.callbacks.history.History at 0x1999fc10b90>

In [None]:
from sklearn.metrics import classification_report, f1_score, roc_auc_score

y_pred = model.predict(X_val)
y_pred_label = np.argmax(y_pred, axis=1)
y_true_label = np.argmax(y_val, axis=1)

print(classification_report(y_true_label, y_pred_label))
print("F1 Score (macro):", f1_score(y_true_label, y_pred_label, average='macro'))
print("AUC (multi-class):", roc_auc_score(y_val, y_pred, multi_class='ovr'))


[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 803us/step
              precision    recall  f1-score   support

           0       0.47      0.18      0.27      1349
           1       0.00      0.00      0.00        44
           2       0.00      0.00      0.00         5
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         2
           5       0.17      0.06      0.09        16
           6       0.59      0.85      0.70      2891
           7       0.58      0.44      0.50      1543

    accuracy                           0.58      5851
   macro avg       0.23      0.19      0.19      5851
weighted avg       0.56      0.58      0.54      5851

F1 Score (macro): 0.19410324738240797
AUC (multi-class): 0.7203299197772801


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


: 