In [71]:
import numpy as np
from tqdm import tqdm

import torch

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import pandas as pd
import matplotlib.pyplot as plt

from torch.utils.data import Dataset, DataLoader
from torchkeras.tabular import TabularPreprocessor, TabularDataset
from torchkeras.tabular.models import FTTransformerConfig, FTTransformerModel

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [72]:
np.random.seed(42)
train_df = pd.read_csv("data/train_df.csv")

In [73]:
train_df.head()

Unnamed: 0,age,gender,primary_diagnosis,num_procedures,days_in_hospital,comorbidity_score,discharge_to,readmitted
0,69,Male,Heart Disease,1,2,1,Home Health Care,0
1,32,Female,COPD,2,13,2,Rehabilitation Facility,0
2,89,Male,Diabetes,1,7,1,Home,0
3,78,Male,COPD,9,2,2,Skilled Nursing Facility,0
4,38,Male,Diabetes,6,4,4,Rehabilitation Facility,0


In [74]:
# ----- 數值型與類別型特徵 -----
numerical_features = ['age']
categorical_features = ['gender', 'primary_diagnosis', 'discharge_to', 'num_procedures', 'days_in_hospital', 'comorbidity_score']
target = 'readmitted'

In [75]:
df = train_df.copy()

# 前處理

In [76]:
# 數值標準化
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# 將類別特徵轉為 category 類型
for col in categorical_features:
    df[col] = df[col].astype('category')

df[target] = df[target].astype('category')

# 轉成 numpy
X_num = df[numerical_features].values.astype(np.float32)
X_cat = df[categorical_features].values
y = df[target].values.astype(np.int64)

# 分割訓練測試資料
X_num_train, X_num_test, X_cat_train, X_cat_test, y_train, y_test = train_test_split(
    X_num, X_cat, y, test_size=0.2, random_state=42
)

In [77]:
# 1. 將數據轉為 DataFrame 格式，因為 TabularPreprocessor 通常需要 DataFrame
numerical_columns = [f'num_{i}' for i in range(len(numerical_features))]
categorical_columns = [f'cat_{i}' for i in range(len(categorical_features))]

train_df_train_csv = pd.DataFrame({
    **{f'num_{i}': X_num_train[:, i] for i in range(X_num_train.shape[1])},
    **{f'cat_{i}': X_cat_train[:, i] for i in range(X_cat_train.shape[1])},
    'target': y_train
})

test_df_train_csv = pd.DataFrame({
    **{f'num_{i}': X_num_test[:, i] for i in range(X_num_test.shape[1])},
    **{f'cat_{i}': X_cat_test[:, i] for i in range(X_cat_test.shape[1])},
    'target': y_test
})

In [78]:
train_df_train_csv.head()

Unnamed: 0,num_0,cat_0,cat_1,cat_2,cat_3,cat_4,cat_5,target
0,0.954285,Male,Hypertension,Home Health Care,7,10,1,0
1,-0.014483,Female,COPD,Home Health Care,4,12,2,1
2,-0.062921,Female,Heart Disease,Home,2,1,2,1
3,-0.208237,Male,Heart Disease,Home,2,6,1,0
4,-0.644182,Male,COPD,Home Health Care,9,6,3,0


In [79]:
train_target_df = train_df_train_csv['target']
train_df_train_csv = train_df_train_csv.drop(columns=['target'])
test_target_df = test_df_train_csv['target']
test_df_train_csv = test_df_train_csv.drop(columns=['target'])

# 使用 TabularPreprocessor 進行預處理
preprocessor = TabularPreprocessor(
    cat_features=categorical_columns,  # 類別特徵
    numeric_features=numerical_columns,    # 數值特徵
    normalization='standard',           # 數值特徵標準化（可選：'minmax' 或 None）
    onehot_max_cat_num=1,
)

preprocessor.fit(train_df_train_csv)

100%|██████████| 6/6 [00:00<?, ?it/s]


In [80]:
train_df_train_csv

Unnamed: 0,num_0,cat_0,cat_1,cat_2,cat_3,cat_4,cat_5
0,0.954285,Male,Hypertension,Home Health Care,7,10,1
1,-0.014483,Female,COPD,Home Health Care,4,12,2
2,-0.062921,Female,Heart Disease,Home,2,1,2
3,-0.208237,Male,Heart Disease,Home,2,6,1
4,-0.644182,Male,COPD,Home Health Care,9,6,3
...,...,...,...,...,...,...,...
3995,0.324586,Male,Diabetes,Skilled Nursing Facility,8,13,3
3996,-0.983251,Female,COPD,Rehabilitation Facility,3,7,0
3997,0.518339,Female,Hypertension,Home Health Care,3,4,0
3998,-1.564511,Female,COPD,Home Health Care,4,5,4


In [81]:
train_df_train_csv_processed = preprocessor.transform(train_df_train_csv)
test_df_train_csv_processed = preprocessor.transform(test_df_train_csv)



In [82]:
train_df_train_csv_processed = train_df_train_csv_processed.loc[:, ~train_df_train_csv_processed.columns.duplicated(keep='last')]
test_df_train_csv_processed = test_df_train_csv_processed.loc[:, ~test_df_train_csv_processed.columns.duplicated(keep='last')]

numerical_features_processed = preprocessor.get_numeric_features()
embedding_features_processed = preprocessor.get_embedding_features()


In [83]:
train_df_train_csv_processed['target'] = train_target_df
test_df_train_csv_processed['target'] = test_target_df

# === 3. 建立 TabularDataset ===
ds_train = TabularDataset(
    data=train_df_train_csv_processed,
    task='classification',
    target=['target'],
    continuous_cols=numerical_features_processed,
    categorical_cols=embedding_features_processed
)

ds_test = TabularDataset(
    data=test_df_train_csv_processed,
    task='classification',
    target=['target'],
    continuous_cols=numerical_features_processed,
    categorical_cols=embedding_features_processed
)

dl_train = DataLoader(ds_train, batch_size=128, shuffle=False, num_workers=0, pin_memory=False)

dl_test = DataLoader(ds_test, batch_size=128, shuffle=False, num_workers=0, pin_memory=False)

In [84]:
print("categorical features (cols):", embedding_features_processed)
print("categorical features (array):", train_df_train_csv_processed[embedding_features_processed].shape[1])

print("numerical features (cols):", numerical_features_processed)
print("numerical features (array):", train_df_train_csv_processed[numerical_features_processed].shape[1])

categorical features (cols): ['cat_0', 'cat_1', 'cat_2', 'cat_3', 'cat_4', 'cat_5']
categorical features (array): 6
numerical features (cols): ['num_0']
numerical features (array): 1


# 建立模型

In [85]:
model_config = FTTransformerConfig(
    # ModelConfig 參數
    task="classification",  # 二元分類
    num_attn_blocks=3,
)

config = model_config.merge_dataset_config(ds_train)

In [None]:
# 初始化模型
model = FTTransformerModel(config)

# 訓練模型

In [None]:
# 設置優化器和損失函數
device = 'cuda' if torch.cuda.is_available() else 'cpu'
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = torch.nn.CrossEntropyLoss()

In [None]:
# 訓練模型
def train_model(model: FTTransformerModel, train_loader: DataLoader, criterion: torch.nn.CrossEntropyLoss, optimizer: torch.optim.Adam, num_epochs:int=20, device:str=device):
    model.to(device)
    model.train()

    progress_bar = tqdm(range(num_epochs), leave=False)
    for epoch in progress_bar:
        total_loss = 0
        for batch in train_loader:
            numerical = batch['continuous'].to(device)
            categorical = batch['categorical'].to(device)
            target = batch['target'].to(device).squeeze()
            optimizer.zero_grad()
            outputs = model({'continuous': numerical, 'categorical': categorical})
            logits = outputs['logits']
            loss = criterion(logits, target)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        progress_bar.set_description(f"Epoch {epoch+1}/{num_epochs} | Avg. Loss: {total_loss/len(train_loader):.4f}")
    
    # 儲存模型（建議用 .pt 或 .pth）
    torch.save(model.state_dict(), 'ft_transformer_model.pth')


# 評估模型
def evaluate_model(model, test_loader, device=device):
    model.eval()
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in test_loader:
            numerical = batch['continuous'].to(device)
            categorical = batch['categorical'].to(device)
            target = batch['target'].to(device).squeeze()
            outputs = model({'continuous': numerical, 'categorical': categorical})
            _, preds = torch.max(outputs['logits'], dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(target.cpu().numpy())
    accuracy = accuracy_score(true_labels, predictions)
    print(f'Test Accuracy: {accuracy:.4f}')
    print('\nClassification Report:')
    print(classification_report(true_labels, predictions))

# 執行訓練和評估
train_model(model, dl_train, criterion, optimizer, num_epochs=100)
evaluate_model(model, dl_test)

                                                                                    

Test Accuracy: 0.7020

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.81      0.82       826
           1       0.16      0.17      0.17       174

    accuracy                           0.70      1000
   macro avg       0.49      0.49      0.49      1000
weighted avg       0.71      0.70      0.71      1000





# 測試集

In [89]:
test_df_test_csv = pd.read_csv("data/test_df.csv")

# 將類別特徵轉為 category 類型
for col in categorical_features:
    test_df_test_csv[col] = test_df_test_csv[col].astype('category')

In [90]:
X_num_test_csv = test_df_test_csv[numerical_features].values.astype(np.float32)
X_cat_test_csv = test_df_test_csv[categorical_features].values

In [91]:
test_df_test_csv = pd.DataFrame({
    **{f'num_{i}': X_num_test_csv[:, i] for i in range(X_num_test_csv.shape[1])},
    **{f'cat_{i}': X_cat_test_csv[:, i] for i in range(X_cat_test_csv.shape[1])},
})

In [92]:
len(test_df_test_csv)

2000

In [94]:
test_df_test_csv.head()

Unnamed: 0,num_0,cat_0,cat_1,cat_2,cat_3,cat_4,cat_5
0,52.0,Male,Heart Disease,Home,3,9,3
1,47.0,Female,Diabetes,Skilled Nursing Facility,2,4,0
2,72.0,Female,Heart Disease,Home,7,12,4
3,18.0,Female,COPD,Home,5,14,3
4,32.0,Male,Heart Disease,Rehabilitation Facility,9,2,4


In [95]:
test_df_test_csv_processed = preprocessor.transform(test_df_test_csv)
test_df_test_csv_processed = test_df_test_csv_processed.loc[:, ~test_df_test_csv_processed.columns.duplicated(keep='last')]

ds_test_test_csv = TabularDataset(
    data=test_df_test_csv_processed,
    task='classification',
    continuous_cols=numerical_features_processed,
    categorical_cols=embedding_features_processed
)

dl_test_test_csv = DataLoader(ds_test_test_csv, batch_size=128, shuffle=False, num_workers=0, pin_memory=False)



In [97]:
print("categorical features (cols):", embedding_features_processed)
print("categorical features (array):", test_df_test_csv_processed[embedding_features_processed].shape[1])

print("numerical features (cols):", numerical_features_processed)
print("numerical features (array):", test_df_test_csv_processed[numerical_features_processed].shape[1])

categorical features (cols): ['cat_0', 'cat_1', 'cat_2', 'cat_3', 'cat_4', 'cat_5']
categorical features (array): 6
numerical features (cols): ['num_0']
numerical features (array): 1


In [98]:
# 12. 設置優化器和損失函數
device = 'cuda' if torch.cuda.is_available() else 'cpu'
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = torch.nn.CrossEntropyLoss()

# 初始化模型
model = FTTransformerModel(config=config)
model.load_state_dict(torch.load("ft_transformer_model.pth"))
model.to(device)

predictions = []
true_labels = []
# evaluation mode
model.eval()
with torch.no_grad():
    for batch in dl_test_test_csv:
        numerical = batch['continuous'].to(device)
        categorical = batch['categorical'].to(device)
        outputs = model({'continuous': numerical, 'categorical': categorical})
        _, preds = torch.max(outputs['logits'], dim=1)
        predictions.extend(preds.cpu().numpy())

submission_df = pd.DataFrame({｀
    "Patient_ID": range(1, len(predictions)+1),
    "readmitted": predictions
})

submission_df.to_csv(path_or_buf='data/submission_df.csv', index=False)


  model.load_state_dict(torch.load("ft_transformer_model.pth"))
