In [87]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import optuna
import torch.nn as nn
import torch.nn.functional as F
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from pytorch_tabnet.tab_model import TabNetClassifier
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# Data Loading

In [88]:
df_train = pd.read_csv('..\\Dataset\\raw\\train.csv')
df_test = pd.read_csv('..\\Dataset\\raw\\test.csv')

# EDA

In [89]:
df_train.head(10)

Unnamed: 0,id,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome
0,0,yes,adult,530001,38.1,132.0,24.0,cool,reduced,dark_cyanotic,...,57.0,8.5,serosanguious,3.4,yes,2209,0,0,no,died
1,1,yes,adult,533836,37.5,88.0,12.0,cool,normal,pale_cyanotic,...,33.0,64.0,serosanguious,2.0,yes,2208,0,0,no,euthanized
2,2,yes,adult,529812,38.3,120.0,28.0,cool,reduced,pale_pink,...,37.0,6.4,serosanguious,3.4,yes,5124,0,0,no,lived
3,3,yes,adult,5262541,37.1,72.0,30.0,cold,reduced,pale_pink,...,53.0,7.0,cloudy,3.9,yes,2208,0,0,yes,lived
4,4,no,adult,5299629,38.0,52.0,48.0,normal,normal,normal_pink,...,47.0,7.3,cloudy,2.6,no,0,0,0,yes,lived
5,5,no,adult,529642,38.1,56.0,32.0,normal,normal,bright_pink,...,49.0,8.0,cloudy,2.8,no,0,0,0,yes,lived
6,6,yes,adult,534787,38.3,36.0,16.0,cool,reduced,normal_pink,...,43.0,75.0,cloudy,1.0,no,3111,0,0,yes,euthanized
7,7,no,adult,529461,39.2,114.0,24.0,cool,reduced,pale_cyanotic,...,57.0,7.6,serosanguious,4.5,yes,2207,0,0,yes,died
8,8,no,adult,528742,37.4,48.0,12.0,cool,reduced,normal_pink,...,40.0,7.8,cloudy,2.6,no,0,0,0,yes,lived
9,9,yes,adult,529640,38.3,129.0,48.0,cool,reduced,pale_pink,...,57.0,4.9,cloudy,2.9,yes,3209,0,0,yes,died


In [90]:
df_train.shape

(1235, 29)

In [91]:
df_train.isna().sum()

id                         0
surgery                    0
age                        0
hospital_number            0
rectal_temp                0
pulse                      0
respiratory_rate           0
temp_of_extremities       39
peripheral_pulse          60
mucous_membrane           21
capillary_refill_time      6
pain                      44
peristalsis               20
abdominal_distention      23
nasogastric_tube          80
nasogastric_reflux        21
nasogastric_reflux_ph      0
rectal_exam_feces        190
abdomen                  213
packed_cell_volume         0
total_protein              0
abdomo_appearance         48
abdomo_protein             0
surgical_lesion            0
lesion_1                   0
lesion_2                   0
lesion_3                   0
cp_data                    0
outcome                    0
dtype: int64

In [92]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1235 entries, 0 to 1234
Data columns (total 29 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     1235 non-null   int64  
 1   surgery                1235 non-null   object 
 2   age                    1235 non-null   object 
 3   hospital_number        1235 non-null   int64  
 4   rectal_temp            1235 non-null   float64
 5   pulse                  1235 non-null   float64
 6   respiratory_rate       1235 non-null   float64
 7   temp_of_extremities    1196 non-null   object 
 8   peripheral_pulse       1175 non-null   object 
 9   mucous_membrane        1214 non-null   object 
 10  capillary_refill_time  1229 non-null   object 
 11  pain                   1191 non-null   object 
 12  peristalsis            1215 non-null   object 
 13  abdominal_distention   1212 non-null   object 
 14  nasogastric_tube       1155 non-null   object 
 15  naso

In [93]:
df_train['outcome'].unique()

array(['died', 'euthanized', 'lived'], dtype=object)

# Preprocessing

In [94]:
# Determine the Categorical Columns
cat_cols_dropped = ['age',
 'mucous_membrane',
 'capillary_refill_time',
 'peristalsis',
 'nasogastric_tube',
 'nasogastric_reflux',
 'rectal_exam_feces',
 'abdomen',
 'abdomo_appearance',
 'surgical_lesion']

In [95]:
# Determine the Numerical Columns
num_cols_dropped = [
 'hospital_number',
 'rectal_temp',
 'pulse',
 'respiratory_rate',
 'nasogastric_reflux_ph',
 'packed_cell_volume',
 'total_protein',
 'abdomo_protein',
 'lesion_1',
 'lesion_2']

In [96]:
def prepo(df):
    df_processed = df.copy()

    # Define Scaler
    numeric_transformer = StandardScaler()

    # Define pipeline
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(sparse_output=False))
    ])

    # Combine both encoder & scaler with ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, num_cols_dropped),
            ('cat', categorical_transformer, cat_cols_dropped)
        ])
    
    # Map the label
    label_map = {
        'died' : 0, 
        'euthanized' : 1, 
        'lived' : 2
    }
    df_processed['outcome'] = df_processed['outcome'].map(label_map)
    
    # Split data into features & label
    X = df_processed.drop(columns='outcome')
    y = df_processed['outcome']

    # Transform the feature data
    X = preprocessor.fit_transform(X)

    # split data into train & test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # change to array
    X_train = X_train.to_numpy() if isinstance(X_train, pd.DataFrame) else X_train
    X_test = X_test.to_numpy() if isinstance(X_test, pd.DataFrame) else X_test
    y_train = y_train.to_numpy() if isinstance(y_train, pd.Series) else y_train
    y_test = y_test.to_numpy() if isinstance(y_test, pd.Series) else y_test

    return X_train, X_test, y_train, y_test


In [97]:
X_train, X_test, y_train, y_test = prepo(df_train)

In [98]:
X_train.shape

(988, 48)

In [99]:
# Determine the input and output dimensions
X_dim = X_train.shape[1]
y_dim = df_train['outcome'].nunique()

# Modelling

In [100]:
# Dataframe for evaluation result each model
eval_columns = ['Model', 'Accuracy', 'F1']
eval_results = pd.DataFrame(columns=eval_columns)
eval_results

Unnamed: 0,Model,Accuracy,F1


## Deep Learning

### NODE : Neural Oblivious Decision Ensembles

In [101]:
class DeepObliviousDecisionTreeLayer(nn.Module):
    def __init__(self, input_dim, num_trees, tree_depth, hidden_dim):
        super(DeepObliviousDecisionTreeLayer, self).__init__()
        self.num_trees = num_trees
        self.tree_depth = tree_depth
        self.hidden_dim = hidden_dim
        
        # Define a deeper architecture with hidden layers
        self.hidden_layers = nn.ModuleList()
        for _ in range(tree_depth):
            self.hidden_layers.append(nn.Linear(input_dim, hidden_dim))
            input_dim = hidden_dim 
        
        self.output_layer = nn.Linear(hidden_dim, num_trees)

    def forward(self, x):
        for layer in self.hidden_layers:
            x = F.relu(layer(x)) 
        
        # Apply the output layer
        out = torch.sigmoid(self.output_layer(x))
        return out

class NODE(nn.Module):
    def __init__(self, input_dim, output_dim, num_trees, tree_depth, hidden_dim):
        super(NODE, self).__init__()
        self.tree_layer = DeepObliviousDecisionTreeLayer(input_dim, num_trees=num_trees, tree_depth=tree_depth, hidden_dim=hidden_dim)
        
        # Additional fully connected layers to make it deeper
        self.fc1 = nn.Linear(num_trees, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.tree_layer(x)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [102]:
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test, num_epochs=100, learning_rate=0.001):
    # Convert data to tensors
    X_train, y_train = torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long)
    X_test, y_test = torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long)
    
    # Define optimizer and loss function
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    def eval_model(model, X, y):
        model.eval()
        with torch.no_grad():
            outputs = model(X)
            y_prob = F.softmax(outputs, dim=1)
        
        y_true = y.numpy()
        y_pred = torch.argmax(y_prob, dim=1).numpy()
        y_prob = y_prob.numpy()

        # Compute accuracy and F1 score
        accuracy = accuracy_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred, average='weighted')

        # Compute AUC-ROC score for test data (binary classification)
        if y_prob.shape[1] == 2:
            y_prob_positive = y_prob[:, 1]
            roc_auc = roc_auc_score(y_true, y_prob_positive)
        else:
            roc_auc = 'N/A'

        return accuracy, f1, roc_auc

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Print information every 10 epochs
        if (epoch + 1) % 10 == 0 or (epoch + 1) == num_epochs:
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

    # Final evaluation at the end of training
    print('Final Evaluation:')
    final_train_accuracy, final_train_f1, final_train_roc_auc = eval_model(model, X_train, y_train)
    final_test_accuracy, final_test_f1, final_test_roc_auc = eval_model(model, X_test, y_test)

    print(f'Final Train Accuracy: {final_train_accuracy:.4f}')
    print(f'Final Train F1 Score: {final_train_f1:.4f}')
    print(f'Final Test Accuracy: {final_test_accuracy:.4f}')
    print(f'Final Test F1 Score: {final_test_f1:.4f}')
    
    return final_test_accuracy, final_test_f1, final_test_roc_auc

In [103]:
input_dim = X_dim
output_dim = y_dim
NODE_model = NODE(input_dim, output_dim, num_trees=10, tree_depth=6, hidden_dim=128)

In [104]:
X_train_t = torch.tensor(X_train, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.long)
X_test_t = torch.tensor(X_test, dtype=torch.float32)
y_test_t = torch.tensor(y_test, dtype=torch.long)

In [105]:
accuracy, f1, roc_auc = train_and_evaluate_model(NODE_model, X_train_t, X_test_t, y_train_t, y_test_t)

  X_train, y_train = torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long)
  X_test, y_test = torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long)


Epoch [10/100], Loss: 1.0499
Epoch [20/100], Loss: 1.0372
Epoch [30/100], Loss: 0.9632
Epoch [40/100], Loss: 0.8932
Epoch [50/100], Loss: 0.8790
Epoch [60/100], Loss: 0.8494
Epoch [70/100], Loss: 0.8370
Epoch [80/100], Loss: 0.8206
Epoch [90/100], Loss: 0.8096
Epoch [100/100], Loss: 0.7967
Final Evaluation:
Final Train Accuracy: 0.6417
Final Train F1 Score: 0.5825
Final Test Accuracy: 0.5466
Final Test F1 Score: 0.4930


In [106]:
result = pd.DataFrame([{'Model':'NODE Classifier', 'Accuracy':accuracy, 'F1':f1}])
eval_results = pd.concat([result, eval_results], ignore_index=True)

  eval_results = pd.concat([result, eval_results], ignore_index=True)


### Tab Transformer

In [107]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim=embed_size, num_heads=heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion),
            nn.ReLU(),
            nn.Linear(forward_expansion, embed_size)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x is expected to have shape [batch_size, sequence_length, embed_size]
        attention = self.attention(x, x, x)[0]
        x = self.dropout(self.norm1(attention + x))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out

class TabularTransformer(nn.Module):
    def __init__(self, input_dim, output_dim, embed_size, num_heads, forward_expansion, dropout):
        super(TabularTransformer, self).__init__()
        self.embedding = nn.Linear(input_dim, embed_size)
        self.transformer_block = TransformerBlock(
            embed_size=embed_size,
            heads=num_heads,
            dropout=dropout,
            forward_expansion=forward_expansion
        )
        self.fc = nn.Linear(embed_size, output_dim)

    def forward(self, x):
        # Add sequence dimension
        x = x.unsqueeze(1)
        x = self.embedding(x)
        x = self.transformer_block(x)
        x = x.squeeze(1)
        out = self.fc(x)
        return out

In [108]:
# Define Tab-Transfomer Model
TabTR_model = TabularTransformer(
    input_dim=input_dim,
    output_dim=output_dim,
    embed_size=32,
    num_heads=4,
    forward_expansion=128,
    dropout=0.1
)

In [109]:
accuracy, f1, roc_auc = train_and_evaluate_model(TabTR_model, X_train_t, X_test_t, y_train_t, y_test_t)

  X_train, y_train = torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long)
  X_test, y_test = torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long)


Epoch [10/100], Loss: 0.9446
Epoch [20/100], Loss: 0.8427
Epoch [30/100], Loss: 0.8006
Epoch [40/100], Loss: 0.7740
Epoch [50/100], Loss: 0.7462
Epoch [60/100], Loss: 0.7383
Epoch [70/100], Loss: 0.7304
Epoch [80/100], Loss: 0.7154
Epoch [90/100], Loss: 0.6946
Epoch [100/100], Loss: 0.6703
Final Evaluation:
Final Train Accuracy: 0.7298
Final Train F1 Score: 0.7283
Final Test Accuracy: 0.6478
Final Test F1 Score: 0.6448


In [110]:
result = pd.DataFrame([{'Model':'Tab-Transformer', 'Accuracy':accuracy, 'F1':f1}])
eval_results = pd.concat([result, eval_results], ignore_index=True)

### TabNet

In [111]:
# Define TabNet Model
TabNet_model = TabNetClassifier(optimizer_fn=torch.optim.Adam,
                         optimizer_params=dict(lr=2e-2),
                         scheduler_fn=torch.optim.lr_scheduler.StepLR,
                         scheduler_params=dict(step_size=10, gamma=0.9),
                         mask_type='sparsemax')

# Train TabNet_model
TabNet_model.fit(X_train=X_train, 
          y_train=y_train, 
          eval_set=[(X_test, y_test)],
          eval_metric=['accuracy'],
          max_epochs=100, 
          patience=10,
          batch_size=256,
          virtual_batch_size=128,
          num_workers=0,
          drop_last=False)

# Predict
y_pred = TabNet_model.predict(X_test)

# Eval
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1: {f1:.4f}")
y_proba = TabNet_model.predict_proba(X_test)[:,1]



epoch 0  | loss: 1.65833 | val_0_accuracy: 0.46559 |  0:00:00s
epoch 1  | loss: 1.23552 | val_0_accuracy: 0.42105 |  0:00:00s
epoch 2  | loss: 1.08877 | val_0_accuracy: 0.40891 |  0:00:00s
epoch 3  | loss: 0.99626 | val_0_accuracy: 0.40891 |  0:00:00s
epoch 4  | loss: 0.93903 | val_0_accuracy: 0.4251  |  0:00:00s
epoch 5  | loss: 0.92592 | val_0_accuracy: 0.46559 |  0:00:00s
epoch 6  | loss: 0.90256 | val_0_accuracy: 0.46964 |  0:00:01s
epoch 7  | loss: 0.87455 | val_0_accuracy: 0.4413  |  0:00:01s
epoch 8  | loss: 0.85767 | val_0_accuracy: 0.57895 |  0:00:01s
epoch 9  | loss: 0.84106 | val_0_accuracy: 0.5749  |  0:00:01s
epoch 10 | loss: 0.86472 | val_0_accuracy: 0.59109 |  0:00:01s
epoch 11 | loss: 0.84272 | val_0_accuracy: 0.58704 |  0:00:01s
epoch 12 | loss: 0.83224 | val_0_accuracy: 0.62753 |  0:00:01s
epoch 13 | loss: 0.82462 | val_0_accuracy: 0.61538 |  0:00:02s
epoch 14 | loss: 0.82177 | val_0_accuracy: 0.583   |  0:00:02s
epoch 15 | loss: 0.82056 | val_0_accuracy: 0.59514 |  0



In [112]:
result = pd.DataFrame([{'Model':'TabNet', 'Accuracy':accuracy, 'F1':f1}])
eval_results = pd.concat([result, eval_results], ignore_index=True)

## Machine Learning

### Random Forest

In [113]:
# Define Random Forest Model
rf_model = RandomForestClassifier(random_state=42)
# Train
rf_model.fit(X_train, y_train)
# Predict
y_pred = rf_model.predict(X_test)
# Eval
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1: {f1:.4f}")

Accuracy: 0.6923
F1: 0.6918


In [114]:
result = pd.DataFrame([{'Model':'Random Forest', 'Accuracy':accuracy, 'F1':f1}])
eval_results = pd.concat([result, eval_results], ignore_index=True)

### XGBoost

In [115]:
# Define XGBoost Model
xgb_model = XGBClassifier(random_state=42)
# Train
xgb_model.fit(X_train, y_train)
# Predict
y_pred = xgb_model.predict(X_test)
# Eval
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1: {f1:.4f}")

Accuracy: 0.6761
F1: 0.6751


In [116]:
result = pd.DataFrame([{'Model':'XGBoost', 'Accuracy':accuracy, 'F1':f1}])
eval_results = pd.concat([result, eval_results], ignore_index=True)

### LightGBM

In [117]:
# Define LightGBM Model
lgbm_model = LGBMClassifier(random_state=42)
# Train
lgbm_model.fit(X_train, y_train)
# Predict
y_pred = lgbm_model.predict(X_test)
# Eval
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1: {f1:.4f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000134 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 547
[LightGBM] [Info] Number of data points in the train set: 988, number of used features: 43
[LightGBM] [Info] Start training from score -1.072637
[LightGBM] [Info] Start training from score -1.612479
[LightGBM] [Info] Start training from score -0.779791
Accuracy: 0.6964
F1: 0.6971


In [118]:
result = pd.DataFrame([{'Model':'LightGBM', 'Accuracy':accuracy, 'F1':f1}])
eval_results = pd.concat([result, eval_results], ignore_index=True)

### CatBoost

In [119]:
# Define CatBoost Model
cb_model = CatBoostClassifier(random_state=42)
# Train
cb_model.fit(X_train, y_train)
# Predict
y_pred = cb_model.predict(X_test)
# Eval
y_proba = cb_model.predict_proba(X_test)[:, 1]
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1: {f1:.4f}")

Learning rate set to 0.079078
0:	learn: 1.0614430	total: 3.16ms	remaining: 3.15s
1:	learn: 1.0330034	total: 5.83ms	remaining: 2.91s
2:	learn: 1.0023045	total: 7.78ms	remaining: 2.59s
3:	learn: 0.9769114	total: 10ms	remaining: 2.5s
4:	learn: 0.9517173	total: 12.1ms	remaining: 2.4s
5:	learn: 0.9325015	total: 14.3ms	remaining: 2.38s
6:	learn: 0.9133757	total: 17ms	remaining: 2.4s
7:	learn: 0.8957767	total: 19.6ms	remaining: 2.43s
8:	learn: 0.8790288	total: 22.3ms	remaining: 2.45s
9:	learn: 0.8645038	total: 25ms	remaining: 2.47s
10:	learn: 0.8506626	total: 27.3ms	remaining: 2.46s
11:	learn: 0.8375293	total: 29.5ms	remaining: 2.43s
12:	learn: 0.8246262	total: 32.1ms	remaining: 2.43s
13:	learn: 0.8127419	total: 34.5ms	remaining: 2.43s
14:	learn: 0.8019380	total: 37.2ms	remaining: 2.44s
15:	learn: 0.7923261	total: 40.2ms	remaining: 2.47s
16:	learn: 0.7835030	total: 42.7ms	remaining: 2.47s
17:	learn: 0.7754123	total: 45.2ms	remaining: 2.46s
18:	learn: 0.7677639	total: 47.2ms	remaining: 2.44s
1

In [120]:
result = pd.DataFrame([{'Model':'CatBoost', 'Accuracy':accuracy, 'F1':f1}])
eval_results = pd.concat([result, eval_results], ignore_index=True)

# Evaluation

In [121]:
eval_results

Unnamed: 0,Model,Accuracy,F1
0,CatBoost,0.708502,0.708502
1,LightGBM,0.696356,0.697107
2,XGBoost,0.676113,0.67506
3,Random Forest,0.692308,0.691831
4,TabNet,0.62753,0.629638
5,Tab-Transformer,0.647773,0.644751
6,NODE Classifier,0.546559,0.493007
