In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Statements and Functions

In [None]:
import os 
import matplotlib.pyplot as plt

from collections.abc import Sequence

import tensorflow as tf

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder

import pandas as pd
import numpy as np

import keras
from keras.models import Sequential
from keras.optimizers import Adam, SGD
from keras.layers import Dense, Activation, Dropout, Conv2D,MaxPooling2D, Flatten
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras import layers
from keras import ops

import seaborn as sns

import csv

import math

from functools import partial

import cv2
import glob
from IPython.display import Image
from sklearn.metrics import confusion_matrix, classification_report

%matplotlib inline
tf.test.gpu_device_name()

In [None]:
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)
    
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd

def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)
    
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)
    
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)

def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low

def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

# Data Import and Analysis

In [None]:
path = '/kaggle/input/diabetic-data/diabetic_data.csv'
data = pd.read_csv(path)

data.shape

In [None]:
data.head()

In [None]:
data.dtypes

In [None]:
data.nunique()

In [None]:
print(data.isnull().values.any())

In [None]:
null_columns = data.columns[data.isnull().any()]
print(null_columns)

In [None]:
data.shape

# Data Preprocessing

In [None]:
df = data

In [None]:
# Drop categorical columns with large unique values and drop null value columns
df = df.drop(columns=['encounter_id', 'diag_1', 'diag_2', 'diag_3', 'patient_nbr', 'admission_source_id', 'max_glu_serum', 'A1Cresult'])

In [None]:
print(df.isnull().values.any())

In [None]:
df.shape

## Oversampling

In [None]:
import pandas as pd
from sklearn.utils import resample

# Separate the dataset by classes
majority_class = df[df['readmitted'] == 'NO']
minority_class_30 = df[df['readmitted'] == '<30']
minority_class_30_plus = df[df['readmitted'] == '>30']

minority_30_upsampled = resample(minority_class_30, 
                                 replace=True,     
                                 n_samples=len(majority_class),  
                                 random_state=42)

minority_30_plus_upsampled = resample(minority_class_30_plus, 
                                      replace=True, 
                                      n_samples=len(majority_class),
                                      random_state=42)

# Combine the upsampled minority classes with the majority class
df_upsampled = pd.concat([majority_class, minority_30_upsampled, minority_30_plus_upsampled])

# Shuffle the DataFrame to mix the classes
df_upsampled = df_upsampled.sample(frac=1, random_state=42).reset_index(drop=True)

# Check the class distribution after oversampling
print(df_upsampled['readmitted'].value_counts())

In [None]:
df_upsampled.columns

In [None]:
transformer_data = df_upsampled

In [None]:
transformer_data.shape

## Normalize and One Hot_Encode Data

In [None]:
zscore_cols = ['num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'time_in_hospital']

for col in df_upsampled.columns: 
    if col in zscore_cols: 
        encode_numeric_zscore(df_upsampled, col)

    elif col == 'readmitted':
        label_encoder = LabelEncoder()
        df_upsampled['readmitted'] = label_encoder.fit_transform(df_upsampled['readmitted'])

    else: 
         df_upsampled = pd.get_dummies(df_upsampled, columns=[col], drop_first=True)

In [None]:
df_upsampled.shape

In [None]:
x, y = to_xy(df_upsampled, 'readmitted')

In [None]:
x.shape

In [None]:
y.shape

## Train/Test Split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
print("Training Features Shape:", x_train.shape, y_train.shape)
print("Test Features Shape:", x_test.shape, y_test.shape)

# Model Training

## Fully Connected Neural Networks

In [None]:
# Define ModelCheckpoint outside the loop
# filepath = '/content/drive/MyDrive/CS/intelligent-systems/p2/best_weights.keras'
filepath = './best_weights.keras'

checkpointer = ModelCheckpoint(filepath=filepath, verbose=0, save_best_only=True) # save best model

# otherwise new model will override after each loop

adam_optimizer = Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999,epsilon=1e-07

# Build network
model = Sequential()
model.add(Dense(4096, input_dim=x_train.shape[1], activation='relu'))
model.add(Dense(2048, activation='relu'))
model.add(Dense(1024, activation='relu'))
model.add(Dense(512, activation='relu'))
model.add(Dense(150, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')

# add early stopping within the loop
model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor,checkpointer],verbose=2,epochs=100, batch_size=100)

print('Training finished...Loading the best model')
print()
model.load_weights('./best_weights.keras') # load weights from best model

# Measure accuracy
pred = model.predict(x_test)
pred = np.argmax(pred,axis=1)

y_true = np.argmax(y_test,axis=1)

score = metrics.accuracy_score(y_true, pred)

In [None]:
print("Final accuracy: {}".format(score))
print(classification_report(y_true, pred))

## Convolutional Neural Network

In [None]:
# CNN input must be 4 dimensions
x_train_cnn = x_train.reshape((x_train.shape[0], 1, x_train.shape[1], 1))
x_test_cnn = x_test.reshape((x_test.shape[0], 1, x_train.shape[1], 1))

# One hot encoded output
y_train_cnn = y_train
y_test_cnn = y_test

In [None]:
print(f"x_train_cnn shape: {x_train_cnn.shape}")
print(f"x_test_cnn shape: {x_test_cnn.shape}")
print(f"y_train_cnn shape: {y_train_cnn.shape}")
print(f"y_test_cnn shape: {y_test_cnn.shape}")

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

adam_optimizer = Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999,epsilon=1e-07)

# Initialize the model
cnn = Sequential()

# Layer 1: First Convolutional Layer
cnn.add(Conv2D(filters=32, kernel_size=(1, 5), strides=(1, 1), activation='relu', 
               input_shape=(1, x_train_cnn.shape[2], 1)))  # Input shape matches your data
cnn.add(MaxPooling2D(pool_size=(1, 2)))  # Pooling reduces feature width by half

# Layer 2: Second Convolutional Layer
cnn.add(Conv2D(filters=64, kernel_size=(1, 7), strides=(1, 1), activation='relu'))
cnn.add(MaxPooling2D(pool_size=(1, 2)))  # Further reduce width by half

# Layer 3: Third Convolutional Layer
cnn.add(Conv2D(filters=128, kernel_size=(1, 15), strides=(1, 1), activation='relu'))
cnn.add(MaxPooling2D(pool_size=(1, 2)))  # Reduce further for higher-level features

# Flatten: Convert 2D features into 1D for dense layers
cnn.add(Flatten())

# Dense Layers: Fully connected layers
cnn.add(Dense(1024, activation='relu'))  
cnn.add(Dense(512, activation='relu'))  
cnn.add(Dense(256, activation='relu')) 
cnn.add(Dense(128, activation='relu')) 
cnn.add(Dense(3, activation='softmax')) 

cnn.compile(loss='categorical_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
cnn.fit(x_train_cnn,y_train_cnn,validation_data=(x_test_cnn,y_test_cnn),callbacks=[monitor,checkpointer],verbose=2,epochs=100, batch_size=64)

print('CNN - Training finished...Loading the best model')
print()
# cnn.load_weights('./cnn_best_weights1.keras') # load weights from best model

# Measure accuracy
pred_cnn = cnn.predict(x_test_cnn)
pred_cnn = np.argmax(pred_cnn,axis=1)

y_true_cnn = np.argmax(y_test_cnn,axis=1)

score_cnn = metrics.accuracy_score(y_true_cnn, pred_cnn)

In [None]:
print("Final accuracy: {}".format(score_cnn))
print(classification_report(y_true_cnn, pred_cnn))

In [None]:
filepath = './cnn_best_weights2.keras'

checkpointer = ModelCheckpoint(filepath=filepath, verbose=0, save_best_only=True) # save best model

adam_optimizer = Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999,epsilon=1e-07)

cnn = Sequential()

cnn.add(Conv2D(filters=32, kernel_size=(1, 5), strides=(1, 1), activation='relu', 
               input_shape=(1, x_train_cnn.shape[2], 1))) 
cnn.add(MaxPooling2D(pool_size=(1, 2)))  

cnn.add(Conv2D(filters=64, kernel_size=(1, 7), strides=(1, 1), activation='relu'))
cnn.add(MaxPooling2D(pool_size=(1, 2)))  

cnn.add(Conv2D(filters=128, kernel_size=(1, 15), strides=(1, 1), activation='relu'))
cnn.add(MaxPooling2D(pool_size=(1, 2)))  

cnn.add(Conv2D(filters=256, kernel_size=(1, 15), strides=(1, 1), activation='relu'))
cnn.add(MaxPooling2D(pool_size=(1, 2))) 

cnn.add(Flatten())

cnn.add(Dense(4096, activation='relu'))  
cnn.add(Dropout(.2))
cnn.add(Dense(1024, activation='relu'))  
cnn.add(Dense(512, activation='relu')) 
cnn.add(Dense(128, activation='relu')) 
cnn.add(Dense(3, activation='softmax')) 

cnn.compile(loss='categorical_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
cnn.fit(x_train_cnn,y_train_cnn,validation_data=(x_test_cnn,y_test_cnn),callbacks=[monitor,checkpointer],verbose=2,epochs=100, batch_size=256)

print('CNN - Training finished...Loading the best model')
print()
cnn.load_weights('./cnn_best_weights2.keras') # load weights from best model

# Measure accuracy
pred_cnn2 = cnn.predict(x_test_cnn)
pred_cnn2 = np.argmax(pred_cnn2,axis=1)

y_true_cnn2 = np.argmax(y_test_cnn,axis=1)

score_cnn2 = metrics.accuracy_score(y_true_cnn2, pred_cnn2)

In [None]:
print("Final accuracy: {}".format(score_cnn2))
print(classification_report(y_true_cnn2, pred_cnn2))

## Transformer Model

In [None]:
t_data = transformer_data
t_data.shape

In [None]:
train_data, test_data = train_test_split(t_data, random_state=42, test_size=0.2)
train_data_file = "train_data.csv"
test_data_file = "test_data.csv"

train_data.to_csv(train_data_file, index=False, header=False)
test_data.to_csv(test_data_file, index=False, header=False)
train_data_file = "/kaggle/working/train_data.csv"
test_data_file = "/kaggle/working/test_data.csv"
CSV_HEADER = t_data.columns.tolist()

In [None]:
zscore_cols = ['num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'time_in_hospital']

t_data['weight'] = 0

for col in zscore_cols: 
    t_data['weight']  = t_data['weight'] + t_data[col]

In [None]:
t_data['weight'].head()

In [None]:
# Define the name of the weighted column.
WEIGHT_COLUMN_NAME = "weight"

zscore_cols = ['num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses', 'time_in_hospital']

# A list of numerical feature names.
NUMERIC_FEATURE_NAMES = zscore_cols

# Dynamically determine categorical features by excluding numeric features.
CATEGORICAL_FEATURE_NAMES = [
    col for col in transformer_data.columns 
    if col not in NUMERIC_FEATURE_NAMES and not col.startswith('readmitted') 
    and not col.startswith('weight')
]

# Generate vocabulary for categorical features.
CATEGORICAL_FEATURES_WITH_VOCABULARY = {
    feature: sorted(list(transformer_data[feature].astype(str).unique())) 
    for feature in CATEGORICAL_FEATURE_NAMES
}

# A list of the categorical feature names.
CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())
# A list of all the input features.
FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES
# A list of column default values for each feature.
COLUMN_DEFAULTS = [
    [0.0] if feature_name in NUMERIC_FEATURE_NAMES + [WEIGHT_COLUMN_NAME] else ["NA"]
    for feature_name in CSV_HEADER
]

TARGET_FEATURE_NAME = "readmitted"
# A list of the labels of the target features.
TARGET_LABELS = ["<30", ">30", "NO"]

In [None]:
len(CATEGORICAL_FEATURE_NAMES), len(NUMERIC_FEATURE_NAMES), len(FEATURE_NAMES)

In [None]:
LEARNING_RATE = 0.001
WEIGHT_DECAY = 0.0001
DROPOUT_RATE = 0.2
BATCH_SIZE = 265
NUM_EPOCHS = 15

NUM_TRANSFORMER_BLOCKS = 3  # Number of transformer blocks.
NUM_HEADS = 4  # Number of attention heads.
EMBEDDING_DIMS = 16  # Embedding dimensions of the categorical features.
MLP_HIDDEN_UNITS_FACTORS = [
    2,
    1,
]  # MLP hidden layer units, as factors of the number of inputs.
NUM_MLP_BLOCKS = 2  # Number of MLP blocks in the baseline model.

In [None]:
target_label_lookup = layers.StringLookup(
    vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=1
)


def prepare_example(features, target):
    target_index = target_label_lookup(target)
    weights = features.pop(WEIGHT_COLUMN_NAME)
    return features, target_index, weights


lookup_dict = {}
for feature_name in CATEGORICAL_FEATURE_NAMES:
    vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
    # Create a lookup to convert a string values to an integer indices.
    # Since we are not using a mask token, nor expecting any out of vocabulary
    # (oov) token, we set mask_token to None and num_oov_indices to 0.
    lookup = layers.StringLookup(
        vocabulary=vocabulary, mask_token=None, num_oov_indices=1
    )
    lookup_dict[feature_name] = lookup


def encode_categorical(batch_x, batch_y, weights):
    for feature_name in CATEGORICAL_FEATURE_NAMES:
        batch_x[feature_name] = lookup_dict[feature_name](batch_x[feature_name])

    return batch_x, batch_y, weights


def get_dataset_from_csv(csv_file_path, batch_size=128, shuffle=False):
    dataset = (
        tf_data.experimental.make_csv_dataset(
            csv_file_path,
            batch_size=batch_size,
            column_names=CSV_HEADER,
            column_defaults=COLUMN_DEFAULTS,
            label_name=TARGET_FEATURE_NAME,
            num_epochs=1,
            header=False,
            na_value="?",
            shuffle=shuffle,
        )
        .map(prepare_example, num_parallel_calls=tf_data.AUTOTUNE, deterministic=False)
        .map(encode_categorical)
    )
    return dataset.cache()

In [None]:
def encode_categorical(batch_x, batch_y, weights):
    for feature_name in CATEGORICAL_FEATURE_NAMES:
        batch_x[feature_name] = lookup_dict[feature_name](batch_x[feature_name])

    return batch_x, batch_y, weights


def get_dataset_from_csv(csv_file_path, batch_size=128, shuffle=False):
    dataset = (
        tf_data.experimental.make_csv_dataset(
            csv_file_path,
            batch_size=batch_size,
            column_names=CSV_HEADER,
            column_defaults=COLUMN_DEFAULTS,
            label_name=TARGET_FEATURE_NAME,
            num_epochs=1,
            header=False,
            na_value="?",
            shuffle=shuffle,
        )
        .map(prepare_example, num_parallel_calls=tf_data.AUTOTUNE, deterministic=False)
        .map(encode_categorical)
    )
    return dataset.cache()

In [None]:
def run_experiment(
    model,
    train_data_file,
    test_data_file,
    num_epochs,
    learning_rate,
    weight_decay,
    batch_size,
):
    optimizer = keras.optimizers.AdamW(
        learning_rate=learning_rate, weight_decay=weight_decay
    )

    model.compile(
        optimizer=optimizer,
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False),
        metrics=[keras.metrics.SparseCategoricalAccuracy(name="accuracy")],
    )
    
    train_dataset = get_dataset_from_csv(train_data_file, batch_size, shuffle=True)
    validation_dataset = get_dataset_from_csv(test_data_file, batch_size)

    print("Start training the model...")
    history = model.fit(
        train_dataset, epochs=num_epochs, validation_data=validation_dataset
    )
    print("Model training finished")

    _, accuracy = model.evaluate(validation_dataset, verbose=0)

    print(f"Validation accuracy: {round(accuracy * 100, 2)}%")

    return history

def create_model_inputs():
    inputs = {}
    for feature_name in FEATURE_NAMES:
        if feature_name in NUMERIC_FEATURE_NAMES:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype="float32"
            )
        else:
            inputs[feature_name] = layers.Input(
                name=feature_name, shape=(), dtype="int32"
            )
    return inputs

In [None]:
def encode_inputs(inputs, embedding_dims):
    encoded_categorical_feature_list = []
    numerical_feature_list = []

    for feature_name in inputs:
        if feature_name in CATEGORICAL_FEATURE_NAMES:
            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
            # Create a lookup to convert a string values to an integer indices.
            # Since we are not using a mask token, nor expecting any out of vocabulary
            # (oov) token, we set mask_token to None and num_oov_indices to 0.

            # Convert the string input values into integer indices.

            # Create an embedding layer with the specified dimensions.
            embedding = layers.Embedding(
                input_dim=len(vocabulary), output_dim=embedding_dims
            )

            # Convert the index values to embedding representations.
            encoded_categorical_feature = embedding(inputs[feature_name])
            encoded_categorical_feature_list.append(encoded_categorical_feature)

        else:
            # Use the numerical features as-is.
            numerical_feature = ops.expand_dims(inputs[feature_name], -1)
            numerical_feature_list.append(numerical_feature)

    return encoded_categorical_feature_list, numerical_feature_list

def create_mlp(hidden_units, dropout_rate, activation, normalization_layer, name=None):
    mlp_layers = []
    for units in hidden_units:
        mlp_layers.append(normalization_layer())
        mlp_layers.append(layers.Dense(units, activation=activation))
        mlp_layers.append(layers.Dropout(dropout_rate))

    return keras.Sequential(mlp_layers, name=name)


In [None]:
def create_tabtransformer_classifier(
    num_transformer_blocks,
    num_heads,
    embedding_dims,
    mlp_hidden_units_factors,
    dropout_rate,
    use_column_embedding=False,
):
    # Create model inputs.
    inputs = create_model_inputs()
    # encode features.
    encoded_categorical_feature_list, numerical_feature_list = encode_inputs(
        inputs, embedding_dims
    )
    # Stack categorical feature embeddings for the Tansformer.
    encoded_categorical_features = ops.stack(encoded_categorical_feature_list, axis=1)
    # Concatenate numerical features.
    numerical_features = layers.concatenate(numerical_feature_list)

    # Add column embedding to categorical feature embeddings.
    if use_column_embedding:
        num_columns = encoded_categorical_features.shape[1]
        column_embedding = layers.Embedding(
            input_dim=num_columns, output_dim=embedding_dims
        )
        column_indices = ops.arange(start=0, stop=num_columns, step=1)
        encoded_categorical_features = encoded_categorical_features + column_embedding(
            column_indices
        )
       # Create multiple layers of the Transformer block.
    for block_idx in range(num_transformer_blocks):
        # Create a multi-head attention layer.
        attention_output = layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=embedding_dims,
            dropout=dropout_rate,
            name=f"multihead_attention_{block_idx}",
        )(encoded_categorical_features, encoded_categorical_features)
        # Skip connection 1.
        x = layers.Add(name=f"skip_connection1_{block_idx}")(
            [attention_output, encoded_categorical_features]
        )
        # Layer normalization 1.
        x = layers.LayerNormalization(name=f"layer_norm1_{block_idx}", epsilon=1e-6)(x)
        # Feedforward.
        feedforward_output = create_mlp(
            hidden_units=[embedding_dims],
            dropout_rate=dropout_rate,
            activation=keras.activations.gelu,
            normalization_layer=partial(
                layers.LayerNormalization, epsilon=1e-6
            ),  # using partial to provide keyword arguments before initialization
            name=f"feedforward_{block_idx}",
        )(x)
        # Skip connection 2.
        x = layers.Add(name=f"skip_connection2_{block_idx}")([feedforward_output, x])
        # Layer normalization 2.
        encoded_categorical_features = layers.LayerNormalization(
            name=f"layer_norm2_{block_idx}", epsilon=1e-6
        )(x)

    # Flatten the "contextualized" embeddings of the categorical features.
    categorical_features = layers.Flatten()(encoded_categorical_features)
    # Apply layer normalization to the numerical features.
    numerical_features = layers.LayerNormalization(epsilon=1e-6)(numerical_features)
    # Prepare the input for the final MLP block.
    features = layers.concatenate([categorical_features, numerical_features])

    # Compute MLP hidden_units.
    mlp_hidden_units = [
        factor * features.shape[-1] for factor in mlp_hidden_units_factors
    ]

     # Create final MLP.
    features = create_mlp(
        hidden_units=mlp_hidden_units,
        dropout_rate=dropout_rate,
        activation=keras.activations.selu,
        normalization_layer=layers.BatchNormalization,
        name="MLP",
    )(features)

    # Add a sigmoid as a binary classifer.
    outputs = layers.Dense(3, activation="softmax", name="softmax")(features)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model


tabtransformer_model = create_tabtransformer_classifier(
    num_transformer_blocks=NUM_TRANSFORMER_BLOCKS,
    num_heads=NUM_HEADS,
    embedding_dims=EMBEDDING_DIMS,
    mlp_hidden_units_factors=MLP_HIDDEN_UNITS_FACTORS,
    dropout_rate=DROPOUT_RATE,
)

print("Total model weights:", tabtransformer_model.count_params())
# keras.utils.plot_model(tabtransformer_model, show_shapes=True, rankdir="LR")

In [None]:
history = run_experiment(
    model=tabtransformer_model,
    train_data_file=train_data_file,
    test_data_file=test_data_file,
    num_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    batch_size=BATCH_SIZE,
)