<h1><font color="#113D68" size=6>PermGrad: Interpretable Hybrid Neural Networks with Synthetic Images for Tabular Data</font></h1>

---

# <font color="#004D7F" size=6> 1. Libraries</font>

---
# <font color="#004D7F" size=5> 1.1. System setup</font>

```
    sudo pip3 install tensorflow==2.17.1 torchmetrics pytorch_lightning TINTOlib==0.0.26 imblearn keras_preprocessing mpi4py bitstring optuna
```

---
# <font color="#004D7F" size=5> 1.2. Invoke the libraries</font>

In [None]:
import os

import numpy as np
import optuna
import pandas as pd
import tensorflow as tf
from keras.layers import (
    BatchNormalization, Dense, Dropout,
    Flatten, Input, MaxPooling2D
)
from keras_preprocessing.image import ImageDataGenerator
from tensorflow.keras.initializers import HeNormal
from tensorflow.keras.layers import (
    Conv2D
)
from tensorflow.keras.models import Model, Sequential as TF_Sequential, load_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from TINTOlib.tinto import TINTO

os.environ["KERAS_BACKEND"] = "tensorflow"

In [None]:
np.random.seed(42)
tf.random.set_seed(42)

---
# <font color="#004D7F" size=6> 2. Data processing</font>

---
# <font color="#004D7F" size=5> 2.1. TINTO method</font>

In [None]:
dataset = "Covertype"
pixels=20
problem_type = "supervised"

images_folder = f"../images/{dataset}"
image_model = TINTO(problem= problem_type,blur=False, pixels=pixels)

---
# <font color="#004D7F" size=5> 2.2. Read the dataset</font>

In [None]:
if dataset == "HELOC":
  dataset_path = "../datasets/HELOC/heloc.csv"
  df=pd.read_csv(dataset_path, delimiter=',')

  column_to_move = df.pop('RiskPerformance')
  df['RiskPerformance'] = column_to_move

  class_col = df.iloc[:,-1]
  df = df.iloc[: , :-1]

  label_encoder = LabelEncoder()
  class_col_encoded = label_encoder.fit_transform(class_col)


if dataset == "Dengue":
  dataset_path = "../datasets/Dengue/dengue_chikunguya_bin.csv"
  df=pd.read_csv(dataset_path, delimiter=',')

  column_to_move = df.pop('CLASSI_FIN')
  df['CLASSI_FIN'] = column_to_move

  class_col = df.iloc[:,-1]
  df = df.iloc[: , :-1]

  label_encoder = LabelEncoder()
  class_col_encoded = label_encoder.fit_transform(class_col)


if dataset == "Covertype":
  dataset_path = "../datasets/Covertype/covtype.csv"
  df=pd.read_csv(dataset_path, delimiter=',')

  column_to_move = df.pop('54')
  df['54'] = column_to_move

  class_col = df.iloc[:,-1]
  df = df.iloc[: , :-1]

  label_encoder = LabelEncoder()
  class_col_encoded = label_encoder.fit_transform(class_col)

if dataset == "Gas":
  dataset_path = "../datasets/Gas/gas.csv"
  df=pd.read_csv(dataset_path, delimiter=',')

  column_to_move = df.pop('Class')
  df['Class'] = column_to_move

  class_col = df.iloc[:,-1]
  df = df.iloc[: , :-1]

  label_encoder = LabelEncoder()
  class_col_encoded = label_encoder.fit_transform(class_col)


df['class'] = class_col_encoded
df

In [None]:
labels = label_encoder.classes_

for label, integer_value in zip(labels, range(len(labels))):
    print(f"Label: {label} -> Integer Value: {integer_value}")

In [None]:
class_counts = df['class'].value_counts()

print(class_counts)

---
# <font color="#004D7F" size=5> 2.3. Generate images</font>

In [None]:
force_recreate_images = True

if not os.path.exists(images_folder) or force_recreate_images:
    image_model.generateImages(df, images_folder)
else:
    print("The images are already generated")

img_paths = os.path.join(images_folder,problem_type+".csv")

---
# <font color="#004D7F" size=5> 2.4. Read images</font>

In [None]:
df = pd.read_csv(img_paths)
df["class"] = df["class"].astype(str)
df["images"]= images_folder + "/" + df["images"]

print(df["images"][0])
print(df["class"].value_counts())

---
# <font color="#004D7F" size=6> 3. Pre-modelling phase</font>

---
# <font color="#004D7F" size=5> 3.1. Data curation</font>

In [None]:
df_x = df.drop('class', axis = 1)
df_y = df['class']

X_train, X_val, y_train, y_val = train_test_split(df_x, df_y, test_size = 0.40, random_state=42,stratify=df_y)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size = 0.50, random_state=42,stratify=y_val)

n_class = df['class'].value_counts().count()

In [None]:
df_train = pd.concat([X_train, y_train], axis = 1)
df_test = pd.concat([X_test, y_test], axis = 1)
df_val = pd.concat([X_val, y_val], axis = 1)

---
# <font color="#004D7F" size=5> 3.2. Resize images</font>

In [None]:
train_datagen = ImageDataGenerator(rescale = 1./255)
valid_datagen = ImageDataGenerator(rescale = 1./255)
test_datagen = ImageDataGenerator(rescale = 1./255)

---
# <font color="#004D7F" size=5> 3.3. Iterators</font>

In [None]:
train_iter = train_datagen.flow_from_dataframe(
    df_train,
    target_size = (pixels, pixels),
    x_col = 'images',
    y_col = 'class',
    class_mode = 'categorical',
    color_mode='rgb',
    batch_size = 32,
    shuffle = True
)

In [None]:
valid_iter = valid_datagen.flow_from_dataframe(
    df_val,
    target_size = (pixels, pixels),
    x_col = 'images',
    y_col = 'class',
    class_mode = 'categorical',
    color_mode='rgb',
    batch_size = 32,
    shuffle = False
)

In [None]:
test_iter = test_datagen.flow_from_dataframe(
    df_test,
    target_size = (pixels, pixels),
    x_col = 'images',
    y_col = 'class',
    class_mode = 'categorical',
    color_mode='rgb',
    batch_size = 32,
    shuffle = False
)

---
# <font color="#004D7F" size=6> 4. Modelling with CNN</font>

---
# <font color="#004D7F" size=5> 4.1. CNN</font>

In [None]:
model_path = f'../models/{dataset}/model_{dataset}_CNN.keras'

study_db_path = f'../models/{dataset}/study_{dataset}_CNN.db'
storage_url = f"sqlite:///{study_db_path}"
study_name = f"{dataset}_cnn_study"

base_checkpoint_dir = f'../datasets/{dataset}/optuna_CNN_checkpoints'

In [None]:
def create_multimodal_classifier(trial, input_shape, n_class):
    dropout = trial.suggest_float("dropout", 0.1, 0.4)
    lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True)
    n_dense_layers = trial.suggest_int("n_dense_layers", 1, 4)
    n_conv_layers = trial.suggest_int("n_conv_layers", 1, 4)
    base_filters = trial.suggest_int("base_filters", 8, 256)
    optimizer_name = trial.suggest_categorical("optimizer", ["adam", "adamw"])
    activation_fn = trial.suggest_categorical("activation", ["relu"])

    init_name = trial.suggest_categorical("initializer", ["he_normal"])
    if init_name == "he_normal":
        initializer = HeNormal()

    cnn_inputs = Input(shape=input_shape)
    x_cnn = cnn_inputs

    for i in range(n_conv_layers):
        filters = int(base_filters * (2 ** i))
        x_cnn = Conv2D(filters, (3, 3), activation=activation_fn, padding='same', kernel_initializer=initializer)(x_cnn)
        x_cnn = BatchNormalization()(x_cnn)
        x_cnn = MaxPooling2D(2, 2)(x_cnn)
        x_cnn = Dropout(dropout)(x_cnn)
    x_cnn = Flatten()(x_cnn)

    x_conc = x_cnn
    for i in range(n_dense_layers):
        units = trial.suggest_int(f"combined_dense_units_{i}", 8, 256)
        x_conc = Dense(units, activation=activation_fn, kernel_initializer=initializer)(x_conc)
        x_conc = BatchNormalization()(x_conc)
        x_conc = Dropout(dropout)(x_conc)

    output = Dense(n_class, activation='softmax', kernel_initializer=initializer)(x_conc)

    if optimizer_name == "adam":
        opt = tf.keras.optimizers.Adam(learning_rate=lr)
    else:
        wd = trial.suggest_float("weight_decay", 1e-6, 1e-3, log=True)
        opt = tf.keras.optimizers.AdamW(learning_rate=lr, weight_decay=wd)

    model = Model(inputs=cnn_inputs, outputs=output)
    model.compile(
        optimizer=opt,
        loss='categorical_crossentropy',
        metrics=[
            tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
            tf.keras.metrics.Precision(name='precision'),
            tf.keras.metrics.Recall(name='recall'),
            tf.keras.metrics.AUC(name='auc'),
        ]
    )
    return model

def one_cycle_schedule(epoch, lr, total_epochs, max_lr, min_lr=1e-5):
    if epoch < total_epochs * 0.25:
        return min_lr + (max_lr - min_lr) * (epoch / (total_epochs * 0.25))
    else:
        progress = (epoch - total_epochs * 0.25) / (total_epochs * 0.75)
        return max_lr * 0.5 * (1 + np.cos(np.pi * progress))

def objective(trial):
    os.makedirs(base_checkpoint_dir, exist_ok=True)
    checkpoint_path = os.path.join(base_checkpoint_dir, f"trial_{trial.number}_best_model.keras")
    input_shape = (20,20,3)
    n_class = df['class'].value_counts().count()

    model = create_multimodal_classifier(trial, input_shape, n_class)

    epochs = 70

    max_lr = trial.params.get("lr", 1e-2)
    lr_scheduler = tf.keras.callbacks.LearningRateScheduler(
        lambda epoch, lr: one_cycle_schedule(epoch, lr, total_epochs=epochs, max_lr=max_lr),
        verbose=0
    )

    checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
        checkpoint_path,
        monitor='val_loss',
        save_best_only=True,
        save_weights_only=False,
        mode='min',
        verbose=0
    )
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=6, restore_best_weights=True
    )

    history = model.fit(
        train_iter,
        validation_data=valid_iter,
        epochs=epochs,
        verbose=1,
        callbacks=[early_stopping, checkpoint_cb, lr_scheduler]
    )

    val_loss = min(history.history['val_loss'])
    trial.set_user_attr("best_model_path", checkpoint_path)
    return val_loss

In [None]:
force_retrain = False

if not os.path.exists(model_path) or force_retrain:

    print(f"Creating or loading study: {study_name} from {study_db_path}")
    study = optuna.create_study(
        study_name=study_name,
        storage=storage_url,
        direction="minimize",
        load_if_exists=True
    )

    n_total_trials = 50
    print(f"Current trials: {len(study.trials)}. Optimizing up to {n_total_trials} total trials.")
    study.optimize(objective, n_trials=(n_total_trials - len(study.trials)))

    print("\nOptimization complete. Best trial:")
    best_trial = study.best_trial
    print(f"  Value (val_loss): {best_trial.value}")
    for key, value in best_trial.params.items():
        print(f"    {key}: {value}")

    best_model_path = best_trial.user_attrs["best_model_path"]
    print(f"Loading best model from: {best_model_path}")
    best_model = load_model(best_model_path)

    best_model.save(model_path)
    model = best_model
    print(f"Best model saved to: {model_path}")

else:
    print(f"Model already exists at {model_path}. Loading it.")
    model = load_model(model_path)

print("Process finished.")

---
# <font color="#004D7F" size=6> 5. Results</font>

---
# <font color="#004D7F" size=5> 5.1. Validation/Test evaluation</font>

In [None]:
score_test = model.evaluate(valid_iter, steps = df_val.shape[0]//valid_iter.batch_size)

In [None]:
score_test = model.evaluate(test_iter, steps = df_test.shape[0]//test_iter.batch_size)

In [None]:
print(score_test)