<a href="https://colab.research.google.com/github/damola936/AI-ML-LLM/blob/main/AbstractOptimizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Skim Literature 🤖📜

The purpose of this Notebook is to build a model that helps making reading medical papers easier.

## Checking for GPU

In [None]:
!nvidia-smi -L

## Getting Helper functions

In [None]:
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/refs/heads/main/extras/helper_functions.py

In [None]:
from helper_functions import plot_loss_curves, unzip_data

In [None]:
import datetime
import tensorflow as tf

# Create a tensorboard callback
def create_tensorboard_callback(dir_name, experiment_name):
    """
        Creates a tensorboard callback to use when training
    """
    log_dir = dir_name + "/" + experiment_name + "/" + datetime.datetime.now().strftime("%Y-%m-%d <-> %H-%M-%S")
    file_writer = tf.summary.create_file_writer(log_dir) # Create a FileWriter for TensorBoard
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir)
    print(f"Saving TensorBoard log files to: {log_dir}")
    return tensorboard_callback

## Getting a Dataset (PubMed 200K RCT Dataset)

https://github.com/Franck-Dernoncourt/pubmed-rct

In [None]:
!git clone https://github.com/Franck-Dernoncourt/pubmed-rct
!ls pubmed-rct

## Check What files are in the `PubMed20K` dataset

In [None]:
!ls pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/

## Exploring Dataset

In [None]:
# Creating function to read line of our document
def read_line(document):
    """
        Reads a document and returns the lines of text as a list
    """
    with open(document, mode="r") as file:
        data = file.readlines()
        print(data[:10])

document = "/content/pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/train.txt"
read_line(document)

How do we want our data to look...
```
[{
    "line_number": 0,

    "target": "OBJECTIVE",

    "text" : "To investigate the efficacy of @ weeks of daily low-dose oral prednisolone in improving pain , ...
    
    "total_lines" : 11
}] ...

```

In [None]:
import re

def preprocess_document(document):
    """
        Preprocesses document into a structured dictionary
        for model training
    """
    final_list = []
    pattern = r'^###.*$' # We want to filter based on lines with text that begin with "###"

    with open(document, mode="r") as file:
        lines = file.readlines()
        text = "".join(lines)
        # Split text based on the pattern
        split_text = re.split(pattern, text, flags=re.MULTILINE)
        # Remove empty strings based on result
        sections = [[section.strip()] for section in split_text if section.strip()]
        # Iterate over our sections and create our target dictionary format
        for section in sections:
            section_text_list = section[0].split("\n")
            for index, text in enumerate(section_text_list):
                data_dict = {
                    "line_number": index,
                    "target": text.split("\t")[0],
                    "text": text.split("\t")[1],
                    "total_lines": len(section_text_list) - 1
                }
                final_list.append(data_dict)

        return final_list


In [None]:
data = preprocess_document(document)[12:24]
for d in data:
    print(d)
    print("\n")

In [None]:
import time

train_document = "/content/pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/train.txt"
test_document = "/content/pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/test.txt"
val_document = "/content/pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/dev.txt"

start_time = time.time()

train_data = preprocess_document(train_document)
test_data = preprocess_document(test_document)
val_data = preprocess_document(val_document)

end_time = time.time()

print(len(train_data), len(test_data), len(val_data))
print(f"Execution time: {end_time - start_time:.2f} seconds")


Turn our data into a dataframe to better visualize it

In [None]:
import pandas as pd

train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)
val_df = pd.DataFrame(val_data)

In [None]:
train_df.head(13)

In [None]:
test_df.head()

In [None]:
val_df.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def check_distribution(column, is_rotate=True):
    # Viewing Visually
    ax = sns.histplot(data=train_df, x=column, bins=10)
    plt.title(f"Distribution of {column} variables")

    for container in ax.containers:
        ax.bar_label(container, fontsize=10)

    plt.gca().spines[["top", "right"]].set_visible(False)
    if is_rotate:
        plt.xticks(rotation=70)

    plt.show()

In [None]:
check_distribution("target", is_rotate=True)

In [None]:
check_distribution("total_lines", is_rotate=False)

## Get list of sentences

In [None]:
train_sentences = train_df["text"].tolist()
val_sentences = val_df["text"].tolist()
test_sentences = test_df["text"].tolist()
print(len(train_sentences), len(val_sentences), len(test_sentences))

In [None]:
# View first 10 lines of training sentences
train_sentences[:10]

## Make Numeric Labels

In [None]:
# One Hot encode Labels
from sklearn.preprocessing import OneHotEncoder

one_hot_encoder = OneHotEncoder(sparse_output=False)
train_labels_one_hot = one_hot_encoder.fit_transform(
    train_df["target"].to_numpy().reshape(-1, 1))

val_labels_one_hot = one_hot_encoder.transform(
    val_df["target"].to_numpy().reshape(-1, 1)
)

test_labels_one_hot = one_hot_encoder.transform(
    test_df["target"].to_numpy().reshape(-1, 1)
)
# Checking labels...
train_labels_one_hot, val_labels_one_hot, test_labels_one_hot

## Label Encode Labels

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
train_labels_le = label_encoder.fit_transform(
    train_df["target"].to_numpy()
)
val_labels_le = label_encoder.transform(
    val_df["target"].to_numpy()
)
test_labels_le = label_encoder.transform(
    test_df["target"].to_numpy()
)

train_labels_le, val_labels_le, test_labels_le

In [None]:
# Get class names and number of classes from LabeEncoder Instance
num_classes = len(label_encoder.classes_)
class_names = label_encoder.classes_
num_classes, class_names

# Creating our Baseline Model (Model 0: A NaiveBayes Classifier)

In [None]:
models_metrics = []

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def add_model_evaluation_to_list(name, val_preds):
    model_metrics = {
    "name": name,

    "accuracy": accuracy_score(
        val_labels_le, val_preds
        ),

    "f1" : f1_score(
        val_labels_le, val_preds, average="weighted"
    ),

    "precision" : precision_score(
        val_labels_le, val_preds, average="weighted"
    ),

    "recall" : recall_score(
        val_labels_le, val_preds, average="weighted"
    )
}

    models_metrics.append(model_metrics)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

model_0 = Pipeline([
    ("tf-idf", TfidfVectorizer()),
    ("clf", MultinomialNB())
])

model_0.fit(train_sentences, train_labels_le)
model_0_val_preds = model_0.predict(val_sentences)

add_model_evaluation_to_list("model_0", model_0_val_preds)

In [None]:
models_metrics

We need to make out text vectorized and then into embeddings to feed to out CNN

## Preparing Dataset

Finding the average length of words in our data

In [None]:
# Find the average number of tokens (words) in training tweets
avg_length = round(sum([len(i.split()) for i in train_sentences]) / len(train_sentences))
avg_length

In [None]:
# How long of a sentence covers 95% of samples
import numpy as np

sentence_lens = [len(i.split()) for i in train_sentences]
avg_length = int(np.percentile(sentence_lens, 95))
avg_length

In [None]:
# Creating our Vectorized Text layer and Embedding Layer
from tensorflow.keras.layers import TextVectorization, Embedding

max_vocab_length = 68000
max_length = avg_length

text_vectorizer = TextVectorization(
    max_tokens = max_vocab_length,
    output_sequence_length = max_length,
    output_mode = "int",
    name = "vectorizing_layer"
)

text_vectorizer.adapt(train_sentences) # Adapting text vectorizer
text_vocab = text_vectorizer.get_vocabulary()

embedding = Embedding(
    input_dim = len(text_vocab),
    output_dim = 128,
    mask_zero=False, # Use masking to handle variable sequence lengths (save space)
    name = "embedding_layer"
)

Creating Datasets, Making sure our data loads in as fast as possible with tensorflow tf data API

In [None]:
# Turn our data in tensorflow datasets
train_dataset = tf.data.Dataset.from_tensor_slices(
    (train_sentences, train_labels_one_hot)
)

val_dataset = tf.data.Dataset.from_tensor_slices(
    (val_sentences, val_labels_one_hot)
)

test_dataset = tf.data.Dataset.from_tensor_slices(
    (test_sentences, test_labels_one_hot)
)

train_dataset

In [None]:
# Turn our datasets into Prefetch Datasets for faster loading
train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE) # No need for shuffling as we need our data in sequences
val_dataset = val_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

train_dataset

# Creating Model 1 (A deep Sequence Model : A Convolutional Model)

In [None]:
# Creating a saving Directory for TensorBoard
SAVE_DIR = "model_logs"

In [None]:
# Building the Model
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.Conv1D(filters=64, kernel_size=5,
                  activation="relu", padding="same",
                  name="convolution_layer")(x)
x = layers.GlobalMaxPool1D(name="global_max_pooling_layer")(x)
outputs = layers.Dense(num_classes, activation="softmax", name="output_layer")(x)
model_1 = tf.keras.Model(inputs, outputs)

# Compiling the Model
model_1.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

# Fit the Model
history_1 = model_1.fit(train_dataset,
                        steps_per_epoch=int(0.1 * len(train_dataset)),
                        validation_data=val_dataset,
                        validation_steps=int(0.1 * len(val_dataset)), # Only validate on 10% of batches to save time for experimentation
                        epochs=5,
                        callbacks=[
                            create_tensorboard_callback(SAVE_DIR, "model_1")
                        ])

In [None]:
model_1.evaluate(val_dataset)

In [None]:
plot_loss_curves(history_1)

In [None]:
model_1_preds = tf.argmax(model_1.predict(val_dataset), axis=1)
model_1_preds

In [None]:
add_model_evaluation_to_list("model_1", model_1_preds)
models_metrics

# Model 2 (Using a Pretrained Feature Extractor: Pretrained Token Embeddings)

In [None]:
import tf_keras

def create_tensorboard_callback_tf_keras(dir_name, experiment_name):
    """
        Creates a tensorboard callback to use when training
    """
    log_dir = dir_name + "/" + experiment_name + "/" + datetime.datetime.now().strftime("%Y-%m-%d <-> %H-%M-%S")
    file_writer = tf.summary.create_file_writer(log_dir) # Create a FileWriter for TensorBoard
    tensorboard_callback = tf_keras.callbacks.TensorBoard(log_dir=log_dir)
    print(f"Saving TensorBoard log files to: {log_dir}")
    return tensorboard_callback

##Loading Pretrained Feature Extractor

In [None]:
# import tensorflow_hub as hub

# # Loading the Pretrained Feature extractor
# feature_extractor = hub.load(
#     "https://www.kaggle.com/models/google/universal-sentence-encoder/TensorFlow2/universal-sentence-encoder/4"
# )

In [None]:
import tf_keras
import tensorflow_hub as hub

# Creating the feature extractoer layer
feature_extractor_layer = hub.KerasLayer(
    "https://tfhub.dev/google/universal-sentence-encoder/4",
    input_shape=[],
    dtype=tf.string,
    trainable=False,
    name="U.S.E"
)

In [None]:
# Building the Model
model_2 = tf_keras.Sequential([
    feature_extractor_layer,
    tf_keras.layers.Dense(64),
    tf_keras.layers.Dense(num_classes, activation="softmax")
])

# Compiling the Model
model_2.compile(
    loss=tf_keras.losses.CategoricalCrossentropy(),
    optimizer=tf_keras.optimizers.Adam(),
    metrics=["accuracy"]
)

# Fitting the Model
history_2 = model_2.fit(
    test_dataset,
    steps_per_epoch=int(0.1 * len(test_dataset)),
    validation_data=val_dataset,
    validation_steps=int(0.1 * len(val_dataset)),
    epochs=5,
    callbacks=create_tensorboard_callback_tf_keras(SAVE_DIR, "model_2")
)

In [None]:
model_2.evaluate(val_dataset)

In [None]:
model_2_preds = tf.argmax(model_2.predict(val_dataset), axis=1)
model_2_preds

In [None]:
add_model_evaluation_to_list("model_2", model_2_preds)
models_metrics

# Model 3: Conv 1D with Character Embeddings

## To create a Character level embedding, we need a character level tokenizer

In [None]:
# Creating a character level tokenizer
train_sentences[:5]

In [None]:
# Make function to split sentences into characters
def split_characters(text):
    return " ".join(list(text))

# testing the function
random_senetence = "How are you doing"
sentence_split = split_characters(random_senetence)
sentence_split

In [None]:
# Split sequence level data splits to character level data splits
train_chars = [split_characters(s) for s in train_sentences]
val_chars = [split_characters(s) for s in val_sentences]
test_chars = [split_characters(s) for s in test_sentences]
train_chars[:3]

In [None]:
# What is the average character length
char_lens = [len(sentence) for sentence in train_sentences]
mean_char_len = np.mean(char_lens)
mean_char_len

In [None]:
# Check the distribution of our sequences at character level
plt.hist(char_lens)
plt.show()

In [None]:
# Find what character length covers 95% of sequences
output_char_len = int(np.percentile(char_lens, 95))
output_char_len

In [None]:
# Get all keyboard characters
import string

alphabet = string.ascii_lowercase + string.digits + string.punctuation
alphabet

In [None]:
# Create Char-Level token vectorizer instance
NUM_CHAR_TOKENS = len(alphabet) + 2  # add 2 for space and OOV(out of vocab, '[UNK]') token
char_vectorizer = TextVectorization(
    max_tokens = NUM_CHAR_TOKENS,
    output_sequence_length = output_char_len,
    output_mode = "int",
    name = "char_vectorizer"
)

In [None]:
# Adapt character vectorizer to training characters
char_vectorizer.adapt(train_chars)

In [None]:
# Check character vocab stats
char_vocab = char_vectorizer.get_vocabulary()
print(f"Length of characters in vocabulary {len(char_vocab)}")
print(f"Top 5 most common characters {char_vocab[:5]}")
print(f"Least 5 common words {char_vocab[-5:]}")

In [None]:
import random

test_rand_char = random.choice(train_chars)
print(f"Character split test: {test_rand_char}\n")
print(f"Length of Character split text: {len(test_rand_char.split())}\n")
vectorized_test_chars = char_vectorizer([test_rand_char])
print(f"Vectorized Character split text: {vectorized_test_chars}")
print(f"Length of vectorized character split text: {len(vectorized_test_chars[0])}")

## Creating a Character level embedding layer

In [None]:
# Creatinng the character embedding layer
char_embedding = Embedding(
    input_dim = len(char_vocab),
    output_dim = 25, # this is the size of the char embedding in the paper
    mask_zero=False, # Use masking to handle variable sequence lengths (save space)
    name = "character_embedding_layer"
)

In [None]:
# Test out character embedding layer
print(f"Character split text:\n {test_rand_char}")
character_embed_sample = char_embedding(char_vectorizer([test_rand_char]))
print(f"\nCharacter embedding:\n {character_embed_sample}\n")
print(f"Character embedding shape: {character_embed_sample.shape}")

## Building the Conv1D Model with character embedding

Turning our dataset into tensor slices for fast loading

In [None]:
# Turn our data in tensorflow datasets
train_dataset_chars = tf.data.Dataset.from_tensor_slices(
    (train_chars, train_labels_one_hot)
)

val_dataset_chars = tf.data.Dataset.from_tensor_slices(
    (val_chars, val_labels_one_hot)
)

test_dataset_chars = tf.data.Dataset.from_tensor_slices(
    (test_chars, test_labels_one_hot)
)

train_dataset_chars

Turning our datasets into prefectch datasets for increase in performance and faster training

In [None]:
train_dataset_chars = train_dataset_chars.batch(32).prefetch(tf.data.AUTOTUNE)
val_dataset_chars = val_dataset_chars.batch(32).prefetch(tf.data.AUTOTUNE)
test_dataset_chars = test_dataset_chars.batch(32).prefetch(tf.data.AUTOTUNE)

In [None]:
from tensorflow.keras import layers

# Building the Model
inputs = layers.Input(shape=(1,), dtype=tf.string)
x = char_vectorizer(inputs)
x = char_embedding(x)
x = layers.Conv1D(filters=128, kernel_size=5,
                  activation="relu", padding="same")(x)
x = layers.GlobalMaxPool1D(name="global_max_pooling_layer")(x)
outputs = layers.Dense(num_classes, activation="softmax", name="output_layer")(x)
model_3 = tf.keras.Model(inputs, outputs, name="model_3")

# Compiling the Model
model_3.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

# Fit the Model
history_3 = model_3.fit(
    train_dataset_chars,
    epochs=5,
    steps_per_epoch=int(0.1 * len(train_dataset_chars)),
    validation_data=val_dataset_chars,
    validation_steps=int(0.1 * len(val_dataset_chars)),
    callbacks=[create_tensorboard_callback(SAVE_DIR, "model_3")]
)

In [None]:
model_3.evaluate(val_dataset_chars)

In [None]:
model_3_preds = tf.argmax(model_3.predict(val_dataset_chars), axis=1)
model_3_preds

In [None]:
add_model_evaluation_to_list("model_3", model_3_preds)
models_metrics

# Model 4: Combining Pre-trained `token embeddings` with `character embeddings`



*   Create a Token Embedding layer
*   Create a Character Embdiing Layer
*   Combine Token and Character Embedding layer with a Concatenate Layer
*   Build a Series of output layers on top of the concatenated layers
*   Construct a Model which takes token and character level sequences as input and produces sequence label probabilities as outputs.



In [None]:
# Creating the feature extractoer layer
model_4_feature_extractor_layer = hub.KerasLayer(
    "https://www.kaggle.com/models/google/universal-sentence-encoder/TensorFlow2/universal-sentence-encoder/2",
    trainable=False,
    name="U.S.E"
)

# Set up token embedding layer
token_inputs = layers.Input(shape=(), dtype=tf.string, name="token_inputs")
token_embeddings = layers.Lambda(lambda x : model_4_feature_extractor_layer(x), output_shape=(512,))(token_inputs)
token_outputs = layers.Dense(128, activation="relu")(token_embeddings)
token_model = tf.keras.Model(token_inputs, token_outputs)

# Creating a character embedding layer
char_inputs = layers.Input(shape=(1,), dtype=tf.string, name="char_inputs")
vectorized_inputs = char_vectorizer(char_inputs)
char_embeddings = char_embedding(vectorized_inputs)
char_bi_lstm = layers.Bidirectional(layers.LSTM(24))(char_embeddings)
char_model = tf.keras.Model(char_inputs, char_bi_lstm)

# Combine with concatenate (token + character), creates a hybrid token embedding
token_char_concat = layers.Concatenate(name="token_char_hybrid")([token_model.output, char_model.output])

# Series out output layers, adding in dropout layer
combined_dropout = layers.Dropout(0.5)(token_char_concat)
combined_dense = layers.Dense(128, activation="relu")(combined_dropout)
final_dropout = layers.Dropout(0.5)(combined_dense)
output_layer = layers.Dense(num_classes, activation="softmax")(final_dropout)

# Construct Model with token and char inputs
model_4 = tf.keras.Model(inputs=[token_model.input, char_model.input],
                         outputs=output_layer, name="model_4")

In [None]:
# Get Model summary
model_4.summary()

## Plotting Hybrid Model

In [None]:
from tensorflow.keras.utils import plot_model

plot_model(model_4, show_shapes=True, show_layer_names=True)

In [None]:
# Compile Model 4
model_4.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [None]:
# Fast Loading our data using tf.data API
train_dataset_text = tf.data.Dataset.from_tensor_slices((train_sentences, train_chars))
train_dataset_labels = tf.data.Dataset.from_tensor_slices(train_labels_one_hot)
train_dataset_hybrid = tf.data.Dataset.zip(train_dataset_text, train_dataset_labels).batch(32).prefetch(tf.data.AUTOTUNE)

val_dataset_text = tf.data.Dataset.from_tensor_slices((val_sentences, val_chars))
val_dataset_labels = tf.data.Dataset.from_tensor_slices(val_labels_one_hot)
val_dataset_hybrid = tf.data.Dataset.zip(val_dataset_text, val_dataset_labels).batch(32).prefetch(tf.data.AUTOTUNE)

test_dataset_text = tf.data.Dataset.from_tensor_slices((test_sentences, test_chars))
test_dataset_labels = tf.data.Dataset.from_tensor_slices(test_labels_one_hot)
test_dataset_hybrid = tf.data.Dataset.zip(test_dataset_text, test_dataset_labels).batch(32).prefetch(tf.data.AUTOTUNE)

In [None]:
# Checking the Dataset
train_dataset_hybrid, val_dataset_hybrid

## Fitting the Model & Making Predictions

In [None]:
history_4 = model_4.fit(train_dataset_hybrid,
                        epochs=5,
                        steps_per_epoch=int(0.1 * len(train_dataset_hybrid)),
                        validation_data=val_dataset_hybrid,
                        validation_steps=int(0.1 * len(val_dataset_hybrid)),
                        callbacks=[create_tensorboard_callback(SAVE_DIR, "model_4")])

In [None]:
model_4.evaluate(val_dataset_hybrid)

In [None]:
model_4_preds = tf.argmax(model_4.predict(val_dataset_hybrid), axis=1)
model_4_preds

In [None]:
add_model_evaluation_to_list("model_4", model_4_preds)
models_metrics

🗝️ Note: Any engineered features used to train our model needs to be available at test time. In our case line numbers and total lines are available

# Creating Positional Embeddings

For Line Numbers

In [None]:
# How many different Line Numbers are there?
train_df["line_number"].value_counts()

In [None]:
# Check the distribution of "line_number" column
train_df["line_number"].plot(kind="hist")

In [None]:
# Using pure tensorflow to create one hot encoded tensors.
# From the distribution it seems like 15 is the cut off for distributions that matter.

train_line_numbers_one_hot = tf.one_hot(train_df["line_number"].to_numpy(), depth=15)
val_line_numbers_one_hot = tf.one_hot(val_df["line_number"].to_numpy(), depth=15)
test_line_numbers_one_hot = tf.one_hot(test_df["line_number"].to_numpy(), depth=15)

train_line_numbers_one_hot[:10], train_line_numbers_one_hot.shape

For Total Lines

In [None]:
# How many different number of total lines?
train_df["total_lines"].value_counts()

In [None]:
# Check the distribution of total lines
train_df["total_lines"].plot(kind="hist")

In [None]:
# Check the coverage of "total_lines" value if 20
np.percentile(train_df.total_lines, 98)

In [None]:
# Using tensorflow to create one-hot encoded tensors of our "total_lines" feature

train_total_lines_one_hot = tf.one_hot(train_df["total_lines"].to_numpy(), depth=20)
val_total_lines_one_hot = tf.one_hot(val_df["total_lines"].to_numpy(), depth=20)
test_total_lines_one_hot = tf.one_hot(test_df["total_lines"].to_numpy(), depth=20)

train_total_lines_one_hot[:10], train_total_lines_one_hot.shape

# Model 5: `Pretrained Token Embeddings` + `Character Embeddings` + `Positional  Embeddings`



1.   Create a Token Level Model
2.   Create a Character Level Model
3.   Create a model for the "line_number" feature
4.   Create a model for the "total_lines" feature
5.   Combine the output for `1` and `2` using `tf.keras.layers.Concatenate( )`
6.   Combine the outputs of `3` , `4` , `5`, using `tf.keras.layers.Concatenate( )`

7.   Create an output layer to accept the tribrid embedding and output label probabilities
8.   Combine the inputs of `1`, `2`, `3`, `4` into a `tf.keras.Model` model

In [None]:
# ✅ Custom Layer for Feature Extraction with Serialization Support
@tf.keras.utils.register_keras_serializable()
class FeatureExtractorLayer(tf.keras.layers.Layer):
    def __init__(self, extractor_layer, **kwargs):
        super().__init__(**kwargs)
        self.extractor_layer = extractor_layer  # Store reference

    def call(self, inputs):
        return self.extractor_layer(inputs)  # Call the stored layer

    def get_config(self):
        """Make the layer serializable"""
        config = super().get_config()
        config.update({
            "extractor_layer": tf.keras.utils.serialize_keras_object(self.extractor_layer)
        })
        return config

    @classmethod
    def from_config(cls, config):
        """Recreate the layer from config"""
        config["extractor_layer"] = tf.keras.utils.deserialize_keras_object(config["extractor_layer"])
        return cls(**config)

In [None]:
# 1. Create a Token Model
token_inputs = layers.Input(shape=(), dtype="string", name="token_inputs")
# token_embeddings = layers.Lambda(lambda x : model_4_feature_extractor_layer(x), output_shape=(512,))(token_inputs)
token_embeddings = FeatureExtractorLayer(model_4_feature_extractor_layer)(token_inputs)
token_outputs = layers.Dense(128, activation="relu")(token_embeddings)
token_model = tf.keras.Model(token_inputs, token_outputs)

In [None]:
# 2. Create the Character level Model
char_inputs = layers.Input(shape=(1,), dtype="string", name="char_inputs")
char_vectorized = char_vectorizer(char_inputs)
char_embeddings = char_embedding(char_vectorized)
char_bi_lstm = layers.Bidirectional(layers.LSTM(24))(char_embeddings)
char_model = tf.keras.Model(char_inputs, char_bi_lstm)

In [None]:
# 3. Create a Model for the Line Number feature
line_number_inputs = layers.Input(shape=(15,),
                                  dtype=tf.float32, name="line_number_inputs")
x = layers.Dense(32, activation="relu")(line_number_inputs)
line_number_model = tf.keras.Model(line_number_inputs, x)

In [None]:
# 4. Create a Model for total_lines feature
total_lines_inputs = layers.Input(shape=(20,),
                                  dtype=tf.float32, name="total_lines_inputs")
y = layers.Dense(32, activation="relu")(total_lines_inputs)
total_lines_model = tf.keras.Model(total_lines_inputs, y)

In [None]:
# Combine 1 & 2
token_char_concat = layers.Concatenate(name="token_char_concat")(
    [token_model.output, char_model.output])
z = layers.Dropout(0.5)(token_char_concat)
z = layers.Dense(256, activation="relu")(z)
z = layers.Dropout(0.5)(z)

# Combine 3, 4 & 5
full_concat = layers.Concatenate(name="full_concat")(
    [line_number_model.output, total_lines_model.output, z])

In [None]:
# Create output layer
output_layer = layers.Dense(num_classes, activation="softmax",
                            name="output_layer")(full_concat)

In [None]:
# Combine Inputs
model_5 = tf.keras.Model(inputs=[
    line_number_model.input,
    total_lines_model.input,
    token_model.input,
    char_model.input
], outputs=output_layer, name="model_5_tribirid_model")

# Get model summary of Tribrid Model
model_5.summary()

## Plottting Tri-Brid Model

In [None]:
from tensorflow.keras.utils import plot_model

plot_model(model_5, show_shapes=True, show_layer_names=True)

## Compiling, Creating Fast loading Pipelines  and Fitting Model

 **What is label Smoothing?**


>For example, if our model gets too confident on a single class, (e.g its prediction probability is really hight), it may get stuck on that class and not consider other classes.




---




Really confident: `[0.0, 0.0, 1.0, 0.0]`

>What label smoothing does, it assigns some of the value from the highest pred prob to other classes, in turn, hopefully improving generalization:

 With Label Smoothing: `[0.01, 0.01, 0.96, 0.01]`

In [None]:
# Compiling the Model
model_5.compile(loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.2), # Helps to prevent overfitting
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

Creating Tri-brid datasets using tf.data

In [None]:
# For train
train_dataset_tribrid_text = tf.data.Dataset.from_tensor_slices(
    (train_line_numbers_one_hot, train_total_lines_one_hot, train_sentences, train_chars))
train_dataset_tribrid_labels = tf.data.Dataset.from_tensor_slices(train_labels_one_hot)
train_dataset_tribrid = tf.data.Dataset.zip((train_dataset_tribrid_text,train_dataset_tribrid_labels)
).batch(32).prefetch(tf.data.AUTOTUNE)

# For val
val_dataset_tribrid_text = tf.data.Dataset.from_tensor_slices(
    (val_line_numbers_one_hot, val_total_lines_one_hot, val_sentences, val_chars))
val_dataset_tribrid_labels = tf.data.Dataset.from_tensor_slices(val_labels_one_hot)
val_dataset_tribrid = tf.data.Dataset.zip((val_dataset_tribrid_text,val_dataset_tribrid_labels)
).batch(32).prefetch(tf.data.AUTOTUNE)

# For test
test_dataset_tribrid_text = tf.data.Dataset.from_tensor_slices(
    (test_line_numbers_one_hot, test_total_lines_one_hot, test_sentences, test_chars))
test_dataset_tribrid_labels = tf.data.Dataset.from_tensor_slices(test_labels_one_hot)
test_dataset_tribrid = tf.data.Dataset.zip((test_dataset_tribrid_text,test_dataset_tribrid_labels)
).batch(32).prefetch(tf.data.AUTOTUNE)

In [None]:
train_dataset_tribrid, val_dataset_tribrid_text

Fit the Tribirid Model to the Dataset Pipelines

In [None]:
# Fitting the Model
history_5 = model_5.fit(
    train_dataset_tribrid,
    epochs=5,
    steps_per_epoch=int(0.1 * len(train_dataset_tribrid)),
    validation_data=val_dataset_tribrid,
    validation_steps=int(0.1 * len(val_dataset_tribrid)),
    callbacks=[create_tensorboard_callback(SAVE_DIR, "model_5")]
)

In [None]:
model_5.evaluate(val_dataset_tribrid)

In [None]:
model_5_preds = tf.argmax(model_5.predict(val_dataset_tribrid), axis=1)
model_5_preds

In [None]:
add_model_evaluation_to_list("model_5", model_5_preds)
models_metrics

# Compare Results for all Models

In [None]:
import pandas as pd

results_df = pd.DataFrame(models_metrics)
results_df

## Visualizing Perfomances

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import random

def plot_model_metric(column):
    palette=["husl", "Set1", "Set2", "Set3", "coolwarm", "viridis"]
    plt.figure(figsize=(10, 7))
    ax = sns.barplot(data=results_df, x=column, y="name", palette=random.choice(palette), hue="name")
    plt.title(f"Model {column} scores")
    plt.xlabel(column)
    plt.ylabel("Models")
    for container in ax.containers:
        ax.bar_label(container, fontsize=10)
    plt.gca().spines[["top", "right"]].set_visible(False)
    plt.show()

In [None]:
plot_model_metric("accuracy")

In [None]:
plot_model_metric("f1")

In [None]:
plot_model_metric("precision")

In [None]:
plot_model_metric("recall")

# Save and Loading Model

In [None]:
model_5.save("/content/drive/MyDrive/AbstractOptimizer20K.keras")

In [None]:
loaded_model = tf.keras.models.load_model(
    "/content/drive/MyDrive/AbstractOptimizer20K.keras",
    custom_objects={
        'FeatureExtractorLayer': FeatureExtractorLayer,  # ✅ Add the registered custom layer
        'KerasLayer': hub.KerasLayer
    }
)

In [None]:
# Make predictions of loaded model
loaded_model_preds = tf.argmax(loaded_model.predict(val_dataset_tribrid), axis=1)
loaded_model_preds

In [None]:
model_5_preds[:10], loaded_model_preds[:10]

In [None]:
results_df.to_csv("/content/drive/MyDrive/model_results.csv", index=False)