In [1]:
# Our intention here is to reimplement the BoW Ham v Spam
# solution using the TextVectorization layer, which will
# allow us to easily deploy the model to a tfserving container.

import numpy as np         # Needed because.. NumPy
import os                  # We will iterate over directories
import re                  # needed to clean the email
import tensorflow as tf
from tensorflow.keras import models
from tensorflow.keras import layers


2024-04-05 19:59:01.998660: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-05 19:59:01.998930: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-05 19:59:02.000731: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-05 19:59:02.024102: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Using our simple directory traveral file finder function
# to find all instances of both classes and translate them
# to a data directory that we can feed to a tensorflow dataset.

def get_file_list(starting_directory):
    final_list = list()
    files = os.listdir(starting_directory)
    for file in files:
        file_name = os.path.join(starting_directory, file)
        if os.path.isdir(file_name):
            final_list = final_list + get_file_list(file_name)
        else:
            final_list.append(file_name)
    return final_list


In [6]:
# Reusing our existing get_words function. This will strip out
# the headers, keeping only the message data.
#
# This has been adjusted to leave the text as-is except for the
# headers since we will allow the TextVectorization layer to
# do the rest of the data normalization.

def get_words(file_name):
    # We'll use a regular expression to find things that are not words or spaces.
    regex = re.compile("[^\w\s]")
    # Start with an empty list
    words = list()
    # Open the specified file
    with open(file_name, encoding='utf8', errors='ignore') as f:
        # Grab all of the lines
        text = f.readlines()
        # Set a flag to keep track of whether we have reached the body or not.
        finished_header = False
        # Iterate over the lines
        for line in text:
            # The last line in the headers is consistently the subject line.  If
            # we have not yet seen the subject then we are still parsing headers
            # and should ignore them.
            if finished_header:
                words.append(line)
            # Check to see if the beginning of the line contains "subject:"
            # to determine if we have reached the end of the email header.
            elif line.lower() == "\n":
                # If we have, set the flag
                finished_header = True
    return words


In [7]:
# We want to leverage tf.keras.preprocessing.text_dataset_from_directory()
#
# To do this, we must preprocess each email into a separate file. The files
# must have the '.txt' extension or they will not be recognied. The files must
# be in directories named 'ham' and 'spam' respectively.

output_dir = './training'
for data_class in ['ham', 'spam']:
    path = os.path.join(output_dir, data_class)
    for index, file in enumerate(get_file_list(os.path.join("./Enron",data_class))):
        words = get_words(file)
        with open(os.path.join(path, f'{index:>06}.txt'), 'w') as f:
            f.write(' '.join(words))
            

In [8]:
# We can create the dataset from this directory:
ds = tf.keras.preprocessing.text_dataset_from_directory(
    output_dir,
    labels='inferred',
    label_mode='binary',
    shuffle=True,
    verbose=True
)

Found 31716 files belonging to 2 classes.


In [11]:
# We can now create and adapt the TextVectorization layer.
# This layer can perform many different tasks. We will use it
# to learn the vocabulary and reencode the data as multihot encoded
# for bag of words.

text_layer = tf.keras.layers.TextVectorization(
    max_tokens=10000,
    standardize='lower_and_strip_punctuation',
    split='whitespace',
    ngrams=None,
    output_mode='multi_hot',
    output_sequence_length=None,
    pad_to_max_tokens=True,
    vocabulary=None,
    idf_weights=None,
    sparse=False,
    ragged=False,
    encoding='utf-8',
)

# The TextVectorization.adapt() function expects to get strings.
# The dataset is currently strings and labels.  We can map the
# dataset through a lambda to return only the texts:

train_text = ds.map(lambda x, y: x)

text_layer.adapt(train_text)

2024-04-05 20:24:08.261631: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [14]:
tf.random.set_seed(42)
model = models.Sequential()
model.add(text_layer)
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1, name="Output_Layer", activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
training_history = model.fit(ds, epochs = 10, batch_size = 32)

Epoch 1/10
[1m992/992[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.9768 - loss: 0.0913
Epoch 2/10
[1m992/992[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.9957 - loss: 0.0105
Epoch 3/10
[1m992/992[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.9984 - loss: 0.0044
Epoch 4/10
[1m992/992[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.9966 - loss: 0.0114
Epoch 5/10
[1m992/992[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.9983 - loss: 0.0043
Epoch 6/10
[1m992/992[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.9986 - loss: 0.0031
Epoch 7/10
[1m992/992[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.9989 - loss: 0.0024
Epoch 8/10
[1m992/992[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.9988 - loss: 0.0024
Epoch 9/10
[1m992/992[0m [32m━━━━━━━━