In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import datetime

In [2]:
# We configure the gpu for tensorflow
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # only use GPU memory that we need, not allocate all the GPU memory
    tf.config.experimental.set_memory_growth(gpus[0], enable=True)

gpus

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
df = pd.read_csv('spam.csv', encoding='Latin-1')

In [4]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
df.isna().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

No NaN is columns we want to keep, v1 and v2

# Preprocessing

In [6]:
# we keep and rename v1 and v2
df = df.loc[:,['v1','v2']]
df = df.rename(columns={'v1': 'target', 'v2': 'text'})
df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# We encode the target in 0 and 1
df['target_encoded'] = df['target'].map({'ham':0,'spam':1})

In [8]:
# I tested to balanced perfectly ham and spam by keeping the same amount of ham than spam. But perform less well. I keep in comment:
# df = df.sample(frac=1).reset_index(drop=True)
# df_ham = df.query('target_encoded == 0').iloc[:747,:]
# df_spam = df.query('target_encoded == 1')
# df = pd.concat([df_ham, df_spam], axis=0)

In [9]:
# let's load the Spicy english model that we already had downloaded
import spacy
nlp = spacy.load('en_core_web_md')

In [10]:
# We import the english stop words
from spacy.lang.en.stop_words import STOP_WORDS

In [11]:
# Remove all non alphanumeric characters except whitespaces
df["text_clean"] = df["text"].apply(lambda x:''.join(ch for ch in x if ch.isalnum() or ch==" "))
# remove double spaces and spaces at the beginning and end of strings
df["text_clean"] = df["text_clean"].apply(lambda x: x.replace(" +"," ").lower().strip())
# remove stop words and replace everyword with their lemma
df["text_clean"] = df["text_clean"].apply(lambda x: " ".join([token.lemma_ for token in nlp(x) if (token.lemma_ not in STOP_WORDS) & (token.text not in STOP_WORDS)]))

In [12]:
df.head()

Unnamed: 0,target,text,target_encoded,text_clean
0,ham,"Go until jurong point, crazy.. Available only ...",0,jurong point crazy available bugis n great wor...
1,ham,Ok lar... Joking wif u oni...,0,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,0,u dun early hor u c
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,nah think usf live


In [None]:
df['text_clean'] = df['text_clean'].apply(lambda x: x if type(x)==str else "")

All values of text_clean are type str. That's good.

In [14]:
# we tokenize the words. That's mean we transform the words as number that the model can understand
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000, oov_token="out_of_vocab")
tokenizer.fit_on_texts(df["text_clean"])
df["text_encoded"] = tokenizer.texts_to_sequences(df["text_clean"])

In [15]:
df.head()

Unnamed: 0,target,text,target_encoded,text_clean,text_encoded
0,ham,"Go until jurong point, crazy.. Available only ...",0,jurong point crazy available bugis n great wor...,"[3625, 230, 444, 460, 942, 36, 51, 204, 943, 7..."
1,ham,Ok lar... Joking wif u oni...,0,ok lar joke wif u oni,"[10, 194, 461, 289, 2, 1455]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry 2 wkly comp win fa cup final tkts 2...,"[13, 300, 4, 533, 662, 34, 1456, 846, 420, 145..."
3,ham,U dun say so early hor... U c already then say...,0,u dun early hor u c,"[2, 125, 150, 2369, 2, 84]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,nah think usf live,"[705, 23, 663, 131]"


In [16]:
print("Number of different words:", len(tokenizer.word_counts))

Number of different words: 8215


In [17]:
# we add 0 to fill the end of text encoded which are not the max size
text_pad = tf.keras.preprocessing.sequence.pad_sequences(df["text_encoded"], padding="post")

In [18]:
print("Length each embedded row after padding is:", len(text_pad[0]))

Length each embedded row after padding is: 72


# Creation of train and val batch

In [19]:
from sklearn.model_selection import train_test_split
xtrain, xval, ytrain, yval = train_test_split(text_pad, df["target_encoded"], test_size=0.2, stratify=df["target_encoded"])

In [20]:
train = tf.data.Dataset.from_tensor_slices((xtrain, ytrain))
val = tf.data.Dataset.from_tensor_slices((xval, yval))

Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB



2023-01-19 16:01:25.186859: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-01-19 16:01:25.186979: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [21]:
train_batch = train.shuffle(len(train)).batch(128)
val_batch = val.shuffle(len(val)).batch(128)

In [22]:
# We confirm the shape of (128, 72) we should have for each batch
for text, target in val_batch.take(1):
  print(text, target)

tf.Tensor(
[[  10  100   19 ...    0    0    0]
 [ 316 1268  240 ...    0    0    0]
 [ 544   25   69 ...    0    0    0]
 ...
 [ 147   29    2 ...    0    0    0]
 [ 270  815  624 ...    0    0    0]
 [2799  281  539 ...    0    0    0]], shape=(128, 72), dtype=int32) tf.Tensor(
[0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0
 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1], shape=(128,), dtype=int64)


# Model setup

### We will use tensorboard to monitor the training

In [23]:
%load_ext tensorboard

### Let's try a simple model with a GlobalMaxPooling1D

In [24]:
vocab_size = len(tokenizer.word_counts)
model = tf.keras.Sequential([
        # Input Word Embedding           
        tf.keras.layers.Embedding(vocab_size+1, 128, input_shape=[text.shape[1],], name="embedding"), # +1 because of padding value
        
        tf.keras.layers.GlobalMaxPooling1D(),
        
        # Classic dense layers
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.2),

        tf.keras.layers.Dense(1, activation="sigmoid") # sigmoid because it's a binary classification
])

In [25]:
import keras.backend as K
# we define a f1_score calculation as a metrics which should be better for the spam classification than a simple accuracy (precision and/or recall could be bad)
def f1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

In [26]:
optimizer= tf.keras.optimizers.Adam(learning_rate=0.0001) # we set a very low learning_rate to prevent the model to converge too fast (because it's fast in this case)

model.compile(optimizer=optimizer,
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[f1,tf.keras.metrics.BinaryAccuracy()])

In [27]:
# We find the "inverted" weights of unbalanced class to inform the model
targets_count = df['target_encoded'].value_counts()
total = targets_count.sum()
dic_weights = {i : 1 / (targets_count[i]/total) / 2 for i in range(2)}
dic_weights

{0: 0.5774093264248704, 1: 3.72958500669344}

In [28]:
# this tensorboard callback will log the training in a new log directory with the current time
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S"), histogram_freq=1)

history = model.fit(train_batch,
                    epochs=30,
                    validation_data=val_batch,
                    class_weight=dic_weights,
                    callbacks=[tensorboard_callback]
                    )

Epoch 1/30


2023-01-19 16:01:25.596693: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-01-19 16:01:26.087946: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-01-19 16:01:29.382568: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [29]:
from plotly import graph_objects as go

def show_evolution(history, score_name="loss"):
    color_chart = ["#4B9AC7", "#4BE8E0", "#9DD4F3", "#97FBF6", "#2A7FAF", "#23B1AB", "#0E3449", "#015955"]
    history_graph = history.history
    fig = go.Figure(data=[
                        go.Scatter(
                            y=history_graph[score_name],
                            name="Training "+score_name,
                            mode="lines",
                            marker=dict(
                                color=color_chart[0]
                            )),
                        go.Scatter(
                            y=history_graph["val_"+score_name],
                            name="Validation "+score_name,
                            mode="lines",
                            marker=dict(
                                color=color_chart[1]
                            ))
    ])
    fig.update_layout(
        title=f"Training and val {score_name} across epochs",
        xaxis_title='epochs',
        yaxis_title=score_name    
    )
    fig.show()

In [30]:
show_evolution(history, "f1")

In [31]:
show_evolution(history, "loss")

The best f1 value on val set looks like to be stabalised at around 0,932. And it's close to train set. It's not really overfitting

### Let's try with LSTM layers

To try give more sense to the sentences because of LSTM short and long term memory

In [32]:
vocab_size = len(tokenizer.word_counts)
model_lstm = tf.keras.Sequential([
        # Input Word Embedding           
        tf.keras.layers.Embedding(vocab_size+1, 128, input_shape=[text.shape[1],], name="embedding"), # +1 because of padding value
        
        tf.keras.layers.LSTM(units=128, return_sequences=True),
        tf.keras.layers.LSTM(units=64, return_sequences=False),
        
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.2),

        tf.keras.layers.Dense(1, activation="sigmoid") # sigmoid because it's a binary classification
])

In [33]:
optimizer_lstm = tf.keras.optimizers.Adam(learning_rate=0.0001) # we set a very low learning_rate to prevent the model to converge too fast (because it's fast in this case)

model_lstm.compile(optimizer=optimizer_lstm,
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[f1,tf.keras.metrics.BinaryAccuracy()])

In [34]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S"), histogram_freq=1)

history_lstm = model_lstm.fit(train_batch,
                    epochs=40,
                    validation_data=val_batch,
                    # class_weight=dic_weights, # after some tests, it seems much better to unactive the class_weight with our LSTM test
                    callbacks=[tensorboard_callback]
                    )

Epoch 1/40


2023-01-19 16:02:29.472603: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-19 16:02:29.750920: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-19 16:02:30.076446: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-19 16:02:30.368258: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-19 16:02:30.756943: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-01-19 16:02:36.405132: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-19 16:02:36.512530: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-01-19 16:02:36.627968: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [35]:
show_evolution(history_lstm, "loss")

In [36]:
show_evolution(history_lstm, "f1")

The best value of f1 is before 15 epochs and around 0.90 and not really stable.

**We keep the GlobalMaxPooling Model which perform better. It's looks like the vocabulary is enough to find spam.**