In [61]:
import pandas as pd
import numpy as np
import tensorflow as tf

# Inspired by:
#https://predictivehacks.com/?all-tips=how-to-get-bert-embeddings-with-tensorflow-hub
#https://www.analyticsvidhya.com/blog/2021/09/performing-email-spam-detection-using-bert-in-python/
#https://www.analyticsvidhya.com/blog/2020/10/simple-text-multi-classification-task-using-keras-bert/

In [62]:
!pip install tensorflow-text

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [63]:
# upload before the spam.csv in the colab environnement
df = pd.read_csv('spam.csv', encoding='Latin-1')

In [64]:
# rename of cols
df = df.loc[:,['v1','v2']]
df = df.rename(columns = {'v1':'target','v2':'text'})
df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [65]:
# encoding target to 0 and 1
df['target_encoded'] = df['target'].map({'ham':0,'spam':1})

In [66]:
df.groupby('target_encoded').agg({'target_encoded':'count'})

Unnamed: 0_level_0,target_encoded
target_encoded,Unnamed: 1_level_1
0,4825
1,747


In [67]:
# we balanced perfectly ham and spam by keeping the same amount of ham than spam. Without that, I tested than the performance are worst.
df = df.sample(frac=1).reset_index(drop=True)
df_ham = df.query('target_encoded == 0').iloc[:747,:]
df_spam = df.query('target_encoded == 1')
df = pd.concat([df_ham, df_spam], axis=0)

In [68]:
# we split the data into train and val set. And use stratify to distribute better the unbalanced target
from sklearn.model_selection import train_test_split
xtrain, xval, ytrain, yval = train_test_split(df['text'], df["target_encoded"], test_size=0.20, stratify=df["target_encoded"])

In [69]:
df.head()

Unnamed: 0,target,text,target_encoded
0,ham,"Sorry, I'll call later",0
1,ham,Okie...,0
2,ham,Lol you won't feel bad when I use her money to...,0
3,ham,We have all rounder:)so not required:),0
4,ham,Have you started in skye,0


In [70]:
# downloading preprocessing files and model
import tensorflow_hub as hub
import tensorflow_text as text  # Imports TF ops for preprocessing

bert_preprocessor = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
bert_encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4')

In [71]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='Inputs') # shape unknown because of strings dtype
preprocessed_text = bert_preprocessor(text_input) # all preprocessing done by this bert preprocessor
# BERT need a specific input prepocessed data
embeed = bert_encoder(preprocessed_text) # main model

# we apply simple layers of dropout to prevent overfitting and gradient leakage
# we use the code chaining technique of keras for that
dropout1 = tf.keras.layers.Dropout(0.1, name='Dropout1')(embeed['pooled_output'])

dense1 = tf.keras.layers.Dense(16, activation="relu", name="Dense1")(dropout1)
dropout2 = tf.keras.layers.Dropout(0.1, name='Dropout2')(dense1)

dense2 = tf.keras.layers.Dense(8, activation="relu", name="Dense2")(dropout2)
dropout3 = tf.keras.layers.Dropout(0.1, name='Dropout3')(dense2)

# sigmoid because it's a binary classification
outputs = tf.keras.layers.Dense(1, activation='sigmoid', name='Output')(dropout3)

model = tf.keras.Model(inputs=[text_input], outputs=[outputs])

In [72]:
import keras.backend as K

def f1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

In [73]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer = optimizer,
               loss = 'binary_crossentropy',
               metrics = [f1,tf.keras.metrics.BinaryAccuracy(name='accuracy')
           ])

In [74]:
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Inputs (InputLayer)            [(None,)]            0           []                               
                                                                                                  
 keras_layer_4 (KerasLayer)     {'input_type_ids':   0           ['Inputs[0][0]']                 
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                    

12449 parameters to train only. Should be fast on the GPU.

In [75]:
history = model.fit(xtrain, ytrain, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [83]:
model.evaluate(xval,yval)



[0.06661155819892883, 0.9661909937858582, 0.9765886068344116]

**The F1 Score on val set is 0.9662. Better than with Baseline and LSTM model in 1st notebook.** Even if it could be little biasy because we reduced the dataset to have balanced data. We have less val set to evaluate.

In [84]:
from plotly import graph_objects as go

def show_evolution(history, score_name="loss"):
    color_chart = ["#4B9AC7", "#4BE8E0", "#9DD4F3", "#97FBF6", "#2A7FAF", "#23B1AB", "#0E3449", "#015955"]
    history_graph = history.history
    fig = go.Figure(data=[
                        go.Scatter(
                            y=history_graph[score_name],
                            name="Training "+score_name,
                            mode="lines",
                            marker=dict(
                                color=color_chart[0]
                            ))
    ])
    fig.update_layout(
        title=f"Training {score_name} across epochs",
        xaxis_title='epochs',
        yaxis_title=score_name    
    )
    fig.show()

In [None]:
show_evolution(history, 'loss')

In [86]:
show_evolution(history, 'f1')