In [52]:
import pandas as pd
import numpy as np
import tensorflow as tf

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # only use GPU memory that we need, not allocate all the GPU memory
    tf.config.experimental.set_memory_growth(gpus[0], enable=True)

In [53]:
df = pd.read_csv('spam.csv', encoding='Latin-1')

In [54]:
df = df.loc[:,['v1','v2']]
df.columns = ['target','text']
df

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [55]:
import spacy
nlp = spacy.load('en_core_web_md')

In [56]:
from spacy.lang.en.stop_words import STOP_WORDS

In [57]:
# Remove all non alphanumeric characters except whitespaces
df["text_clean"] = df["text"].apply(lambda x:''.join(ch for ch in x if ch.isalnum() or ch==" "))
# remove double spaces and spaces at the beginning and end of strings
df["text_clean"] = df["text_clean"].apply(lambda x: x.replace(" +"," ").lower().strip())
# remove stop words and replace everyword with their lemma
df["text_clean"] = df["text_clean"].apply(lambda x: " ".join([token.lemma_ for token in nlp(x) if (token.lemma_ not in STOP_WORDS) & (token.text not in STOP_WORDS)]))

In [58]:
df.head()

Unnamed: 0,target,text,text_clean
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun early hor u c
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think usf live


In [59]:
df['size'] = df['text_clean'].apply(lambda x: len(x))
df.head()

Unnamed: 0,target,text,text_clean,size
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...,75
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni,21
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,131
3,ham,U dun say so early hor... U c already then say...,u dun early hor u c,19
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think usf live,18


In [61]:
df['target'].value_counts()

ham     4825
spam     747
Name: target, dtype: int64

In [66]:
df.loc[df['size'] > 80,'target'].value_counts()

spam    599
ham     415
Name: target, dtype: int64

In [None]:
sizes = df['size'].value_counts()
sizes_ham = df['size'].value_counts()
sizes_scam = df['size'].value_counts()
import plotly.express as px

px.box(x=sizes)

In [None]:
# on regarde si il n'y a pas une classe unique dans les outliers, ce qui pourrait être une information en soit
df.query('size >= 50').value_counts("target")

target
ham     1171
spam     696
dtype: int64

In [68]:
# on supprime les trop grandes chaines pour aviter d'avoir de trop grandes valeurs

#df = df.query('size < 50') # 0.93 au lieu de 0.99

In [69]:
mask = df['text_clean'].apply(lambda x: type(x)==str)
mask.value_counts()

True    5572
Name: text_clean, dtype: int64

In [70]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=1000, oov_token="out_of_vocab") # instanciate the tokenizer # todo
tokenizer.fit_on_texts(df["text_clean"])
df["text_encoded"] = tokenizer.texts_to_sequences(df["text_clean"])

In [71]:
df['target_encoded'] = df['target'].map({'ham':0,'spam':1})

In [72]:
text_pad = tf.keras.preprocessing.sequence.pad_sequences(df["text_encoded"], padding="post")

In [73]:
from sklearn.model_selection import train_test_split

xtrain, xval, ytrain, yval = train_test_split(text_pad, df["target_encoded"], test_size=0.2)

In [74]:
train = tf.data.Dataset.from_tensor_slices((xtrain, ytrain))
val = tf.data.Dataset.from_tensor_slices((xval, yval))

In [75]:
len(train)

4457

In [76]:
train_batch = train.shuffle(len(train)).batch(64)
val_batch = val.shuffle(len(val)).batch(64)

In [77]:
for text, target in val_batch.take(1):
  print(text, target)

tf.Tensor(
[[724   0   0 ...   0   0   0]
 [615 211  29 ...   0   0   0]
 [ 55   1   7 ...   0   0   0]
 ...
 [517   1 923 ...   0   0   0]
 [438   1  42 ...   0   0   0]
 [ 29 524   0 ...   0   0   0]], shape=(64, 72), dtype=int32) tf.Tensor(
[0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0], shape=(64,), dtype=int64)


In [78]:
vocab_size = tokenizer.num_words
model = tf.keras.Sequential([
                  # Couche d'Input Word Embedding           
                  tf.keras.layers.Embedding(vocab_size+1, 128, input_shape=[text.shape[1],], name="embedding"),
                  
                  tf.keras.layers.LSTM(units=128, return_sequences=True, dropout=0.2),
                  tf.keras.layers.LSTM(units=64, return_sequences=False, dropout=0.2),

                  # hi speed conversion 0.98, faster convergence
                  # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
                  # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
                  #tf.keras.layers.GlobalMaxPooling1D(),
# todo : dropout
                  # Couche Dense classique
                  tf.keras.layers.Dense(64, activation='relu'),
                  tf.keras.layers.Dropout(0.2),
                  tf.keras.layers.Dense(32, activation='relu'),
                  tf.keras.layers.Dropout(0.2),

                  tf.keras.layers.Dense(1, activation="sigmoid")
])

In [79]:
import keras.backend as K

def f1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

# much better
def f1_weighted(true, pred): #shapes (batch, 4)

    #for metrics include these two lines, for loss, don't include them
    #these are meant to round 'pred' to exactly zeros and ones
    # predLabels = K.argmax(pred, axis=-1)
    # pred = K.one_hot(predLabels, 4) 


    ground_positives = K.sum(true, axis=0) + K.epsilon()       # = TP + FN
    pred_positives = K.sum(pred, axis=0) + K.epsilon()         # = TP + FP
    true_positives = K.sum(true * pred, axis=0) + K.epsilon()  # = TP
        #all with shape (4,)
    
    precision = true_positives / pred_positives 
    recall = true_positives / ground_positives
        #both = 1 if ground_positives == 0 or pred_positives == 0
        #shape (4,)

    f1 = 2 * (precision * recall) / (precision + recall + K.epsilon())
        #still with shape (4,)

    weighted_f1 = f1 * ground_positives / K.sum(ground_positives) 
    weighted_f1 = K.sum(weighted_f1)

    
    return 1 - weighted_f1 #for metrics, return only 'weighted_f1'

def f1_weighted_loss(true, pred): #shapes (batch, 4)

    #for metrics include these two lines, for loss, don't include them
    #these are meant to round 'pred' to exactly zeros and ones
    # predLabels = K.argmax(pred, axis=-1)
    # pred = K.one_hot(predLabels, 4) 

    true = tf.cast(true, tf.float32)

    ground_positives = K.sum(true, axis=0) + K.epsilon()       # = TP + FN
    pred_positives = K.sum(pred, axis=0) + K.epsilon()         # = TP + FP
    true_positives = K.sum(true * pred, axis=0) + K.epsilon()  # = TP
        #all with shape (4,)
    
    precision = true_positives / pred_positives 
    recall = true_positives / ground_positives
        #both = 1 if ground_positives == 0 or pred_positives == 0
        #shape (4,)

    f1 = 2 * (precision * recall) / (precision + recall + K.epsilon())
        #still with shape (4,)

    weighted_f1 = f1 * ground_positives / K.sum(ground_positives) 
    weighted_f1 = K.sum(weighted_f1)

    
    return 1 - weighted_f1 #for metrics, return only 'weighted_f1'

In [93]:
optimizer= tf.keras.optimizers.Adam(learning_rate=0.001)

model.compile(optimizer=optimizer,
              #loss=f1_weighted_loss,
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[f1_weighted,tf.keras.metrics.BinaryAccuracy])

In [81]:
targets_count = df['target_encoded'].value_counts()
total = targets_count.sum()
dic_weights = {i : 1 / (targets_count[i]/total) / 2 for i in range(2)}

In [82]:
dic_weights

{0: 0.5774093264248704, 1: 3.72958500669344}

In [83]:
# part = 1 - df['target_encoded'].value_counts() / df.shape[0]
# part.to_dict()

In [95]:
history = model.fit(train_batch,
                    epochs=100,
                    validation_data=val_batch,
                    #class_weight=dic_weights
                    )

Epoch 1/100


2022-11-07 17:07:07.605577: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-11-07 17:07:07.954517: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-11-07 17:07:08.154658: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-11-07 17:07:08.619656: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-11-07 17:07:08.946295: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

In [85]:
from plotly import graph_objects as go
color_chart = ["#4B9AC7", "#4BE8E0", "#9DD4F3", "#97FBF6", "#2A7FAF", "#23B1AB", "#0E3449", "#015955"]

history_graph = history.history
fig = go.Figure(data=[
                      go.Scatter(
                          y=history_graph["loss"],
                          name="Training loss",
                          mode="lines",
                          marker=dict(
                              color=color_chart[0]
                          )),
                      go.Scatter(
                          y=history_graph["val_loss"],
                          name="Validation loss",
                          mode="lines",
                          marker=dict(
                              color=color_chart[1]
                          ))
])
fig.update_layout(
    title='Training and val loss across epochs',
    xaxis_title='epochs',
    yaxis_title='Cross Entropy'    
)
fig.show()

In [86]:
history_graph = history.history
fig = go.Figure(data=[
                      go.Scatter(
                          y=history_graph["f1_weighted"],
                          name="Training F1",
                          mode="lines",
                          marker=dict(
                              color=color_chart[0]
                          )),
                      go.Scatter(
                          y=history_graph["val_f1_weighted"],
                          name="Validation F1",
                          mode="lines",
                          marker=dict(
                              color=color_chart[1]
                          ))
])
fig.update_layout(
    title='Training and val F1 across epochs',
    xaxis_title='epochs',
    yaxis_title='1'    
)
fig.show()