In [None]:
# Import Tensorflow & Pathlib librairies
import tensorflow as tf 
import pathlib 
import pandas as pd 
import os
import io
import warnings
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding, Dense, LSTM, GRU, Dropout, BatchNormalization
#warnings.filterwarnings('ignore')

In [None]:
# Import dataset with Pandas 
dataset = pd.read_csv("spam.csv", encoding="ISO-8859-1")
dataset.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
dataset= dataset.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4"], axis=1)

In [None]:
len(dataset)

5572

In [None]:
dataset.rename(columns = {'v1':'type', 'v2': 'mail'}, inplace = True)

In [None]:
dataset.head()

Unnamed: 0,type,mail
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# check count and unique and top values and their frequency
dataset['type'].value_counts()

ham     4825
spam     747
Name: type, dtype: int64

In [None]:
# target replaced by 1 or 0
dataset['type'] = dataset['type'].apply(lambda x:1 if x=='spam' else 0)

In [None]:
!python -m spacy download en_core_web_md -q

2022-12-07 15:03:08.950023: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[K     |████████████████████████████████| 42.8 MB 1.3 MB/s 
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [None]:
# Import Spacy and french initialisation
import en_core_web_md
nlp = en_core_web_md.load()

In [None]:
# Import Stop words 
from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
dataset["mail_clean"] = dataset["mail"].apply(lambda x:''.join(ch for ch in x if ch.isalnum() or ch==" " or ch=="'"))
dataset["mail_clean"] = dataset["mail_clean"].apply(lambda x: x.replace("å","").lower().strip())
dataset["mail_clean"] = dataset["mail_clean"].apply(lambda x: " ".join([token.lemma_ for token in nlp(x) if (token.lemma_ not in STOP_WORDS) and (token.text not in STOP_WORDS)]))

dataset

Unnamed: 0,type,mail,mail_clean
0,0,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,0,U dun say so early hor... U c already then say...,u dun early hor u c
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think usf live
...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,2nd time try 2 contact u u win 750 pound prize...
5568,0,Will Ì_ b going to esplanade fr home?,ì b esplanade fr home
5569,0,"Pity, * was in mood for that. So...any other s...",pity mood soany suggestion
5570,0,The guy did some bitching but I acted like i'd...,guy bitching act like interested buy week free


In [None]:
mask = dataset["mail_clean"].isna()==False
dataset = dataset[mask]

In [None]:
import numpy as np
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=1000) # instanciate the tokenizer
tokenizer.fit_on_texts(dataset["mail_clean"])
dataset["mail_encoded"] = tokenizer.texts_to_sequences(dataset.mail_clean)
dataset["len_mail"] = dataset["mail_encoded"].apply(lambda x: len(x))
dataset = dataset[dataset["len_mail"]!=0]

In [None]:
mails_pad = tf.keras.preprocessing.sequence.pad_sequences(dataset.mail_encoded, padding="post")

In [None]:
# Train Test Split
xtrain, xval, ytrain, yval = train_test_split(mails_pad,dataset.type, test_size=0.3)

In [None]:
train = tf.data.Dataset.from_tensor_slices((xtrain, ytrain))
val = tf.data.Dataset.from_tensor_slices((xval, yval))

In [None]:
train_batch = train.shuffle(len(train)).batch(64)
val_batch = val.shuffle(len(val)).batch(64)

In [None]:
 # Regardons un batch 
for review, star in train_batch.take(1):
  print(review, star)

tf.Tensor(
[[ 51  58  19 ...   0   0   0]
 [426   0   0 ...   0   0   0]
 [ 71   4   1 ...   0   0   0]
 ...
 [272  39  47 ...   0   0   0]
 [  7   9  35 ...   0   0   0]
 [ 18  61 122 ...   0   0   0]], shape=(64, 54), dtype=int32) tf.Tensor(
[0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], shape=(64,), dtype=int64)


In [None]:
vocab_size = len(tokenizer.word_index)
model_lstm = tf.keras.Sequential([
                  Embedding(vocab_size+1, 64, input_shape=[review.shape[1],],name="embedding"),
                  LSTM(units=64, return_sequences=True), # maintains the sequential nature
                  Dropout(0.2),
                  LSTM(units=32, return_sequences=False), # returns the last output
                  Dense(16, activation='relu'),
                  Dense(8, activation='relu'),

                  Dense(1, activation="sigmoid", name="last")
])

In [None]:
model_lstm.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 54, 64)            522944    
                                                                 
 lstm_6 (LSTM)               (None, 54, 64)            33024     
                                                                 
 dropout_5 (Dropout)         (None, 54, 64)            0         
                                                                 
 lstm_7 (LSTM)               (None, 32)                12416     
                                                                 
 dense_24 (Dense)            (None, 16)                528       
                                                                 
 dense_25 (Dense)            (None, 8)                 136       
                                                                 
 last (Dense)                (None, 1)                

In [None]:
optimizer= tf.keras.optimizers.Adam()

model_lstm.compile(optimizer=optimizer,
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy()])

In [None]:
model_lstm.fit(train_batch,
              epochs=30, 
              validation_data=val_batch)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f54ef56a040>

In [None]:
#introduction de la class_weight, ayant en target 2 classes représentées de manière déséquilibrée
weights = 1/(dataset["type"]).value_counts()
weights = weights * len(dataset)/2
weights = {index : values for index , values in zip(weights.index,weights.values)}
weights

{0: 0.5801419965576592, 1: 3.619463087248322}

In [None]:
# Model training 
model_lstm.fit(train_batch,
          epochs=30, 
          validation_data=val_batch,
          class_weight=weights)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f54e0454f40>

In [None]:
import json
tf.keras.utils.get_file("/content/model_lstm_reg.h5",
                        origin="https://full-stack-assets.s3.eu-west-3.amazonaws.com/models/M08_Deep_learning/Text_classification/model_lstm_reg.h5")
tf.keras.utils.get_file("/content/LSTM_history_reg.json",
                        origin="https://full-stack-assets.s3.eu-west-3.amazonaws.com/models/M08_Deep_learning/Text_classification/LSTM_history_reg.json")
LSTM_history_reg = json.load(open("/content/LSTM_history_reg.json", 'r'))
model_lstm_reg = tf.keras.models.load_model("/content/model_lstm_reg.h5")

In [None]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(y=LSTM_history_reg["loss"],
                    mode='lines',
                    name='loss'))
fig.add_trace(go.Scatter(y=LSTM_history_reg["val_loss"],
                    mode='lines',
                    name='val_loss'))
fig.show()


In [None]:
#LSTM ne donne pas du tout de bon resultat 
vocab_size = len(tokenizer.word_index)
model_gru = tf.keras.Sequential([
                  Embedding(vocab_size+1, 64, input_shape=[review.shape[1],],name="embedding"),
                  GRU(units=64, return_sequences=True), # maintains the sequential nature
                  Dropout(0.15),
                  GRU(units=32, return_sequences=False), # returns the last output
                  Dense(16, activation='relu'),
                  Dense(8, activation='relu'),
                  Dense(1, activation="sigmoid")
])

In [None]:
optimizer= tf.keras.optimizers.Adam()

model_gru.compile(optimizer=optimizer,
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy()])

In [None]:
model_gru.fit(train_batch,
              epochs=20, 
              validation_data=val_batch)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f54d7bfd4c0>

In [None]:
model_gru.save("model_gru.h5")
json.dump(model_gru.history.history, open("/content/GRU_history.json", 'w'))

In [None]:
tf.keras.utils.get_file("/content/model_gru.h5",
                        origin="https://full-stack-assets.s3.eu-west-3.amazonaws.com/models/M08_Deep_learning/Text_classification/model_gru.h5")
tf.keras.utils.get_file("/content/GRU_history.json",
                        origin="https://full-stack-assets.s3.eu-west-3.amazonaws.com/models/M08_Deep_learning/Text_classification/GRU_history.json")
GRU_history = json.load(open("/content/GRU_history.json", 'r'))
model_gru = tf.keras.models.load_model("/content/model_gru.h5")

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(y=GRU_history["loss"],
                    mode='lines',
                    name='loss'))
fig.add_trace(go.Scatter(y=GRU_history["val_loss"],
                    mode='lines',
                    name='val_loss'))
fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(y=GRU_history["binary_accuracy"],
                    mode='lines',
                    name='accuracy'))
fig.add_trace(go.Scatter(y=GRU_history["val_binary_accuracy"],
                    mode='lines',
                    name='val_accuracy'))
fig.show()