# **LSTM & Bidirectional LSTM Model**

> [**Understanding LSTM Networks**](https://colah.github.io/posts/2015-08-Understanding-LSTMs/)



In [None]:
# Install Kaggle.
!pip install --upgrade --force-reinstall --no-deps kaggle

In [None]:
# Files Upload.
from google.colab import files

files.upload()

In [None]:
# Create a Kaggle Folder.
!mkdir ~/.kaggle

# Copy the kaggle.json to the folder created.
!cp kaggle.json ~/.kaggle/

# Permission for the json file to act.
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Dataset Download.
!kaggle datasets download -d ue153011/spam-mail-detection-dataset

In [None]:
# Unzip Dataset.
!unzip spam-mail-detection-dataset.zip

## **Spam Mail Classification**

> [**Kaggle - Spam Mail Detection Dataset**](https://www.kaggle.com/datasets/ue153011/spam-mail-detection-dataset)

In [None]:
!pip install texthero
!pip install textblob
!pip install tensorflow_addons
!pip install spacy==3.3

In [None]:
# Import Library.
import pandas as pd
import numpy as np
import nltk

nltk.download("wordnet")
nltk.download("omw-1.4")
from textblob import TextBlob, Word
import texthero as hero
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from tensorflow.keras.preprocessing.text import one_hot
from tensorflow_addons.metrics import CohenKappa
from tensorflow.keras.layers import (
    Embedding,
    LSTM,
    Bidirectional,
    Dense,
    Dropout,
    LayerNormalization,
)
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping

# Model Configuration.
BATCH_SIZE = 256
NO_EPOCHS = 50
NO_CLASSES = 2
VALIDATION_SPLIT = 0.2
VERBOSITY = 1
VOC_SIZE = 10000
MAX_LEN = 20
my_callbacks = [
    EarlyStopping(monitor="val_loss", patience=25, restore_best_weights=True)
]

# Read Dataset.
data = pd.read_csv("spam_mail_data.csv")

# Text Cleaning and Preprocessing.
data["Message"] = data["Message"].pipe(hero.clean).pipe(hero.remove_urls)
data["Message"] = data["Message"].apply(
    lambda x: " ".join([Word(word).lemmatize() for word in x.split()])
)
data["Message"] = data["Message"].apply(lambda x: str(TextBlob(x).correct()))
data["Class"] = data["Category"].apply(lambda x: 1 if x == "spam" else 0)

# Split Dataset into Dependent and Independent Features.
X = data["Message"]
y = data["Class"]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
# One Hot Representation.
onehot_repr = [one_hot(words, VOC_SIZE) for words in X]

embedded_docs = pad_sequences(onehot_repr, padding="post", maxlen=MAX_LEN)
print(embedded_docs)

[[1900 1992 9225 ...    0    0    0]
 [2728 3232   39 ...    0    0    0]
 [3929 6969 7004 ... 9357 9019 9766]
 ...
 [9097 8222 4799 ...    0    0    0]
 [2898 5199 3088 ...    0    0    0]
 [9769 2879 1503 ...    0    0    0]]


In [None]:
# Split Dataset into Training and Test Set.
X_train, X_test, y_train, y_test = train_test_split(
    np.array(embedded_docs), y, test_size=0.25, random_state=1, stratify=y
)

In [None]:
def Simple_LSTM():
    model = Sequential()
    model.add(
        Embedding(input_dim=VOC_SIZE, output_dim=64, input_length=MAX_LEN)
    )  # Embedding Layer.
    model.add(LSTM(100))
    model.add(Dropout(0.25))
    model.add(LayerNormalization())
    model.add(Dense(units=1, activation="sigmoid"))
    # Compile the Model.
    model.compile(
        optimizer="adam",
        loss="binary_crossentropy",
        metrics=["accuracy", CohenKappa(num_classes=NO_CLASSES)],
    )
    return model

In [None]:
def Bidirectional_LSTM():
    model = Sequential()
    model.add(
        Embedding(input_dim=VOC_SIZE, output_dim=64, input_length=MAX_LEN)
    )  # Embedding Layer.
    model.add(Bidirectional(LSTM(100)))
    model.add(Dropout(0.25))
    model.add(LayerNormalization())
    model.add(Dense(units=1, activation="sigmoid"))
    # Compile the Model.
    model.compile(
        optimizer="adam",
        loss="binary_crossentropy",
        metrics=["accuracy", CohenKappa(num_classes=NO_CLASSES)],
    )
    return model

In [None]:
# Cost Sensitive Learning.
weights_assigned = {0: 1, 1: 7}

## **Train & Evaluate the LSTM Model**

In [None]:
# Call the LSTM Model Architecture.
lstm_model = Simple_LSTM()

# Build the Model.
lstm_model.build(X_train.shape)
lstm_model.summary()

# Fit the Model.
lstm_model.fit(
    X_train,
    y_train,
    class_weight=weights_assigned,
    batch_size=BATCH_SIZE,
    epochs=NO_EPOCHS,
    verbose=VERBOSITY,
    validation_split=VALIDATION_SPLIT,
    callbacks=my_callbacks,
)

# Model Evaluation.
print("\n Model Evaluation: ", lstm_model.evaluate(X_test, y_test))

In [None]:
# Performance Metrics and Accuracy.
y_pred = lstm_model.predict(X_test)
print("ROC-AUC Score is ", roc_auc_score(y_test, y_pred))

ROC-AUC Score is  0.9940138877803496


## **Train & Evaluate the Bidirectional LSTM Model**

In [None]:
# Call the Bidirectional LSTM Model Architecture.
bi_lstm = Bidirectional_LSTM()

# Build the Model.
bi_lstm.build(X_train.shape)
bi_lstm.summary()

# Fit the Model.
bi_lstm.fit(
    X_train,
    y_train,
    class_weight=weights_assigned,
    batch_size=BATCH_SIZE,
    epochs=NO_EPOCHS,
    verbose=VERBOSITY,
    validation_split=VALIDATION_SPLIT,
    callbacks=my_callbacks,
)

# Model Evaluation.
print("\n Model Evaluation: ", bi_lstm.evaluate(X_test, y_test))

In [None]:
# Performance Metrics and Accuracy.
y_pred = bi_lstm.predict(X_test)
print("ROC-AUC Score is ", roc_auc_score(y_test, y_pred))

ROC-AUC Score is  0.989606335523807
