# 04 â€“ LSTM Model for Phishing Email Detection

In this notebook we:
- load the preprocessed dataset (`cleaned.csv`)
- tokenize and pad the email texts
- build a simple LSTM-based classifier
- train the model and evaluate it on a test set
- compare later with the Naive Bayes baseline

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

DATA_PATH = Path("../data/processed/cleaned.csv")
df = pd.read_csv(DATA_PATH)

# Safety: make sure text has no NaNs
df['clean_body'] = df['clean_body'].fillna('').astype(str)

df.head()

In [None]:
X = df['clean_body']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

len(X_train), len(X_test)

In [None]:
MAX_WORDS = 10000    # size of vocabulary
MAX_LEN = 300        # max number of tokens per email

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Convert text to sequences of integers
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

# Pad / truncate sequences to fixed length
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post', truncating='post')
X_test_pad  = pad_sequences(X_test_seq,  maxlen=MAX_LEN, padding='post', truncating='post')

X_train_pad.shape, X_test_pad.shape

In [None]:
EMBED_DIM = 64
LSTM_UNITS = 64

model = Sequential([
    Embedding(input_dim=MAX_WORDS, output_dim=EMBED_DIM),
    LSTM(LSTM_UNITS),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()

In [None]:
EPOCHS = 5
BATCH_SIZE = 64

history = model.fit(
    X_train_pad, y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_split=0.1,
    verbose=1
)

In [None]:
plt.plot(history.history['accuracy'], label='train_acc')
plt.plot(history.history['val_accuracy'], label='val_acc')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
y_probs = model.predict(X_test_pad).ravel()
y_pred = (y_probs >= 0.5).astype(int)

print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("LSTM - Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [None]:
models_path = Path("../models")
models_path.mkdir(exist_ok=True)

model.save(models_path / "lstm.h5")

In [11]:
from io import StringIO

report_str = classification_report(y_test, y_pred)

results_path = Path("../results")
results_path.mkdir(exist_ok=True)

with open(results_path / "lstm_report.txt", "w", encoding="utf-8") as f:
    f.write(report_str)