# Model 2: RNN (LSTM)

This notebook follows the Course 3 workflow:
Tokenized Text Sequences → Embedding → LSTM → Dense → Softmax

Steps:
1. Load preprocessed sequences
2. Define embedding + RNN model
3. Train the model
4. Evaluate on test set
5. Save metrics

## 1. Load and preprocess text

In [None]:
from pathlib import Path
import re

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

DATA_DIR = Path("../data")

train_df = pd.read_csv(DATA_DIR / "train.txt", sep=";", header=None, names=["text", "emotion"])
test_df = pd.read_csv(DATA_DIR / "test.txt", sep=";", header=None, names=["text", "emotion"])


def clean_text(text: str) -> str:
    text = text.lower().strip()
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text


train_df["clean_text"] = train_df["text"].apply(clean_text)
test_df["clean_text"] = test_df["text"].apply(clean_text)

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

train_df.head()

## 2. Tokenize and pad sequences

In [None]:
max_words = 20000

text_tokenizer = Tokenizer(num_words=max_words, oov_token="<UNK>")
text_tokenizer.fit_on_texts(train_df["clean_text"])

train_seq = text_tokenizer.texts_to_sequences(train_df["clean_text"])
test_seq = text_tokenizer.texts_to_sequences(test_df["clean_text"])

seq_lengths = np.array([len(seq) for seq in train_seq])
max_len = int(np.percentile(seq_lengths, 95))

X_train = pad_sequences(train_seq, maxlen=max_len, padding="post", truncating="post")
X_test = pad_sequences(test_seq, maxlen=max_len, padding="post", truncating="post")

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df["emotion"])
y_test = label_encoder.transform(test_df["emotion"])
class_names = list(label_encoder.classes_)
num_classes = len(class_names)

y_train_cat = tf.keras.utils.to_categorical(y_train, num_classes=num_classes)
y_test_cat = tf.keras.utils.to_categorical(y_test, num_classes=num_classes)

print(f"Vocab size (limited): {max_words}")
print(f"Sequence length (95th percentile): {max_len}")
print(f"Train sequences shape: {X_train.shape}")

## 3. Define embedding + LSTM model

In [None]:
embedding_dim = 100

lstm_model = tf.keras.Sequential(
    [
        tf.keras.layers.Embedding(input_dim=max_words, output_dim=embedding_dim),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
        tf.keras.layers.Dense(6, activation="relu"),
        tf.keras.layers.Dense(num_classes, activation="softmax"),
    ]
)

lstm_model.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"],
)

lstm_model.build((None, max_len))
lstm_model.summary()

## 4. Train the model

In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=2,
    restore_best_weights=True,
)

lstm_history = lstm_model.fit(
    X_train,
    y_train_cat,
    epochs=10,
    batch_size=32,
    validation_split=0.1,
    callbacks=[early_stop],
    verbose=1,
)

## 5. Evaluate on test set

In [None]:
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support

lstm_probs = lstm_model.predict(X_test)
lstm_pred = lstm_probs.argmax(axis=1)

lstm_accuracy = accuracy_score(y_test, lstm_pred)
lstm_precision, lstm_recall, lstm_f1, _ = precision_recall_fscore_support(
    y_test, lstm_pred, average="macro", zero_division=0
)

print(f"Accuracy: {lstm_accuracy:.4f}")
print(f"Precision (macro): {lstm_precision:.4f}")
print(f"Recall (macro): {lstm_recall:.4f}")
print(f"F1 (macro): {lstm_f1:.4f}")

print("\nClassification report:\n")
print(classification_report(y_test, lstm_pred, target_names=class_names, zero_division=0))

## 6. Save metrics

In [None]:
from datetime import datetime

results_dir = Path("../results")
results_dir.mkdir(parents=True, exist_ok=True)
metrics_path = results_dir / "metrics.csv"

row = {
    "model": "RNN_LSTM",
    "accuracy": round(lstm_accuracy, 4),
    "precision": round(lstm_precision, 4),
    "recall": round(lstm_recall, 4),
    "f1": round(lstm_f1, 4),
    "timestamp": datetime.now().isoformat(timespec="seconds"),
}

metrics_df = pd.DataFrame([row])

if metrics_path.exists():
    metrics_df.to_csv(metrics_path, mode="a", header=False, index=False)
else:
    metrics_df.to_csv(metrics_path, index=False)

print(f"Saved metrics to: {metrics_path}")
metrics_df

In [None]:
from datetime import datetime
from pathlib import Path

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tensorflow.keras.preprocessing.sequence import pad_sequences

val_path = Path("../data/validation.txt")
if val_path.exists():
    val_df = pd.read_csv(val_path, sep=";", header=None, names=["text", "emotion"])
    val_df["clean_text"] = val_df["text"].apply(clean_text)
    val_seq = text_tokenizer.texts_to_sequences(val_df["clean_text"])
    X_val = pad_sequences(val_seq, maxlen=max_len, padding="post", truncating="post")
    y_val = label_encoder.transform(val_df["emotion"])
    val_probs = lstm_model.predict(X_val)
    val_pred = val_probs.argmax(axis=1)
    val_accuracy = accuracy_score(y_val, val_pred)
    val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(
        y_val, val_pred, average="macro", zero_division=0
    )

    print("Validation metrics (RNN):")
    print(f"Accuracy: {val_accuracy:.4f}")
    print(f"Precision (macro): {val_precision:.4f}")
    print(f"Recall (macro): {val_recall:.4f}")
    print(f"F1 (macro): {val_f1:.4f}")

    val_row = {
        "model": "RNN_LSTM_VAL",
        "accuracy": round(val_accuracy, 4),
        "precision": round(val_precision, 4),
        "recall": round(val_recall, 4),
        "f1": round(val_f1, 4),
        "timestamp": datetime.now().isoformat(timespec="seconds"),
    }

    val_metrics_df = pd.DataFrame([val_row])
    if metrics_path.exists():
        val_metrics_df.to_csv(metrics_path, mode="a", header=False, index=False)
    else:
        val_metrics_df.to_csv(metrics_path, index=False)

    val_metrics_df