# 05 - Comparison of Naive Bayes and LSTM Models

In this notebook we:
- load the saved evaluation reports of both models (`nb_report.txt` and `lstm_report.txt`)
- parse the classification metrics into structured DataFrames
- compare Naive Bayes and LSTM using accuracy, precision, recall and F1-scores
- visualize overall performance differences between the models
- compare class-wise F1-scores to understand strengths and weaknesses
- summarize which model performs better and by how much

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Make plots show inside the notebook
%matplotlib inline

results_path = Path("../results")

nb_report_path = results_path / "nb_report.txt"
lstm_report_path = results_path / "lstm_report.txt"

with open(nb_report_path, "r", encoding="utf-8") as f:
    nb_report_str = f.read()

with open(lstm_report_path, "r", encoding="utf-8") as f:
    lstm_report_str = f.read()

print("Naive Bayes classification report:\n")
print(nb_report_str)

print("\n" + "="*80 + "\n")

print("LSTM classification report:\n")
print(lstm_report_str)

In [2]:
def parse_classification_report(report_str):
    """
    Parse sklearn.metrics.classification_report (string) into:
    - df: DataFrame with index = label rows, columns = precision/recall/f1-score/support
    - accuracy: float
    """
    lines = [line.strip() for line in report_str.split("\n") if line.strip()]
    rows = []
    accuracy = None

    for line in lines:
        # Skip header line (starts with 'precision' etc.)
        if line.startswith("precision") or line.startswith("macro avg") and "avg" not in line:
            continue

        tokens = line.split()

        # Class rows: label precision recall f1 support
        if len(tokens) == 5:
            label = tokens[0]
            precision, recall, f1, support = tokens[1:]
            rows.append({
                "label": label,
                "precision": float(precision),
                "recall": float(recall),
                "f1-score": float(f1),
                "support": int(support),
            })

        # 'accuracy' row: accuracy value support
        elif tokens[0] == "accuracy":
            # Typically: 'accuracy  0.97  1234'
            if len(tokens) >= 2:
                try:
                    accuracy = float(tokens[1])
                except ValueError:
                    accuracy = None

        # 'macro avg' or 'weighted avg'
        elif tokens[0] in ("macro", "weighted") and len(tokens) == 6:
            label = " ".join(tokens[0:2])   # "macro avg" / "weighted avg"
            precision, recall, f1, support = tokens[2:]
            rows.append({
                "label": label,
                "precision": float(precision),
                "recall": float(recall),
                "f1-score": float(f1),
                "support": int(support),
            })

    df = pd.DataFrame(rows).set_index("label")
    return df, accuracy

In [None]:
nb_df, nb_accuracy = parse_classification_report(nb_report_str)
lstm_df, lstm_accuracy = parse_classification_report(lstm_report_str)

print("Naive Bayes parsed report:")
display(nb_df)
print(f"Naive Bayes accuracy: {nb_accuracy:.4f}" if nb_accuracy is not None else "Naive Bayes accuracy: N/A")

print("\nLSTM parsed report:")
display(lstm_df)
print(f"LSTM accuracy: {lstm_accuracy:.4f}" if lstm_accuracy is not None else "LSTM accuracy: N/A")

In [None]:
def overall_summary(df, accuracy, model_name):
    """
    Extract overall metrics (accuracy, macro avg, weighted avg) from parsed df.
    Returns a one-row DataFrame with index=model_name.
    """
    row = {}

    # Accuracy
    row["accuracy"] = accuracy

    # Macro avg row
    if "macro avg" in df.index:
        row["macro_precision"] = df.loc["macro avg", "precision"]
        row["macro_recall"] = df.loc["macro avg", "recall"]
        row["macro_f1"] = df.loc["macro avg", "f1-score"]
    else:
        row["macro_precision"] = None
        row["macro_recall"] = None
        row["macro_f1"] = None

    # Weighted avg row
    if "weighted avg" in df.index:
        row["weighted_precision"] = df.loc["weighted avg", "precision"]
        row["weighted_recall"] = df.loc["weighted avg", "recall"]
        row["weighted_f1"] = df.loc["weighted avg", "f1-score"]
    else:
        row["weighted_precision"] = None
        row["weighted_recall"] = None
        row["weighted_f1"] = None

    return pd.DataFrame([row], index=[model_name])


nb_summary = overall_summary(nb_df, nb_accuracy, "Naive Bayes")
lstm_summary = overall_summary(lstm_df, lstm_accuracy, "LSTM")

overall_comparison = pd.concat([nb_summary, lstm_summary])
overall_comparison

In [None]:
# We'll compare accuracy, macro F1 and weighted F1
metrics_to_plot = ["accuracy", "macro_f1", "weighted_f1"]

plot_df = overall_comparison[metrics_to_plot].copy()

plt.figure(figsize=(6, 4))
plot_df.plot(kind="bar")
plt.title("Overall Performance Comparison")
plt.ylabel("Score")
plt.ylim(0, 1.0)
plt.xticks(rotation=0)
plt.legend(title="Metric")
plt.tight_layout()
plt.show()

In [None]:
# Take only the "real" classes, i.e., rows that are not averages
def class_rows(df):
    return df[~df.index.isin(["accuracy", "macro avg", "weighted avg"])]

nb_classes = class_rows(nb_df)
lstm_classes = class_rows(lstm_df)

# Build a combined DataFrame of F1 scores per class and per model
all_labels = sorted(set(nb_classes.index) | set(lstm_classes.index))

rows = []
for label in all_labels:
    nb_f1 = nb_classes.loc[label, "f1-score"] if label in nb_classes.index else None
    lstm_f1 = lstm_classes.loc[label, "f1-score"] if label in lstm_classes.index else None
    rows.append({"label": label, "Naive Bayes": nb_f1, "LSTM": lstm_f1})

class_f1_df = pd.DataFrame(rows).set_index("label")
class_f1_df

In [None]:
plt.figure(figsize=(6, 4))
class_f1_df.plot(kind="bar")
plt.title("Class-wise F1-score Comparison")
plt.ylabel("F1-score")
plt.ylim(0, 1.0)
plt.xticks(rotation=0)
plt.legend(title="Model")
plt.tight_layout()
plt.show()

In [None]:
from IPython.display import Markdown

nb_acc = overall_comparison.loc["Naive Bayes", "accuracy"]
lstm_acc = overall_comparison.loc["LSTM", "accuracy"]

summary_lines = []

summary_lines.append(f"- Naive Bayes accuracy: **{nb_acc:.4f}**")
summary_lines.append(f"- LSTM accuracy: **{lstm_acc:.4f}**")

macro_nb = overall_comparison.loc["Naive Bayes", "macro_f1"]
macro_lstm = overall_comparison.loc["LSTM", "macro_f1"]

summary_lines.append(f"- Naive Bayes macro F1: **{macro_nb:.4f}**")
summary_lines.append(f"- LSTM macro F1: **{macro_lstm:.4f}**")

text = "### Summary\n\n" + "\n".join(summary_lines)
Markdown(text)