# 03 – Naive Bayes Baseline Model

In this notebook:
- load preprocessed dataset
- convert text to TF–IDF features
- train a Naive Bayes classifier
- evaluate using accuracy, precision, recall, F1
- show confusion matrix

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

DATA_PATH = Path("../data/processed/cleaned.csv")
df = pd.read_csv(DATA_PATH)

# IMPORTANT: replace NaNs in clean_body with empty strings
df['clean_body'] = df['clean_body'].fillna('')

df.head()

In [None]:
X = df['clean_body']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

len(X_train), len(X_test)

In [3]:
vectorizer = TfidfVectorizer(
    max_features=10000,
    stop_words='english'
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
nb = MultinomialNB()
nb.fit(X_train_vec, y_train)

In [None]:
y_pred = nb.predict(X_test_vec)

print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Naive Bayes - Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [None]:
import joblib
from pathlib import Path

models_path = Path("../models")
models_path.mkdir(exist_ok=True)

joblib.dump(nb, models_path / "naive_bayes.pkl")

In [8]:
from io import StringIO
from pathlib import Path

report_str = classification_report(y_test, y_pred)

results_path = Path("../results")
results_path.mkdir(exist_ok=True)

with open(results_path / "nb_report.txt", "w", encoding="utf-8") as f:
    f.write(report_str)