In [10]:
import sys
import os
sys.path.insert(0, os.path.abspath('..')) 
import pandas as pd
from src.model import NaiveBayesClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

In [12]:
df_train = pd.read_csv("../data/processed/train.csv")

df_train = df_train.dropna(subset=['cleaned', 'label']).reset_index(drop=True)

X_train = df_train['cleaned'].tolist()
y_train = df_train['label'].tolist()

print(f"После очистки: {len(df_train)} сообщений")

nb = NaiveBayesClassifier()
nb.fit(X_train, y_train)

print(f"Размер словаря: {nb.V}")
print(f"P(spam) = {nb.p_spam:.4f}, P(ham) = {nb.p_ham:.4f}")

После очистки: 4429 сообщений
Размер словаря: 2815
P(spam) = 0.1350, P(ham) = 0.8650


In [13]:
test_msg = "free win cash now"
pred = nb.predict(test_msg)
log_spam, log_ham = nb.predict_proba(test_msg)

print(f"Сообщение: '{test_msg}' → предсказание: {pred}")
print(f"log P(spam|text) = {log_spam:.2f}")
print(f"log P(ham|text)  = {log_ham:.2f}")

Сообщение: 'free win cash now' → предсказание: spam
log P(spam|text) = -16.46
log P(ham|text)  = -21.39


In [16]:
df_test = pd.read_csv("../data/processed/test.csv")

df_test = df_test.dropna(subset=['cleaned', 'label']).reset_index(drop=True)

X_test = df_test['cleaned'].tolist()
y_test = df_test['label'].tolist()

y_pred = [nb.predict(text) for text in X_test]

print("Результаты на тестовой выборке:")
print(f"Accuracy:  {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision (spam): {precision_score(y_test, y_pred, pos_label='spam'):.4f}")
print(f"Recall (spam):    {recall_score(y_test, y_pred, pos_label='spam'):.4f}")

print("\nПодробный отчёт:")
print(classification_report(y_test, y_pred))

Результаты на тестовой выборке:
Accuracy:  0.9711
Precision (spam): 0.8980
Recall (spam):    0.8859

Подробный отчёт:
              precision    recall  f1-score   support

         ham       0.98      0.98      0.98       959
        spam       0.90      0.89      0.89       149

    accuracy                           0.97      1108
   macro avg       0.94      0.94      0.94      1108
weighted avg       0.97      0.97      0.97      1108



In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

sk_nb = MultinomialNB()
sk_nb.fit(X_train_vec, y_train)
y_pred_sk = sk_nb.predict(X_test_vec)

print(f"\nSklearn MultinomialNB Accuracy: {accuracy_score(y_test, y_pred_sk):.4f}")


Sklearn MultinomialNB Accuracy: 0.9783
