In [5]:
import numpy as np
from collections import defaultdict, Counter

class NaiveBayes:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.class_probs = {}
        self.word_probs = {}
        self.vocab = set()
        self.classes = []

    def clean_text(self, text):
        text = text.lower()
        text = ''.join(c for c in text if c.isalnum() or c.isspace())
        return text

    def fit(self, X, y):
        self.classes = np.unique(y)
        n_samples = len(y)

        # Prior probabilities
        for c in self.classes:
            self.class_probs[c] = np.sum(y == c) / n_samples

        # Word counts per class
        word_counts = {c: defaultdict(int) for c in self.classes}
        class_word_totals = {c: 0 for c in self.classes}

        for text, label in zip(X, y):
            text = self.clean_text(text)
            words = text.split()
            word_freq = Counter(words)

            for word, count in word_freq.items():
                self.vocab.add(word)
                word_counts[label][word] += count
                class_word_totals[label] += count

        # Word probabilities with smoothing
        vocab_size = len(self.vocab)
        for c in self.classes:
            self.word_probs[c] = {}
            total_words_c = class_word_totals[c]
            for word in self.vocab:
                count = word_counts[c][word]
                self.word_probs[c][word] = (count + self.alpha) / (total_words_c + self.alpha * vocab_size)

    def predict_proba(self, X):
        probas = []
        for text in X:
            text = self.clean_text(text)
            words = text.split()
            scores = {}

            for c in self.classes:
                score = np.log(self.class_probs[c])
                for word in words:
                    if word in self.vocab:
                        score += np.log(self.word_probs[c].get(word, self.alpha / (self.alpha * len(self.vocab))))
                scores[c] = score


            exp_scores = {c: np.exp(s - max(scores.values())) for c, s in scores.items()}
            total = sum(exp_scores.values())
            probas.append({c: exp_scores[c]/total for c in self.classes})

        return probas

    def predict(self, X):
        probas = self.predict_proba(X)
        return np.array([max(p, key=p.get) for p in probas])



X_train = np.array([
    "Buy cheap meds now",
    "Cheap pills available here",
    "Hello how are you today",
    "Let's meet tomorrow at cafe",
    "Win free money click here",
    "Meeting at 5 pm ok?"
])

y_train = np.array(["spam", "spam", "ham", "ham", "spam", "ham"])

model = NaiveBayes(alpha=1.0)
model.fit(X_train, y_train)

test_texts = np.array([
    "cheap meds available",
    "how are you doing",
    "free money click here"
])

print("پیش‌بینی‌ها:")
predictions = model.predict(test_texts)
probas = model.predict_proba(test_texts)

for text, pred, pro in zip(test_texts, predictions, probas):
    print(f"متن: {text}")
    print(f"→ پیش‌بینی: {pred}")
    print(f"احتمال‌ها: { {k: f'{v:.3f}' for k,v in pro.items()} }")
    print()

پیش‌بینی‌ها:
متن: cheap meds available
→ پیش‌بینی: spam
احتمال‌ها: {np.str_('ham'): '0.067', np.str_('spam'): '0.933'}

متن: how are you doing
→ پیش‌بینی: ham
احتمال‌ها: {np.str_('ham'): '0.873', np.str_('spam'): '0.127'}

متن: free money click here
→ پیش‌بینی: spam
احتمال‌ها: {np.str_('ham'): '0.033', np.str_('spam'): '0.967'}

