In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
# In real life, you would load a CSV of emails.
# 1 = Spam, 0 = Ham (Not Spam)

data = {
    'text': [
        'Win a free lottery ticket now',      # Spam
        'Meeting regarding the project',      # Ham
        'Free money for you click here',      # Spam
        'Can we have lunch today?',           # Ham
        'Limited time offer buy now',         # Spam
        'Please review the attached code',    # Ham
        'Congratulations you won a prize',    # Spam
        'Let us schedule a call for tomorrow' # Ham
    ],
    'label': [1, 0, 1, 0, 1, 0, 1, 0]
}

In [None]:
df = pd.DataFrame(data)
X = df['text']
y = df['label']

print("Raw Data")
print(df.head())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
# 2. VECTORIZATION (Text -> Numbers)
# ==========================================
# Computers understand numbers, not words. 
# We use 'CountVectorizer' to count word frequencies (Bag of Words).

vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test) # Note: Do NOT fit on test data!

In [None]:
# Show what the vocabulary looks like
print("\n Vocabulary Map ")
print(vectorizer.vocabulary_)

In [None]:
# 3. TRAIN MULTINOMIAL NAIVE BAYES
# ==========================================
# We use MultinomialNB because we are dealing with word counts (integers).

model = MultinomialNB()
model.fit(X_train_vec, y_train)

In [None]:
new_emails = [
    "Hey, are we still meeting for lunch?",
    "You have won a free cash prize click now"
]

new_emails_vec = vectorizer.transform(new_emails)
predictions = model.predict(new_emails_vec)

In [None]:
print("\n Custom Predictions ")
for text, pred in zip(new_emails, predictions):
    label = "Spam" if pred == 1 else "Not Spam"
    print(f"'{text}' -> {label}")

# Evaluate on Test Set
y_pred = model.predict(X_test_vec)
print(f"\nModel Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")