In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [2]:
# Load the spam or not spam dataset
data = {
    'text': [
        'Get a free iPhone now!',
        'Congratulations, you have won a lottery.',
        'Meet me at the park later?',
        'Can we reschedule our meeting?',
        'You won a free gift card!',
        'Limited time offer: 50% off on all products.',
        'URGENT: Your account has been compromised.',
        'Hello, we are conducting a survey.',
        'Claim your prize now!',
        'Last chance to avail the discount!',
        'Your package has been shipped.',
        'Join us for an exciting event this weekend.',
        'Important update: Please reset your password.',
        'Invitation to our exclusive VIP party.',
        'New arrivals: Check out our latest collection.',
        'Your order has been confirmed.',
        'Unlock special rewards with our loyalty program.',
        'Job opportunity: Apply now for a great career.',
        'Reminder: Your appointment is tomorrow.',
        'Exclusive membership offer: Don\'t miss out!',
        'Breaking news: Stay informed with our app.',
        'Upgrade to premium for enhanced features.',
        'Hurry, limited stock available!',
        'Weekly newsletter: Stay updated with our content.',
        'Schedule of upcoming events.',
        'Thank you for your recent purchase.'
    ],
    'label': [1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0]
}

In [3]:
# Create a DataFrame from the dataset
df = pd.DataFrame(data)
df

Unnamed: 0,text,label
0,Get a free iPhone now!,1
1,"Congratulations, you have won a lottery.",1
2,Meet me at the park later?,0
3,Can we reschedule our meeting?,0
4,You won a free gift card!,1
5,Limited time offer: 50% off on all products.,1
6,URGENT: Your account has been compromised.,1
7,"Hello, we are conducting a survey.",0
8,Claim your prize now!,1
9,Last chance to avail the discount!,1


In [4]:
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
print(X_train_vec)
model = MultinomialNB()
model.fit(X_train_vec, y_train)

  (0, 36)	1
  (0, 78)	1
  (0, 57)	1
  (0, 24)	1
  (0, 84)	1
  (0, 61)	1
  (1, 20)	1
  (1, 89)	1
  (1, 31)	1
  (1, 88)	1
  (1, 42)	1
  (2, 57)	1
  (2, 86)	1
  (2, 50)	1
  (2, 72)	1
  (2, 81)	1
  (2, 87)	1
  (2, 21)	1
  (3, 41)	1
  (3, 77)	1
  (3, 53)	1
  (3, 0)	1
  (3, 52)	1
  (3, 54)	1
  (3, 2)	1
  :	:
  (15, 74)	1
  (16, 90)	1
  (16, 30)	1
  (16, 10)	1
  (16, 59)	1
  (16, 71)	1
  (17, 57)	1
  (17, 48)	1
  (17, 7)	1
  (17, 15)	1
  (17, 58)	1
  (17, 40)	1
  (17, 16)	1
  (18, 24)	1
  (18, 53)	1
  (18, 58)	1
  (18, 46)	1
  (18, 22)	1
  (18, 47)	1
  (19, 90)	1
  (19, 30)	1
  (19, 10)	1
  (19, 83)	1
  (19, 1)	1
  (19, 17)	1


In [6]:
X_test_vec = vectorizer.transform(X_test)
y_pred = model.predict(X_test_vec)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.5

In [7]:
# Print the vocabulary (words) and their probabilities for each class
vocabulary = vectorizer.get_feature_names_out()
print("Vocabulary=",vocabulary)
print('vocabulary : ', len(vocabulary))
class_probabilities = model.feature_log_prob_  # Log probabilities for each feature in each class
print("Probabilities Vocabulary=",class_probabilities)
class_labels = ['Not Spam', 'Spam']

for i, label in enumerate(class_labels):
    print(f"Class: {label}")
    for j, word in enumerate(vocabulary):
        prob = class_probabilities[i][j]
        word_freq = X_train_vec[:, j].getnnz()
        print(f"Word: {word}, Probability: {prob:.2f}, Frequency: {word_freq}, Total Words: {X_train_vec.shape[0]}")
    print("\n")

Vocabulary= ['50' 'account' 'all' 'app' 'apply' 'appointment' 'are' 'arrivals' 'at'
 'available' 'been' 'breaking' 'can' 'card' 'career' 'check' 'collection'
 'compromised' 'conducting' 'confirmed' 'congratulations' 'content' 'don'
 'enhanced' 'exclusive' 'features' 'for' 'free' 'gift' 'great' 'has'
 'have' 'hello' 'hurry' 'important' 'informed' 'invitation' 'is' 'job'
 'later' 'latest' 'limited' 'lottery' 'me' 'meet' 'meeting' 'membership'
 'miss' 'new' 'news' 'newsletter' 'now' 'off' 'offer' 'on' 'opportunity'
 'order' 'our' 'out' 'package' 'park' 'party' 'password' 'please'
 'premium' 'products' 'purchase' 'recent' 'reminder' 'reschedule' 'reset'
 'shipped' 'stay' 'stock' 'survey' 'thank' 'the' 'time' 'to' 'tomorrow'
 'update' 'updated' 'upgrade' 'urgent' 'vip' 'we' 'weekly' 'with' 'won'
 'you' 'your']
vocabulary :  91
Probabilities Vocabulary= [[-5.00394631 -5.00394631 -5.00394631 -5.00394631 -4.31079913 -4.31079913
  -4.31079913 -4.31079913 -4.31079913 -5.00394631 -3.90533402 -5.0

In [10]:
# Manual Data Testing

# New text to classify
new_text = ["Internship Certificate Completion Letter your been"]

# Vectorize the new text using the same vectorizer
new_text_vec = vectorizer.transform(new_text)

# Make a prediction
prediction = model.predict(new_text_vec)

# Convert prediction to human-readable label
predicted_label = "Spam" if prediction[0] == 1 else "Not Spam"

print(f"The text '{new_text[0]}' is classified as '{predicted_label}'.")

The text 'Internship Certificate Completion Letter your been' is classified as 'Not Spam'.
