In [2]:
import pandas as pd
import os
import zipfile
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [3]:
file_path = "C:/Users/spam.csv"  # Update this path if needed

# Load dataset
try:
    data = pd.read_csv(file_path, encoding="latin-1")
    print("File successfully loaded!")
except FileNotFoundError:
    print(f"File not found! Expected at: {file_path}")
    raise

# Display first few rows
print(data.head())

File successfully loaded!
     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [4]:
# Keep only relevant columns
data = data.iloc[:, [0, 1]]
data.columns = ['label', 'message']  # Rename columns

# Convert labels: 'ham' → 0, 'spam' → 1
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# Display first few cleaned rows
print("Data cleaned successfully!")
print(data.head())

Data cleaned successfully!
   label                                            message
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    data['message'], data['label'], test_size=0.2, random_state=42, stratify=data['label']
)

print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")

Training samples: 4457, Testing samples: 1115


In [6]:
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [7]:
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Predictions
y_pred = model.predict(X_test_tfidf)

# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Trained Successfully! Accuracy: {accuracy * 100:.2f}%\n")

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Model Trained Successfully! Accuracy: 96.86%

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       1.00      0.77      0.87       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115



In [8]:
sample_messages = [
    "Congratulations! You've won a free lottery. Call now to claim your prize.",
    "Hey, are we still meeting for lunch today?",
    "URGENT! Your bank account has been compromised. Click this link to secure it.",
    "Let’s catch up this weekend!"
]

# Convert sample messages to TF-IDF format
sample_tfidf = vectorizer.transform(sample_messages)

# Predict
predictions = model.predict(sample_tfidf)

# Display Results
for msg, label in zip(sample_messages, predictions):
    print(f"Message: {msg}\nPrediction: {'Spam' if label == 1 else 'Ham'}\n")

Message: Congratulations! You've won a free lottery. Call now to claim your prize.
Prediction: Spam

Message: Hey, are we still meeting for lunch today?
Prediction: Ham

Message: URGENT! Your bank account has been compromised. Click this link to secure it.
Prediction: Ham

Message: Let’s catch up this weekend!
Prediction: Ham



In [9]:
# Additional sample messages for testing
test_messages = [
    "You have been selected for a $500 gift card! Click the link to claim now.",
    "Don't forget about our team meeting at 3 PM.",
    "Win a brand new iPhone 15! Just answer this quick survey.",
    "Hey, can you send me the report by EOD?",
    "Exclusive deal for you: Get 50% off on all purchases today!",
    "I will call you later. Let’s plan for the weekend.",
    "URGENT: Your PayPal account has been suspended. Verify now to reactivate.",
    "Your OTP for login is 543210. Do not share with anyone.",
    "Limited-time offer! Buy one get one free. Order now before it’s gone!",
    "Hi, just checking in to see how you're doing."
]

# Convert messages to TF-IDF format
test_tfidf = vectorizer.transform(test_messages)

# Predict
test_predictions = model.predict(test_tfidf)

# Display Results
for msg, label in zip(test_messages, test_predictions):
    print(f"Message: {msg}\nPrediction: {'Spam' if label == 1 else 'Ham'}\n")

Message: You have been selected for a $500 gift card! Click the link to claim now.
Prediction: Spam

Message: Don't forget about our team meeting at 3 PM.
Prediction: Ham

Message: Win a brand new iPhone 15! Just answer this quick survey.
Prediction: Ham

Message: Hey, can you send me the report by EOD?
Prediction: Ham

Message: Exclusive deal for you: Get 50% off on all purchases today!
Prediction: Ham

Message: I will call you later. Let’s plan for the weekend.
Prediction: Ham

Message: URGENT: Your PayPal account has been suspended. Verify now to reactivate.
Prediction: Spam

Message: Your OTP for login is 543210. Do not share with anyone.
Prediction: Ham

Message: Limited-time offer! Buy one get one free. Order now before it’s gone!
Prediction: Ham

Message: Hi, just checking in to see how you're doing.
Prediction: Ham

