<a href="https://colab.research.google.com/github/gj0210/CMP7239/blob/main/parth.......Welcome_To_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# =============================
# Spam Email Detection Project
# =============================

# Import necessary libraries
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# -----------------------------
# 1. Load Dataset
# -----------------------------
# Replace with actual dataset path or Kaggle download
# Example: Enron Spam Dataset
data = pd.read_csv("spam.csv", encoding='latin-1')  # placeholder file name
data = data[['v1', 'v2']]
data.columns = ['label', 'message']

# Convert labels to binary: spam = 1, ham = 0
data['label'] = data['label'].map({'spam': 1, 'ham': 0})

# -----------------------------
# 2. Text Preprocessing
# -----------------------------
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = text.strip()  # Remove leading/trailing spaces
    return text

data['cleaned_text'] = data['message'].apply(preprocess_text)

# -----------------------------
# 3. Train-Test Split
# -----------------------------
X = data['cleaned_text']
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -----------------------------
# 4. Feature Extraction (TF-IDF)
# -----------------------------
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# -----------------------------
# 5. Train Models
# -----------------------------

# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
nb_pred = nb_model.predict(X_test_tfidf)

# Support Vector Machine
svm_model = LinearSVC()
svm_model.fit(X_train_tfidf, y_train)
svm_pred = svm_model.predict(X_test_tfidf)

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)
rf_pred = rf_model.predict(X_test_tfidf)

# -----------------------------
# 6. Evaluation Function
# -----------------------------
def evaluate_model(y_true, y_pred, model_name):
    print(f"\n--- {model_name} Evaluation ---")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_true, y_pred):.4f}")
    print(classification_report(y_true, y_pred))

# Evaluate all models
evaluate_model(y_test, nb_pred, "Naive Bayes")
evaluate_model(y_test, svm_pred, "SVM")
evaluate_model(y_test, rf_pred, "Random Forest")

# -----------------------------
# 7. Best Model Recommendation
# -----------------------------
# Compare F1 Scores
f1_scores = {
    'Naive Bayes': f1_score(y_test, nb_pred),
    'SVM': f1_score(y_test, svm_pred),
    'Random Forest': f1_score(y_test, rf_pred)
}

best_model = max(f1_scores, key=f1_scores.get)
print(f"\n✅ Best Performing Model: {best_model} with F1 Score = {f1_scores[best_model]:.4f}")


FileNotFoundError: [Errno 2] No such file or directory: 'spam.csv'

In [5]:
!wget https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip -O sms_spam_collection.zip
!unzip -o sms_spam_collection.zip

--2025-08-22 19:30:29--  https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘sms_spam_collection.zip’

sms_spam_collection     [ <=>                ] 198.65K  1017KB/s    in 0.2s    

2025-08-22 19:30:29 (1017 KB/s) - ‘sms_spam_collection.zip’ saved [203415]

Archive:  sms_spam_collection.zip
  inflating: SMSSpamCollection       
  inflating: readme                  


In [6]:
# =============================
# Spam Email Detection Project
# =============================

# Import necessary libraries
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# -----------------------------
# 1. Load Dataset
# -----------------------------
# Replace with actual dataset path or Kaggle download
# Example: Enron Spam Dataset
data = pd.read_csv("SMSSpamCollection", sep='\t', names=['label', 'message'], encoding='latin-1') # Load the correct file name and specify the separator
# data = data[['v1', 'v2']] # This line is no longer needed as we specified names
# data.columns = ['label', 'message'] # This line is no longer needed as we specified names

# Convert labels to binary: spam = 1, ham = 0
data['label'] = data['label'].map({'spam': 1, 'ham': 0})

# -----------------------------
# 2. Text Preprocessing
# -----------------------------
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = text.strip()  # Remove leading/trailing spaces
    return text

data['cleaned_text'] = data['message'].apply(preprocess_text)

# -----------------------------
# 3. Train-Test Split
# -----------------------------
X = data['cleaned_text']
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -----------------------------
# 4. Feature Extraction (TF-IDF)
# -----------------------------
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# -----------------------------
# 5. Train Models
# -----------------------------

# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
nb_pred = nb_model.predict(X_test_tfidf)

# Support Vector Machine
svm_model = LinearSVC()
svm_model.fit(X_train_tfidf, y_train)
svm_pred = svm_model.predict(X_test_tfidf)

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)
rf_pred = rf_model.predict(X_test_tfidf)

# -----------------------------
# 6. Evaluation Function
# -----------------------------
def evaluate_model(y_true, y_pred, model_name):
    print(f"\n--- {model_name} Evaluation ---")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_true, y_pred):.4f}")
    print(classification_report(y_true, y_pred))

# Evaluate all models
evaluate_model(y_test, nb_pred, "Naive Bayes")
evaluate_model(y_test, svm_pred, "SVM")
evaluate_model(y_test, rf_pred, "Random Forest")

# -----------------------------
# 7. Best Model Recommendation
# -----------------------------
# Compare F1 Scores
f1_scores = {
    'Naive Bayes': f1_score(y_test, nb_pred),
    'SVM': f1_score(y_test, svm_pred),
    'Random Forest': f1_score(y_test, rf_pred)
}

best_model = max(f1_scores, key=f1_scores.get)
print(f"\n✅ Best Performing Model: {best_model} with F1 Score = {f1_scores[best_model]:.4f}")


--- Naive Bayes Evaluation ---
Accuracy: 0.9686
Precision: 1.0000
Recall: 0.7651
F1 Score: 0.8669
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       1.00      0.77      0.87       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115


--- SVM Evaluation ---
Accuracy: 0.9892
Precision: 0.9928
Recall: 0.9262
F1 Score: 0.9583
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.99      0.93      0.96       149

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.98      1115
weighted avg       0.99      0.99      0.99      1115


--- Random Forest Evaluation ---
Accuracy: 0.9749
Precision: 1.0000
Recall: 0.8121
F1 Score: 0.8963
              precision    recall  f1-score   support

