In [None]:
import numpy as np
import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import confusion_matrix, classification_report
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
data = pd.read_csv('/content/SMSSpamCollection', sep='\t', header=None, names=['label', 'message'])
print(data.head())

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [None]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Create a list to hold cleaned messages
corpus = []

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Loop through each message in the dataset
for message in data['message']:
    # Remove special characters and numbers
    message = re.sub(r'[^a-zA-Z\s]', '', message)
    message = message.lower()  # Convert to lowercase

    # Tokenize the message
    words = word_tokenize(message)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Perform stemming and lemmatization
    words_stemmed = [stemmer.stem(word) for word in words]
    words_lemmatized = [lemmatizer.lemmatize(word) for word in words_stemmed]

    # Join words back into a single string
    cleaned_message = ' '.join(words_lemmatized)
    corpus.append(cleaned_message)

# Add cleaned messages to the DataFrame
data['cleaned_message'] = corpus

In [None]:
# Split the data
X = data['cleaned_message']
y = data['label'].map({'ham': 0, 'spam': 1})  # Convert labels to binary
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
# Vectorize the text
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
# Train the model
spam_detect_model = ComplementNB()
spam_detect_model.fit(X_train_vec, y_train)

In [None]:
pred = spam_detect_model.predict(X_test_vec)
pred

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
# Evaluate the model
confusion_m = confusion_matrix(y_test, pred)
print("Confusion Matrix:\n", confusion_m)
print("\nClassification Report:\n", classification_report(y_test, pred))

Confusion Matrix:
 [[921  45]
 [  9 140]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.95      0.97       966
           1       0.76      0.94      0.84       149

    accuracy                           0.95      1115
   macro avg       0.87      0.95      0.90      1115
weighted avg       0.96      0.95      0.95      1115

