In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
try:
    df = pd.read_csv('/kaggle/input/spamdata/spam.csv', encoding='latin-1')
except FileNotFoundError:
    print("Error: 'spam.csv' not found at '/kaggle/input/spamdata/spam.csv'. Please check the file path.")
    exit()

In [3]:
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)
df.rename(columns={'v1': 'label', 'v2': 'message'}, inplace=True)

In [4]:
print("--- Initial Data ---")
print(df.head())
print("\n--- Data Info ---")
df.info()

--- Initial Data ---
  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...

--- Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [5]:
try:
    stopwords.words('english')
except:
    nltk.download('stopwords')

In [6]:
ps = PorterStemmer()
corpus = []

In [7]:
for i in range(0, len(df)):
    # Remove all characters that are not letters
    review = re.sub('[^a-zA-Z]', ' ', df['message'][i])
    # Convert to lowercase
    review = review.lower()
    # Split the message into a list of words
    review = review.split()
    # Stem the words and remove stopwords
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    # Join the words back into a single string
    review = ' '.join(review)
    corpus.append(review)

In [8]:
tfidf = TfidfVectorizer(max_features=2500)
X = tfidf.fit_transform(corpus).toarray()

In [9]:
y = pd.get_dummies(df['label'])
y = y['spam'].values

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [11]:
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [12]:


# Make predictions on the test set
y_pred = spam_detect_model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\n--- Model Evaluation ---")
print(f"Accuracy: {accuracy:.4f}")

# Display the confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Display the classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


--- Model Evaluation ---
Accuracy: 0.9722

Confusion Matrix:
[[948   1]
 [ 30 136]]

Classification Report:
              precision    recall  f1-score   support

       False       0.97      1.00      0.98       949
        True       0.99      0.82      0.90       166

    accuracy                           0.97      1115
   macro avg       0.98      0.91      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [13]:
def predict_spam(sms_message):
    """
    Predicts if a given SMS message is spam or not.
    """
    # Preprocess the input message
    message = re.sub('[^a-zA-Z]', ' ', sms_message)
    message = message.lower()
    message = message.split()
    message = [ps.stem(word) for word in message if not word in stopwords.words('english')]
    message = ' '.join(message)

    # Transform the message using the trained TF-IDF vectorizer
    message_transformed = tfidf.transform([message]).toarray()

    # Predict using the trained model
    prediction = spam_detect_model.predict(message_transformed)

    if prediction[0] == 1:
        return "This message is likely SPAM."
    else:
        return "This message is likely NOT spam (ham)."

# Example Usage
new_sms_spam = "Congratulations! You've get job in london"
new_sms_ham = "Hey, are we still on for dinner tonight at 7?"

print("\n--- Predictions on New SMS ---")
print(f"'{new_sms_spam}' -> {predict_spam(new_sms_spam)}")
print(f"'{new_sms_ham}' -> {predict_spam(new_sms_ham)}")


--- Predictions on New SMS ---
'Congratulations! You've get job in london' -> This message is likely NOT spam (ham).
'Hey, are we still on for dinner tonight at 7?' -> This message is likely NOT spam (ham).
