In [6]:
pip install pandas numpy scikit-learn nltk

Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd 
df = pd.read_csv(r"C:\Users\Indhu\Downloads\spam.csv",encoding='latin-1')

# Keep only the necessary columns
df = df[['v1', 'v2']]
df.columns = ['label', 'message']  # Rename columns

# Convert labels to binary (spam = 1, ham = 0)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Display dataset info
print(df.head())
print(df['label'].value_counts())


   label                                            message
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...
label
0    4825
1     747
Name: count, dtype: int64


In [5]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    tokens = word_tokenize(text)  # Tokenization
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords.words('english')]  # Lemmatization & stopword removal
    return ' '.join(tokens)

# Apply preprocessing
df['cleaned_message'] = df['message'].apply(preprocess_text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Indhu\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Indhu\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Indhu\AppData\Roaming\nltk_data...


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_message'])

# Labels
y = df['label']

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Naïve Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9739910313901345
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       1.00      0.81      0.89       150

    accuracy                           0.97      1115
   macro avg       0.99      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [11]:
def predict_sms(text):
    text = preprocess_text(text)  # Preprocess the SMS
    text_tfidf = vectorizer.transform([text])  # Convert to TF-IDF
    prediction = model.predict(text_tfidf)[0]  # Predict
    return "Spam" if prediction == 1 else "Not Spam"

# Test example
sms_text = "Congratulations! You've won a free iPhone. Click here to claim now!"
print(predict_sms(sms_text))


Spam


In [12]:
import joblib

# Save model and vectorizer
joblib.dump(model, 'sms_spam_detector.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [16]:
# Load model
model = joblib.load('sms_spam_detector.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Predict spam or not
print(predict_sms("how do you do!"))


Not Spam
