In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [10]:
# Load dataset
df = pd.read_csv('smsspamcollection/SMSSpamCollection.txt',
                    sep='\t',names=["label","message"])

In [11]:
# Convert label to binary
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

#### Text Preprocessing with spaCy

In [12]:
import spacy
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

In [13]:
def clean_text_spacy(message):
    doc = nlp(message.lower())
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    return ' '.join(tokens)

# Apply to all text
df['cleaned_text'] = df['message'].apply(clean_text_spacy)

#### Feature Extraction using Bag of Words

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

# Use BoW model
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['cleaned_text'])
y = df['label']

#### Train-Test Split & Model Training

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

####  Evaluation

In [16]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9596412556053812

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.96      0.98       966
           1       0.80      0.94      0.86       149

    accuracy                           0.96      1115
   macro avg       0.89      0.95      0.92      1115
weighted avg       0.96      0.96      0.96      1115

