In [1]:
# Install required libraries
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Download NLTK stopwords
nltk.download("stopwords")

# ---------------------------------------------------
# 1. Download SMS Spam Collection dataset automatically
# ---------------------------------------------------

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"

import urllib.request
import zipfile

urllib.request.urlretrieve(url, "smsspamcollection.zip")

with zipfile.ZipFile("smsspamcollection.zip", "r") as zip_ref:
    zip_ref.extractall()

# Load dataset (tab-separated)
df = pd.read_table("SMSSpamCollection", header=None, names=["label", "text"])


# ---------------------------------------------------
# 2. Preprocessing Labels
# ---------------------------------------------------
df["label"] = df["label"].map({"ham": 0, "spam": 1})


# ---------------------------------------------------
# 3. Text Cleaning Function
# ---------------------------------------------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\W+", " ", text)
    words = text.split()
    words = [w for w in words if w not in stopwords.words("english")]
    return " ".join(words)

df["clean_text"] = df["text"].apply(clean_text)


# ---------------------------------------------------
# 4. Train-Test Split
# ---------------------------------------------------
X = df["clean_text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)


# ---------------------------------------------------
# 5. Vectorization
# ---------------------------------------------------
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


# ---------------------------------------------------
# 6. Logistic Regression Model
# ---------------------------------------------------
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)


# ---------------------------------------------------
# 7. Evaluation
# ---------------------------------------------------
y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Accuracy: 0.9669777458722182

Confusion Matrix:
 [[1205    1]
 [  45  142]]

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98      1206
           1       0.99      0.76      0.86       187

    accuracy                           0.97      1393
   macro avg       0.98      0.88      0.92      1393
weighted avg       0.97      0.97      0.97      1393

