# Spam SMS Detection
This notebook covers the task of classifying SMS messages as spam or legitimate.
We will use the SMS Spam Collection dataset.
The notebook includes dataset download, preprocessing, model training, and evaluation.

In [None]:
# Install necessary libraries
!pip install -q pandas scikit-learn numpy


In [None]:
# Download dataset
import os
import urllib.request

dataset_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
dataset_zip = 'smsspamcollection.zip'
dataset_folder = 'smsspamcollection'

if not os.path.exists(dataset_zip):
    print('Downloading dataset...')
    urllib.request.urlretrieve(dataset_url, dataset_zip)
    print('Download complete.')
else:
    print('Dataset already exists.')

# Extract dataset
import zipfile
if not os.path.exists(dataset_folder):
    with zipfile.ZipFile(dataset_zip, 'r') as zip_ref:
        zip_ref.extractall(dataset_folder)
    print('Dataset extracted.')
else:
    print('Dataset already extracted.')


In [None]:
# Load dataset
import pandas as pd
data_path = dataset_folder + '/SMSSpamCollection'
df = pd.read_csv(data_path, sep='\t', header=None, names=['label', 'message'])
df.head()

## Data Preprocessing
- Encode labels
- Use TF-IDF vectorization for text features


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

X = df['message']
y = df['label_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


## Model Training
We will train a Naive Bayes classifier to detect spam messages.

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Predictions
y_pred = model.predict(X_test_tfidf)

# Evaluation
print('Accuracy:', accuracy_score(y_test, y_pred))
print('\nClassification Report:\n', classification_report(y_test, y_pred, target_names=label_encoder.classes_))