# EDA  


In [1]:
import pandas as pd
import numpy as np

In [4]:
# Load data
df = pd.read_csv('C:\\Users\\dell\\Documents\\sms-spam-detection\\data\\spam.csv', encoding='latin-1')
df = df[['v1', 'v2']]  # Keep only label and text columns
df.columns = ['label', 'message']

In [5]:
# Explore
print(df.head())
print(df['label'].value_counts())
print(f"Dataset shape: {df.shape}")

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
label
ham     4825
spam     747
Name: count, dtype: int64
Dataset shape: (5572, 2)


In [6]:
df.isnull().sum()
df.duplicated().sum()

np.int64(403)

In [7]:
df['char_len'] = df['message'].str.len()
df['word_count'] = df['message'].str.split().str.len()

In [8]:
df.groupby('label')[['char_len', 'word_count']].describe()


Unnamed: 0_level_0,char_len,char_len,char_len,char_len,char_len,char_len,char_len,char_len,word_count,word_count,word_count,word_count,word_count,word_count,word_count,word_count
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
ham,4825.0,71.023627,58.016023,2.0,33.0,52.0,92.0,910.0,4825.0,14.200622,11.424511,1.0,7.0,11.0,19.0,171.0
spam,747.0,138.866131,29.183082,13.0,132.5,149.0,157.0,224.0,747.0,23.851406,5.811898,2.0,22.0,25.0,28.0,35.0


Spam messages have higher average length and word count compared to ham messages.

# data preprpcessing 


## Clean text data

In [9]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

# Download stopwords if needed
nltk.download('stopwords')

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

df['cleaned_message'] = df['message'].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Encode labels


In [10]:
# Convert labels to binary (spam=1, ham=0)
df['label_encoded'] = df['label'].map({'ham': 0, 'spam': 1})

# Feature Engineering

## Text vectorization

In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Bag of Words
count_vectorizer = CountVectorizer(max_features=5000)
X_count = count_vectorizer.fit_transform(df['cleaned_message'])

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(df['cleaned_message'])

## Create additional features


In [12]:
# Text length features
df['message_length'] = df['message'].apply(len)
df['word_count'] = df['message'].apply(lambda x: len(x.split()))

# Check for spam keywords
spam_keywords = ['free', 'win', 'winner', 'click', 'urgent', 'cash', 'prize']
for keyword in spam_keywords:
    df[f'contains_{keyword}'] = df['message'].str.contains(keyword, case=False).astype(int)

# Train/Test Split

In [13]:
from sklearn.model_selection import train_test_split

# Using TF-IDF features
X = X_tfidf
y = df['label_encoded']

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Model Building & Training

## Implement baseline models

In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

models = {
    'MultinomialNB': MultinomialNB(),
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'SVM': SVC(kernel='linear', probability=True),
    'RandomForest': RandomForestClassifier(n_estimators=100)
}

## Train all models

In [15]:
trained_models = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    trained_models[name] = model
    print(f"{name} trained successfully")

MultinomialNB trained successfully
LogisticRegression trained successfully
SVM trained successfully
RandomForest trained successfully


# Model Evaluation

## Create evaluation function

In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
    }
    
    if y_proba is not None:
        metrics['roc_auc'] = roc_auc_score(y_test, y_proba)
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    
    return metrics, cm

## Compare all models

In [17]:
results = {}
for name, model in trained_models.items():
    metrics, cm = evaluate_model(model, X_test, y_test)
    results[name] = metrics
    
    print(f"\n{name}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.3f}")


MultinomialNB:
  accuracy: 0.963
  precision: 1.000
  recall: 0.725
  f1: 0.840
  roc_auc: 0.976

LogisticRegression:
  accuracy: 0.967
  precision: 0.991
  recall: 0.758
  f1: 0.859
  roc_auc: 0.987

SVM:
  accuracy: 0.984
  precision: 0.992
  recall: 0.886
  f1: 0.936
  roc_auc: 0.985

RandomForest:
  accuracy: 0.970
  precision: 1.000
  recall: 0.779
  f1: 0.875
  roc_auc: 0.992


# Hyperparameter Tuning

In [18]:
from sklearn.model_selection import GridSearchCV

# Example: Tune Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.3f}")

Best parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}
Best score: 0.905


# Model Interpretation

## Feature importance

In [19]:
# For Logistic Regression
feature_names = tfidf_vectorizer.get_feature_names_out()
coefs = trained_models['LogisticRegression'].coef_[0]

# Get top 20 features for spam
top_spam_features = pd.DataFrame({
    'feature': feature_names,
    'coefficient': coefs
}).sort_values('coefficient', ascending=False).head(20)

print("Top 20 features indicating spam:")
print(top_spam_features)

Top 20 features indicating spam:
      feature  coefficient
4580      txt     4.776293
474      call     4.311929
1244     free     3.524839
4242     stop     3.376621
4486       to     3.291434
4397     text     3.259726
607     claim     3.108432
3816    reply     2.979442
1265     from     2.749723
2955   mobile     2.715182
4969     your     2.547711
3420       or     2.428619
4823      win     2.414735
3992  service     2.391343
4859      won     2.296210
3648    prize     2.215809
3326      now     2.099242
570      chat     2.059531
4636   urgent     1.986166
1222      for     1.956557


## Error analysis

In [20]:
# Get misclassified examples
model = trained_models['LogisticRegression']
y_pred = model.predict(X_test)

misclassified_indices = np.where(y_pred != y_test)[0]
misclassified_samples = df.iloc[misclassified_indices]

print(f"Number of misclassified samples: {len(misclassified_samples)}")

Number of misclassified samples: 37


# Model Deployment Preparation