In [22]:
import numpy as np
import pandas as pd
import altair as alt

alt.data_transformers.enable('default', max_rows=None)

DataTransformerRegistry.enable('default')

## Load and Prepare Data

In [23]:
def calculate_uppercase_ratio(text: str) -> float:
    letters = [c for c in text if c.isalpha()]
    if not letters:
        return 0.0
    uppercase = sum(1 for c in letters if c.isupper())
    return round(uppercase / len(letters), 4)

df = pd.read_csv(
    '../data/SMSSpamCollection',
    sep='\t',
    header=None,
    names=['label', 'message'],
    encoding='utf-8'
)

df['message_length'] = df['message'].str.len()
df['word_count'] = df['message'].str.split().str.len()
df['has_numbers'] = df['message'].str.contains(r'\d').astype(int)
df['has_currency'] = df['message'].str.contains(r'[$£€]').astype(int)
df['uppercase_ratio'] = df['message'].apply(calculate_uppercase_ratio)
df['exclamation_count'] = df['message'].str.count('!')
df['question_count'] = df['message'].str.count(r'\?')

df.head()

Unnamed: 0,label,message,message_length,word_count,has_numbers,has_currency,uppercase_ratio,exclamation_count,question_count
0,ham,"Go until jurong point, crazy.. Available only ...",111,20,0,0,0.0361,0,0
1,ham,Ok lar... Joking wif u oni...,29,6,0,0,0.1111,0,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,28,1,0,0.1031,0,0
3,ham,U dun say so early hor... U c already then say...,49,11,0,0,0.0606,0,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,13,0,0,0.0426,0,0


## Dataset Overview

In [4]:
label_counts = df['label'].value_counts().reset_index()
label_counts.columns = ['label', 'count']
label_counts['percentage'] = (label_counts['count'] / label_counts['count'].sum() * 100).round(1)

pie_chart = alt.Chart(label_counts).mark_arc(innerRadius=50).encode(
    theta=alt.Theta('count:Q', stack=True),
    color=alt.Color('label:N', scale=alt.Scale(domain=['ham', 'spam'], range=['#4CAF50', '#f44336'])),
    tooltip=['label', 'count', 'percentage']
).properties(
    title='Message Distribution: Ham vs Spam',
    width=300,
    height=300
)

text = alt.Chart(label_counts).mark_text(radius=85, size=14).encode(
    theta=alt.Theta('count:Q', stack=True),
    text=alt.Text('percentage:Q', format='.1f'),
    color=alt.value('white')
)

pie_chart + text

## Message Length Distribution

In [5]:
length_hist = alt.Chart(df).mark_bar(opacity=0.7).encode(
    alt.X('message_length:Q', bin=alt.Bin(maxbins=50), title='Message Length (characters)'),
    alt.Y('count():Q', title='Count'),
    alt.Color('label:N', scale=alt.Scale(domain=['ham', 'spam'], range=['#4CAF50', '#f44336'])),
    tooltip=['label', 'count()']
).properties(
    title='Message Length Distribution by Label',
    width=700,
    height=350
)

length_hist

In [6]:
box_plot = alt.Chart(df).mark_boxplot(extent='min-max').encode(
    x=alt.X('label:N', title='Label'),
    y=alt.Y('message_length:Q', title='Message Length', scale=alt.Scale(domain=[0, 400])),
    color=alt.Color('label:N', scale=alt.Scale(domain=['ham', 'spam'], range=['#4CAF50', '#f44336']))
).properties(
    title='Message Length: Ham vs Spam',
    width=300,
    height=350
)

box_plot

## Feature Comparison

In [7]:
features = ['message_length', 'word_count', 'uppercase_ratio', 'exclamation_count', 'question_count']

comparison_data = []
for feature in features:
    for label in ['ham', 'spam']:
        comparison_data.append({
            'feature': feature,
            'label': label,
            'mean': df[df['label'] == label][feature].mean()
        })

comparison_df = pd.DataFrame(comparison_data)

feature_chart = alt.Chart(comparison_df).mark_bar().encode(
    x=alt.X('feature:N', title='Feature', axis=alt.Axis(labelAngle=-45)),
    y=alt.Y('mean:Q', title='Mean Value'),
    color=alt.Color('label:N', scale=alt.Scale(domain=['ham', 'spam'], range=['#4CAF50', '#f44336'])),
    xOffset='label:N',
    tooltip=['feature', 'label', alt.Tooltip('mean:Q', format='.2f')]
).properties(
    title='Feature Comparison: Ham vs Spam',
    width=500,
    height=350
)

feature_chart

In [8]:
scatter = alt.Chart(df.sample(1000, random_state=42)).mark_circle(size=60, opacity=0.6).encode(
    x=alt.X('word_count:Q', title='Word Count'),
    y=alt.Y('message_length:Q', title='Message Length'),
    color=alt.Color('label:N', scale=alt.Scale(domain=['ham', 'spam'], range=['#4CAF50', '#f44336'])),
    tooltip=['label', 'word_count', 'message_length', 'message']
).properties(
    title='Message Length vs Word Count (sample of 1000)',
    width=600,
    height=400
).interactive()

scatter

## Length Bucket Analysis

In [9]:
df['length_bucket'] = pd.cut(
    df['message_length'],
    bins=[0, 50, 100, 150, 200, np.inf],
    labels=['0-50', '51-100', '101-150', '151-200', '200+']
)

bucket_counts = df.groupby(['length_bucket', 'label'], observed=True).size().reset_index(name='count')

bucket_totals = bucket_counts.groupby('length_bucket', observed=True)['count'].sum().reset_index(name='total')
bucket_spam = bucket_counts[bucket_counts['label'] == 'spam'][['length_bucket', 'count']].rename(columns={'count': 'spam'})
spam_ratio = bucket_totals.merge(bucket_spam, on='length_bucket', how='left')
spam_ratio['spam'] = spam_ratio['spam'].fillna(0)
spam_ratio['spam_percentage'] = (spam_ratio['spam'] / spam_ratio['total'] * 100).round(1)

stacked_bar = alt.Chart(bucket_counts).mark_bar().encode(
    x=alt.X('length_bucket:N', title='Message Length Bucket'),
    y=alt.Y('count:Q', title='Count'),
    color=alt.Color('label:N', scale=alt.Scale(domain=['ham', 'spam'], range=['#4CAF50', '#f44336'])),
    tooltip=['length_bucket', 'label', 'count']
).properties(
    title='Message Count by Length Bucket',
    width=500,
    height=350
)

stacked_bar

In [10]:
spam_ratio_chart = alt.Chart(spam_ratio).mark_bar(color='#f44336').encode(
    x=alt.X('length_bucket:N', title='Message Length Bucket'),
    y=alt.Y('spam_percentage:Q', title='Spam Percentage (%)', scale=alt.Scale(domain=[0, 100])),
    tooltip=['length_bucket', alt.Tooltip('spam_percentage:Q', format='.1f')]
).properties(
    title='Spam Percentage by Message Length',
    width=500,
    height=300
)

rule = alt.Chart(pd.DataFrame({'y': [13.4]})).mark_rule(color='orange', strokeDash=[5, 5]).encode(
    y='y:Q'
)

spam_ratio_chart + rule

## Word Frequency Analysis

In [11]:
def get_word_frequency(texts, top_n=15):
    all_words = ' '.join(texts).lower().split()
    
    stop_words = {'i', 'me', 'my', 'you', 'your', 'we', 'the', 'a', 'an', 'is', 'are', 
                  'was', 'to', 'of', 'and', 'in', 'it', 'for', 'on', 'with', 'at', 'be',
                  'this', 'that', 'have', 'do', 'will', 'can', 'but', 'or', 'so', 'if',
                  'just', 'not', 'u', 'ur', 'im', 'dont', 'its', 'got', 'get', 'been'}
    
    filtered_words = [w for w in all_words if w.isalpha() and len(w) > 2 and w not in stop_words]
    word_counts = pd.Series(filtered_words).value_counts().head(top_n)
    
    return pd.DataFrame({'word': word_counts.index, 'count': word_counts.values})

spam_words = get_word_frequency(df[df['label'] == 'spam']['message'])
spam_words['label'] = 'spam'

ham_words = get_word_frequency(df[df['label'] == 'ham']['message'])
ham_words['label'] = 'ham'

spam_chart = alt.Chart(spam_words).mark_bar(color='#f44336').encode(
    x=alt.X('count:Q', title='Frequency'),
    y=alt.Y('word:N', sort='-x', title='Word'),
    tooltip=['word', 'count']
).properties(
    title='Top 15 Words in Spam Messages',
    width=350,
    height=400
)

ham_chart = alt.Chart(ham_words).mark_bar(color='#4CAF50').encode(
    x=alt.X('count:Q', title='Frequency'),
    y=alt.Y('word:N', sort='-x', title='Word'),
    tooltip=['word', 'count']
).properties(
    title='Top 15 Words in Ham Messages',
    width=350,
    height=400
)

spam_chart | ham_chart

## Percentile Analysis

In [12]:
percentiles = [10, 25, 50, 75, 90, 95, 99]

percentile_data = []
for label in ['ham', 'spam']:
    subset = df[df['label'] == label]['message_length']
    for p in percentiles:
        percentile_data.append({
            'label': label,
            'percentile': p,
            'message_length': int(np.percentile(subset, p))
        })

percentile_df = pd.DataFrame(percentile_data)

percentile_chart = alt.Chart(percentile_df).mark_line(point=True, strokeWidth=3).encode(
    x=alt.X('percentile:Q', title='Percentile', scale=alt.Scale(domain=[0, 100])),
    y=alt.Y('message_length:Q', title='Message Length'),
    color=alt.Color('label:N', scale=alt.Scale(domain=['ham', 'spam'], range=['#4CAF50', '#f44336'])),
    tooltip=['label', 'percentile', 'message_length']
).properties(
    title='Message Length Percentiles: Ham vs Spam',
    width=600,
    height=350
)

percentile_chart

## Spam Indicators: Uppercase & Special Characters

In [13]:
uppercase_hist = alt.Chart(df).mark_bar(opacity=0.7).encode(
    alt.X('uppercase_ratio:Q', bin=alt.Bin(maxbins=30), title='Uppercase Ratio'),
    alt.Y('count():Q', title='Count'),
    alt.Color('label:N', scale=alt.Scale(domain=['ham', 'spam'], range=['#4CAF50', '#f44336'])),
    tooltip=['label', 'count()']
).properties(
    title='Uppercase Ratio Distribution',
    width=600,
    height=300
)

uppercase_hist

In [14]:
exclaim_data = df.groupby(['exclamation_count', 'label']).size().reset_index(name='count')
exclaim_data = exclaim_data[exclaim_data['exclamation_count'] <= 10]

exclaim_chart = alt.Chart(exclaim_data).mark_bar().encode(
    x=alt.X('exclamation_count:O', title='Number of Exclamation Marks'),
    y=alt.Y('count:Q', title='Message Count'),
    color=alt.Color('label:N', scale=alt.Scale(domain=['ham', 'spam'], range=['#4CAF50', '#f44336'])),
    xOffset='label:N',
    tooltip=['exclamation_count', 'label', 'count']
).properties(
    title='Exclamation Marks Usage: Ham vs Spam',
    width=600,
    height=300
)

exclaim_chart

## Currency & Numbers as Spam Indicators

In [15]:
currency_data = df.groupby(['has_currency', 'label']).size().reset_index(name='count')
currency_data['has_currency'] = currency_data['has_currency'].map({0: 'No Currency', 1: 'Has Currency'})

currency_chart = alt.Chart(currency_data).mark_bar().encode(
    x=alt.X('has_currency:N', title='Currency Symbol Presence'),
    y=alt.Y('count:Q', title='Message Count'),
    color=alt.Color('label:N', scale=alt.Scale(domain=['ham', 'spam'], range=['#4CAF50', '#f44336'])),
    xOffset='label:N',
    tooltip=['has_currency', 'label', 'count']
).properties(
    title='Currency Symbols as Spam Indicator',
    width=400,
    height=300
)

numbers_data = df.groupby(['has_numbers', 'label']).size().reset_index(name='count')
numbers_data['has_numbers'] = numbers_data['has_numbers'].map({0: 'No Numbers', 1: 'Has Numbers'})

numbers_chart = alt.Chart(numbers_data).mark_bar().encode(
    x=alt.X('has_numbers:N', title='Numbers Presence'),
    y=alt.Y('count:Q', title='Message Count'),
    color=alt.Color('label:N', scale=alt.Scale(domain=['ham', 'spam'], range=['#4CAF50', '#f44336'])),
    xOffset='label:N',
    tooltip=['has_numbers', 'label', 'count']
).properties(
    title='Numbers as Spam Indicator',
    width=400,
    height=300
)

currency_chart | numbers_chart

## Key Insights Summary

In [16]:
insights_data = pd.DataFrame({
    'metric': ['Message Length', 'Uppercase Ratio', 'Exclamation Marks'],
    'ham': [
        df[df['label'] == 'ham']['message_length'].mean(),
        df[df['label'] == 'ham']['uppercase_ratio'].mean(),
        df[df['label'] == 'ham']['exclamation_count'].mean()
    ],
    'spam': [
        df[df['label'] == 'spam']['message_length'].mean(),
        df[df['label'] == 'spam']['uppercase_ratio'].mean(),
        df[df['label'] == 'spam']['exclamation_count'].mean()
    ]
})

insights_data['spam_vs_ham_ratio'] = insights_data['spam'] / insights_data['ham']

ratio_chart = alt.Chart(insights_data).mark_bar().encode(
    x=alt.X('spam_vs_ham_ratio:Q', title='Spam / Ham Ratio (1.0 = equal)'),
    y=alt.Y('metric:N', title='', sort='-x'),
    color=alt.condition(
        alt.datum.spam_vs_ham_ratio > 1,
        alt.value('#f44336'),
        alt.value('#4CAF50')
    ),
    tooltip=['metric', alt.Tooltip('spam_vs_ham_ratio:Q', format='.2f')]
).properties(
    title='Key Spam Indicators: How Much More in Spam vs Ham',
    width=500,
    height=200
)

rule = alt.Chart(pd.DataFrame({'x': [1]})).mark_rule(color='black', strokeDash=[3, 3]).encode(x='x:Q')

ratio_chart + rule

## Machine Learning Model

In [24]:
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

model_name = 'sentence-transformers/all-MiniLM-L6-v2'
encoder = SentenceTransformer(model_name)

embeddings = encoder.encode(df['message'].tolist(), show_progress_bar=True)
embeddings.shape

Batches: 100%|██████████| 175/175 [00:24<00:00,  7.06it/s]


(5572, 384)

In [18]:
y = (df['label'] == 'spam').astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    embeddings, y, test_size=0.15, random_state=42, stratify=y
)

print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"Spam ratio in train: {y_train.mean():.2%}")
print(f"Spam ratio in test: {y_test.mean():.2%}")

Training set: 4736 samples
Test set: 836 samples
Spam ratio in train: 13.41%
Spam ratio in test: 13.40%


In [19]:
clf = LogisticRegression(max_iter=1000, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]

print(f"Accuracy: {accuracy_score(y_test, y_pred):.2%}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['ham', 'spam']))

Accuracy: 97.73%

Classification Report:
              precision    recall  f1-score   support

         ham       0.98      0.99      0.99       724
        spam       0.94      0.88      0.91       112

    accuracy                           0.98       836
   macro avg       0.96      0.94      0.95       836
weighted avg       0.98      0.98      0.98       836



In [20]:
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame({
    'predicted': ['ham', 'ham', 'spam', 'spam'],
    'actual': ['ham', 'spam', 'ham', 'spam'],
    'count': [cm[0, 0], cm[1, 0], cm[0, 1], cm[1, 1]]
})

alt.Chart(cm_df).mark_rect().encode(
    x=alt.X('predicted:N', title='Predicted'),
    y=alt.Y('actual:N', title='Actual'),
    color=alt.Color('count:Q', scale=alt.Scale(scheme='blues')),
    tooltip=['actual', 'predicted', 'count']
).properties(
    title='Confusion Matrix',
    width=250,
    height=250
).mark_rect().encode() + alt.Chart(cm_df).mark_text(fontSize=20).encode(
    x='predicted:N',
    y='actual:N',
    text='count:Q'
)

In [21]:
test_messages = [
    "Congratulations! You've won a free iPhone! Click here now!",
    "Hey, are we still meeting for lunch tomorrow?",
    "URGENT: Your account has been compromised. Call now!",
    "Can you pick up some milk on your way home?"
]

test_embeddings = encoder.encode(test_messages)
test_probs = clf.predict_proba(test_embeddings)[:, 1]

for msg, prob in zip(test_messages, test_probs):
    print(f"Spam probability: {prob:.1%} | {msg[:50]}...")

Spam probability: 77.0% | Congratulations! You've won a free iPhone! Click h...
Spam probability: 0.2% | Hey, are we still meeting for lunch tomorrow?...
Spam probability: 78.2% | URGENT: Your account has been compromised. Call no...
Spam probability: 0.1% | Can you pick up some milk on your way home?...


In [8]:
import os
os.makedirs('../models', exist_ok=True)

joblib.dump(clf, '../models/spam_classifier.joblib')
print("Model saved to ../models/spam_classifier.joblib")

Model saved to ../models/spam_classifier.joblib
