# Spam Detection - Data Analysis

In [None]:
import sys
import os

project_root = os.path.abspath(os.path.join('..', '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import nltk
import seaborn as sns

nltk.download('punkt')

from src.utils import load_config, get_project_root, print_text
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud

## 1. Data Loading and Exploration
- Load raw training and test data from location in configuration file.

In [None]:
config = load_config()

train_path = os.path.join(get_project_root(), config['data']['task1']['raw']['train'])
test_path = os.path.join(get_project_root(), config['data']['task1']['raw']['test'])

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

- Display basic information about our test and training data:

In [None]:
print("Training Data Shape:", train_df.shape)
print("Test Data Shape:", test_df.shape)

- Choose a spam and not spam message from the data, use the given function to see if it is spam or not, then print it. We can use this to get a rough idea of what a spam message might look like:

In [None]:
non_spam_sample = train_df[train_df['label'] == 0].iloc[0]
print("NON-SPAM SAMPLE:")
print_text(non_spam_sample['text'], non_spam_sample[-1])

print("\n")

spam_sample = train_df[train_df['label'] == 1].iloc[0]
print("SPAM SAMPLE:")
print_text(spam_sample['text'], spam_sample['label'])

## 2. Analysing Data
- Pie chart of spam vs. non-spam distribution

In [None]:
def map_labels(series, mapping):
    return series.map(mapping).to_numpy()

In [None]:
mapping = {0: "NotSpam", 1: "Spam"}
plot_labels = ["NotSpam", "Spam"]

train_labels = map_labels(train_df.iloc[:, -1], mapping)
train_counts = [(train_labels == "NotSpam").sum(), (train_labels == "Spam").sum()]

print(train_counts)

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))

ax.pie(
    train_counts,
    labels=plot_labels,
    autopct='%1.1f%%',
    colors=['paleturquoise', 'orchid']
)

ax.set_title('Training Data')

plt.tight_layout()
plt.show()

- Print key information

In [None]:
print(f"Training Set - NotSpam: {train_counts[0]}, Spam: {train_counts[1]}")

## 3. Text Length Analysis
- Calculate average text length of a spam message vs a non-spam message.
- Plot distribution on histogram of average text lengths for spam vs. non-spam

In [None]:
text_lengths = train_df.iloc[:, 0].apply(len)
spam_lengths = text_lengths[train_labels == "Spam"]
nonspam_lengths = text_lengths[train_labels == "NotSpam"]

spam_avg_len = spam_lengths.mean()
nonspam_avg_len = nonspam_lengths.mean()

plt.figure(figsize=(8, 6))
plt.hist(nonspam_lengths, bins=30, alpha=0.6, label='NotSpam', color='paleturquoise')
plt.hist(spam_lengths, bins=30, alpha=0.6, label='Spam', color='orchid')
plt.title('Text Length Distribution')
plt.xlabel('Text Length (characters)')
plt.ylabel('Frequency')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
print(f"Average text length for spam: {spam_avg_len:.2f} characters")
print(f"Average text length for non-spam: {nonspam_avg_len:.2f} characters")
print(f"Max spam length: {spam_lengths.max()}")
print(f"Number of spam messages > 500 characters: {(spam_lengths > 500).sum()}")

## 4. Linguistic Features Analysis
- Average sentence length comparison
- Word count distributions
- Special character usage (e.g., exclamation marks)
- Uppercase word frequency

In [None]:
texts = train_df.iloc[:, 0]

In [None]:
def length_avg_sentence(text):
    sentences = sent_tokenize(text)
    if not sentences:
        return 0
    return np.mean([len(word_tokenize(sent)) for sent in sentences])

In [None]:
avg_sent_lens = texts.apply(length_avg_sentence)

labels = pd.Series(map_labels(train_df.iloc[:, -1], mapping))

spam_avg_sent_len = avg_sent_lens[labels == "Spam"].mean()
nonspam_avg_sent_len = avg_sent_lens[labels == "NotSpam"].mean()

print(f"Average Spam Sentence Length: {spam_avg_sent_len:.2f} words")
print(f"Average Not Spam Sentence Length: {nonspam_avg_sent_len:.2f} words")

- Word count distributions

In [None]:
word_counts = texts.apply(lambda x: len(word_tokenize(x)))
spam_word_counts = word_counts[labels == "Spam"]
nonspam_word_counts = word_counts[labels == "NotSpam"]

plt.figure(figsize=(8, 6))
plt.hist(nonspam_word_counts, bins=30, alpha=0.6, label='NotSpam', color='paleturquoise')
plt.hist(spam_word_counts, bins=30, alpha=0.6, label='Spam', color='orchid')
plt.title('Word Count Distribution')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.legend()
plt.tight_layout()
plt.show()

- Usage of special characters

In [None]:
special_chars = ['!', '?', '$', '%', '&', '@', '*']

# Function to count special characters
def count_special_chars(text, chars=special_chars):
    return sum(text.count(c) for c in chars)

# Apply function
special_char_counts = texts.apply(lambda x: count_special_chars(x))

# Separate counts by label
spam_special_char_avg = special_char_counts[labels == "Spam"].mean()
nonspam_special_char_avg = special_char_counts[labels == "NotSpam"].mean()

print(f"Avg Special Characters per Spam Message: {spam_special_char_avg:.2f}")
print(f"Avg Special Characters per Not Spam Message: {nonspam_special_char_avg:.2f}")


- Exclamation mark frequency

In [None]:
import string

def count_punctuation(text):
    # Count all punctuation marks in a message
    return sum(1 for char in text if char in string.punctuation)

punctuation_counts = texts.apply(count_punctuation)

# Calculate averages
spam_punctuation_avg = punctuation_counts[labels == "Spam"].mean()
nonspam_punctuation_avg = punctuation_counts[labels == "NotSpam"].mean()

print(f"Avg Punctuation Marks per Spam Message: {spam_punctuation_avg:.2f}")
print(f"Avg Punctuation Marks per Not Spam Message: {nonspam_punctuation_avg:.2f}")


- Exclamation mark density

In [None]:
def exclamation_density(text):
    word_count = len(word_tokenize(text))
    excl_count = text.count('!')
    return excl_count / word_count if word_count > 0 else 0

In [None]:
exclamation_density_counts = texts.apply(exclamation_density)

spam_excl_density_avg = exclamation_density_counts[labels == "Spam"].mean()
nonspam_excl_density_avg = exclamation_density_counts[labels == "NotSpam"].mean()

print(f"Average exclamation mark density per message (Spam): {spam_excl_density_avg:.4f}")
print(f"Average exclamation mark density per message (NotSpam): {nonspam_excl_density_avg:.4f}")


## 5. Word Frequency Analysis
- Most common words in spam
- Most common words in legitimate emails
- Create word clouds for visual comparison

In [None]:
def get_most_common_words(texts):
    vectorizer = CountVectorizer(stop_words='english', max_features=20)  # Get top 20 words, ignoring stop words
    word_matrix = vectorizer.fit_transform(texts)
    word_freq = pd.DataFrame(word_matrix.toarray(), columns=vectorizer.get_feature_names_out())
    word_counts = word_freq.sum(axis=0).sort_values(ascending=False)
    return word_counts

In [None]:
def generate_word_cloud(texts, title, max_words=20):
    text = " ".join(texts)
    
    wordcloud = WordCloud(
        width=800,
        height=400,
        background_color='white',
        stopwords=None,
        max_words=max_words
    ).generate(text)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.show()

In [None]:
spam_texts = texts[labels == "Spam"]
nonspam_texts = texts[labels == "NotSpam"]

# Get most common words in Spam and Non-Spam
spam_word_counts = get_most_common_words(spam_texts)
nonspam_word_counts = get_most_common_words(nonspam_texts)

print("Most Common Words in Spam:")
print(spam_word_counts)
print("\nMost Common Words in Non-Spam:")
print(nonspam_word_counts)

generate_word_cloud(spam_texts, 'Word Cloud for Spam')
generate_word_cloud(nonspam_texts, 'Word Cloud for Non-Spam')

## 6. Correlation Analysis
- Correlation between text length and spam classification
- Correlation between sentence length and spam classification
- Identify other potential correlations

In [None]:
features_df = pd.DataFrame({
    'text_length': text_lengths,
    'avg_sentence_length': avg_sent_lens,
    'special_char_count': special_char_counts,
    'punctuation_count': punctuation_counts,
    'excl_density': exclamation_density_counts,
})

correlations = features_df.corr()

print("\nCorrelation Matrix:")
print(correlations)

plt.figure(figsize=(10, 6))
sns.heatmap(correlations, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1)
plt.title("Spam Detection Correlation Matrix")
plt.show()


## 7. Summary of Findings
- Key differences between spam and non-spam emails
- Potential features for ML model

In [None]:
print("Summary of Findings")
print("\n")
print("Differences between spam and non-spam emails:")

if spam_avg_len > nonspam_avg_len:
    print(f"- Spam emails tend to be longer ({spam_avg_len:.1f} vs {nonspam_avg_len:.1f} characters)")
else:
    print(f"- Non-spam emails tend to be longer ({nonspam_avg_len:.1f} vs {spam_avg_len:.1f} characters)")

if spam_special_char_avg > nonspam_special_char_avg:
    print(f"- Spam emails use significantly more special characters ({spam_special_char_avg:.2f} vs {nonspam_special_char_avg:.2f})")

print("Vocabulary Patterns:")
print(f"- Common in spam: {', '.join(spam_word_counts.head(5).index)}")
print(f"- Common in non-spam: {', '.join(nonspam_word_counts.head(5).index)}")

print("Potential Model Features (based on correlation):")
potential_correlations = correlations.where(~np.eye(correlations.shape[0], dtype=bool))

for feature in correlations.columns:
    related = potential_correlations[feature].dropna().abs().sort_values(ascending=False)
    if not related.empty and related.iloc[0] > 0.2:
        top_related_feature = related.index[0]
        top_corr_value = correlations.loc[feature, top_related_feature]
        print(f"- {feature} correlates with {top_related_feature} (r = {top_corr_value:.2f})")
