In [None]:
# SMS Scam Detection - Data Exploration and Analysis
# =====================================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from wordcloud import WordCloud
import emoji
import string
import os

# Set plotting style
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 8)

# Download required NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet=True)

# Set up project directory structure
project_dir = '/content/drive/MyDrive/sms-scam-detection'
os.chdir(project_dir)

directories = [
    'data/raw',
    'data/processed',
    'models/baseline',
    'models/deep_learning',
    'models/llm',
    'results/metrics',
    'results/visualizations'
]

for directory in directories:
    os.makedirs(directory, exist_ok=True)

print(f"Project directory: {project_dir}")
print("Directory structure created successfully!")

# Load and examine the dataset
dataset_path = 'data/raw/spam-fraud-sms-dataset.csv'
df = pd.read_csv(dataset_path)

print(f"Dataset loaded with {len(df)} rows and {len(df.columns)} columns")
print(f"Column names: {df.columns.tolist()}")

# Display basic dataset information
print("\nDataset Info:")
df.info()

print("\nFirst 5 rows:")
print(df.head())

# Convert labels to binary format if needed
if df['label'].dtype == object:
    unique_labels = df['label'].unique()
    print(f"\nUnique label values: {unique_labels}")

    label_map = {'ham': 0, 'spam': 1}
    df['label'] = df['label'].map(label_map)
    print("Labels converted to binary (0 for legitimate/ham, 1 for spam/scam)")

df['label'] = df['label'].astype(int)

# Analyze class distribution
print("\nClass Distribution:")
label_counts = df['label'].value_counts(normalize=True) * 100
print(f"Class 0 (Legitimate): {label_counts.get(0, 0):.2f}%")
print(f"Class 1 (Spam/Scam): {label_counts.get(1, 0):.2f}%")

# Visualize class distribution
plt.figure(figsize=(8, 5))
sns.countplot(x='label', data=df)
plt.title('Distribution of Labels')
plt.xlabel('Label (0: Legitimate, 1: Spam/Scam)')
plt.ylabel('Count')
plt.show()

def clean_text(text):
    """
    Clean and normalize text data while preserving important patterns.

    Args:
        text (str): Input text to clean

    Returns:
        str: Cleaned text
    """
    if not isinstance(text, str):
        return ""

    text = text.lower()

    # Preserve important patterns with tokens
    text = re.sub(r'https?://\S+|www\.\S+|[a-zA-Z0-9.-]+\.ug\b|[a-zA-Z0-9.-]+\.ly\b', 'URLTOKEN', text)
    text = re.sub(r'\S+@\S+', 'EMAILTOKEN', text)
    text = re.sub(r'\b256(?:\s*\d){9}\b', 'PHONETOKEN', text)
    text = re.sub(r'\b0(?:\s*\d){9}\b', 'PHONETOKEN', text)
    text = re.sub(r'\b0(?:\s*\d){10,15}\b', 'PHONETOKEN', text)

    # Normalize money amounts
    text = re.sub(r'(?:ugx|shs|shilling(?:s)?)?\s*(\d{1,3}(?:,\d{3})*(?:\.\d+)?)', r'MONEYTOKEN \1', text)
    text = re.sub(r'(\d{1,3}(?:,\d{3})*(?:\.\d+)?)?\s*(?:ugx|shs|shilling(?:s)?)', r'MONEYTOKEN \1', text)
    text = re.sub(r'(\d{1,3}(?:,\d{3})*(?:\.\d+)?)?\s*/[-=]', r'MONEYTOKEN \1', text)

    # Remove emojis and non-ASCII characters
    text = emoji.replace_emoji(text, replace='')
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def preprocess_text(text, remove_stopwords=True, lemmatize=True):
    """
    Preprocess text for analysis.

    Args:
        text (str): Input text to preprocess
        remove_stopwords (bool): Whether to remove stopwords
        lemmatize (bool): Whether to lemmatize words

    Returns:
        list: List of processed tokens
    """
    tokens = word_tokenize(text)

    special_tokens = ['URLTOKEN', 'EMAILTOKEN', 'PHONETOKEN', 'MONEYTOKEN']
    tokens = [token for token in tokens if token in special_tokens or token not in string.punctuation]

    uganda_terms = ['ugx', 'shs', 'shilling', 'shillings']

    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        stop_words = stop_words - set(uganda_terms)
        tokens = [token for token in tokens if token.lower() not in stop_words or token in special_tokens]

    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) if token not in special_tokens else token
                  for token in tokens]

    return tokens

# Apply text cleaning
df['cleaned_text'] = df['message'].apply(clean_text)

print("\nExamples of original and cleaned text:")
for i in range(3):
    print(f"\nOriginal: {df['message'].iloc[i]}")
    print(f"Cleaned: {df['cleaned_text'].iloc[i]}")

# Calculate text statistics
df['text_length'] = df['message'].str.len()
df['word_count'] = df['message'].str.split().str.len()
df['avg_word_length'] = df['message'].apply(
    lambda x: sum(len(word) for word in x.split()) / len(x.split()) if len(x.split()) > 0 else 0
)
df['uppercase_ratio'] = df['message'].apply(
    lambda x: sum(1 for c in x if c.isupper()) / len(x) if len(x) > 0 else 0
)
df['digit_ratio'] = df['message'].apply(
    lambda x: sum(1 for c in x if c.isdigit()) / len(x) if len(x) > 0 else 0
)
df['special_char_ratio'] = df['message'].apply(
    lambda x: sum(1 for c in x if not c.isalnum() and not c.isspace()) / len(x) if len(x) > 0 else 0
)

# Display text statistics by class
text_stats = df.groupby('label')[
    ['text_length', 'word_count', 'avg_word_length', 'uppercase_ratio', 'digit_ratio', 'special_char_ratio']
].mean()

text_stats.index = ['Legitimate', 'Spam/Scam']
print("\nAverage text statistics by class:")
print(text_stats)

# Create descriptive labels for visualization
df['label_desc'] = df['label'].map({0: 'Legitimate', 1: 'Spam/Scam'})

# Visualize text length distribution
plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='text_length', hue='label_desc', bins=50, kde=True, element='step')
plt.title('Distribution of Message Lengths')
plt.xlabel('Message Length (characters)')
plt.ylabel('Count')
plt.xlim(0, df['text_length'].quantile(0.99))
plt.show()

# Create boxplots for text statistics
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
stats = ['text_length', 'word_count', 'avg_word_length', 'uppercase_ratio', 'digit_ratio', 'special_char_ratio']
titles = ['Message Length', 'Word Count', 'Average Word Length', 'Uppercase Ratio', 'Digit Ratio', 'Special Character Ratio']

for i, (stat, title) in enumerate(zip(stats, titles)):
    row, col = i // 3, i % 3
    sns.boxplot(x='label', y=stat, data=df, ax=axes[row, col])
    axes[row, col].set_title(title)
    axes[row, col].set_xlabel('Class (0: Legitimate, 1: Spam/Scam)')
    axes[row, col].set_xticks([0, 1])
    axes[row, col].set_xticklabels(['Legitimate', 'Spam/Scam'])

plt.tight_layout()
plt.show()

# Analyze vocabulary patterns
legitimate_texts = df[df['label'] == 0]['cleaned_text'].tolist()
spam_texts = df[df['label'] == 1]['cleaned_text'].tolist()

legitimate_tokens = []
for text in legitimate_texts:
    legitimate_tokens.extend(preprocess_text(text))

spam_tokens = []
for text in spam_texts:
    spam_tokens.extend(preprocess_text(text))

# Count token frequencies
legitimate_counter = Counter(legitimate_tokens)
spam_counter = Counter(spam_tokens)

legitimate_common = legitimate_counter.most_common(20)
spam_common = spam_counter.most_common(20)

print("\nMost common words in legitimate messages:")
for word, count in legitimate_common:
    print(f"{word}: {count}")

print("\nMost common words in spam messages:")
for word, count in spam_common:
    print(f"{word}: {count}")

# Visualize most common words
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 8))

words, counts = zip(*legitimate_common)
sns.barplot(x=list(counts), y=list(words), ax=ax1)
ax1.set_title('Most Common Words in Legitimate Messages')
ax1.set_xlabel('Count')

words, counts = zip(*spam_common)
sns.barplot(x=list(counts), y=list(words), ax=ax2)
ax2.set_title('Most Common Words in Spam Messages')
ax2.set_xlabel('Count')

plt.tight_layout()
plt.show()

# Generate word clouds
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 8))

legitimate_wordcloud = WordCloud(
    width=800, height=400, background_color='white', max_words=100
).generate(' '.join(legitimate_tokens))
ax1.imshow(legitimate_wordcloud, interpolation='bilinear')
ax1.set_title('Word Cloud for Legitimate Messages')
ax1.axis('off')

spam_wordcloud = WordCloud(
    width=800, height=400, background_color='white', max_words=100
).generate(' '.join(spam_tokens))
ax2.imshow(spam_wordcloud, interpolation='bilinear')
ax2.set_title('Word Cloud for Spam Messages')
ax2.axis('off')

plt.tight_layout()
plt.show()

def get_ngrams(texts, n=2, min_df=5):
    """Extract n-grams from texts."""
    min_df_value = 2 if n == 3 else min_df
    vectorizer = CountVectorizer(ngram_range=(n, n), stop_words='english', min_df=min_df_value)

    try:
        X = vectorizer.fit_transform(texts)
        ngrams = vectorizer.get_feature_names_out()
        ngram_counts = X.sum(axis=0).A1
        ngram_freq = [(ngram, count) for ngram, count in zip(ngrams, ngram_counts)]
        return sorted(ngram_freq, key=lambda x: x[1], reverse=True)
    except ValueError:
        return []

# Extract and visualize bigrams and trigrams
legitimate_bigrams = get_ngrams(legitimate_texts, 2)[:15]
spam_bigrams = get_ngrams(spam_texts, 2)[:15]

legitimate_trigrams = get_ngrams(legitimate_texts, 3)[:15]
spam_trigrams = get_ngrams(spam_texts, 3)[:15]

# Visualize bigrams
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 8))

if legitimate_bigrams:
    words, counts = zip(*legitimate_bigrams)
    sns.barplot(x=list(counts), y=list(words), ax=ax1)
    ax1.set_title('Most Common Bigrams in Legitimate Messages')
    ax1.set_xlabel('Count')
else:
    ax1.set_title('No Common Bigrams Found in Legitimate Messages')

if spam_bigrams:
    words, counts = zip(*spam_bigrams)
    sns.barplot(x=list(counts), y=list(words), ax=ax2)
    ax2.set_title('Most Common Bigrams in Spam Messages')
    ax2.set_xlabel('Count')
else:
    ax2.set_title('No Common Bigrams Found in Spam Messages')

plt.tight_layout()
plt.show()

# TF-IDF analysis
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', min_df=5)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_text'])
feature_names = tfidf_vectorizer.get_feature_names_out()

legitimate_indices = df[df['label'] == 0].index
spam_indices = df[df['label'] == 1].index

try:
    legitimate_tfidf = tfidf_matrix[legitimate_indices].mean(axis=0).A1
    spam_tfidf = tfidf_matrix[spam_indices].mean(axis=0).A1
    diff_tfidf = spam_tfidf - legitimate_tfidf

    top_spam_indices = diff_tfidf.argsort()[-20:][::-1]
    top_legitimate_indices = diff_tfidf.argsort()[:20]

    top_spam_terms = [(feature_names[i], diff_tfidf[i]) for i in top_spam_indices]
    top_legitimate_terms = [(feature_names[i], -diff_tfidf[i]) for i in top_legitimate_indices]

    # Visualize most distinctive terms
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 8))

    words, scores = zip(*top_spam_terms)
    sns.barplot(x=list(scores), y=list(words), ax=ax1)
    ax1.set_title('Most Distinctive Terms for Spam Messages')
    ax1.set_xlabel('TF-IDF Difference')

    words, scores = zip(*top_legitimate_terms)
    sns.barplot(x=list(scores), y=list(words), ax=ax2)
    ax2.set_title('Most Distinctive Terms for Legitimate Messages')
    ax2.set_xlabel('TF-IDF Difference')

    plt.tight_layout()
    plt.show()

except Exception as e:
    print(f"Error in TF-IDF analysis: {e}")

# Pattern analysis functions
def contains_url(text):
    """Detect URL tokens in text."""
    return 'urltoken' in text.lower()

def contains_phone(text):
    """Detect phone number tokens in text."""
    return 'phonetoken' in text.lower()

def contains_money(text):
    """Detect money tokens in text."""
    return 'moneytoken' in text.lower()

def contains_urgent_words(text):
    """Detect urgency-related words in text."""
    urgent_words = [
        'urgent', 'now', 'today', 'immediately', 'hurry', 'quick', 'fast',
        'limited', 'offer', 'exclusive', 'expires', 'deadline', 'alert',
        'confirm', 'verify', 'activate', 'suspended', 'blocked', 'expire',
        'winner', 'congratulations', 'prize', 'bonus', 'promotion'
    ]
    text_lower = text.lower()
    return any(word in text_lower.split() for word in urgent_words)

# Apply pattern detection
df['has_url'] = df['cleaned_text'].apply(contains_url)
df['has_phone'] = df['cleaned_text'].apply(contains_phone)
df['has_money'] = df['cleaned_text'].apply(contains_money)
df['has_urgent'] = df['cleaned_text'].apply(contains_urgent_words)

# Count pattern occurrences by class
legitimate_count = df[df['label'] == 0].shape[0]
spam_count = df[df['label'] == 1].shape[0]

if legitimate_count > 0 and spam_count > 0:
    pattern_counts = pd.DataFrame({
        'Legitimate': [
            df[(df['label'] == 0) & df['has_url']].shape[0],
            df[(df['label'] == 0) & df['has_phone']].shape[0],
            df[(df['label'] == 0) & df['has_money']].shape[0],
            df[(df['label'] == 0) & df['has_urgent']].shape[0]
        ],
        'Spam': [
            df[(df['label'] == 1) & df['has_url']].shape[0],
            df[(df['label'] == 1) & df['has_phone']].shape[0],
            df[(df['label'] == 1) & df['has_money']].shape[0],
            df[(df['label'] == 1) & df['has_urgent']].shape[0]
        ]
    }, index=['URLs', 'Phone Numbers', 'Money References', 'Urgency Words'])

    pattern_percentages = pd.DataFrame({
        'Legitimate (%)': pattern_counts['Legitimate'] / legitimate_count * 100,
        'Spam (%)': pattern_counts['Spam'] / spam_count * 100
    })

    print("\nPattern occurrences by class:")
    print(pattern_counts)

    print("\nPattern percentages by class:")
    print(pattern_percentages)

    # Visualize pattern percentages
    plt.figure(figsize=(12, 6))
    pattern_percentages.plot(kind='bar', rot=0)
    plt.title('Percentage of Messages Containing Specific Patterns')
    plt.ylabel('Percentage (%)')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

# Correlation analysis
corr_features = [
    'text_length', 'word_count', 'avg_word_length',
    'uppercase_ratio', 'digit_ratio', 'special_char_ratio',
    'has_url', 'has_phone', 'has_money', 'has_urgent', 'label'
]

corr_df = df[corr_features].copy()
corr_df[['has_url', 'has_phone', 'has_money', 'has_urgent']] = \
    corr_df[['has_url', 'has_phone', 'has_money', 'has_urgent']].astype(int)

correlation_matrix = corr_df.corr()

# Visualize correlation matrix
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(
    correlation_matrix,
    annot=True,
    mask=mask,
    cmap='coolwarm',
    vmin=-1,
    vmax=1,
    fmt='.2f',
    linewidths=0.5
)
plt.title('Correlation Matrix of Text Features')
plt.tight_layout()
plt.show()

# Extract correlations with label
label_correlations = correlation_matrix['label'].drop('label').sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=label_correlations.values, y=label_correlations.index)
plt.title('Feature Correlations with Spam Label')
plt.xlabel('Correlation Coefficient')
plt.axvline(x=0, color='black', linestyle='--')
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Display example messages
print("\nExample Legitimate Messages:")
legitimate_examples = df[df['label'] == 0].sample(min(5, legitimate_count))['message'].tolist()
for i, msg in enumerate(legitimate_examples):
    print(f"\n{i+1}. {msg}")

print("\n\nExample Spam Messages:")
spam_examples = df[df['label'] == 1].sample(min(5, spam_count))['message'].tolist()
for i, msg in enumerate(spam_examples):
    print(f"\n{i+1}. {msg}")

# Save preprocessed data
output_file = 'data/processed/sms_dataset_explored.csv'
df.to_csv(output_file, index=False)
print(f"\nPreprocessed dataset saved to {output_file}")

print("\nData exploration completed successfully!")