In [1]:
# Spam SMS Detection - Exploratory Data Analysis
# CodSoft ML Internship - Task 4
# Author: Chandan Kumar

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from collections import Counter
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("SPAM SMS DETECTION - EXPLORATORY DATA ANALYSIS")
print("="*70)

SPAM SMS DETECTION - EXPLORATORY DATA ANALYSIS


In [2]:
# 1. LOAD DATASET

print("\nüìÇ Loading dataset...")
# Dataset typically has 'v1' (label) and 'v2' (message) columns
df = pd.read_csv('../data/spam.csv', encoding='latin-1')

# Clean columns if there are extra ones
if 'Unnamed: 2' in df.columns:
    df = df[['v1', 'v2']]
    df.columns = ['label', 'message']
else:
    df.columns = ['label', 'message']

print(f"‚úÖ Dataset loaded successfully!")
print(f"   Shape: {df.shape}")
print(f"   Rows: {df.shape[0]:,}")
print(f"   Columns: {df.shape[1]}")


üìÇ Loading dataset...
‚úÖ Dataset loaded successfully!
   Shape: (5572, 2)
   Rows: 5,572
   Columns: 2


In [3]:
# 2. INITIAL DATA INSPECTION

print("\n" + "="*70)
print("DATA OVERVIEW")
print("="*70)

print("\nüìä First 10 samples:")
print(df.head(10))

print("\nüìã Dataset Info:")
print(df.info())

print("\nüîç Label Distribution:")
print(df['label'].value_counts())


DATA OVERVIEW

üìä First 10 samples:
  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
5  spam  FreeMsg Hey there darling it's been 3 week's n...
6   ham  Even my brother is not like to speak with me. ...
7   ham  As per your request 'Melle Melle (Oru Minnamin...
8  spam  WINNER!! As a valued network customer you have...
9  spam  Had your mobile 11 months or more? U R entitle...

üìã Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ 

In [4]:
# 3. MISSING VALUES CHECK

print("\n" + "="*70)
print("MISSING VALUES ANALYSIS")
print("="*70)

missing_values = df.isnull().sum()
print(missing_values)

if missing_values.sum() == 0:
    print("‚úÖ No missing values found!")
else:
    print(f"‚ö†Ô∏è  Dropping {missing_values.sum()} missing values...")
    df = df.dropna()

# Remove duplicates
duplicates = df.duplicated().sum()
if duplicates > 0:
    print(f"\nüîÑ Removing {duplicates} duplicate messages...")
    df = df.drop_duplicates()
    print(f"   New shape: {df.shape}")


MISSING VALUES ANALYSIS
label      0
message    0
dtype: int64
‚úÖ No missing values found!

üîÑ Removing 403 duplicate messages...
   New shape: (5169, 2)


In [5]:
# 4. LABEL DISTRIBUTION

print("\n" + "="*70)
print("SPAM vs HAM DISTRIBUTION")
print("="*70)

label_counts = df['label'].value_counts()
label_percentages = df['label'].value_counts(normalize=True) * 100

print("\nüìä Message Distribution:")
print(f"   Ham (Legitimate): {label_counts['ham']:,} ({label_percentages['ham']:.2f}%)")
print(f"   Spam: {label_counts['spam']:,} ({label_percentages['spam']:.2f}%)")

spam_ratio = label_counts['spam'] / label_counts['ham']
print(f"\nüìà Spam Rate: {label_percentages['spam']:.2f}%")
print(f"   Imbalance Ratio: 1:{1/spam_ratio:.2f}")

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot
axes[0].bar(['Ham', 'Spam'], label_counts.values, 
            color=['green', 'red'], alpha=0.7)
axes[0].set_ylabel('Count')
axes[0].set_title('SMS Message Distribution')
for i, v in enumerate(label_counts.values):
    axes[0].text(i, v, f'{v:,}', ha='center', va='bottom')

# Pie chart
colors = ['lightgreen', 'lightcoral']
axes[1].pie(label_counts.values, labels=['Ham', 'Spam'], 
            autopct='%1.2f%%', colors=colors, startangle=90)
axes[1].set_title('SMS Message Percentage')

plt.tight_layout()
plt.savefig('../images/spam_distribution.png', dpi=300, bbox_inches='tight')
print("\n‚úÖ Visualization saved: ../images/spam_distribution.png")
plt.close()


SPAM vs HAM DISTRIBUTION

üìä Message Distribution:
   Ham (Legitimate): 4,516 (87.37%)
   Spam: 653 (12.63%)

üìà Spam Rate: 12.63%
   Imbalance Ratio: 1:6.92

‚úÖ Visualization saved: ../images/spam_distribution.png


In [6]:
# 5. MESSAGE LENGTH ANALYSIS

print("\n" + "="*70)
print("MESSAGE LENGTH ANALYSIS")
print("="*70)

# Calculate message lengths
df['message_length'] = df['message'].apply(len)
df['word_count'] = df['message'].apply(lambda x: len(x.split()))

print("\nüìè Overall Statistics:")
print(f"   Mean length: {df['message_length'].mean():.0f} characters")
print(f"   Median length: {df['message_length'].median():.0f} characters")
print(f"   Mean words: {df['word_count'].mean():.0f} words")

print("\nüìè By Category:")
for label in ['ham', 'spam']:
    subset = df[df['label'] == label]
    print(f"\n   {label.upper()}:")
    print(f"      Mean length: {subset['message_length'].mean():.0f} chars")
    print(f"      Median length: {subset['message_length'].median():.0f} chars")
    print(f"      Mean words: {subset['word_count'].mean():.0f} words")

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Character length distribution
df[df['label'] == 'ham']['message_length'].hist(bins=50, alpha=0.7, 
                                                  label='Ham', color='green', ax=axes[0])
df[df['label'] == 'spam']['message_length'].hist(bins=50, alpha=0.7, 
                                                   label='Spam', color='red', ax=axes[0])
axes[0].set_xlabel('Message Length (characters)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Message Length Distribution')
axes[0].legend()
axes[0].set_xlim(0, 500)

# Word count distribution
df[df['label'] == 'ham']['word_count'].hist(bins=50, alpha=0.7, 
                                              label='Ham', color='green', ax=axes[1])
df[df['label'] == 'spam']['word_count'].hist(bins=50, alpha=0.7, 
                                               label='Spam', color='red', ax=axes[1])
axes[1].set_xlabel('Word Count')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Word Count Distribution')
axes[1].legend()
axes[1].set_xlim(0, 100)

plt.tight_layout()
plt.savefig('../images/message_length_analysis.png', dpi=300, bbox_inches='tight')
print("\n‚úÖ Length analysis saved: ../images/message_length_analysis.png")
plt.close()


MESSAGE LENGTH ANALYSIS

üìè Overall Statistics:
   Mean length: 79 characters
   Median length: 60 characters
   Mean words: 15 words

üìè By Category:

   HAM:
      Mean length: 70 chars
      Median length: 52 chars
      Mean words: 14 words

   SPAM:
      Mean length: 138 chars
      Median length: 149 chars
      Mean words: 24 words

‚úÖ Length analysis saved: ../images/message_length_analysis.png


In [7]:
# 6. TEXT PREPROCESSING

print("\n" + "="*70)
print("TEXT PREPROCESSING")
print("="*70)

def preprocess_text(text):
    """Clean and preprocess text"""
    # Lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

print("\nüîÑ Applying preprocessing...")
df['cleaned_message'] = df['message'].apply(preprocess_text)

print("\nüìù Sample before/after:")
for i in range(3):
    print(f"\nOriginal: {df.iloc[i]['message'][:80]}...")
    print(f"Cleaned:  {df.iloc[i]['cleaned_message'][:80]}...")


TEXT PREPROCESSING

üîÑ Applying preprocessing...

üìù Sample before/after:

Original: Go until jurong point, crazy.. Available only in bugis n great world la e buffet...
Cleaned:  go until jurong point crazy available only in bugis n great world la e buffet ci...

Original: Ok lar... Joking wif u oni......
Cleaned:  ok lar joking wif u oni...

Original: Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 8...
Cleaned:  free entry in a wkly comp to win fa cup final tkts st may text fa to to receive ...


In [8]:
# 7. WORD FREQUENCY ANALYSIS

print("\n" + "="*70)
print("WORD FREQUENCY ANALYSIS")
print("="*70)

# Get words from each category
ham_words = ' '.join(df[df['label'] == 'ham']['cleaned_message']).split()
spam_words = ' '.join(df[df['label'] == 'spam']['cleaned_message']).split()

# Most common words
ham_common = Counter(ham_words).most_common(20)
spam_common = Counter(spam_words).most_common(20)

print("\nüî§ Top 10 Ham Words:")
for word, count in ham_common[:10]:
    print(f"   {word}: {count}")

print("\nüî§ Top 10 Spam Words:")
for word, count in spam_common[:10]:
    print(f"   {word}: {count}")

# Visualization - Word frequency
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Ham words
ham_words_list = [word for word, count in ham_common]
ham_counts_list = [count for word, count in ham_common]
axes[0].barh(ham_words_list[:15], ham_counts_list[:15], color='green', alpha=0.7)
axes[0].set_xlabel('Frequency')
axes[0].set_title('Top 15 Words in Ham Messages')
axes[0].invert_yaxis()

# Spam words
spam_words_list = [word for word, count in spam_common]
spam_counts_list = [count for word, count in spam_common]
axes[1].barh(spam_words_list[:15], spam_counts_list[:15], color='red', alpha=0.7)
axes[1].set_xlabel('Frequency')
axes[1].set_title('Top 15 Words in Spam Messages')
axes[1].invert_yaxis()

plt.tight_layout()
plt.savefig('../images/word_frequency.png', dpi=300, bbox_inches='tight')
print("\n‚úÖ Word frequency saved: ../images/word_frequency.png")
plt.close()


WORD FREQUENCY ANALYSIS

üî§ Top 10 Ham Words:
   i: 2075
   you: 1773
   to: 1474
   the: 1048
   a: 960
   u: 890
   and: 818
   in: 753
   me: 712
   my: 668

üî§ Top 10 Spam Words:
   to: 594
   a: 332
   call: 305
   you: 259
   your: 241
   √•¬£: 221
   free: 190
   for: 184
   the: 181
   now: 157

‚úÖ Word frequency saved: ../images/word_frequency.png


In [9]:
# 8. WORD CLOUDS

print("\n" + "="*70)
print("GENERATING WORD CLOUDS")
print("="*70)

# Ham word cloud
print("\n‚òÅÔ∏è  Generating Ham word cloud...")
ham_text = ' '.join(df[df['label'] == 'ham']['cleaned_message'])
ham_wordcloud = WordCloud(width=800, height=400, 
                          background_color='white',
                          colormap='Greens').generate(ham_text)

plt.figure(figsize=(12, 6))
plt.imshow(ham_wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud - Ham Messages', fontsize=16, pad=20)
plt.tight_layout()
plt.savefig('../images/wordcloud_ham.png', dpi=300, bbox_inches='tight')
plt.close()

# Spam word cloud
print("‚òÅÔ∏è  Generating Spam word cloud...")
spam_text = ' '.join(df[df['label'] == 'spam']['cleaned_message'])
spam_wordcloud = WordCloud(width=800, height=400, 
                           background_color='white',
                           colormap='Reds').generate(spam_text)

plt.figure(figsize=(12, 6))
plt.imshow(spam_wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud - Spam Messages', fontsize=16, pad=20)
plt.tight_layout()
plt.savefig('../images/wordcloud_spam.png', dpi=300, bbox_inches='tight')
plt.close()

print("‚úÖ Word clouds saved!")


GENERATING WORD CLOUDS

‚òÅÔ∏è  Generating Ham word cloud...
‚òÅÔ∏è  Generating Spam word cloud...
‚úÖ Word clouds saved!


In [10]:
# 9. SPECIAL CHARACTERS ANALYSIS

print("\n" + "="*70)
print("SPECIAL CHARACTERS ANALYSIS")
print("="*70)

# Count special characters
df['has_url'] = df['message'].apply(lambda x: 1 if 'http' in x.lower() or 'www' in x.lower() else 0)
df['has_currency'] = df['message'].apply(lambda x: 1 if '$' in x or '¬£' in x or '‚Ç¨' in x else 0)
df['has_phone'] = df['message'].apply(lambda x: 1 if re.search(r'\d{10}', x) else 0)
df['exclamation_count'] = df['message'].apply(lambda x: x.count('!'))
df['capital_count'] = df['message'].apply(lambda x: sum(1 for c in x if c.isupper()))

print("\nüîç Special Character Patterns:")
for feature in ['has_url', 'has_currency', 'has_phone']:
    print(f"\n   {feature}:")
    for label in ['ham', 'spam']:
        count = df[df['label'] == label][feature].sum()
        total = len(df[df['label'] == label])
        print(f"      {label}: {count} ({count/total*100:.1f}%)")


SPECIAL CHARACTERS ANALYSIS

üîç Special Character Patterns:

   has_url:
      ham: 2 (0.0%)
      spam: 91 (13.9%)

   has_currency:
      ham: 18 (0.4%)
      spam: 217 (33.2%)

   has_phone:
      ham: 1 (0.0%)
      spam: 355 (54.4%)


In [None]:
# 10. SAVE PROCESSED DATA

print("\n" + "="*70)
print("SAVING PROCESSED DATA")
print("="*70)

# Encode labels
df['label_encoded'] = df['label'].map({'ham': 0, 'spam': 1})

# Save processed data
df.to_csv('../data/spam_processed.csv', index=False)
print("\n‚úÖ Processed data saved: ../data/spam_processed.csv")

# Save summary
summary = {
    'total_messages': int(len(df)),
    'ham_messages': int(label_counts['ham']),
    'spam_messages': int(label_counts['spam']),
    'spam_percentage': float(label_percentages['spam']),
    'avg_message_length': float(df['message_length'].mean()),
    'avg_word_count': float(df['word_count'].mean()),
    'spam_avg_length': float(df[df['label'] == 'spam']['message_length'].mean()),
    'ham_avg_length': float(df[df['label'] == 'ham']['message_length'].mean())
}

import json
with open('../artifacts/eda_summary.json', 'w') as f:
    json.dump(summary, f, indent=4)
print("‚úÖ Summary saved: ../artifacts/eda_summary.json")


SAVING PROCESSED DATA

‚úÖ Processed data saved: data/spam_processed.csv
‚úÖ Summary saved: ../artifacts/eda_summary.json


In [12]:
# 11. KEY INSIGHTS

print("\n" + "="*70)
print("KEY INSIGHTS")
print("="*70)

print(f"""
üìä Dataset Overview:
   - Total Messages: {len(df):,}
   - Ham Messages: {label_counts['ham']:,} ({label_percentages['ham']:.2f}%)
   - Spam Messages: {label_counts['spam']:,} ({label_percentages['spam']:.2f}%)

üîç Key Findings:

1. CLASS IMBALANCE
   - Moderate imbalance: {label_percentages['spam']:.1f}% spam
   - Ratio 1:{1/spam_ratio:.0f}

2. MESSAGE LENGTH
   - Spam messages are longer: {df[df['label']=='spam']['message_length'].mean():.0f} chars
   - Ham messages are shorter: {df[df['label']=='ham']['message_length'].mean():.0f} chars

3. SPAM INDICATORS
   - More URLs in spam messages
   - More currency symbols ($, ¬£) in spam
   - More phone numbers in spam
   - More exclamation marks in spam

4. COMMON SPAM WORDS
   - free, call, prize, win, urgent
   - txt, claim, guaranteed, cash

5. COMMON HAM WORDS
   - ok, thanks, got, will, can
   - love, home, day, time

üí° RECOMMENDATIONS:
   - Focus on text features (TF-IDF)
   - Handle class imbalance (class weights)
   - Use character/special char features
   - Consider n-grams (unigrams + bigrams)
""")

print("\n" + "="*70)
print("‚úÖ EXPLORATORY DATA ANALYSIS COMPLETED!")
print("="*70)

print("\nüìÅ Generated Files:")
print("   ‚úÖ ../artifacts/spam_distribution.png")
print("   ‚úÖ ../artifacts/message_length_analysis.png")
print("   ‚úÖ ../artifacts/word_frequency.png")
print("   ‚úÖ ../artifacts/wordcloud_ham.png")
print("   ‚úÖ ../artifacts/wordcloud_spam.png")
print("   ‚úÖ ../artifacts/eda_summary.json")
print("   ‚úÖ ../data/spam_processed.csv")

print("\nüöÄ Next Steps:")
print("   1. Run model_training.ipynb")
print("   2. Use TF-IDF for feature extraction")
print("   3. Test multiple classifiers")


KEY INSIGHTS

üìä Dataset Overview:
   - Total Messages: 5,169
   - Ham Messages: 4,516 (87.37%)
   - Spam Messages: 653 (12.63%)

üîç Key Findings:

1. CLASS IMBALANCE
   - Moderate imbalance: 12.6% spam
   - Ratio 1:7

2. MESSAGE LENGTH
   - Spam messages are longer: 138 chars
   - Ham messages are shorter: 70 chars

3. SPAM INDICATORS
   - More URLs in spam messages
   - More currency symbols ($, ¬£) in spam
   - More phone numbers in spam
   - More exclamation marks in spam

4. COMMON SPAM WORDS
   - free, call, prize, win, urgent
   - txt, claim, guaranteed, cash

5. COMMON HAM WORDS
   - ok, thanks, got, will, can
   - love, home, day, time

üí° RECOMMENDATIONS:
   - Focus on text features (TF-IDF)
   - Handle class imbalance (class weights)
   - Use character/special char features
   - Consider n-grams (unigrams + bigrams)


‚úÖ EXPLORATORY DATA ANALYSIS COMPLETED!

üìÅ Generated Files:
   ‚úÖ ../artifacts/spam_distribution.png
   ‚úÖ ../artifacts/message_length_analysis.pn