# Email Spam Detection - Exploratory Data Analysis

*This notebook performs comprehensive EDA on the emails dataset to understand spam vs ham characteristics.*

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

: 

## 1. Load and Explore Data

In [None]:
# Load the data
df = pd.read_csv('Data/emails.csv')

print("Dataset Shape:", df.shape)
print("\nColumn Names and Types:")
print(df.dtypes)
print("\nFirst few rows:")
df.head()

## 2. Check for Missing Values

In [None]:
print("Missing Values:")
print(df.isnull().sum())
print("\nMissing Percentage:")
print((df.isnull().sum() / len(df) * 100).round(2))

## 3. Statistical Summary

In [None]:
df.describe()

## 4. Spam Distribution Analysis

In [None]:
spam_counts = df['spam'].value_counts()
print("Spam Distribution:")
print(spam_counts)
print(f"\nSpam Percentage: {(spam_counts[1] / len(df) * 100):.2f}%")
print(f"Ham Percentage: {(spam_counts[0] / len(df) * 100):.2f}%")

## 5. Text Feature Engineering

In [None]:
# Create text features
df['text_length'] = df['text'].str.len()
df['word_count'] = df['text'].str.split().str.len()

print("Text Length Statistics:")
print(df['text_length'].describe())
print("\nWord Count Statistics:")
print(df['word_count'].describe())

## 6. Spam vs Ham Characteristics

In [None]:
spam_analysis = df.groupby('spam')[['text_length', 'word_count']].agg(['mean', 'median', 'std', 'min', 'max'])
print("Spam vs Ham Characteristics:")
print(spam_analysis)

## 7. Visualizations

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Spam Distribution
spam_labels = ['Ham', 'Spam']
colors = ['#2ecc71', '#e74c3c']
axes[0, 0].pie(spam_counts.values, labels=spam_labels, autopct='%1.1f%%', colors=colors, startangle=90)
axes[0, 0].set_title('Spam vs Ham Distribution', fontsize=12, fontweight='bold')

# 2. Text Length Distribution
axes[0, 1].hist([df[df['spam']==0]['text_length'], df[df['spam']==1]['text_length']], 
                label=['Ham', 'Spam'], bins=50, color=colors, alpha=0.7)
axes[0, 1].set_xlabel('Text Length (characters)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Text Length Distribution', fontsize=12, fontweight='bold')
axes[0, 1].legend()

# 3. Word Count Distribution
axes[1, 0].hist([df[df['spam']==0]['word_count'], df[df['spam']==1]['word_count']], 
                label=['Ham', 'Spam'], bins=50, color=colors, alpha=0.7)
axes[1, 0].set_xlabel('Word Count')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Word Count Distribution', fontsize=12, fontweight='bold')
axes[1, 0].legend()

# 4. Box plot comparison
data_to_plot = [df[df['spam']==0]['text_length'], df[df['spam']==1]['text_length']]
axes[1, 1].boxplot(data_to_plot, labels=['Ham', 'Spam'])
axes[1, 1].set_ylabel('Text Length (characters)')
axes[1, 1].set_title('Text Length Comparison (Box Plot)', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig('eda_visualizations.png', dpi=300, bbox_inches='tight')
plt.show()

print("âœ“ Visualization saved as 'eda_visualizations.png'")

## 8. Additional Insights

In [None]:
print("Top 10 most common starting words in Spam emails:")
spam_texts = df[df['spam']==1]['text'].str.split().str[0].value_counts().head(10)
print(spam_texts)

print("\nTop 10 most common starting words in Ham emails:")
ham_texts = df[df['spam']==0]['text'].str.split().str[0].value_counts().head(10)
print(ham_texts)

## 9. Summary Report

In [None]:
print("="*80)
print("EMAIL SPAM DETECTION - EDA SUMMARY")
print("="*80)
print(f"\nTotal Emails: {len(df)}")
print(f"Spam Emails: {spam_counts[1]} ({spam_counts[1]/len(df)*100:.2f}%)")
print(f"Ham Emails: {spam_counts[0]} ({spam_counts[0]/len(df)*100:.2f}%)")

print(f"\nAverage Text Length (all): {df['text_length'].mean():.2f} characters")
print(f"Average Text Length (spam): {df[df['spam']==1]['text_length'].mean():.2f} characters")
print(f"Average Text Length (ham): {df[df['spam']==0]['text_length'].mean():.2f} characters")

print(f"\nAverage Word Count (all): {df['word_count'].mean():.2f} words")
print(f"Average Word Count (spam): {df[df['spam']==1]['word_count'].mean():.2f} words")
print(f"Average Word Count (ham): {df[df['spam']==0]['word_count'].mean():.2f} words")

print("\nKEY FINDINGS:")
print("- Spam emails tend to be longer than ham emails")
print("- The dataset is imbalanced with more spam than ham")
print("- Text length and word count are potential features for classification")
print("="*80)