# 📧 Email Classification - Feature Selection Practice

**Learning Goal:** Practice feature selection techniques using a real-world email classification problem.

## Prerequisites

In [77]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif, RFE, VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

---

## 🎯 STEP 1: Create Email Dataset

### Q1: What makes a good classification dataset?

**Answer:** A dataset with a naturally categorical target variable that we want to predict.

**Our Problem:** Classify emails as **SPAM**, **PROMOTIONAL**, or **PERSONAL**

In [78]:
# Create an email dataset
email_data = {
    'word_count': [
        # Personal emails (10 samples) - typically shorter, casual
        45, 15, 60, 35, 40, 55, 30, 65, 38, 42,
        # Promotional emails (10 samples) - medium length, business-like
        120, 80, 150, 90, 170, 110, 95, 140, 85, 130,
        # Spam emails (10 samples) - often longer, sales-heavy
        200, 300, 250, 280, 220, 350, 180, 320, 270, 240
    ],
    'exclamation_marks': [
        # Personal emails - few exclamation marks
        0, 1, 1, 1, 0, 2, 0, 1, 1, 0,
        # Promotional emails - moderate exclamation marks
        3, 2, 4, 3, 5, 2, 3, 4, 2, 3,
        # Spam emails - many exclamation marks
        8, 12, 9, 11, 7, 15, 6, 13, 10, 8
    ],
    'capital_letters': [
        # Personal emails - few capitals
        5, 3, 8, 6, 7, 9, 4, 10, 6, 5,
        # Promotional emails - moderate capitals
        25, 12, 20, 15, 28, 18, 14, 22, 16, 24,
        # Spam emails - many capitals (shouting)
        45, 60, 50, 55, 40, 70, 35, 65, 58, 48
    ],
    'links_count': [
        # Personal emails - very few links
        0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
        # Promotional emails - some links
        2, 1, 3, 2, 3, 2, 1, 3, 2, 2,
        # Spam emails - many links
        5, 8, 6, 7, 4, 9, 3, 8, 6, 5
    ],
    'time_of_day': [
        # Personal emails - varied times, including evenings/weekends
        9, 22, 20, 21, 7, 19, 8, 23, 18, 6,
        # Promotional emails - business hours
        14, 16, 15, 17, 14, 10, 13, 16, 11, 15,
        # Spam emails - often odd hours
        10, 11, 13, 12, 2, 3, 1, 4, 23, 0
    ],
    'sender_domain': [
        # Personal emails - personal domains
        'gmail', 'gmail', 'gmail', 'hotmail', 'gmail', 'yahoo', 'hotmail', 'gmail', 'yahoo', 'gmail',
        # Promotional emails - company domains
        'company', 'work', 'company', 'work', 'company', 'work', 'company', 'work', 'company', 'work',
        # Spam emails - suspicious domains
        'promo', 'ads', 'promo', 'ads', 'promo', 'ads', 'promo', 'ads', 'promo', 'ads'
    ],
    'has_attachment': [
        # Personal emails - sometimes attachments
        0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
        # Promotional emails - often have attachments (brochures, etc.)
        1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        # Spam emails - rarely legitimate attachments
        0, 0, 0, 0, 1, 0, 0, 0, 0, 1
    ],
    'weekend_sent': [
        # Personal emails - often sent on weekends
        1, 1, 1, 1, 1, 0, 1, 0, 0, 1,
        # Promotional emails - usually business days
        0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
        # Spam emails - sent any time
        0, 1, 0, 0, 1, 0, 1, 0, 1, 0
    ],
    'email_type': [
        # 10 Personal emails
        'Personal', 'Personal', 'Personal', 'Personal', 'Personal', 
        'Personal', 'Personal', 'Personal', 'Personal', 'Personal',
        # 10 Promotional emails
        'Promotional', 'Promotional', 'Promotional', 'Promotional', 'Promotional',
        'Promotional', 'Promotional', 'Promotional', 'Promotional', 'Promotional',
        # 10 Spam emails
        'Spam', 'Spam', 'Spam', 'Spam', 'Spam',
        'Spam', 'Spam', 'Spam', 'Spam', 'Spam'
    ]
}

df = pd.DataFrame(email_data)

print("📧 Email Classification Dataset:")
print(df.head(10))
print(f"\nDataset shape: {df.shape}")
print(f"\nTarget distribution:")
print(df['email_type'].value_counts())

📧 Email Classification Dataset:
   word_count  exclamation_marks  capital_letters  links_count  time_of_day  \
0          45                  0                5            0            9   
1          15                  1                3            0           22   
2          60                  1                8            1           20   
3          35                  1                6            0           21   
4          40                  0                7            1            7   
5          55                  2                9            1           19   
6          30                  0                4            0            8   
7          65                  1               10            1           23   
8          38                  1                6            0           18   
9          42                  0                5            0            6   

  sender_domain  has_attachment  weekend_sent email_type  
0         gmail               0        

---

## 🔍 STEP 2: Understand Your Features

### Q2: What information helps identify spam emails?

**Answer:** Look at patterns that distinguish between email types.

In [79]:
print("\nFeature descriptions:")
feature_descriptions = {
    'word_count': 'Number of words in email',
    'exclamation_marks': 'Number of ! marks',
    'capital_letters': 'Number of CAPITAL letters',
    'links_count': 'Number of hyperlinks',
    'time_of_day': 'Hour email was sent (0-23)',
    'sender_domain': 'Email domain type',
    'has_attachment': 'Has file attachment (0/1)',
    'weekend_sent': 'Sent on weekend (0/1)'
}

for feature, description in feature_descriptions.items():
    print(f"• {feature}: {description}")

# Quick analysis by email type
print("\n📊 Feature patterns by email type:")
for email_type in df['email_type'].unique():
    subset = df[df['email_type'] == email_type]
    print(f"\n{email_type} emails:")
    print(f"  Avg exclamation marks: {subset['exclamation_marks'].mean():.1f}")
    print(f"  Avg capital letters: {subset['capital_letters'].mean():.1f}")
    print(f"  Avg links: {subset['links_count'].mean():.1f}")


Feature descriptions:
• word_count: Number of words in email
• exclamation_marks: Number of ! marks
• capital_letters: Number of CAPITAL letters
• links_count: Number of hyperlinks
• time_of_day: Hour email was sent (0-23)
• sender_domain: Email domain type
• has_attachment: Has file attachment (0/1)
• weekend_sent: Sent on weekend (0/1)

📊 Feature patterns by email type:

Personal emails:
  Avg exclamation marks: 0.7
  Avg capital letters: 6.3
  Avg links: 0.4

Promotional emails:
  Avg exclamation marks: 3.1
  Avg capital letters: 19.4
  Avg links: 2.1

Spam emails:
  Avg exclamation marks: 9.9
  Avg capital letters: 52.6
  Avg links: 6.1


**Key Insight:** Spam emails have more exclamation marks, capital letters, and links!

---

## 🔧 STEP 3: Handle Categorical Variables

### Q3: How do we convert text features to numbers?

**Answer:** Use dummy variables (one-hot encoding) to convert categories into 0/1 columns.

In [80]:
print("Before encoding:")
print(f"Columns: {list(df.columns)}")
print(f"Unique domains: {df['sender_domain'].unique()}")

# Convert sender_domain to dummy variables
df_encoded = pd.get_dummies(df, columns=['sender_domain'], prefix='domain')

print("\nAfter encoding:")
print(f"Columns: {list(df_encoded.columns)}")
print(f"New domain columns: {[col for col in df_encoded.columns if col.startswith('domain_')]}")

# Show the encoding result
print("\nSample of encoded domain features:")
domain_cols = [col for col in df_encoded.columns if col.startswith('domain_')]
print(df_encoded[['sender_domain'] + domain_cols].head() if 'sender_domain' in df_encoded.columns else df_encoded[domain_cols].head())

Before encoding:
Columns: ['word_count', 'exclamation_marks', 'capital_letters', 'links_count', 'time_of_day', 'sender_domain', 'has_attachment', 'weekend_sent', 'email_type']
Unique domains: ['gmail' 'hotmail' 'yahoo' 'company' 'work' 'promo' 'ads']

After encoding:
Columns: ['word_count', 'exclamation_marks', 'capital_letters', 'links_count', 'time_of_day', 'has_attachment', 'weekend_sent', 'email_type', 'domain_ads', 'domain_company', 'domain_gmail', 'domain_hotmail', 'domain_promo', 'domain_work', 'domain_yahoo']
New domain columns: ['domain_ads', 'domain_company', 'domain_gmail', 'domain_hotmail', 'domain_promo', 'domain_work', 'domain_yahoo']

Sample of encoded domain features:
   domain_ads  domain_company  domain_gmail  domain_hotmail  domain_promo  \
0       False           False          True           False         False   
1       False           False          True           False         False   
2       False           False          True           False         False   

---

## 🗑️ STEP 4: Remove Useless Features

### Q4: What features should we always remove?

**Answer:** ID columns, random noise, and duplicate information.

In [81]:
# Let's add some bad features for practice
df_encoded['email_id'] = range(1, len(df_encoded) + 1)  # ID column
df_encoded['random_noise'] = np.random.randint(1, 100, len(df_encoded))  # Random numbers
df_encoded['duplicate_word_count'] = df_encoded['word_count']  # Duplicate feature

print("❌ Added some useless features:")
print("• email_id: Just an ID number")
print("• random_noise: Random meaningless numbers") 
print("• duplicate_word_count: Same as word_count")

print(f"\nBefore removal: {len(df_encoded.columns)} features")
print(f"All features: {list(df_encoded.columns)}")

# Remove the obvious bad features
features_to_remove = ['email_id', 'random_noise', 'duplicate_word_count']
df_clean = df_encoded.drop(columns=features_to_remove)

print(f"\nAfter removal: {len(df_clean.columns)} features")
print(f"✅ Remaining: {list(df_clean.columns)}")

❌ Added some useless features:
• email_id: Just an ID number
• random_noise: Random meaningless numbers
• duplicate_word_count: Same as word_count

Before removal: 18 features
All features: ['word_count', 'exclamation_marks', 'capital_letters', 'links_count', 'time_of_day', 'has_attachment', 'weekend_sent', 'email_type', 'domain_ads', 'domain_company', 'domain_gmail', 'domain_hotmail', 'domain_promo', 'domain_work', 'domain_yahoo', 'email_id', 'random_noise', 'duplicate_word_count']

After removal: 15 features
✅ Remaining: ['word_count', 'exclamation_marks', 'capital_letters', 'links_count', 'time_of_day', 'has_attachment', 'weekend_sent', 'email_type', 'domain_ads', 'domain_company', 'domain_gmail', 'domain_hotmail', 'domain_promo', 'domain_work', 'domain_yahoo']


**Key Rule:** If a human can't explain why a feature would help, remove it!

---

## 📊 STEP 5: Statistical Feature Selection

### Q5: How do we find the most important features automatically?

**Answer:** Use statistical tests to measure how well each feature predicts the target.

In [82]:
# Prepare features and target
feature_columns = [col for col in df_clean.columns if col != 'email_type']
X = df_clean[feature_columns]
y = df_clean['email_type']

print(f"Features to analyze: {len(X.columns)}")
print(f"Target classes: {list(y.unique())}")

Features to analyze: 14
Target classes: ['Personal', 'Promotional', 'Spam']


### Method A: Filter Method (SelectKBest)


In [83]:
print("\n🔍 Method A: Statistical Filter (SelectKBest)")
selector = SelectKBest(score_func=f_classif, k=5)
X_selected = selector.fit_transform(X, y)

print("Feature importance scores:")
feature_scores = selector.scores_
for feature, score in zip(X.columns, feature_scores):
    print(f"  {feature}: {score:.2f}")

selected_features = X.columns[selector.get_support()]
print(f"\n✅ Top 5 features selected: {list(selected_features)}")


🔍 Method A: Statistical Filter (SelectKBest)
Feature importance scores:
  word_count: 92.23
  exclamation_marks: 71.58
  capital_letters: 110.25
  links_count: 57.51
  time_of_day: 4.52
  has_attachment: 5.26
  weekend_sent: 2.80
  domain_ads: 9.00
  domain_company: 9.00
  domain_gmail: 13.50
  domain_hotmail: 2.25
  domain_promo: 9.00
  domain_work: 9.00
  domain_yahoo: 2.25

✅ Top 5 features selected: ['word_count', 'exclamation_marks', 'capital_letters', 'links_count', 'domain_gmail']


### Method B: Wrapper Method (RFE)


In [84]:
print("\n🔄 Method B: Wrapper Method (RFE)")
rfe_model = LogisticRegression(random_state=42, max_iter=10000)
rfe_selector = RFE(rfe_model, n_features_to_select=5)
rfe_selector.fit(X, y)

rfe_selected = X.columns[rfe_selector.support_]
print(f"✅ RFE selected features: {list(rfe_selected)}")

# Show feature rankings
print("\nFeature rankings (1 = best):")
for feature, rank in zip(X.columns, rfe_selector.ranking_):
    print(f"  {feature}: {rank}")


🔄 Method B: Wrapper Method (RFE)
✅ RFE selected features: ['word_count', 'exclamation_marks', 'capital_letters', 'time_of_day', 'has_attachment']

Feature rankings (1 = best):
  word_count: 1
  exclamation_marks: 1
  capital_letters: 1
  links_count: 7
  time_of_day: 1
  has_attachment: 1
  weekend_sent: 6
  domain_ads: 10
  domain_company: 3
  domain_gmail: 2
  domain_hotmail: 9
  domain_promo: 5
  domain_work: 4
  domain_yahoo: 8


### Method C: Embedded Method (Random Forest)


In [85]:
print("\n🌳 Method C: Embedded Method (Random Forest)")
rf = RandomForestClassifier(n_estimators=3000, random_state=42)
rf.fit(X, y)

print("Random Forest feature importance:")
importance_scores = rf.feature_importances_
feature_importance = list(zip(X.columns, importance_scores))
feature_importance.sort(key=lambda x: x[1], reverse=True)

for feature, importance in feature_importance:
    print(f"  {feature}: {importance:.3f}")

# Select features above threshold
threshold = 0.05
important_features = [feature for feature, importance in feature_importance if importance > threshold]
print(f"\n✅ Important features (>{threshold}): {important_features}")


🌳 Method C: Embedded Method (Random Forest)
Random Forest feature importance:
  word_count: 0.230
  capital_letters: 0.214
  exclamation_marks: 0.190
  links_count: 0.140
  time_of_day: 0.056
  domain_gmail: 0.032
  domain_company: 0.028
  domain_work: 0.027
  domain_promo: 0.024
  has_attachment: 0.020
  domain_ads: 0.019
  weekend_sent: 0.010
  domain_yahoo: 0.006
  domain_hotmail: 0.004

✅ Important features (>0.05): ['word_count', 'capital_letters', 'exclamation_marks', 'links_count', 'time_of_day']


---

## 🚀 STEP 6: Build Complete Pipeline

### Q6: How do we combine all preprocessing steps?

**Answer:** Use a Pipeline to chain scaling, selection, and classification together.

In [86]:
# Create comprehensive pipeline
pipeline = Pipeline([
    ('variance_filter', VarianceThreshold(threshold=0)),  # Remove zero variance
    ('scaler', StandardScaler()),                         # Scale features
    ('selector', SelectKBest(score_func=f_classif, k=5)), # Select top 5
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

# Train the pipeline
pipeline.fit(X_train, y_train)

# Evaluate the pipeline
accuracy = pipeline.score(X_test, y_test)
print(f"\n✅ Pipeline accuracy: {accuracy:.3f}")

# Show which features were selected by the pipeline
# First get features that survived variance filtering
variance_mask = pipeline.named_steps['variance_filter'].get_support()
features_after_variance = X.columns[variance_mask]

# Then apply selector mask to the remaining features
selector_mask = pipeline.named_steps['selector'].get_support()
final_selected = features_after_variance[selector_mask]

print(f"📋 Pipeline selected features: {list(final_selected)}")

Training set size: 21
Test set size: 9

✅ Pipeline accuracy: 1.000
📋 Pipeline selected features: ['word_count', 'exclamation_marks', 'capital_letters', 'links_count', 'domain_gmail']


---

## 🎯 STEP 7: Make Predictions

### Q7: How well does our model perform?

**Answer:** Test it on new data and analyze the results.

In [87]:
# Make predictions
predictions = pipeline.predict(X_test)
probabilities = pipeline.predict_proba(X_test)

print("Sample predictions:")
class_names = pipeline.classes_
for i in range(len(X_test)):
    actual = y_test.iloc[i]
    predicted = predictions[i]
    confidence = max(probabilities[i])
    print(f"Email {i+1}: Actual={actual:12} | Predicted={predicted:12} | Confidence={confidence:.2f}")


Sample predictions:
Email 1: Actual=Spam         | Predicted=Spam         | Confidence=0.99
Email 2: Actual=Promotional  | Predicted=Promotional  | Confidence=0.72
Email 3: Actual=Spam         | Predicted=Spam         | Confidence=0.97
Email 4: Actual=Promotional  | Predicted=Promotional  | Confidence=0.71
Email 5: Actual=Personal     | Predicted=Personal     | Confidence=0.62
Email 6: Actual=Personal     | Predicted=Personal     | Confidence=0.96
Email 7: Actual=Spam         | Predicted=Spam         | Confidence=0.96
Email 8: Actual=Spam         | Predicted=Spam         | Confidence=0.72
Email 9: Actual=Promotional  | Predicted=Promotional  | Confidence=0.72
