In [6]:
import sys
from pathlib import Path

# Add parent directory to path so we can import src module
sys.path.insert(0, str(Path.cwd().parent))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Import custom modules from src directory for better organization
from src.preprocessing import (
    load_data,
    missing_value_handling,
    labels_encoding,
    clean_text,
    preprocess_data,
    get_text_stats,
    data_split,
    get_class_distribution,
    get_sample_message
)

from src.modeling import (
    vectorize_text,
    train_baseline_model,
    train_logistic_regression,
    train_decision_tree,
    save_model,
    get_feature_importance
)

from src.evaluation import (
    evaluate_model,
    print_evaluation_report,
    plot_confusion_matrix,
    analyze_errors,
    print_error_analysis,
    plot_roc_curve,
    compare_models,
    save_metrics_to_csv
)

# Set random seed
np.random.seed(42)
# Setup paths
figures_path = Path('../results/figures')

In [8]:
# Load Data & Basic check

df = pd.read_csv('../data/processed/spam_with_features.csv')

print(f"Shape: {df.shape}")
print(f"Rows: {df.shape[0]:,}")
print(f"Columns: {df.shape[1]}")

print("\nColumn names:")
print(df.columns.tolist())

print("\nFirst few rows:")
print(df.head())

# Check data types
print("\nData types:")
print(df.dtypes)

Shape: (5572, 25)
Rows: 5,572
Columns: 25

Column names:
['label', 'message', 'message_length', 'word_count', 'avg_word_length', 'num_exclamation', 'num_question', 'num_capitals', 'capital_ratio', 'has_numbers', 'has_currency', 'has_phone', 'has_url', 'urgency_count', 'money_count', 'action_count', 'has_time_words', 'starts_with_free', 'congratulations', 'youve_won', 'click_here', 'call_now', 'txt_to', 'reply_to', 'label_numeric']

First few rows:
  label                                            message  message_length  \
0   ham  Go until jurong point, crazy.. Available only ...             111   
1   ham                      Ok lar... Joking wif u oni...              29   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...             155   
3   ham  U dun say so early hor... U c already then say...              49   
4   ham  Nah I don't think he goes to usf, he lives aro...              61   

   word_count  avg_word_length  num_exclamation  num_question  num_capitals  \


In [10]:
# Data Prep & Preprocessing

# Check for missing values
missing = df.isnull().sum()
print("\nMissing values:")
print(missing[missing > 0])

if missing.sum() > 0:
    df = missing_value_handling(df)
    print("✓ Missing values handled")

# Ensure we have required columns
required_cols = ['label', 'message', 'cleaned_message', 'label_numeric']
missing_cols = [col for col in required_cols if col not in df.columns]

if missing_cols:
    print(f"\n⚠ Missing required columns: {missing_cols}")
    
    # Create cleaned_message if it doesn't exist
    if 'cleaned_message' not in df.columns:
        print("Creating cleaned_message column...")
        df = preprocess_data(df, text_col='message')
        print("✓ Text cleaned")
    
    # Create label_numeric if it doesn't exist
    if 'label_numeric' not in df.columns:
        print("Creating label_numeric column...")
        df['label_numeric'] = df['label'].map({'ham': 0, 'spam': 1})
        print("✓ Labels encoded")
else:
    print("✓ All required columns present")

# Verify class distribution
class_dist = df['label'].value_counts()
print("\nClass distribution:")
print(class_dist)
print(f"\nClass balance:")
for label, count in class_dist.items():
    pct = count / len(df) * 100
    print(f"  {label}: {count:,} ({pct:.1f}%)")


Missing values:
Series([], dtype: int64)

⚠ Missing required columns: ['cleaned_message']
Creating cleaned_message column...
✓ Text cleaned

Class distribution:
label
ham     4825
spam     747
Name: count, dtype: int64

Class balance:
  ham: 4,825 (86.6%)
  spam: 747 (13.4%)


In [12]:
# Feature Selection 

# Features from the list I'm considering based on EDA - 3 Tiers - must have, strong, and moderate
custom_features = [
    'has_phone',      
    'has_numbers',     
    'money_count',      
    'action_count',    
    'has_currency',    
    'message_length',   
    'has_url',          
    'word_count',       
]

print("\nCustom features selected (from EDA):")
for i, feat in enumerate(custom_features, 1):
    print(f"  {i}. {feat}")

# Verify all features exist
available_features = [f for f in custom_features if f in df.columns]
missing_features = [f for f in custom_features if f not in df.columns]

print(f"\n Available: {len(available_features)}/{len(custom_features)}")
if missing_features:
    print(f"Missing: {missing_features}")
    custom_features = available_features



Custom features selected (from EDA):
  1. has_phone
  2. has_numbers
  3. money_count
  4. action_count
  5. has_currency
  6. message_length
  7. has_url
  8. word_count

 Available: 8/8
