In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import pickle
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
from scipy import sparse
import matplotlib.pyplot as plt

In [2]:
# Load dataset
print("Loading data...")
data = pd.read_csv('spam_emails.csv')
print(f"Data loaded: {data.shape}")
print(data.head())

Loading data...
Data loaded: (84, 3)
                                               title  \
0                          ?? the secrets to SUCCESS   
1                    ?? You Earned 500 GCLoot Points   
2                         ?? Your GitHub launch code   
3  [The Virtual Reward Center] Re: ** Clarifications   
4  10-1 MLB Expert Inside, Plus Everything You Ne...   

                                                text      type  
0  Hi James,\n\nHave you claim your complimentary...      spam  
1  \nalt_text\nCongratulations, you just earned\n...  not spam  
2  Here's your GitHub launch code, @Mortyj420!\n ...  not spam  
3  Hello,\n \nThank you for contacting the Virtua...  not spam  
4  Hey Prachanda Rawal,\n\nToday's newsletter is ...      spam  


In [3]:
def preprocess_and_clean(data):
    """Preprocess and clean spam data"""
    # Your CSV has columns: title, text, type
    text_col = 'text'
    label_col = 'type'
    
    print(f"Original shape: {data.shape}")
    print(f"Columns: {data.columns.tolist()}")

    # Clean text - LESS aggressive cleaning
    def clean_text(text):
        if pd.isna(text):
            return ""
        text = str(text).lower()
        text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
        text = re.sub(r'\\n', ' ', text)  # Replace \n with space
        text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)  # Keep more characters
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    data['cleaned_text'] = data[text_col].apply(clean_text)
    
    # Check if we have valid text
    print(f"Texts before filtering: {len(data)}")
    print(f"Sample cleaned text: {data['cleaned_text'].iloc[0][:100]}")
    
    # Only remove truly empty texts
    data = data[data['cleaned_text'].str.len() > 5]  # Keep texts with at least 5 chars
    print(f"Texts after filtering: {len(data)}")

    # Convert labels
    print(f"\nUnique labels found: {data[label_col].unique()}")
    
    # Map 'spam' and 'not spam' to 0 and 1
    label_mapping = {
        'spam': 1,
        'not spam': 0,
        'ham': 0
    }
    
    data['label'] = data[label_col].map(label_mapping)
    
    # Check for unmapped labels
    if data['label'].isna().any():
        print(f"Warning: Some labels couldn't be mapped.")
        print(f"Unmapped values: {data[data['label'].isna()][label_col].unique()}")
        data = data.dropna(subset=['label'])
        print(f"Dropped {data['label'].isna().sum()} rows with invalid labels")
    
    data['label'] = data['label'].astype(int)
    print(f"Label distribution:\n{data['label'].value_counts()}")

    # Extract features
    data['text_length'] = data['cleaned_text'].str.len()
    data['word_count'] = data['cleaned_text'].apply(lambda x: len(str(x).split()))
    data['uppercase_ratio'] = data[text_col].apply(
        lambda x: sum(1 for c in str(x) if c.isupper()) / max(len(str(x)), 1)
    )

    # Remove duplicates
    initial = len(data)
    data = data.drop_duplicates(subset=['cleaned_text'])
    print(f"Removed {initial - len(data)} duplicates")

    # Handle outliers
    for col in ['text_length', 'word_count']:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        data = data[(data[col] >= lower) & (data[col] <= upper)]

    print(f"Final shape: {data.shape}")
    return data

cleaned_data = preprocess_and_clean(data)

Original shape: (84, 3)
Columns: ['title', 'text', 'type']
Texts before filtering: 84
Sample cleaned text: hi james, have you claim your complimentary gift yet? ive compiled in here a special astrology gift 
Texts after filtering: 84

Unique labels found: ['spam' 'not spam']
Label distribution:
label
0    58
1    26
Name: count, dtype: int64
Removed 2 duplicates
Final shape: (72, 8)


In [None]:
# Vectorize features
print("\nVectorizing features...")

# Check cleaned text
print(f"Sample cleaned texts:")
print(cleaned_data['cleaned_text'].head())
print(f"\nNon-empty texts: {(cleaned_data['cleaned_text'].str.len() > 0).sum()}")

# TF-IDF with more lenient settings
tfidf = TfidfVectorizer(
    max_features=3000, 
    stop_words='english', 
    ngram_range=(1,2),
    min_df=1,  # Include words that appear at least once
    max_df=0.95  # Exclude words that appear in >95% of documents
)

try:
    X_tfidf = tfidf.fit_transform(cleaned_data['cleaned_text'])
    print(f"TF-IDF vocabulary size: {len(tfidf.vocabulary_)}")
except ValueError as e:
    print(f"Error: {e}")
    print("Trying without stop words...")
    # Retry without stop words if vocabulary is empty
    tfidf = TfidfVectorizer(
        max_features=3000, 
        ngram_range=(1,2),
        min_df=1
    )
    X_tfidf = tfidf.fit_transform(cleaned_data['cleaned_text'])

# Numeric features - DON'T scale for Naive Bayes compatibility
X_numeric = cleaned_data[['text_length', 'word_count', 'uppercase_ratio']].values

# Combine
X = sparse.hstack([X_tfidf, X_numeric])
y = cleaned_data['label'].values

print(f"Features shape: {X.shape}")

# Save preprocessing tools
with open('tfidf_vectoriser.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

# Save a scaler for the API (even though we don't use it for training)
scaler = StandardScaler()
scaler.fit(cleaned_data[['text_length', 'word_count', 'uppercase_ratio']])
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)


Vectorizing features...
Sample cleaned texts:
0    hi james, have you claim your complimentary gi...
1    alttext congratulations, you just earned 500 y...
2    heres your github launch code, mortyj420! an o...
3    hello, thank you for contacting the virtual re...
5    model casting call thank you for taking the ti...
Name: cleaned_text, dtype: object

Non-empty texts: 72
TF-IDF vocabulary size: 3000
Features shape: (72, 3003)
Saved: tfidf_vectoriser.pkl, scaler.pkl


In [5]:
# Load and train ML models
print("\n" + "="*50)
print("TRAINING MODELS")
print("="*50)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train: {X_train.shape}, Test: {X_test.shape}")

# Define models
models = {
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42)
}

results = {}

# Train each model
for name, model in models.items():
    print(f"\n=== {name} ===")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    results[name] = {
        'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1, 'model': model
    }
    
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1-score : {f1:.4f}")

# Summary
print("\n" + "="*70)
print("MODEL COMPARISON SUMMARY")
print("="*70)
print(f"{'Model':<20} {'Accuracy':<12} {'Precision':<12} {'Recall':<12} {'F1-Score':<12}")
print("-"*70)
for name, res in results.items():
    print(f"{name:<20} {res['acc']:<12.4f} {res['prec']:<12.4f} {res['rec']:<12.4f} {res['f1']:<12.4f}")

# Best model
best_name = max(results, key=lambda x: results[x]['f1'])
best_model = results[best_name]['model']
print(f"\nBEST MODEL: {best_name} (F1-Score: {results[best_name]['f1']:.4f})")

# Final evaluation
y_pred = best_model.predict(X_test)
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))



TRAINING MODELS
Train: (57, 3003), Test: (15, 3003)

=== Naive Bayes ===
Accuracy : 0.6667
Precision: 0.0000
Recall   : 0.0000
F1-score : 0.0000

=== Random Forest ===
Accuracy : 0.7333
Precision: 1.0000
Recall   : 0.2000
F1-score : 0.3333

=== Logistic Regression ===
Accuracy : 0.8000
Precision: 0.7500
Recall   : 0.6000
F1-score : 0.6667

MODEL COMPARISON SUMMARY
Model                Accuracy     Precision    Recall       F1-Score    
----------------------------------------------------------------------
Naive Bayes          0.6667       0.0000       0.0000       0.0000      
Random Forest        0.7333       1.0000       0.2000       0.3333      
Logistic Regression  0.8000       0.7500       0.6000       0.6667      

BEST MODEL: Logistic Regression (F1-Score: 0.6667)

Confusion Matrix:
[[9 1]
 [2 3]]

Classification Report:
              precision    recall  f1-score   support

         Ham       0.82      0.90      0.86        10
        Spam       0.75      0.60      0.67       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [6]:
# Save the best model
with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)
print("\nBest model saved as 'best_model.pkl'")
print("All files ready for API deployment!")


Best model saved as 'best_model.pkl'
All files ready for API deployment!
