In [27]:
# Import necessary libraries
import pandas as pd
import numpy as np
import pickle
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
from scipy import sparse
import matplotlib.pyplot as plt

In [36]:
# Load dataset
print("Loading data...")
data = pd.read_csv('dataset_2.csv')
print(f"Data loaded: {data.shape}")
print(data.head())

Loading data...
Data loaded: (30240, 4)
   Message ID                       Subject  \
0           1           vastar resource inc   
1           2  calpine daily gas nomination   
2           3                      re issue   
3           4              meter allocation   
4           5        mcmullen gas for 11 99   

                                             Message  Spam/Ham  
0  gary production from the high island larger bl...         0  
1                   calpine daily gas nomination doc         0  
2  see note below already done stella stella this...         0  
3  kimberly vaughn lauri i have put this on stran...         0  
4  jackie since the inlet to river plant is shut ...         0  


In [40]:
# Cell 2: Preprocess and clean
def preprocess_and_clean(data):
    """Preprocess and clean spam data"""
    import re
    import pandas as pd

    text_col = 'Message'
    label_col = 'Spam/Ham'  # 0 = ham, 1 = spam

    print(f"Original shape: {data.shape}")
    print(f"Columns: {data.columns.tolist()}")

    # Clean text
    def clean_text(text):
        if pd.isna(text):
            return ""
        text = str(text).lower()
        text = re.sub(r'http\S+|www\S+', '', text)
        text = re.sub(r'\\n', ' ', text)
        text = re.sub(r'[^a-zA-Z0-9\s.,!?@:/-]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    # Create cleaned_text
    data['cleaned_text'] = data[text_col].apply(clean_text)

    print(f"Texts before filtering: {len(data)}")

    # Keep only non-empty messages
    data = data[data['cleaned_text'].str.len() > 1]
    print(f"Texts after filtering: {len(data)}")

    # Fix label mapping for numeric dataset
    data['label'] = data[label_col].astype(int)

    print(f"Label distribution:\n{data['label'].value_counts()}")

    # Feature extraction
    data['text_length'] = data['cleaned_text'].str.len()
    data['word_count'] = data['cleaned_text'].apply(lambda x: len(str(x).split()))
    data['uppercase_ratio'] = data[text_col].apply(
        lambda x: sum(1 for c in str(x) if c.isupper()) / max(len(str(x)), 1)
    )

    # Remove duplicates
    initial = len(data)
    data = data.drop_duplicates(subset=['cleaned_text'])
    print(f"Removed {initial - len(data)} duplicates")

    # Handle outliers gently
    for col in ['text_length', 'word_count']:
        if data[col].nunique() > 1:
            Q1 = data[col].quantile(0.25)
            Q3 = data[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 3 * IQR
            upper = Q3 + 3 * IQR
            data = data[(data[col] >= lower) & (data[col] <= upper)]

    print(f"Final shape after outlier removal: {data.shape}")

    # Balance dataset
    print("\n=== BALANCING DATASET ===")
    spam_data = data[data['label'] == 1]
    ham_data = data[data['label'] == 0]

    print(f"Spam samples: {len(spam_data)}")
    print(f"Ham samples: {len(ham_data)}")

    if len(spam_data) > 0 and len(ham_data) > 0:
        min_samples = min(len(spam_data), len(ham_data))
        spam_balanced = spam_data.sample(n=min_samples, random_state=42)
        ham_balanced = ham_data.sample(n=min_samples, random_state=42)
        data = pd.concat([spam_balanced, ham_balanced]).sample(frac=1, random_state=42).reset_index(drop=True)
        print(f"Balanced dataset shape: {data.shape}")
    else:
        print("Skipping balancing (one label class missing).")

    return data


cleaned_data = preprocess_and_clean(data)

Original shape: (30240, 5)
Columns: ['Message ID', 'Subject', 'Message', 'Spam/Ham', 'cleaned_text']
Texts before filtering: 30240
Texts after filtering: 30239
Label distribution:
label
1    15159
0    15080
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['label'] = data[label_col].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['text_length'] = data['cleaned_text'].str.len()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['word_count'] = data['cleaned_text'].apply(lambda x: len(str(x).split()))


Removed 4852 duplicates
Final shape after outlier removal: (25349, 9)

=== BALANCING DATASET ===
Spam samples: 12341
Ham samples: 13008
Balanced dataset shape: (24682, 9)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['uppercase_ratio'] = data[text_col].apply(


In [55]:
# Vectorize features
from scipy.sparse import hstack

print("\nVectorizing features...")

# Check cleaned text
print(f"Sample cleaned texts:")
print(cleaned_data['cleaned_text'].head())
print(f"\nNon-empty texts: {(cleaned_data['cleaned_text'].str.len() > 0).sum()}")

# TF-IDF with more lenient settings
tfidf = TfidfVectorizer(
    max_features=10000,
    stop_words='english',
    ngram_range=(1, 3),
    min_df=2,
    max_df=0.95
)

try:
    X_tfidf = tfidf.fit_transform(cleaned_data['cleaned_text'])
    print(f"TF-IDF vocabulary size: {len(tfidf.vocabulary_)}")
except ValueError as e:
    print(f"Error: {e}")
    print("Trying without stop words...")
    # Retry without stop words if vocabulary is empty
    tfidf = TfidfVectorizer( 
        max_features=3000, 
        ngram_range=(1,2),
        min_df=1
    )
    X_tfidf = tfidf.fit_transform(cleaned_data['cleaned_text'])

# Numeric features - DON'T scale for Naive Bayes compatibility
X_numeric = cleaned_data[['text_length', 'word_count', 'uppercase_ratio']].values

# Combine
X = sparse.hstack([X_tfidf, X_numeric])
y = cleaned_data['label'].values

print(f"Features shape: {X.shape}")

# Save preprocessing tools
with open('tfidf_vectoriser.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

# Save a scaler for the API (even though we don't use it for training)
scaler = StandardScaler()
scaler.fit(cleaned_data[['text_length', 'word_count', 'uppercase_ratio']])
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

numeric_features = cleaned_data[['text_length', 'word_count', 'uppercase_ratio']].values
X_combined = hstack([X_tfidf, sparse.csr_matrix(numeric_features)])
print("Combined feature shape:", X_combined.shape)

# Use combined features going forward
X = X_combined


Vectorizing features...
Sample cleaned texts:
0    attached is the open season posting for tw s p...
1    hey better than all other spam filter only del...
2    ford salute the military this offer is extende...
3    won t take our name aep mirant powerex limited...
4    increase your cum volume and orgasm length mai...
Name: cleaned_text, dtype: object

Non-empty texts: 24682
TF-IDF vocabulary size: 10000
Features shape: (24682, 10003)
Combined feature shape: (24682, 10003)


In [56]:
# Load and train ML models
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier

print("\n" + "="*50)
print("TRAINING MODELS")
print("="*50)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {X_train.shape}, Test: {X_test.shape}")
print(f"Train spam: {sum(y_train == 1)}, ham: {sum(y_train == 0)}")
print(f"Test spam: {sum(y_test == 1)}, ham: {sum(y_test == 0)}")

print("\nRunning Grid Search for Logistic Regression...")
param_grid = {
    'C': [0.1, 1, 5, 10],
    'solver': ['liblinear', 'lbfgs']
}
grid = GridSearchCV(
    LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1
)
grid.fit(X_train, y_train)
print("Best Logistic Regression params:", grid.best_params_)
best_lr = grid.best_estimator_

# Define models
models = {
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(
        n_estimators=100, 
        random_state=42,
        class_weight='balanced',  # Handle imbalanced data
        max_depth=20
    ),
    'Logistic Regression': best_lr,
}

results = {}

# Train each model
for name, model in models.items():
    print(f"\n=== {name} ===")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    results[name] = {
        'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1, 'model': model
    }
    
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1-score : {f1:.4f}")

print("\nBuilding ensemble model...")
ensemble = VotingClassifier(
    estimators=[
        ('lr', best_lr),
        ('rf', RandomForestClassifier(n_estimators=300, max_depth=30, random_state=42)),
        ('nb', MultinomialNB())
    ],
    voting='soft'
)
ensemble.fit(X_train, y_train)
ensemble_pred = ensemble.predict(X_test)

print("\nEnsemble Performance:")
print(classification_report(y_test, ensemble_pred))


# Summary
print("\n" + "="*70)
print("MODEL COMPARISON SUMMARY")
print("="*70)
print(f"{'Model':<20} {'Accuracy':<12} {'Precision':<12} {'Recall':<12} {'F1-Score':<12}")
print("-"*70)
for name, res in results.items():
    print(f"{name:<20} {res['acc']:<12.4f} {res['prec']:<12.4f} {res['rec']:<12.4f} {res['f1']:<12.4f}")

# Best model
best_name = max(results, key=lambda x: results[x]['f1'])
best_model = results[best_name]['model']
print(f"\nBEST MODEL: {best_name} (F1-Score: {results[best_name]['f1']:.4f})")

# Final evaluation
y_pred = best_model.predict(X_test)
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))


TRAINING MODELS
Train: (19745, 10003), Test: (4937, 10003)
Train spam: 9873, ham: 9872
Test spam: 2468, ham: 2469

Running Grid Search for Logistic Regression...
Best Logistic Regression params: {'C': 5, 'solver': 'liblinear'}

=== Naive Bayes ===
Accuracy : 0.9682
Precision: 0.9857
Recall   : 0.9502
F1-score : 0.9676

=== Random Forest ===
Accuracy : 0.9526
Precision: 0.9196
Recall   : 0.9919
F1-score : 0.9544

=== Logistic Regression ===
Accuracy : 0.9844
Precision: 0.9799
Recall   : 0.9891
F1-score : 0.9845

Building ensemble model...

Ensemble Performance:
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      2469
           1       0.98      0.99      0.98      2468

    accuracy                           0.98      4937
   macro avg       0.98      0.98      0.98      4937
weighted avg       0.98      0.98      0.98      4937


MODEL COMPARISON SUMMARY
Model                Accuracy     Precision    Recall       F1-Score    
------

In [52]:
# Cell 5: Test the model
def test_model_predictions():
    """Test model on sample emails"""
    print("\n=== MODEL SANITY CHECK ===")
    
    test_cases = [
        ("Hi, let's meet for coffee tomorrow at 3pm", "ham"),
        ("Meeting reminder: Project review at 2pm", "ham"),
        ("Your package has been delivered successfully", "ham"),
        ("WINNER! Claim your $1000000 prize NOW!!!", "spam"),
        ("Congratulations! You won the lottery! Send bank details", "spam"),
        ("URGENT: Verify your account or it will be closed", "spam"),
    ]
    
    correct = 0
    total = len(test_cases)
    
    for text, expected in test_cases:
        # Clean and process the text
        cleaned = text.lower()
        cleaned = re.sub(r'http\S+|www\S+', '', cleaned)
        cleaned = re.sub(r'\\n', ' ', cleaned)
        cleaned = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', cleaned)
        cleaned = re.sub(r'\s+', ' ', cleaned).strip()
        
        # Extract features
        X_tfidf_test = tfidf.transform([cleaned])
        text_length = len(cleaned)
        word_count = len(cleaned.split())
        uppercase_ratio = sum(1 for c in text if c.isupper()) / max(len(text), 1)
        X_numeric_test = np.array([[text_length, word_count, uppercase_ratio]])
        X_test = sparse.hstack([X_tfidf_test, X_numeric_test])
        
        # Predict
        prediction = best_model.predict(X_test)[0]
        probability = best_model.predict_proba(X_test)[0]
        pred_label = 'spam' if prediction == 1 else 'ham'
        
        # Check if correct
        is_correct = pred_label == expected
        if is_correct:
            correct += 1
        
        status = "✓" if is_correct else "✗"
        print(f"{status} Text: {text[:50]}...")
        print(f"   Expected: {expected}, Got: {pred_label} (confidence: {probability[prediction]:.2f})")
        print()
    
    accuracy = (correct / total) * 100
    print(f"Test Accuracy: {correct}/{total} ({accuracy:.1f}%)")
    
    if accuracy < 50:
        print("\nWARNING: Model is performing poorly! Consider:")
        print("   1. Balancing your dataset")
        print("   2. Adding more training data")
        print("   3. Adjusting model parameters")

# Run the test
test_model_predictions()


=== MODEL SANITY CHECK ===
✓ Text: Hi, let's meet for coffee tomorrow at 3pm...
   Expected: ham, Got: ham (confidence: 0.58)

✓ Text: Meeting reminder: Project review at 2pm...
   Expected: ham, Got: ham (confidence: 0.94)

✗ Text: Your package has been delivered successfully...
   Expected: ham, Got: spam (confidence: 0.79)

✓ Text: WINNER! Claim your $1000000 prize NOW!!!...
   Expected: spam, Got: spam (confidence: 0.92)

✓ Text: Congratulations! You won the lottery! Send bank de...
   Expected: spam, Got: spam (confidence: 0.87)

✓ Text: URGENT: Verify your account or it will be closed...
   Expected: spam, Got: spam (confidence: 0.83)

Test Accuracy: 5/6 (83.3%)


In [53]:
# Save the best model
with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)
print("\nBest model saved as 'best_model.pkl'")


Best model saved as 'best_model.pkl'
