In [11]:
import pandas as pd
import numpy as np
import joblib
import re
import os
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score
import warnings

warnings.filterwarnings('ignore')

model_path = '../models/best_spam_classifier.joblib'
model_2_path = '../models/best_tuned_spam_classifier.joblib'
try:
    model = joblib.load(model_path)
    
    print(f"Model loaded successfully from '{model_path}'")
   
except FileNotFoundError:
    print(f"Error: Model file not found at '{model_path}'")
    print("Please run the model training notebook first to generate the model file.")
    model = None
try:
    new_data_df = pd.read_csv('../data/test/spam_or_not_spam.csv')
  
    X_new_text = new_data_df.iloc[:, 0].astype(str)
    y_new = new_data_df.iloc[:, 1]
    
    print(f"New data loaded: {len(X_new_text)} emails and {len(y_new)} labels.")
except FileNotFoundError as e:
    print(f"Error loading new data: {e}")
    print("Please make sure 'spam_or_not_spam.csv' is in the '../data/' directory.")
    X_new_text, y_new = None, None




Model loaded successfully from '../models/best_spam_classifier.joblib'
New data loaded: 3000 emails and 3000 labels.


In [12]:
def extract_features_from_text(text):
    """
    Extracts the 57 features required by the Spambase model from a raw text string.
    """
    feature_columns = [
        'word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d',
        'word_freq_our', 'word_freq_over', 'word_freq_remove', 'word_freq_internet',
        'word_freq_order', 'word_freq_mail', 'word_freq_receive', 'word_freq_will',
        'word_freq_people', 'word_freq_report', 'word_freq_addresses', 'word_freq_free',
        'word_freq_business', 'word_freq_email', 'word_freq_you', 'word_freq_credit',
        'word_freq_your', 'word_freq_font', 'word_freq_000', 'word_freq_money',
        'word_freq_hp', 'word_freq_hpl', 'word_freq_george', 'word_freq_650',
        'word_freq_lab', 'word_freq_labs', 'word_freq_telnet', 'word_freq_857',
        'word_freq_data', 'word_freq_415', 'word_freq_85', 'word_freq_technology',
        'word_freq_1999', 'word_freq_parts', 'word_freq_pm', 'word_freq_direct',
        'word_freq_cs', 'word_freq_meeting', 'word_freq_original', 'word_freq_project',
        'word_freq_re', 'word_freq_edu', 'word_freq_table', 'word_freq_conference',
        'char_freq_;', 'char_freq_(', 'char_freq__', 'char_freq_!',
        'char_freq_$', 'char_freq_#', 'capital_run_length_average',
        'capital_run_length_longest', 'capital_run_length_total'
    ]

    features = {col: 0 for col in feature_columns}
    text_lower = text.lower()
    words = re.findall(r'[a-zA-Z0-9]+', text_lower)
    total_words = len(words) if len(words) > 0 else 1
    total_chars = len(text) if len(text) > 0 else 1

    # Word Frequencies
    word_freq_cols = [col for col in feature_columns if 'word_freq' in col]
    for col in word_freq_cols:
        word = col.replace('word_freq_', '')
        features[col] = (words.count(word) / total_words) * 100

    # Character Frequencies
    char_freq_cols = [col for col in feature_columns if 'char_freq' in col]
    for col in char_freq_cols:
        char = col.replace('char_freq_', '')
        if char == '_': char = '['
        features[col] = (text.count(char) / total_chars) * 100
        
    # Capital Run Lengths
    capital_runs = re.findall(r'[A-Z]+', text)
    if capital_runs:
        run_lengths = [len(run) for run in capital_runs]
        features['capital_run_length_average'] = np.mean(run_lengths)
        features['capital_run_length_longest'] = np.max(run_lengths)
        features['capital_run_length_total'] = np.sum(run_lengths)
    else: 
        features['capital_run_length_average'] = 0
        features['capital_run_length_longest'] = 0
        features['capital_run_length_total'] = 0

    return pd.Series(features, index=feature_columns)


In [13]:

if model is not None and X_new_text is not None:
    print("\nExtracting features from new emails *this may take a moment*")
    
    X_new_features = X_new_text.apply(extract_features_from_text)
    print("Feature extraction complete.")
   
    y_pred_new = model.predict(X_new_features)
    
    print("\nModel Evaluation on New Data")
  
    accuracy = accuracy_score(y_new, y_pred_new)
    precision = precision_score(y_new, y_pred_new)
    

    print(f"Overall Accuracy: {accuracy:.4f}")
    print(f"Spam Precision:   {precision:.4f}\n")
    
    print("Classification Report:")
    print(classification_report(y_new, y_pred_new, target_names=['Not Spam', 'Spam']))

    print("Confusion Matrix:")
    cm = confusion_matrix(y_new, y_pred_new)
    cm_df = pd.DataFrame(cm, index=['Actual Not Spam', 'Actual Spam'], columns=['Predicted Not Spam', 'Predicted Spam'])
    display(cm_df)
else:
    print("\nEvaluation skipped due to missing model or data.")


Extracting features from new emails *this may take a moment*
Feature extraction complete.

Model Evaluation on New Data
Overall Accuracy: 0.8957
Spam Precision:   0.7367

Classification Report:
              precision    recall  f1-score   support

    Not Spam       0.92      0.96      0.94      2500
        Spam       0.74      0.58      0.65       500

    accuracy                           0.90      3000
   macro avg       0.83      0.77      0.79      3000
weighted avg       0.89      0.90      0.89      3000

Confusion Matrix:


Unnamed: 0,Predicted Not Spam,Predicted Spam
Actual Not Spam,2396,104
Actual Spam,209,291
