In [2]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import pandas as pd
import json
import pickle
import os
from datetime import datetime
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import string
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dark_Coder\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:

class EmailClassifier:
    """A flexible text classifier with pickle-based model persistence."""
    
    def __init__(self, model_name='naive_bayes', vectorizer_type='tfidf', 
                 max_features=5000, ngram_range=(1, 2)):
        self.model_name = model_name
        self.vectorizer_type = vectorizer_type
        self.max_features = max_features
        self.ngram_range = ngram_range
        self.is_trained = False
        self.is_clean=True
        
        
        self._initialize_pipeline()
    
    def _initialize_pipeline(self):
        """Initialize the pipeline with selected vectorizer and model."""
        # Select vectorizer
        if self.vectorizer_type == 'tfidf':
            vectorizer = TfidfVectorizer(
                max_features=self.max_features,
                ngram_range=self.ngram_range,
                stop_words='english'
            )
        else:  # count vectorizer
            vectorizer = CountVectorizer(
                max_features=self.max_features,
                ngram_range=self.ngram_range,
                stop_words='english'
            )
        
        # Select model
        if self.model_name == 'naive_bayes':
            model = MultinomialNB()
        elif self.model_name == 'svm':
            model = LinearSVC(random_state=42)
        elif self.model_name == 'random_forest':
            model = RandomForestClassifier(n_estimators=100, random_state=42)
        elif self.model_name == 'logistic':
            model = LogisticRegression(random_state=42)
        else:
            raise ValueError(f"Unknown model name: {self.model_name}")
        
        self.pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', model)
        ])
    
    def save_model_pickle(self, filepath=None, include_timestamp=True, protocol=pickle.HIGHEST_PROTOCOL):
        """
        Save the trained model using pickle.
        
        Args:
            filepath (str, optional): Path to save the model. If None, generates a default name
            include_timestamp (bool): Whether to include timestamp in the filename
            protocol (int): Pickle protocol version to use
            
        Returns:
            str: Path where the model was saved
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before saving")
        
        if filepath is None:
            # Create models directory if it doesn't exist
            os.makedirs('models', exist_ok=True)
            
            # Generate filename
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') if include_timestamp else ''
            filepath = f'models/{self.model_name}_{self.vectorizer_type}_{timestamp}.pkl'
        
        # Prepare model data
        model_data = {
            'pipeline': self.pipeline,
            'config': {
                'model_name': self.model_name,
                'vectorizer_type': self.vectorizer_type,
                'max_features': self.max_features,
                'ngram_range': self.ngram_range
            },
            'metadata': {
                'save_date': datetime.now().isoformat(),
                'is_trained': self.is_trained
            }
        }
        
        # Save using pickle
        with open(filepath, 'wb') as f:
            pickle.dump(model_data, f, protocol=protocol)
        
        return filepath
    
    @classmethod
    def load_model_pickle(cls, filepath):
        """
        Load a saved model using pickle.
        
        Args:
            filepath (str): Path to the saved model
            
        Returns:
            EmailClassifier: Loaded classifier instance
        """
        # Load the saved model data
        with open(filepath, 'rb') as f:
            model_data = pickle.load(f)
        
        # Create new instance with saved configuration
        instance = cls(**model_data['config'])
        instance.pipeline = model_data['pipeline']
        instance.is_trained = model_data['metadata']['is_trained']
        
        return instance
    
    def train(self, texts, labels, is_clean=True):
        """Train the classifier."""
        
        # Perform cross-validation
        cv_scores = cross_val_score(self.pipeline, texts, labels, cv=5)
        
        # Clean texts if necessary
        if is_clean:
            texts = [self.clean_text(text) for text in texts]

        # Train the model on full dataset
        self.pipeline.fit(texts, labels)
        self.is_trained = True
        
        return {
            "model_name": self.model_name,
            "vectorizer_type": self.vectorizer_type,
            "cross_val_scores": {
                "mean": float(cv_scores.mean()),
                "std": float(cv_scores.std()),
                "scores": [float(score) for score in cv_scores]
            }
        }
    
    def predict_(self, text):
        """Predict for a single text."""
        if not self.is_trained:
            raise ValueError("Model must be trained before making predictions")
        
        text = self.clean_text(text)  # Apply cleaning if necessary    
        pred = self.pipeline.predict([text])[0]
        
        result = {
            "text": text[:100] + "..." if len(text) > 100 else text,
            "prediction": int(pred),
            "classification": "positive" if pred == 1 else "negative"
        }
        
        if hasattr(self.pipeline.named_steps['classifier'], 'predict_proba'):
            prob = self.pipeline.predict_proba([text])[0]
            result["confidence"] = float(max(prob))
        
        return result
    
    def clean_text(self, text):
        if isinstance(text, list):
            return [self.clean_text(t) for t in text]
        
        # Remove the punctuations
        text = ''.join([char for char in text if char not in string.punctuation])
        
        # Tokenize the words
        words = text.split()
        
        # Remove stopwords and apply stemming (the library we use)
        stop_words = stopwords.words('english')
        stemmer = SnowballStemmer('english')
        words = [stemmer.stem(word) for word in words if word.lower() not in stop_words]
        return ' '.join(words)

In [15]:
# Example usage

classifier = EmailClassifier(
    model_name='naive_bayes',  # Try different models: 'naive_bayes', 'svm', 'logistic'
)

data = pd.read_csv('spam_ham_dataset.csv')
texts = data['text']
labels = data['label_num']

# Train
results = classifier.train(texts, labels)
print("Training results:", json.dumps(results, indent=2))

# Save using pickle
saved_path = classifier.save_model_pickle()
print(f"Model saved to: {saved_path}")

Texts before the cleaning 0       Subject: enron methanol ; meter # : 988291\r\n...
1       Subject: hpl nom for january 9 , 2001\r\n( see...
2       Subject: neon retreat\r\nho ho ho , we ' re ar...
3       Subject: photoshop , windows , office . cheap ...
4       Subject: re : indian springs\r\nthis deal is t...
                              ...                        
5166    Subject: put the 10 on the ft\r\nthe transport...
5167    Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168    Subject: calpine daily gas nomination\r\n>\r\n...
5169    Subject: industrial worksheets for august 2000...
5170    Subject: important online banking alert\r\ndea...
Name: text, Length: 5171, dtype: object
Texts After the cleaning ['subject enron methanol meter 988291 follow note gave monday 4 3 00 preliminari flow data provid daren pleas overrid pop daili volum present zero reflect daili activ obtain gas control chang need asap econom purpos', 'subject hpl nom januari 9 2001 see attach file hplno

In [196]:
model_path = 'models/naive_bayes_tfidf_20241025_181158.pkl'  # Replace with your saved model path

loaded_classifier = EmailClassifier.load_model_pickle(model_path)
        
# Test the loaded model
example_emails = [
    "Congratulations! You've won a $1,000 gift card. Click here to claim your prize.",
    "Please see the attached file for the monthly report. Let me know if you have any questions.",
    "Get Viagra at 50% off. No prescription needed!",
    "Team meeting scheduled for tomorrow at 10 AM. Please confirm your availability."
]

for email in example_emails:
    batch_results = classifier.predict_(email)
    print("\nBatch Prediction Results:")
    print(json.dumps(batch_results['text'], indent=2))

    if batch_results["prediction"] == 0:
        print("Not Spam.")
    else:
        print("Spam")



Batch Prediction Results:
{
  "text": "Congratulations! You've won a $1,000 gift card. Click here to claim your prize.",
  "prediction": 1,
  "classification": "positive",
  "confidence": 0.9039914198611662
}
Spam

Batch Prediction Results:
{
  "text": "Please see the attached file for the monthly report. Let me know if you have any questions.",
  "prediction": 0,
  "classification": "negative",
  "confidence": 0.9966224543172951
}
Not Spam.

Batch Prediction Results:
{
  "text": "Get Viagra at 50% off. No prescription needed!",
  "prediction": 1,
  "classification": "positive",
  "confidence": 0.9714840387990622
}
Spam

Batch Prediction Results:
{
  "text": "Team meeting scheduled for tomorrow at 10 AM. Please confirm your availability.",
  "prediction": 0,
  "classification": "negative",
  "confidence": 0.9453725869513427
}
Not Spam.
