In [None]:
import nltk
import re
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from nltk.corpus import stopwords
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Initialize the lemmatizer and stopwords
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load the dataset
def load_data(file_path):
    df = pd.read_csv(file_path)
    df.dropna(inplace=True)
    return df

# Extract URLs from the text
def extract_urls(text):
    if not isinstance(text, str):
        return []
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    urls = re.findall(url_pattern, text)
    return urls

# Clean email body
def clean_text(text):
    if not isinstance(text, str):
        text = ''
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'>+=+=+=+=+', '', text)  # Remove separators
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    text_tokens = text.split()
    filtered_words = [word for word in text_tokens if word not in stop_words]
    return ' '.join(filtered_words)

# Preprocessing pipeline
def create_preprocessor():
    return ColumnTransformer(
        transformers=[
            ('body', Pipeline([
                ('convert_to_str', FunctionTransformer(lambda x: x.astype(str), validate=False)),
                ('tfidf', TfidfVectorizer(min_df=1, max_df=0.9))
            ]), 'body'),
            ('extracted_urls', Pipeline([
                ('convert_to_str', FunctionTransformer(lambda x: x.astype(str), validate=False)),
                ('tfidf', TfidfVectorizer())
            ]), 'extracted_urls'),
            ('message_length', Pipeline([
                ('length', FunctionTransformer(lambda X: np.array(X.apply(len)).reshape(-1, 1)))
            ]), 'body') 
        ]
    )

# Train models
def train_models(X_train, y_train):
    classifiers = {
        'RandomForest': RandomForestClassifier(),
        'NaiveBayes': MultinomialNB(),
        'SVM': SVC(probability=True),
        'LogisticRegression': LogisticRegression()
    }

    models = {}
    for name, clf in classifiers.items():
        model = Pipeline([
            ('preprocessor', create_preprocessor()),
            ('classifier', clf)
        ])
        model.fit(X_train, y_train)
        models[name] = model
    return models

# Evaluate models and display results
def evaluate_models(models, X_test, y_test):
    results = {}
    for name, model in models.items():
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
        print(f"Results for {name}:")
        print(classification_report(y_test, y_pred))
        roc_auc = roc_auc_score(y_test, y_prob) if y_prob is not None else None
        results[name] = {
            'y_pred': y_pred,
            'roc_auc': roc_auc,
            'y_prob': y_prob
        }
    return results

# Plot ROC curves
def plot_roc_curves(results, y_test):
    plt.plot([0,1],[0,1],'k--')
    for name, res in results.items():
        if res['y_prob'] is not None:
            fpr, tpr, _ = roc_curve(y_test, res['y_prob'])
            plt.plot(fpr, tpr, label=f"{name} ROC AUC: {res['roc_auc']:.2f}")
    plt.legend()
    plt.title("ROC Curves")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.show()

# Plot confusion matrices
def plot_confusion_matrices(results, y_test):
    fig, ax = plt.subplots(1, len(results), figsize=(15,5))
    for i, (name, res) in enumerate(results.items()):
        cm = confusion_matrix(y_test, res['y_pred'])
        sns.heatmap(cm, annot=True, cmap="Oranges", fmt='g', ax=ax[i])
        ax[i].set_title(f"{name} Confusion Matrix")
    plt.show()

# Main execution
def main():
    
    # Set the NLTK data download path
    nltk_data_path = 'C:/Users/HP/Desktop/ml/project1 classification/nltk_data'
    nltk.data.path.append(nltk_data_path)
    
    # Ensure necessary NLTK resources are downloaded
    nltk.download('punkt', download_dir=nltk_data_path)
    nltk.download('stopwords', download_dir=nltk_data_path)
    nltk.download('wordnet', download_dir=nltk_data_path)
    nltk.download('omw-1.4', download_dir=nltk_data_path)
    
    # Load the dataset
    file_path = 'C:/Users/HP/Desktop/ml/project1 classification/CEAS_08.csv'
    df = pd.read_csv(file_path)
        
    # Load data
    df = load_data(file_path)
    
    # Data cleaning and preprocessing
    df['extracted_urls'] = df['body'].apply(extract_urls)
    df['body'] = df['body'].apply(clean_text)
    
    # Features and labels
    X = df[['body', 'extracted_urls']]
    y = df['label']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train models
    models = train_models(X_train, y_train)
    
    # Evaluate models
    results = evaluate_models(models, X_test, y_test)
    
    # Plot ROC Curves
    plot_roc_curves(results, y_test)
    
    # Plot Confusion Matrices
    plot_confusion_matrices(results, y_test)
    
    # ---- New Data for Testing ----
    print("\n--- Testing on New Data ---")
    
    # Example new data (replace this with actual new email data)
    new_data = pd.DataFrame({
        'body': ["Congratulations! You've won a lottery.", "Please reply to confirm the meeting."],
        'extracted_urls': ["http://lotterywin.com", ""]
    })
    
    # Clean and preprocess the new data
    new_data['body'] = new_data['body'].apply(clean_text)
    new_data['extracted_urls'] = new_data['extracted_urls'].apply(extract_urls)
    
    # Predict the label for new data using RandomForest model
    model_to_test = models['RandomForest']  # Example: Using RandomForest
    predictions = model_to_test.predict(new_data)
    
    # Output the predictions
    print("Predictions for the new data:", predictions)

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:/Users/HP/Desktop/ml/project1
[nltk_data]     classification/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:/Users/HP/Desktop/ml/project1
[nltk_data]     classification/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:/Users/HP/Desktop/ml/project1
[nltk_data]     classification/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:/Users/HP/Desktop/ml/project1
[nltk_data]     classification/nltk_dat