In [1]:
import nltk
import re
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
import seaborn as sns
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier



# Set the NLTK data download path
nltk_data_path = 'C:/Users/HP/Desktop/ml/project1 classification/nltk_data'



# Add the path where NLTK data will be downloaded
nltk.data.path.append(nltk_data_path)

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt', download_dir=nltk_data_path)
nltk.download('stopwords', download_dir=nltk_data_path)
nltk.download('wordnet', download_dir=nltk_data_path)
nltk.download('omw-1.4', download_dir=nltk_data_path)
nltk.download('punkt_tab', download_dir=nltk_data_path)


# Load the dataset
file_path = 'C:/Users/HP/Desktop/ml/project1 classification/CEAS_08.csv'
df = pd.read_csv(file_path)

df.dropna(inplace=True)
print(df.isnull().sum())
# Define stopwords and lemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package punkt to
[nltk_data]     C:/Users/HP/Desktop/ml/project1
[nltk_data]     classification/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:/Users/HP/Desktop/ml/project1
[nltk_data]     classification/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:/Users/HP/Desktop/ml/project1
[nltk_data]     classification/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:/Users/HP/Desktop/ml/project1
[nltk_data]     classification/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:/Users/HP/Desktop/ml/project1
[nltk_data]     classification/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


sender      0
receiver    0
date        0
subject     0
body        0
label       0
urls        0
dtype: int64


In [2]:
# Function to extract URLs from text
def extract_urls(text):
    if not isinstance(text, str):
        return []
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    urls = re.findall(url_pattern, text)
    return urls

In [None]:
sns.countplot(x='category_column', data=df)
plt.title('Distribution of Categories')
plt.show()

In [3]:
# Function to clean email text (body, subject)
def clean_text(text):
    if not isinstance(text, str):
        text = ''
    # Remove URLs, special characters, and numbers
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    # Tokenize, remove stopwords, and lemmatize
    words = word_tokenize(text)
    filtered_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(filtered_words)

In [4]:
# Apply URL extraction and text cleaning
df['extracted_urls'] = df['body'].apply(extract_urls)
df['body'] = df['body'].apply(clean_text)
df['subject'] = df['subject'].apply(clean_text)

In [5]:
# Ensure columns are strings
def column_as_string(X):
    return X.astype(str)

# Features and labels
X = df[['body', 'extracted_urls', 'subject', 'sender']]  # Include sender here
y = df['label']

In [6]:
# Train-test split with stratification to maintain class distribution
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Preprocessor: handle 'body', 'extracted_urls', and 'subject' columns separately
preprocessor = ColumnTransformer(
    transformers=[
        ('body', Pipeline([
            ('convert_to_str', FunctionTransformer(column_as_string, validate=False)),
            ('tfidf', TfidfVectorizer())
        ]), 'body'),
        ('extracted_urls', Pipeline([
            ('convert_to_str', FunctionTransformer(column_as_string, validate=False)),
            ('tfidf', TfidfVectorizer())
        ]), 'extracted_urls'),
        ('subject', Pipeline([
            ('convert_to_str', FunctionTransformer(column_as_string, validate=False)),
            ('tfidf', TfidfVectorizer())
        ]), 'subject'),
        ('sender', Pipeline([
            ('convert_to_str', FunctionTransformer(column_as_string, validate=False)),
            ('tfidf', TfidfVectorizer())
        ]), 'sender')
    ]
)


In [8]:
# Create the Random Forest pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('nb', RandomForestClassifier())
])



In [9]:
# Hyperparameter tuning using GridSearchCV
param_grid = {
    'nb__n_estimators': [100, 200, 500],
    'nb__max_depth': [10, 20, None],
    'nb__min_samples_split': [2, 5, 10],
    'nb__min_samples_leaf': [1, 2, 4],
    'nb__bootstrap': [True, False]
}



In [15]:
grid_search = GridSearchCV(model, param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)



Fitting 3 folds for each of 162 candidates, totalling 486 fits


KeyboardInterrupt: 

In [17]:
from sklearn.model_selection import StratifiedKFold
import numpy as np
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train.dropna(inplace=True)
y_train.dropna(inplace=True)

# Use StratifiedKFold to ensure balanced class distribution in folds
skf = StratifiedKFold(n_splits=3)

# Create the GridSearchCV with StratifiedKFold
grid_search = GridSearchCV(model, param_grid, cv=skf, n_jobs=-1, verbose=1)

# Fit the grid search
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 162 candidates, totalling 486 fits


KeyboardInterrupt: 