In [None]:
import nltk
import re
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import RandomizedSearchCV
import numpy as np


from scipy.stats import uniform
from scipy.stats import randint


import seaborn as sns
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report
from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB




from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import FunctionTransformer


from sklearn.tree import export_graphviz
import graphviz


from sklearn.pipeline import Pipeline




# Initialize the lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()


nltk.download('stopwords')
stop_words = set(stopwords.words('english'))




# Set the NLTK data download path
nltk_data_path = 'C:/Users/HP/Desktop/ml/project1 classification/nltk_data'
nltk.data.path.append(nltk_data_path)

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt', download_dir=nltk_data_path)
nltk.download('stopwords', download_dir=nltk_data_path)
nltk.download('wordnet', download_dir=nltk_data_path)
nltk.download('omw-1.4', download_dir=nltk_data_path)

# Load the dataset
file_path = 'C:/Users/HP/Desktop/ml/project1 classification/CEAS_08.csv'
df = pd.read_csv(file_path)

df.dropna(inplace=True)




def extract_urls(text):
   if not isinstance(text, str):
       return []
   url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
   urls = re.findall(url_pattern, text)
   return urls


# Function to clean email body
def clean_text(text):
   if not isinstance(text, str):
      text = ''
  


   text = re.sub(r'http\S+', '', text)
   text = re.sub(r'>+=+=+=+=+', '', text)  # Removing separators like '+=+=+=+=+=+'
   text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and digits
  
   # 3. Lowercase the text
   text = text.lower()
  
   # 4. Remove non-alphabetic characters and numbers
   text = re.sub(r'[^a-z\s]', '', text)
  
   # 5. Remove stopwords
   text_tokens = text.split()
   filtered_words = [word for word in text_tokens if word not in stop_words]
  
   return ' '.join(filtered_words)


# Apply URL extraction and data cleaning
df['extracted_urls'] = df['body'].apply(extract_urls)
df['body'] = df['body'].apply(clean_text)






print(df[['sender', 'subject', 'body', 'extracted_urls', 'label']].head())


# Features and labels
X = df[['body', 'extracted_urls']]
y = df['label']


def message_length(X):
   return X.apply(len)




def column_as_string(X):
   return X.astype(str)




print("AAA")
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Preprocessor: handling 'body' and 'extracted_urls' separately
preprocessor = ColumnTransformer(
   transformers=[
       ('body', Pipeline([
           ('convert_to_str', FunctionTransformer(lambda x: x.astype(str), validate=False)),
           ('tfidf', TfidfVectorizer(min_df=1, max_df=0.9))
       ]), 'body'),
       ('extracted_urls', Pipeline([
           ('convert_to_str', FunctionTransformer(lambda x: x.astype(str), validate=False)),
           ('tfidf', TfidfVectorizer())
       ]), 'extracted_urls'),
        # For 'message_length' column: calculate the length of the message body
      ('message_length', Pipeline([
           ('length', FunctionTransformer(lambda X: np.array(X.apply(len)).reshape(-1, 1)))  # Reshape to 2D
       ]), 'body') 
   ])











#Create the full pipeline with RandomForestClassifier
model = Pipeline([
   ('preprocessor', preprocessor),
   ('rf', RandomForestClassifier())
])


# Define hyperparameter distributions
param_dist = {
   'rf__n_estimators': randint(50, 500),  # Use randint for a distribution of values
   'rf__max_depth': randint(1, 20)        # Use randint for a distribution of values
}


# Use random search to find the best hyperparameters
rand_search = RandomizedSearchCV(model,
                                param_distributions=param_dist,
                                n_iter=6,
                                cv=5,
                                random_state=42)  # Added random_state for reproducibility


# Fit the random search object to the data
rand_search.fit(X_train, y_train)


# Make predictions
y_pred_train = rand_search.predict(X_train)
y_pred_test = rand_search.predict(X_test)


# Evaluate your model
print("Train Set Performance:")
print(classification_report(y_train, y_pred_train))


print("\nTest Set Performance:")
print(classification_report(y_test, y_pred_test))


pred_prob_train = rand_search.predict_proba(X_train)[:,1]
pred_prob_test = rand_search.predict_proba(X_test)[:,1]


# calculate ROC AUC score
roc_auc_train = roc_auc_score(y_train, y_pred_train)
roc_auc_test = roc_auc_score(y_test, y_pred_test)
print("\nTrain ROC AUC:", roc_auc_train)
print("Test ROC AUC:", roc_auc_test)


# plot the ROC curve
fpr_train, tpr_train, thresholds_train = roc_curve(y_train, pred_prob_train)
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, pred_prob_test)
plt.plot([0,1],[0,1],'k--')
plt.plot(fpr_train, tpr_train, label="Train ROC AUC: {:.2f}".format(roc_auc_train))
plt.plot(fpr_test, tpr_test, label="Test ROC AUC: {:.2f}".format(roc_auc_test))
plt.legend()
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()


# calculate confusion matrix
cm_train = confusion_matrix(y_train, y_pred_train)
cm_test = confusion_matrix(y_test, y_pred_test)


fig, ax = plt.subplots(1, 2, figsize=(11,4))


print("\nConfusion Matrix:")
sns.heatmap(cm_train, annot=True, xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'], cmap="Oranges", fmt='.4g', ax=ax[0])
ax[0].set_xlabel("Predicted Label")
ax[0].set_ylabel("True Label")
ax[0].set_title("Train Confusion Matrix")


sns.heatmap(cm_test, annot=True, xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'], cmap="Oranges", fmt='.4g', ax=ax[1])
ax[1].set_xlabel("Predicted Label")
ax[1].set_ylabel("True Label")
ax[1].set_title("Test Confusion Matrix")


plt.tight_layout()
plt.show()


new_email_body = """hey mate...."""


# Apply the same preprocessing steps (extract URLs and clean text)
new_email_body = clean_text(new_email_body)  # Clean the email body
new_email_urls = extract_urls(new_email_body)  # Extract URLs
new_email_length = len(new_email_body)  # Calculate the length of the email body


# Create a DataFrame similar to the one used for training
new_email_df = pd.DataFrame({
   'body': [new_email_body],  # Cleaned email body
   'extracted_urls': [' '.join(new_email_urls)],  # Join extracted URLs into a single string
   'message_length': [new_email_length]  # Include the message length
})


# Make a prediction using the model pipeline
prediction = rand_search.predict(new_email_df)


# Print the prediction result
if prediction == 0:
   print("This is not a spam Email!")
else:
   print("This is a Spam Email!")



