Dataset Collection
In this code, we obtain the dataset from two sources, the Enron dataset and the Millersmile dataset.

In [None]:
# Installing all the necessary libraries
!pip install requests
!pip install re
!pip install langdetect
!pip install nltk
!pip install autocorrect
!pip install googletrans==3.1.0a0 
!pip install wordninja
!pip install tqdm

In [2]:
# Importing all the necessary libraries
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup
import nltk
import pandas as pd
import email
import langdetect
import wordninja
from tqdm import tqdm
from dateutil import parser

from autocorrect import Speller
from email import message_from_string
from nltk.corpus import words
from nltk.corpus import stopwords
from googletrans import Translator


In [4]:
# Setting the stopwords in English language
stop_words = set(stopwords.words('english'))

In [5]:
# Dataframe phish_emails_df created with columns - Name, Subject and Text
phish_emails_df = pd.DataFrame(columns = ['Name', 'Subject', 'Text']) 

In [None]:
#Web scraping
spam_email_data = []
count_spam_emails = 0
count_ham_emails = 51000
for i in range(1, 296, 1):
    url = "http://www.millersmiles.co.uk/archives/"+str(i)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    #Finding all occurrences with orange font and skipping the first one
    orange_fonts = soup.find_all('font',{'color':'orange'})[1:]
    for orange_font in orange_fonts:
        #Second orange_font is the name
        name = orange_font.get_text(strip=True)
        #Subject is a hyperlink
        subject = orange_font.find_next('a').get_text(strip=True)
        # Text is enclosed within blockquote
        text = orange_font.find_next('blockquote').get_text(strip=True)
        spam_email_data.append({'Name':name, 'Subject':subject, 'Text': text})
        count_spam_emails+=1
        if count_spam_emails >= count_ham_emails:
            break
    if count_spam_emails >= count_ham_emails:
            break
# Convertion of scraped data into a DataFrame
phish_emails_df= pd.DataFrame(spam_email_data)

In [None]:
# Concatenation of Subject and Content into a new Column text
phish_emails_df['text'] =phish_emails_df['Subject']+phish_emails_df['Text']
phish_emails_df['label'] = 1
phish_emails_df = phish_emails_df[['text', 'label']]

In [None]:
#Set the number of spam email in the training dataset to be 24000 and the rest in testing.
train_val = 24000
total_len = phish_emails_df.shape[0]
org_phish_train= phish_emails_df.sample(train_val)
org_phish_test = phish_emails_df.drop(org_phish_train.index)

In [None]:
#emails.csv file contains the Enron dataset pre-downloaded from kaggle
emails = pd.read_csv("emails.csv")
print(emails.head())
# Print the number of emails 
print("The number of emails are:")
print(emails.shape[0])

In [None]:
# Helper function to parse dates safely
def safe_parse_date(date_string):
    if not date_string:
        return None
    try:
        return parser.parse(date_string)
    except (ValueError, parser.ParserError):
        return None

In [None]:
# Extract subject, body, date from each message
df_email = pd.DataFrame([
    {
        'Content': complete_content.get_payload(),
        'Subject': d.get('Subject', 'No Subject'),
        'Date': d.get('Date'),  
        'Label': 0
    }
    for i in range(emails.shape[0])
    for email in [emails.loc[i]['message']]
    for complete_content in [message_from_string(email)]
    for d in [dict(complete_content.items())]
])

# Parse date column and extract year
df_email['Date'] = pd.to_datetime(df_email['Date'].apply(safe_parse_date), errors='coerce')
df_email['Year'] = df_email['Date'].dt.year

# Combine Subject and Content into one text column
df_email['text'] = df_email['Subject'] + df_email['Content']
df_email['label'] = 0

In [None]:
# Filter the DataFrame for emails from January 1997 onwards till July 2001
df_email['Date'] = pd.to_datetime(df_email['Date'])
start_date = pd.Timestamp('1997-01-01', tz='UTC')  
end_date = pd.Timestamp('2001-07-31 23:59:59', tz='UTC')
filtered_emails = df_email[(df_email['Date'] >= start_date) & (df_email['Date'] <= end_date)]

In [None]:
# Sample 120K legitimate emails
df_email = filtered_emails.sample(n=120000)
phish_emails_df_final = df_email[['text', 'label']]
num_of_train_emails = 96000

# Split into train and test sets
leg_train_df = phish_emails_df_final.head(num_of_train_emails)
leg_test_df = phish_emails_df_final.drop(leg_train_df.index)

In [None]:
# Final Train and Test Datasets (Dataset1)
Dataset1_Train = pd.concat([org_phish_train,leg_train_df], axis = 0)
Dataset1_Test = pd.concat([org_phish_test,leg_test_df], axis = 0)

Tradional Preprocessing - The following fuctions are involved in the preprocessing stage
1) Removal of special characters
2) Conversion to lowercase
3) Removal of stop words
4) Removal of Numbers
5) Identification of URL, Numbers, Phone, Email
6) Detection and translation of languages (if not in English)
These preprocessing steps are followed to obtain the Dataset1_1.

In [None]:
# Function to remove special characters from a given text string
def remove_special_characters(text):
    cleaned_text = re.sub(r'[^\w\s]', '', text)
    cleaned_text = re.sub(r'_+', '', cleaned_text)
    return cleaned_text  

In [None]:
# Function to convert the text into lowercase
def convert_to_lowercase(text):
    words_list = [word.lower() for word in text.split()]
    cleaned_text = ' '.join(words_list)
    return cleaned_text

In [None]:
#Function to remove stop words from text
def remove_stop_words(text):
    stop_words = set(stopwords.words('english'))
    words_list = [word.lower() for word in text.split() if word.lower() not in stop_words]
    cleaned_text = ' '.join(words_list)
    return cleaned_text

In [None]:
# Function to remove digits from the text
def remove_numbers(text):
    words = text.split()
    cleaned_text_list = []
    for word in words:
        word_list = [x for x in word if x.isdigit()!=True]
        cleaned_text_list.append("".join(word_list))
    cleaned_text = " ".join(cleaned_text_list)
    return cleaned_text

In [None]:
# Function to replace URLs, emails, attachments, and phone numbers with keywords
def url_attachment_identification(text):
    url_pattern = r'\b(?:https?:\/\/)?(?:[\w-]+\.)+[a-z]{2,}(?:\/[^\s]*)?'
    attachment_pattern = r'\b\w+\.(pdf|docx|jpg|png|xls|xlsx|ppt|pptx|txt|zip)\b'
    email_pattern = r'\b[a-zA-Z0-9_.%+-]+@[a-zA-Z0-9.-]+\.[a-z|A-Z]{2,7}\b'
    phone_pattern = r'^\+?(\d{1,3})?[-.\s]?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}([-.\s]?(ext|x|#)\s?\d{1,5})?$'

    # Replace and count replacements for each pattern
    cleaned_text = re.sub(email_pattern, "email", text)
    cleaned_text = re.sub(phone_pattern, "phone number", cleaned_text)
    cleaned_text = re.sub(url_pattern, "link", cleaned_text)
    cleaned_text = re.sub(attachment_pattern, "attachment", cleaned_text)
    return cleaned_text



In [None]:
# Translates non-English text to English
def detect_and_translate(text):
    try:
        detected_lang = langdetect.detect(text)
        if detected_lang == 'en':
            return text
        translator = Translator()
        translation = translator.translate(text, dest='en')
        return translation.text
    except Exception as e:
        return text

In [None]:
# Function to preprocess the emails
def preprocess(text):
    cleaned_text = url_attachment_identification(text)
    cleaned_text = remove_special_characters(cleaned_text)
    cleaned_text = convert_to_lowercase(cleaned_text)
    cleaned_text = remove_stop_words(cleaned_text)
    cleaned_text = remove_numbers(cleaned_text)
    cleaned_text = detect_and_translate(cleaned_text)
    return cleaned_text

In [None]:
# Apply preprocessing to the text column of dataframe df_email
Dataset1_Train['cleaned_text'] = Dataset1_Train["text"].apply(preprocess)
Dataset1_Test['cleaned_text'] = Dataset1_Test["text"].apply(preprocess)

Dataset1_1Train = Dataset1_Train[["cleaned_text","label"]]
Dataset1_1Test = Dataset1_Test[["cleaned_text","label"]]

In [None]:
Dataset1_1Train

In [None]:
Dataset1_1Test

Traditional + Split_Words + SpellChecker
Along with the traditional methods used, we also use the second level of preprocessing functions - 
1) Splitting of Words
2) Spelling Correction

In [None]:
# Load previously saved Dataset1_1
Dataset1_1Train = pd.read_csv("Dataset1_1Train_new3.csv")
Dataset1_1Test = pd.read_csv("Dataset1_1Test_new3.csv")

# Fill missing values
Dataset1_1Train['cleaned_text'] = Dataset1_1Train['cleaned_text'].fillna('')
Dataset1_1Test['cleaned_text'] = Dataset1_1Test['cleaned_text'].fillna('')

# Remove index columns
Dataset1_1Train.drop(columns=['Unnamed: 0'], inplace=True)
Dataset1_1Test.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
# Spelling correction
def autocorrect_email(text):
    check = Speller(lang="en")
    if isinstance(text, str):
        autocorrected_email = check(text)
        return autocorrected_email
    else:
        return text 

In [None]:
# Apply autocorrect and word splitting on train and test datasets
for index, row in tqdm(Dataset1_1Train.iterrows(), total=len(Dataset1_1Train), desc="Processing Training Data"):
    corrected = autocorrect_email(row['cleaned_text'])
    Dataset1_1Train.loc[index, 'final_cleaned_text'] = " ".join(wordninja.split(corrected))

for index, row in tqdm(Dataset1_1Test.iterrows(), total=len(Dataset1_1Test), desc="Processing Testing Data"):
    corrected = autocorrect_email(row['cleaned_text'])
    Dataset1_1Test.loc[index, 'final_cleaned_text'] = " ".join(wordninja.split(corrected))

# Final datasets
Dataset1_2Train = Dataset1_1Train[["final_cleaned_text", "label"]]
Dataset1_2Test = Dataset1_1Test[["final_cleaned_text", "label"]]


In [None]:
# Dataframe to csv conversion
#Dataset1_Train.to_csv("Dataset1_Train_new2.csv")
#Dataset1_Test.to_csv("Dataset1_Test_new2.csv")
#Dataset1_1Train.to_csv("Dataset1_1Train_new2.csv")
#Dataset1_1Test.to_csv("Dataset1_1Test_new2.csv")
#Dataset1_2Train.to_csv("Dataset1_2Train_new3.csv")
#ataset1_2Test.to_csv("Dataset1_2Test_new3.csv")