# Import libraries

In [2]:
import numpy as np 
import pandas as pd 
import sklearn
from sklearn.cluster import KMeans 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.metrics import pairwise_distances
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from scipy.stats import multivariate_normal as mvn
import nltk
import os
import random
import string
import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')

import os, sys, email,re

# DATASET  LOADING 
"""The code above imports the pandas library and reads two CSV files into dataframes. 
The read_csv function from pandas is used to read the CSV files. In the case of 'emails.csv', 
the number of rows is limited to 3900 using the nrows parameter. In the case of 'nazario_recent.csv', 
the index column is set to the first column using the index_col parameter, and the data type of the 'body' column 
is set to a generic object using the dtype parameter."""

In [3]:
# Read 'emails.csv' file into a dataframe with a limit of 3900 rows(Benign Corpus)
df = pd.read_csv('emails.csv', nrows=3900)

# Read 'nazario_recent.csv' file into a dataframe with the first column as the index column
# Set the data type of the 'body' column as a generic object(Phishing Corpus)
jf = pd.read_csv('nazario_recent.csv', index_col=0, dtype={'body': 'object'})


# EMAIL PARSING FOR BENIGN DATASET

In [4]:
"""Extract information such as the subject, sender, and recipient of each email
  then create new columns in the DataFrame for each piece of extracted information.""" #Code reference(https://www.kaggle.com/code/jxm222/clustering-mail/notebook)


# Parse each email message in the "message" column of the DataFrame
emails = list(map(email.parser.Parser().parsestr, df['message']))

# extract headings from the first email object in the list of emails such as subject, from, to etc..
headings = emails[0].keys()

"""Loop through each heading, extract the corresponding values from each email object in the list, 
and create a new column in the DataFrame with the heading as the column name and the extracted 
values as the column values""" 
for key in headings:
    df[key] = [doc[key] for doc in emails]

    
# Define a function to extract the raw text from each email object in the list of emails
def get_raw_text(emails):
    email_text = []
    for email in emails.walk():
        if email.get_content_type() == 'text/plain':
            email_text.append(email.get_payload())
    return ''.join(email_text)

"""Apply the "get_raw_text" function to each email object in the list of emails,and store 
the resulting strings in a new column called "message" in the DataFrame
"""
df['message'] = list(map(get_raw_text, emails))

"""Extract the username from the file path of each email,and store the usernames in a new column called
"user" in the DataFrame"""

df['user'] = df['file'].map(lambda x: x.split('/')[0])

In [5]:
print(df.iloc[1])


file                                                    allen-p/_sent_mail/10.
message                      Traveling to have a business meeting takes the...
Message-ID                       <15464986.1075855378456.JavaMail.evans@thyme>
Date                                      Fri, 4 May 2001 13:51:00 -0700 (PDT)
From                                                   phillip.allen@enron.com
To                                                     john.lavorato@enron.com
Subject                                                                    Re:
Mime-Version                                                               1.0
Content-Type                                      text/plain; charset=us-ascii
Content-Transfer-Encoding                                                 7bit
X-From                                                         Phillip K Allen
X-To                         John J Lavorato <John J Lavorato/ENRON@enronXg...
X-cc                                                

In [101]:
#view the new columns after parsing the email method on the Benign Dataset
df.columns

Index(['file', 'message', 'Message-ID', 'Date', 'From', 'To', 'Subject',
       'Mime-Version', 'Content-Type', 'Content-Transfer-Encoding', 'X-From',
       'X-To', 'X-cc', 'X-bcc', 'X-Folder', 'X-Origin', 'X-FileName', 'user'],
      dtype='object')

In [74]:
#Read the first index of the message column of the benign dataset
print(df['message'].iloc[1])

Traveling to have a business meeting takes the fun out of the trip.  Especially if you have to prepare a presentation.  I would suggest holding the business plan meetings here then take a trip without any formal business meetings.  I would even try and get some honest opinions on whether a trip is even desired or necessary.

As far as the business meetings, I think it would be more productive to try and stimulate discussions across the different groups about what is working and what is not.  Too often the presenter speaks and the others are quiet just waiting for their turn.   The meetings might be better if held in a round table discussion format.  

My suggestion for where to go is Austin.  Play golf and rent a ski boat and jet ski's.  Flying somewhere takes too much time.



# PHISHING CLEANUP 

In [75]:
#Get information about the Phishing Dataset before cleaning
jf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1918 entries, 0 to 2162
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   body    1917 non-null   object
dtypes: object(1)
memory usage: 30.0+ KB


In [76]:
#import necessary preprocessing libraries for the cleaning(Reference:https://github.com/liakoyras/thesis-phishing-email-detection)

from raw_utils import save_to_csv
import preprocessing as util

In [102]:
#Removing Duplicate rows
jf = jf[jf.duplicated(keep='first') == False]

#View the new shape
jf.shape

(1918, 1)

# LEGITIMATE(BENIGN) EMAILS CLEANUP

In [78]:

#Dropping Columns not needed in our ML model building

df = df.drop(['file', 'Message-ID', 'Date', 'From', 'To', 'Subject',
       'Mime-Version', 'Content-Type', 'Content-Transfer-Encoding', 'X-From',
       'X-To', 'X-cc', 'X-bcc', 'X-Folder', 'X-Origin', 'X-FileName', 'user'], axis=1)

In [79]:
#Get new information of the Benign Corpus
df.info

#View the new shape
df.shape

(3900, 1)

In [80]:
#Drop Null Rows and Rename the column


#Rename the column message to body
df = df.rename(columns={'message':'body'})
print(df)

                                                   body
0                             Here is our forecast\n\n 
1     Traveling to have a business meeting takes the...
2                        test successful.  way to go!!!
3     Randy,\n\n Can you send me a schedule of the s...
4                   Let's shoot for Tuesday at 11:45.  
...                                                 ...
3895  asshole\n\n\n\n\nJohn J Lavorato@ENRON\n12/24/...
3896  the market had to get to a price whereby these...
3897  ---------------------- Forwarded by Sarah-Joy ...
3898  ---------------------- Forwarded by John Arnol...
3899  i know\n\n\n\n\nJennifer Shipos\n12/21/2000 12...

[3900 rows x 1 columns]


In [81]:
#Drop null Rows
df = df.dropna()

In [82]:
#View the new shape
df.shape

(3900, 1)

In [83]:
#Check for empty emails
df = df[df['body'].apply(util.check_empty) == False]
df.shape

(3900, 1)

In [84]:
#Check for Duplicates
df = df[df.duplicated(keep='first') == False]
df.shape

(2210, 1)

In [85]:
#View the Column of the new Dataframe
df.columns

Index(['body'], dtype='object')

# Creating a Mixed Balanced Dataset

In [86]:
"""This Step creates a combined dataset of the two dataframes and add an extra column named "class" 
to classify the emails as either phishing or legitimate.The Balanced dataset will be 1:1"""


#Create the column for the phishing dataset with value 1 to represent phishing
jf['class'] = 1


In [87]:
#Sample a random set of 1918 rows from the benign dataframe to match the number of rows 
#in the phishing dataframe and set create an extra column and set the class to 0 to indicate legitimate emails

df = df.sample(n=1918, random_state = 1746)
df['class'] = 0

In [88]:
#View the dataframes

print("\033[1m" + "This is for the Legitimate emails:\n" + "\033[0;0m",df)
print("\033[1m" + "This is for the Phishing emails:\n" + "\033[0;0m",jf)

[1mThis is for the Legitimate emails:
[0;0m                                                    body  class
1196  Scott,\n\n Thanks for the email.  I have two q...      0
1412  [IMAGE]\nNCI Marketing Web Alert \t\n\n\n[IMAG...      0
1295  phillip.\n\nlooking into it now, i'll move the...      0
2274  \n\n -----Original Message-----\nFrom: \t"JEFF...      0
442   I would like to have a copy of the appraisal. ...      0
...                                                 ...    ...
3848  Jennifer,\nI just checked with Carolyn on your...      0
1232  \n \n>From: "Greg Thorse" \n>To: \n>CC: "Phill...      0
3521  planning on going.  which night are you inviti...      0
1475  The following expense report is ready for appr...      0
2206  Tim,\n\nIs something wrong with the database? ...      0

[1918 rows x 2 columns]
[1mThis is for the Phishing emails:
[0;0m                                                    body  class
0       Microsoft Failure Delivery Notice.\n  User: ...      1
2  

In [89]:
"""Create the Mixed dataframe and insert a new column id at the left-most column 
to help create an easy reference on individual rows incase we need to perform operations on them."""

balanced = pd.concat ([jf, df])

#Shuffle rows in a random order from the initial combined order
balanced = balanced.sample(frac=1, random_state=1746).reset_index(drop=True)

#Create the new column id from the first index
balanced.insert(0, 'id', balanced.index)

In [90]:
#View the new comibined dataframe
balanced.shape

(3836, 3)

In [None]:
#Create a csv file path and save the balanced dataset for further processing
csv_path = os.path.join('/Users/mimidubcys/Desktop/Projects/A.I Powered Phishing Detection System/Project Files/Phishing Datasets/Phishing Detection Coding Task')

save_to_csv(balanced,csv_path,'balanced.csv')