# Summary

The following is the step taken to clean the datasets
- Removing duplicates based on 'raw_mail' column
- Change the "None" to an actuall None value to all column
- Remove email address that is not in the same format
- Fill empty/none email address with bfill and ffill
- Fill empty/none subject with bffill and ffill
- Update the 'date' so it is in one format
- Add malicious column

# import libraries

In [1]:
import pandas as pd
from datetime import datetime
from dateutil import parser
import re

Parse the datasets into dataframe

In [2]:
fraudDataframe = pd.read_json('datasets/raw/fradulent_emails.json', orient='index')
phishingDataframe = pd.read_json('datasets/raw/phishing-chorpus.json', orient='index')
enronDataframe = pd.read_csv('datasets/raw/enron-emails.csv')

Remove duplicates

In [3]:
fraudDataframe = fraudDataframe.drop_duplicates(subset="raw_mail")
phishingDataframe = phishingDataframe.drop_duplicates(subset="raw_mail")
enronDataframe = enronDataframe.drop_duplicates(subset="raw_mail")

In [4]:
print(fraudDataframe.info())
fraudDataframe.sample(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3939 entries, 0 to 3977
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   raw_mail  3939 non-null   object
 1   subject   3939 non-null   object
 2   from      3939 non-null   object
 3   to        3939 non-null   object
 4   status    3939 non-null   object
 5   date      3939 non-null   object
 6   body      3939 non-null   object
dtypes: object(7)
memory usage: 246.2+ KB
None


Unnamed: 0,raw_mail,subject,from,to,status,date,body
296,Return-Path: <basher39@lycos.com>\nMessage-Id:...,ASSISTANCE NEEDED,BASHER MOBUTU <basher39@lycos.com>,R@M,O,"Tue, 01 Jul 2003 01:29:15 +0200","Good Day,\nYou may be surprise to receive this..."
3857,Return-Path: <web391@jenny.webhoster.ag>\nX-Si...,MUTUAL PARTNERSHIP.,"""LUISA ESTRADA"" <luisa@blue-conn.de>",,O,"Sun, 22 Jul 2007 16:57:01 +0200 (CEST)","Dear Friend,\n\nThe political unrest in my cou..."
2885,Return-Path: <barripikolo@adinet.com.uy>\nX-Si...,GET BACK TO ME.,barri pikolo <barripikolo@adinet.com.uy>,,RO,"Sat, 29 Jul 2006 16:07:15 -0300 (UYT)",FROM THE DESK OF:MR.BARR PIKOLO COKER\nGENERAL...
3727,Return-Path: <sussanbien44@yahoo.ca>\nX-Sieve:...,SORROW AND TEARS FROM SUSSAN BIEN,,sussanbien44@yahoo.ca,RO,,"Dearest One,\n\nThanks for your mail, I got yo..."
3035,Return-Path: <hassan_abdoulaye3@yahoo.co.uk>\n...,URGENTLY REPLY,"""hassan abdoulaye"" <hassan_abdoulaye3@yahoo....",,O,"Thu, 14 Sep 2006 14:12:13 +0200",My Dear=2C \n\nGood day to you=2C I hope fine=...


In [5]:
print(phishingDataframe.info())
phishingDataframe.sample(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4190 entries, 0 to 4195
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   raw_mail  4190 non-null   object
 1   subject   4190 non-null   object
 2   from      4190 non-null   object
 3   to        4190 non-null   object
 4   status    4190 non-null   object
 5   date      4190 non-null   object
 6   body      4190 non-null   object
dtypes: object(7)
memory usage: 261.9+ KB
None


Unnamed: 0,raw_mail,subject,from,to,status,date,body
2126,Return-Path: <member@ebay.com>\nX-Original-To:...,Question about Item -- Respond Now,"""Question from ebay member: jancortina"" <membe...",user@example.com,RO,"Mon, 25 Sep 2006 08:43:28 -0700",<TABLE cellSpacing=3D0 cellPadding=3D5 width=3...
1796,Return-Path: <administrator@paypal.com>\nX-Ori...,Update your PayPal account,"""PayPal"" <postmaster@paypal.com>",nobody@example.com,RO,"Mon, 27 Mar 2006 23:40:21 -0100",<div id=3Dmessage>\r\n\r\n\r\n\r\n\r\n<BR>Dear...
380,Return-Path: <wsykespg@ebay.com>\nX-Original-T...,=?ISO-8859-1?b?RWJheSBBY2NvdW50IFJldmlldyAgICA...,"""Bernadine W. Sykes"" <wsykespg@ebay.com>",username@domain.com,O,"Thu, 17 Jun 2004 23:09:00 +0000","<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.01 T..."
3532,Return-Path: <user@mail.example.com>\nDelivere...,eBay - eBay automatically invites qualified us...,"""eBay Team"" <aw-confirm36@ebay.com>",undisclosed-recipients: ;,RO,"Sat, 16 Jul 2005 02:13:58 -0700","<style type=""text/css"">\r\n<!--\r\n.style1 {fo..."
2665,Return-Path: <Visa@visa.com>\nX-Original-To: u...,Verified by Visa enrollment,Visa@visa.com,user@example.com,RO,15 Feb 2007 04:28:07 -0700,"<!doctype html public ""-//W3C//DTD HTML 4.0 Tr..."


In [6]:
print(enronDataframe.info())
enronDataframe.sample(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 517401 entries, 0 to 517400
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  517401 non-null  int64  
 1   raw_mail    517401 non-null  object 
 2   subject     498214 non-null  object 
 3   from        517401 non-null  object 
 4   to          495554 non-null  object 
 5   status      0 non-null       float64
 6   date        517401 non-null  object 
 7   body        517401 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 35.5+ MB
None


Unnamed: 0.1,Unnamed: 0,raw_mail,subject,from,to,status,date,body
276465,276465,Message-ID: <7124113.1075858127346.JavaMail.ev...,"Fw: FW: Winners of the ""I Look Like My Dog"" co...",matthew.lenhart@enron.com,debbielatham@realtor.com,,"Mon, 7 Aug 2000 07:03:00 -0700 (PDT)",---------------------- Forwarded by Matthew Le...
411528,411528,Message-ID: <19577259.1075841559934.JavaMail.e...,FW: OASIS Posting - Real Power Loss Return Met...,cara.semperger@enron.com,donald.robinson@enron.com,,"Mon, 26 Nov 2001 10:06:19 -0800 (PST)",\n\n-----Original Message-----\nFrom: Steve Hu...
249294,249294,Message-ID: <6069463.1075846355144.JavaMail.ev...,Re: Dabhol Reg Risk,james.steffes@enron.com,steven.kean@enron.com,,"Fri, 2 Jun 2000 00:55:00 -0700 (PDT)",FYI. Jane is doing a great job supporting the...
31426,31426,Message-ID: <9021295.1075855904437.JavaMail.ev...,Re: excitement,sally.beck@enron.com,mary.gray@enron.com,,"Thu, 10 Aug 2000 04:13:00 -0700 (PDT)","David is doing well, too. He is working contr..."
67531,67531,Message-ID: <31906215.1075859195299.JavaMail.e...,RE: Doubletree PowerPoint Presentation,michael.tribolet@enron.com,jeff.dasovich@enron.com,,"Thu, 20 Dec 2001 15:08:17 -0800 (PST)",Yes it is.\n\n-----Original Message-----\nFrom...


From a quick glance from all the 3 datasets, there are multiple inconsistensy that can be found in the format of the values. 

- from and to columns contains not only the emails
- datetime isn't in one format

In [7]:
print(fraudDataframe.isna().sum(), '\n') # contains none but inst register as one
print(phishingDataframe.isna().sum(), '\n') # contains none but isnt register as one
print(enronDataframe.isna().sum()) # contains null values

raw_mail    0
subject     0
from        0
to          0
status      0
date        0
body        0
dtype: int64 

raw_mail    0
subject     0
from        0
to          0
status      0
date        0
body        0
dtype: int64 

Unnamed: 0         0
raw_mail           0
subject        19187
from               0
to             21847
status        517401
date               0
body               0
dtype: int64


As the results show above only the enron sets register having a null values while in fact all 3 datasets does contains a null values

If we check the values for a "None" in a string format we will infact found that the rest of the datasets does in fact contains a Null value 

In [8]:
print((fraudDataframe == "None").sum())
print((phishingDataframe == "None").sum())
print((enronDataframe == "None").sum())

raw_mail      0
subject      17
from        365
to          948
status        0
date        534
body          0
dtype: int64
raw_mail     0
subject     49
from         4
to           9
status       5
date         3
body         0
dtype: int64
Unnamed: 0    0
raw_mail      0
subject       0
from          0
to            0
status        0
date          0
body          0
dtype: int64


Update the datasets to change the "None" values to an actual None

In [9]:
def updateToNone(val):
    if val == "None":
        return None
    else:
        return val

fraudDataframe['subject'] = fraudDataframe['subject'].apply(updateToNone)
fraudDataframe['to'] = fraudDataframe['to'].apply(updateToNone)
fraudDataframe['from'] = fraudDataframe['to'].apply(updateToNone)
fraudDataframe['status'] = fraudDataframe['status'].apply(updateToNone)
fraudDataframe['date'] = fraudDataframe['date'].apply(updateToNone)

print((fraudDataframe == "None").sum())
print(fraudDataframe.isna().sum()) 

raw_mail    0
subject     0
from        0
to          0
status      0
date        0
body        0
dtype: int64
raw_mail      0
subject      17
from        948
to          948
status        0
date        534
body          0
dtype: int64


In [10]:
phishingDataframe['subject'] = phishingDataframe['subject'].apply(updateToNone)
phishingDataframe['to'] = phishingDataframe['to'].apply(updateToNone)
phishingDataframe['from'] = phishingDataframe['to'].apply(updateToNone)
phishingDataframe['status'] = phishingDataframe['status'].apply(updateToNone)
phishingDataframe['date'] = phishingDataframe['date'].apply(updateToNone)

print((fraudDataframe == "None").sum())
print(fraudDataframe.isna().sum()) 

raw_mail    0
subject     0
from        0
to          0
status      0
date        0
body        0
dtype: int64
raw_mail      0
subject      17
from        948
to          948
status        0
date        534
body          0
dtype: int64


To fix the inconsistensy format of the email in columns "from" and "to" , we will used regex to extract the valid emails first then, fill the empty values with valid values from the datasets

In [11]:
notValidEmail = phishingDataframe['from'].str.contains('[a-zA-Z0-9-_.]*@a-zA-Z0-9-]*(\.[a-zA-Z]*)*', regex=True) == False
phishingDataframe[notValidEmail]['from'].count()

  notValidEmail = phishingDataframe['from'].str.contains('[a-zA-Z0-9-_.]*@a-zA-Z0-9-]*(\.[a-zA-Z]*)*', regex=True) == False


4181

In [12]:
notValidEmail = fraudDataframe['from'].str.contains('([a-zA-Z0-9-_.])*@([a-zA-Z0-9-])*(\.[a-zA-Z]*)*', regex=True) == False
fraudDataframe[notValidEmail]['from'].count()

  notValidEmail = fraudDataframe['from'].str.contains('([a-zA-Z0-9-_.])*@([a-zA-Z0-9-])*(\.[a-zA-Z]*)*', regex=True) == False


545

In [13]:
notValidEmail = enronDataframe['to'].str.contains('([a-zA-Z0-9-_.])*@([a-zA-Z0-9-])*(\.[a-zA-Z]*)*', regex=True) == False
enronDataframe[notValidEmail]['to'].count()

  notValidEmail = enronDataframe['to'].str.contains('([a-zA-Z0-9-_.])*@([a-zA-Z0-9-])*(\.[a-zA-Z]*)*', regex=True) == False


16

In [14]:
parsedFrom = fraudDataframe['from'].str.extract('([a-zA-Z0-9-_.]*@[a-zA-Z0-9-]*(\.[a-zA-Z]*)*)')
parsedTo = fraudDataframe['to'].str.extract('([a-zA-Z0-9-_.]*@[a-zA-Z0-9-]*(\.[a-zA-Z]*)*)')

fraudDataframe['parsed_from'] = parsedFrom[0]
fraudDataframe['parsed_to'] = parsedTo[0]

In [15]:
parsedFrom = phishingDataframe['from'].str.extract('([a-zA-Z0-9-_.]*@[a-zA-Z0-9-]*(\.[a-zA-Z]*)*)')
parsedTo = phishingDataframe['to'].str.extract('([a-zA-Z0-9-_.]*@[a-zA-Z0-9-]*(\.[a-zA-Z]*)*)')

phishingDataframe['parsed_from'] = parsedFrom[0]
phishingDataframe['parsed_to'] = parsedTo[0]

In [16]:
parsedFrom = enronDataframe['from'].str.extract('([a-zA-Z0-9-_.]*@[a-zA-Z0-9-]*(\.[a-zA-Z]*)*)')
parsedTo = enronDataframe['to'].str.extract('([a-zA-Z0-9-_.]*@[a-zA-Z0-9-]*(\.[a-zA-Z]*)*)')

enronDataframe['parsed_from'] = parsedFrom[0]
enronDataframe['parsed_to'] = parsedTo[0]

Bellow is the kind of fields that wasnt register as an email

In [17]:
print(fraudDataframe[fraudDataframe['parsed_from'].isna()]['from'].unique())
print(fraudDataframe[fraudDataframe['parsed_to'].isna()]['to'].unique())
print(phishingDataframe[phishingDataframe['parsed_from'].isna()]['from'].unique())
print(phishingDataframe[phishingDataframe['parsed_to'].isna()]['to'].unique())

[None 'undisclosed-recipients: ;' 'undisclosed-recipients:;' ''
 'undisclosed recipients: ;' 'N/A <>, N/A <>' 'N/A <>']
[None 'undisclosed-recipients: ;' 'undisclosed-recipients:;' ''
 'undisclosed recipients: ;' 'N/A <>, N/A <>' 'N/A <>']
['undisclosed-recipients: ;' '[removed]' None 'undisclosed-recipients:;'
 'unlisted-recipients:; (no To-header on input)'
 '<Undisclosed-Recipient:;>' '=?euc-kr?B?u+e2+7nnu/W6rsbtwfawocG3?=' '']
['undisclosed-recipients: ;' '[removed]' None 'undisclosed-recipients:;'
 'unlisted-recipients:; (no To-header on input)'
 '<Undisclosed-Recipient:;>' '=?euc-kr?B?u+e2+7nnu/W6rsbtwfawocG3?=' '']


In [18]:
phishingDataframe['parsed_from'] = phishingDataframe['parsed_from'].ffill().bfill()
fraudDataframe['parsed_from'] = fraudDataframe['parsed_from'].ffill().bfill()
enronDataframe['parsed_from'] = enronDataframe['parsed_from'].ffill().bfill()

In [19]:
phishingDataframe['parsed_to'] = phishingDataframe['parsed_to'].ffill().bfill()
fraudDataframe['parsed_to'] = fraudDataframe['parsed_to'].ffill().bfill()
enronDataframe['parsed_to'] = enronDataframe['parsed_to'].ffill().bfill()

all the datasets subject column contains a null value, we will fill this value using existing fields in the datasets

In [20]:
phishingDataframe['subject'] = phishingDataframe.subject.ffill().bfill()
fraudDataframe['subject'] = fraudDataframe.subject.ffill().bfill()
enronDataframe['subject'] = enronDataframe.subject.ffill().bfill()

In [21]:
print('Number of row that have empty subject for phishingDataframe:', phishingDataframe.subject.isnull().sum())
print('Number of row that have empty subject for fraudDataframe:', fraudDataframe.subject.isnull().sum())
print('Number of row that have empty subject for enronDataframe:', enronDataframe.subject.isnull().sum())

Number of row that have empty subject for phishingDataframe: 0
Number of row that have empty subject for fraudDataframe: 0
Number of row that have empty subject for enronDataframe: 0


In [22]:
enronDataframe['parsed_date'] = enronDataframe.date.apply(lambda date: parser.parse(date).isoformat())

Fill empty fields so no null exist by doing backward and forward fill

In [23]:
fraudDataframe.date = fraudDataframe.date.ffill().bfill()
phishingDataframe.date = phishingDataframe.date.ffill().bfill()

In [24]:
diff = phishingDataframe.shape[0] - phishingDataframe.date.str.contains('[A-Za-z]{0,3}, \d* [A-Za-z]{0,3} \d{4}').sum()
print("Total date row that are not in format for phishingDataframe:", diff)
diff = fraudDataframe.shape[0] - fraudDataframe.date.str.contains('[A-Za-z]{0,3}, \d* [A-Za-z]{0,3} \d{4}').sum()
print("Total date row that are not in format for phishingDataframe:", diff)

Total date row that are not in format for phishingDataframe: 444
Total date row that are not in format for phishingDataframe: 151


In [25]:
def parseDate(date):
    try:
        return parser.parse(date).isoformat()
    except Exception as e:
        return None

In [26]:
phishingDataframe['parsed_date'] = phishingDataframe.date.str.replace('\.', ':', regex=True)
phishingDataframe['parsed_date'] = phishingDataframe['parsed_date'].apply(parseDate)




Manual cleaning for cases that are to few to automate

In [27]:
phishingDataframe.loc[821].parsed_date = parser.parse("Fri, 09 Jun 2006 08:23:29 +0500 (EST)").isoformat()
phishingDataframe.loc[892].parsed_date = parser.parse("Fri, 23 Jun 2006 13:25:46 -0100 (EST)").isoformat()
phishingDataframe.loc[896].parsed_date = parser.parse("Fri, 23 Jun 2006 21:36:05 +0800").isoformat()
phishingDataframe.loc[1066].parsed_date = parser.parse("Wed, 26 Jul 2006 09:48:28 -0800").isoformat()
phishingDataframe.loc[1067].parsed_date = parser.parse("Wed, 26 Jul 2006 12:50:48 -0600").isoformat()
phishingDataframe.loc[1072].parsed_date = parser.parse("Thu, 27 Jul 2006 03:06:10 -0800").isoformat()
phishingDataframe.loc[1074].parsed_date = parser.parse("Wed, 26 Jul 2006 15:24:52 -0500").isoformat()
phishingDataframe.loc[1075].parsed_date = parser.parse("Wed, 26 Jul 2006 15:43:42 -0500").isoformat()
phishingDataframe.loc[1076].parsed_date = parser.parse("Wed, 26 Jul 2006 19:03:49 -0300").isoformat()
phishingDataframe.loc[1077].parsed_date = parser.parse("Wed, 26 Jul 2006 19:35:02 -0300").isoformat()
phishingDataframe.loc[1095].parsed_date = parser.parse("31.07.2006").isoformat()
phishingDataframe.loc[1173].parsed_date = parser.parse("Thu, 3 Aug 2006 00:13:00 -0530").isoformat()
phishingDataframe.loc[2421].parsed_date = parser.parse("Tue, 09 Jan 2007 14:00:44 +0430").isoformat()
phishingDataframe.loc[3540].parsed_date = parser.parse("Sun, 10 Sep 2006 14:00:47 +0000").isoformat()
phishingDataframe.loc[3643].parsed_date = parser.parse("Fri, 09 Mar 2007 18:11:57 +0530").isoformat()
phishingDataframe.loc[3896].parsed_date = parser.parse("07.08.2006").isoformat()
phishingDataframe.loc[3963].parsed_date = parser.parse("Mon, 24 Feb 2003 17:32:08 +0000").isoformat()
phishingDataframe.loc[4117].parsed_date = parser.parse("Sun, 10 Sep 2006 12:08:54 -0300").isoformat()

In [28]:
fraudDataframe['parsed_date'] = fraudDataframe.date.str.replace('\.', ':', regex=True)
fraudDataframe['parsed_date'] = fraudDataframe['parsed_date'].apply(parseDate)



In [29]:
def myfunc(row):
    if row.parsed_date == None:
        try:
            row.parsed_date = parser.parse(
                re.search("([A-Za-z]{1,3}, \d{0,2} [A-Za-z]* \d{2,4} \d{2}:\d{2}:\d{2} ((\+|\-)?\d{4})?)", 
                          row.date).group(1)).isoformat()
            return row
        except Exception as e:
            return row
    else:
        return row

fraudDataframe = fraudDataframe.apply(myfunc, axis=1)

Manual Updates

In [30]:
fraudDataframe.loc[542].parsed_date = parser.parse("Sun, 09 nov 2003 21:18:28").isoformat()
fraudDataframe.loc[1236].parsed_date = parser.parse("Tue, 09 nov 2004 15:38:35 -0300").isoformat()

Add prediction label

In [31]:
fraudDataframe['malicious'] = True
phishingDataframe['malicious'] = True
enronDataframe['malicious'] = False

In [32]:
fraudDataframe.to_csv(path_or_buf='datasets/clean/fraud-emails.csv', index=False)
phishingDataframe.to_csv(path_or_buf='datasets/clean/phishing-emails.csv', index=False)
enronDataframe.to_csv(path_or_buf='datasets/clean/enron-emails.csv', index=False)