## Import libraries

In [2]:
import pandas as pd
from datetime import datetime
from dateutil import parser
import re
import email


## Import Datasets to Pandas

In [3]:
fraudDataframe = pd.read_csv('datasets/clean/fraud-emails.csv')
phishingDataframe = pd.read_csv('datasets/clean/phishing-emails.csv')
enronDataframe = pd.read_csv('datasets/clean/enron-emails.csv')

In [4]:
malicious_df = pd.concat([fraudDataframe, phishingDataframe], ignore_index=True)
enron_df = enronDataframe

Run this if the index is save to the CSV

In [5]:
malicious_df = malicious_df.drop(columns='Unnamed: 0')
enron_df = enron_df.drop(columns=['Unnamed: 0','Unnamed: 0.1'])

Extract extra informations that can be used as a features

In [6]:
# this method is really inefficient and will take too long for larger datasets
# 

def getExtraInfo(row):
    try:
        message = email.message_from_string(row.raw_mail)
        row['content_type'] = message.get_content_type()      
        row['charset'] = message.get_content_charset()
        row['content_transfer_encoding'] = message['Content-Transfer-Encoding']
        return row
    except Exception as e:
        return row

malicious_df = malicious_df.apply(getExtraInfo, axis=1)

If we used the method above for the enron_df it will take a lot longer (30+ minutes) then doing it like in the code bellow 

In [7]:
enron_df['content_type'] = enron_df.raw_mail.apply(lambda raw_mail: email.message_from_string(raw_mail).get_content_type())

In [8]:
enron_df['charset'] = enron_df.raw_mail.apply(lambda raw_mail: email.message_from_string(raw_mail).get_content_charset())

In [9]:
enron_df['content_transfer_encoding'] = enron_df.raw_mail.apply(lambda raw_mail: email.message_from_string(raw_mail)['Content-Transfer-Encoding'])

Get the email domain used to send the email and the domain of the email for the receiver

In [10]:
malicious_df['from_domain'] = malicious_df.parsed_from.str.split('@', expand=True)[1]
malicious_df['to_domain'] = malicious_df.parsed_from.str.split('@', expand=True)[1]

In [11]:
(malicious_df.groupby(['to_domain']).from_domain.count()).sort_values(ascending=False)

to_domain
example.com         3204
M                   1328
domain.com           733
S                    358
aclweb.org           297
                    ... 
portugalmail.com       1
alibi.com.mk           1
jrocha.com.br          1
bluemail.ch            1
123.                   1
Name: from_domain, Length: 321, dtype: int64

In [12]:
enron_df['from_domain'] = enron_df.parsed_from.str.split('@', expand=True)[1]
enron_df['to_domain'] = enron_df.parsed_from.str.split('@', expand=True)[1]

In [13]:
(enron_df.groupby(['to_domain']).from_domain.count()).sort_values(ascending=False)

to_domain
enron.com                 427784
aol.com                     2803
hotmail.com                 2427
mailman.enron.com           1775
txu.com                     1653
                           ...  
learningstrategies.com         1
leasinggroup.com               1
lexgen.com                     1
lexis-nexis.com                1
zzz2.net                       1
Name: from_domain, Length: 5289, dtype: int64

Clean some inconsistensy in the content_type and content_transfer_encoding columns

In [14]:
print(malicious_df.content_type.unique())
print(malicious_df.isna().sum())

['text/plain' 'multipart/mixed' 'multipart/alternative' 'text/html'
 'multipart/related' 'text/html content-transfer-encoding: 8bit\\r\\n'
 'text/htmlcontent-transfer-encoding:8bitrn']
raw_mail                        0
subject                        38
from                         1043
to                           1043
status                        363
date                            0
body                            0
parsed_from                     0
parsed_to                       0
parsed_date                     0
malicious                       0
content_type                    0
charset                      4217
content_transfer_encoding    4021
from_domain                     0
to_domain                       0
dtype: int64


In [15]:
print(enron_df.content_type.unique())
print(enron_df.isna().sum())

['text/plain']
raw_mail                          0
subject                           0
from                              0
to                            21847
status                       517401
date                              0
body                              0
parsed_from                       0
parsed_to                         0
parsed_date                       0
malicious                         0
content_type                      0
charset                          29
content_transfer_encoding        29
from_domain                       0
to_domain                         0
dtype: int64


In [16]:
malicious_df.content_transfer_encoding.loc[
    (malicious_df.content_type == 'text/htmlcontent-transfer-encoding:8bitrn') | 
    (malicious_df.content_type == 'text/html content-transfer-encoding: 8bit\\r\\n')] = '8bit'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  malicious_df.content_transfer_encoding.loc[


In [17]:
malicious_df.content_type.loc[
    (malicious_df.content_type == 'text/htmlcontent-transfer-encoding:8bitrn') | 
    (malicious_df.content_type == 'text/html content-transfer-encoding: 8bit\\r\\n')] = "text/html"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  malicious_df.content_type.loc[


Adding Numeric, Boolean and others features

In [18]:
# Get email that contains html
malicious_df['html'] = malicious_df.content_type.str.contains('text/html', case=False, regex=True)
# Get email that contains javascript
malicious_df['javascript'] = malicious_df.raw_mail.str.contains('(<script|.js)', case=False, regex=True)
# Get email that contains css
malicious_df['css'] = malicious_df.raw_mail.str.contains('(<style|\.css)', case=False, regex=True)
# Get email that contains html form
malicious_df['html_form'] = malicious_df.raw_mail.str.contains('(<form)', case=False, regex=True)
malicious_df['html_iframe'] = malicious_df.raw_mail.str.contains('<iframe', case=False, regex=True)

  malicious_df['javascript'] = malicious_df.raw_mail.str.contains('(<script|.js)', case=False, regex=True)
  malicious_df['css'] = malicious_df.raw_mail.str.contains('(<style|\.css)', case=False, regex=True)
  malicious_df['html_form'] = malicious_df.raw_mail.str.contains('(<form)', case=False, regex=True)


In [24]:
def getURLs(text):
    count = len(re.findall(r'(https?://\S+)', text))
    return count

malicious_df['URLs_in_message'] = malicious_df.body.apply(getURLs)

In [26]:
# Get email that contains html
enron_df['html'] = enron_df.content_type.str.contains('text/html', case=False, regex=True)
# Get email that contains javascript
enron_df['javascript'] = enron_df.raw_mail.str.contains('(<script|.js)', case=False, regex=True)
# Get email that contains css
enron_df['css'] = enron_df.raw_mail.str.contains('(<style|\.css)', case=False, regex=True)
# Get email that contains html form
enron_df['html_form'] = enron_df.raw_mail.str.contains('(<form)', case=False, regex=True)
enron_df['html_iframe'] = enron_df.raw_mail.str.contains('<iframe', case=False, regex=True)

  enron_df['javascript'] = enron_df.raw_mail.str.contains('(<script|.js)', case=False, regex=True)
  enron_df['css'] = enron_df.raw_mail.str.contains('(<style|\.css)', case=False, regex=True)
  enron_df['html_form'] = enron_df.raw_mail.str.contains('(<form)', case=False, regex=True)


In [None]:
enron_df['URLs_in_message'] = enron_df.body.apply(getURLs)


save results

In [27]:
enron_df.to_csv('datasets/explored/enron-emails-explored.csv', index=False)
malicious_df.to_csv('datasets/explored/malicious-emails-explored.csv', index=False)