In [64]:
%cd
%rm -rf ml-project
!git clone https://github.com/filipkosecek/ml-project.git
!pip install pyahocorasick
%cd ml-project
%ls

/root
Cloning into 'ml-project'...
remote: Enumerating objects: 12, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 12 (delta 2), reused 12 (delta 2), pack-reused 0 (from 0)[K
Receiving objects: 100% (12/12), 18.00 MiB | 17.26 MiB/s, done.
Resolving deltas: 100% (2/2), done.
/root/ml-project
negative_keywords.txt  Phishing_Email.csv  stop_words.csv


In [65]:
# imports
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from nltk.stem.porter import PorterStemmer
from matplotlib import pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split

In [66]:
# load the required data
df_data = pd.read_csv("Phishing_Email.csv")
df_data.rename(columns={'Email Text': 'email_body', 'Email Type': 'email_type'}, inplace=True)
df_data.drop(df_data.columns[0], axis=1, inplace=True)

stop_words = np.loadtxt('stop_words.csv', delimiter=',', dtype='str')
stop_words.append(['re', 'disc, '])
# TODO: remove link, email addresses, abbreviations, etc
with open('negative_keywords.txt', 'r') as f:
    trigger_words = [line.strip().lower() for line in f]

In [73]:
classes = {'Phishing Email': 1, 'Safe Email': 0}
classes_inv = {1: 'Phishing Email', 0: 'Safe Email'}

def get_label_vec(y, classes):
    result = []
    for yi in y:
        result.append(classes[yi])
    return result

df_data['email_type'] = get_label_vec(df_data['email_type'].to_list(), classes)

In [69]:
# count web URLs
def count_urls(email_bodies):
    result = []
    for body in email_bodies:
        result.append(str(body).count('http'))
    return result

df_data['url_count'] = count_urls(df_data['email_body'].to_list())

In [68]:
# check whether the email body contains embedded HTML
def contains_html(email_bodies):
    result = []
    for body in email_bodies:
        # it is usually <html language...> so
        # this should not create many false positives
        if "<html" in str(body):
            result.append(1)
        else:
            result.append(0)
    return result

df_data['contains_html'] = contains_html(df_data['email_body'].to_list())

In [70]:
# compute how many exclamation marks the body contains
def count_exclamation_marks(email_bodies):
    result = []
    for body in email_bodies:
        result.append(str(body).count('!'))
    return result

df_data['exclamation_mark_count'] = count_exclamation_marks(df_data['email_body'])

In [72]:
# include the total email body length
df_data['email_body_length'] = df_data['email_body'].apply(lambda x : len(str(x)))

In [67]:
# preprocess the data
import ahocorasick

def count_trigger_words(email_bodies):
    automaton = ahocorasick.Automaton()
    ix = 0
    for word in trigger_words:
        automaton.add_word(word, ix)
        ix += 1
    automaton.make_automaton()

    trigger_word_counts = []
    for body in email_bodies:
        count = 0
        used = [False] * len(trigger_words)
        for end_index, ix in automaton.iter(str(body).lower()):
            if used[ix]:
                continue
            count += 1
            used[ix] = True
        trigger_word_counts.append(count)
    return trigger_word_counts

df_data['trigger_word_count'] = count_trigger_words(df_data['email_body'].to_list())

In [71]:
# count words which are in all caps
def count_all_caps(email_bodies):
    result = []
    for body in email_bodies:
        tmp = 0
        for word in str(body):
            if word.isupper():
                tmp += 1
        result.append(tmp)
    return result

df_data['all_caps_word_count'] = count_all_caps(df_data['email_body'])

In [74]:
def remove_stop_words(data, stop_words):
    result = []
    #data = str(data)
    stop_words = set(stop_words)
    for d in data:
        d = str(d)
        split_data = d.lower().split(' ')
        result.append(" ".join([word for word in split_data if word not in stop_words]))
    return result

df_data['email_body'] = remove_stop_words(df_data['email_body'].to_list(), stop_words)

In [75]:
df_data.head(10)

Unnamed: 0,email_body,email_type,trigger_word_count,contains_html,url_count,exclamation_mark_count,all_caps_word_count,email_body_length
0,"re : 6 . 1100 , disc : uniformitarianism , re ...",0,0,0,0,2,0,1030
1,side * galicismos * * galicismo * spanish term...,0,0,0,0,0,0,479
2,re : equistar deal tickets still available ass...,0,0,0,0,0,0,1245
3,\nhello hot lil horny toy.\n one dream abou...,1,0,0,1,1,39,688
4,software incredibly low prices ( 86 % lower ) ...,1,0,0,0,0,0,441
5,global risk management operations sally congra...,0,1,0,0,0,0,3295
6,"sun, aug 11, 2002 11:17:47am +0100, wintermute...",0,0,0,1,0,33,908
7,"entourage , stockmogul newsletter ralph velez ...",1,0,0,0,5,0,7653
8,"owe lots money dear applicant , review upon re...",1,0,0,1,0,0,613
9,re : coastal deal - exxon participation projec...,0,0,0,0,1,0,1822
