In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score, roc_auc_score

In [16]:
combined_data = pd.read_csv("phishing_data/combined_data.csv",  lineterminator='\n')
combined_data.head()

Unnamed: 0,sender,receiver,date,subject,body,urls,label,date_day_of_month,date_month,date_day_of_week,...,body_question_count,body_special_char_count,body_sentiment_polarity,body_sentiment_subjectivity,body_url_count,body_shortened_url_count,body_spam_count,send_word_count,send_char_count,send_diffSenderReceiver
0,"""Hu, Sylvia"" <Sylvia.Hu@ENRON.com>","""Acevedo, Felecia"" <Felecia.Acevedo@ENRON.com>...",2001-06-29 13:36:09+00:00,"FW: June 29 -- BNA, Inc. Daily Labor Report",User ID: enrondlr\nPW: bnaweb22\n\n\n ...,1,0,1.725612,-0.235795,0.845494,...,-0.148146,0.016733,-0.430243,-0.10863,5.465202,-0.033873,1.198552,0.12592,-0.46738,0
1,"""Webb, Jay"" <Jay.Webb@ENRON.com>","""Lambie, Chris"" <Chris.Lambie@ENRON.com>",2001-06-29 14:37:04+00:00,NGX failover plan.,"\nHi Chris, \n\nTonight we are rolling out a ...",0,0,1.725612,-0.235795,0.845494,...,-0.148146,-0.033274,-0.373288,-0.267096,-0.231379,-0.033873,-0.095955,0.12592,-0.657927,0
2,"""Symms, Mark"" <Mark.Symms@ENRON.com>","""Thomas, Paul D."" <Paul.D.Thomas@ENRON.com>",2001-06-29 13:39:30+00:00,RE: Intranet Site,Rika r these new?\n\n -----Original Message---...,1,0,1.725612,-0.235795,0.845494,...,-0.047293,-0.033274,-0.425876,-0.764598,0.185444,-0.033873,-0.239789,0.12592,-0.276833,0
3,"""Thorne, Judy"" <Judy.Thorne@ENRON.com>","""Grass, John"" <John.Grass@ENRON.com>, ""Nemec, ...",2001-06-29 15:35:17+00:00,FW: ENA Upstream Company information,"John/Gerald,\n\nWe are currently trading under...",0,0,1.725612,-0.235795,0.845494,...,-0.047293,-0.014521,-0.380623,-0.27787,-0.231379,-0.033873,0.191713,0.12592,-0.086286,0
4,"""Williams, Jason R (Credit)"" <Jason.R.Williams...","""Nemec, Gerald"" <Gerald.Nemec@ENRON.com>, ""Dic...",2001-06-29 15:40:02+00:00,New Master Physical,Gerald and Stacy -\n\nAttached is a worksheet ...,0,0,1.725612,-0.235795,0.845494,...,-0.148146,-0.033274,-0.029388,-0.615755,-0.231379,-0.033873,-0.239789,2.48685,1.723908,0


In [17]:

# get numeric attributes since decision tree feature must be continuous
def get_numeric_attributes(df):
    attributes = list(df.select_dtypes(include = ["float", "int"]).columns)
    attributes.remove("label")
    return attributes

attributes = get_numeric_attributes(combined_data)
print(f"Number of columns: {len(attributes)}")
print(attributes)


Number of columns: 28
['urls', 'date_day_of_month', 'date_month', 'date_day_of_week', 'date_hour_of_day', 'date_is_weekend', 'date_is_business_hours', 'subj_reply', 'subj_forward', 'subj_word_count', 'subj_char_count', 'body_char_count', 'body_word_count', 'body_distinct_word_count', 'body_average_word_length', 'body_uppercase_word_count', 'body_numeric_char_count', 'body_exclamation_count', 'body_question_count', 'body_special_char_count', 'body_sentiment_polarity', 'body_sentiment_subjectivity', 'body_url_count', 'body_shortened_url_count', 'body_spam_count', 'send_word_count', 'send_char_count', 'send_diffSenderReceiver']


In [18]:
y = combined_data["label"].to_numpy().astype(float)
X = combined_data[attributes].to_numpy().astype(float)
# use 20% of data for testing, 80% for training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [19]:
# baseline behaviour
# set initial seed (random_state) to 0 for deterministic behaviour
baseline_decision_tree = DecisionTreeClassifier(random_state = 0)
baseline_decision_tree.fit(X_train, y_train)
y_pred = baseline_decision_tree.predict(X_test)

In [20]:
def get_evaluation_metrics(y_test, y_pred):
    tp = tn = fp = fn = 0
    for i in range(len(y_pred)):
        if y_pred[i] == y_test[i]:
            if y_pred[i] == 1.0:
                tp += 1
            else:
                tn += 1
        else:
            if y_pred[i] == 1.0:
                fp += 1 
            else:
                fn += 1
    sk_tn, sk_fp, sk_fn, sk_tp = confusion_matrix(y_test, y_pred).ravel()
    # cross reference with sklearn implementation
    assert(tp == sk_tp and tn == sk_tn and fp == sk_fp and fn == sk_fn)
    # compute F1 score
    f1  = (2 * tp) / ((2 * tp) + fp + fn)
    sk_f1 = f1_score(y_test, y_pred)
    # compute precision
    precision = tp / (tp + fp)
    sk_precision = precision_score(y_test, y_pred)
    # compute precision
    recall = tp / (tp + fn)
    sk_recall = recall_score(y_test, y_pred)
    # compute accuracy
    accuracy = (tp + tn) / len(y_pred)
    sk_accuracy = accuracy_score(y_test, y_pred)
    # cross reference with sklearn implementation to check if metrics tally
    assert(f1 == sk_f1 and precision == sk_precision and recall == sk_recall and accuracy == sk_accuracy)
    roc_auc = roc_auc_score(y_test, y_pred)
    return sk_f1, sk_precision, sk_recall, sk_accuracy,roc_auc

def print_metrics(metrics):
    print(f"F1 = {metrics[0]}")
    print(f"precision = {metrics[1]}")
    print(f"recall = {metrics[2]}")
    print(f"accuracy = {metrics[3]}")
    print(f"ROC_AUC = {metrics[4]}")

baseline_metrics = get_evaluation_metrics(y_test, y_pred)
print_metrics(baseline_metrics)

F1 = 0.9192751235584844
precision = 0.913115968877362
recall = 0.9255179320272893
accuracy = 0.9244904486378042
ROC_AUC = 0.9245584881682928


In [11]:
# perform grid search to fine tune hyperparameters
# setup parameter space
import time

parameters = {'criterion': ['gini','entropy'],
              'max_depth': [i for i in range(5, 10)],
              'max_features': [None, 'sqrt', 'log2'],
              'max_leaf_nodes': [i for i in range(5, 20, 2)]}

# create an instance of the grid search object
# default scorign is accuracy
grid_searcher = GridSearchCV(DecisionTreeClassifier(), parameters, cv = 5, n_jobs = -1)

# conduct grid search over the parameter space
start_time = time.time()
grid_searcher.fit(X_train,y_train)
duration = time.time() - start_time
print(f"Duration taken for grid search: {duration}")
optimal_params = grid_searcher.best_params_
print(optimal_params)

Duration taken for grid search: 215.53607654571533
{'criterion': 'gini', 'max_depth': 8, 'max_features': None, 'max_leaf_nodes': 19}


In [21]:
# obtain evaluation metrics for fined tuned model
optimal_model = grid_searcher.best_estimator_
optimal_pred = optimal_model.predict(X_test)
optimal_metrics = get_evaluation_metrics(y_test, optimal_pred)
print_metrics(optimal_metrics)

F1 = 0.8184824659927618
precision = 0.8159875583203733
recall = 0.8209926769731489
accuracy = 0.8308376704561975
ROC_AUC = 0.8301857390078604
