# **Importing the Data**

In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.max_colwidth', None)

FILE_NAME = 'Phishing Dataset/phishing_out_2022.csv'
df = pd.read_csv(FILE_NAME)

In [2]:
desiredcols = ['received1',
 'received2',
 'received3',
 'received4',
 'received5',
 'received6',
 'received7',
 'hops',
 'subject',
 'date',
 'message-id',
 'from',
 'return-path',
 'to',
 'content-type',
 'mime-version',
 'x-mailer',
 'content-transfer-encoding',
 'x-mimeole',
 'x-priority',
 'list-id',
 'x-virus-scanned',
 'status',
 'content-length',
 'delivered-to',
 'list-unsubscribe',
 'x-msmail-priority',
 'x-spam-status',
 'sender',
 'reply-to',
 'x-virus-status',
 'x-spam-level',
 'x-spam-checker-version',
 'references',
 'in-reply-to',
 'user-agent',
 'thread-index',
 'received-spf',
 'content-disposition',
 'domainkey-signature',
 'importance',
 'label',
 'precedence',
 'received8',
 'cc']

In [3]:
# Fill in missing columns from each dataframe and fill with nan values
for col in desiredcols:
    if col not in df.columns:
        df[col] = np.nan

In [4]:
df.shape

(245, 45)

# **Basic Information on the Dataset**

In [5]:
df['label'].value_counts()

label
2    245
Name: count, dtype: int64

In [6]:
df.columns.values

array(['received1', 'received2', 'received3', 'received4', 'received5',
       'received6', 'received7', 'received8', 'hops', 'subject', 'date',
       'message-id', 'from', 'return-path', 'to', 'content-type',
       'mime-version', 'x-mailer', 'content-transfer-encoding',
       'x-mimeole', 'x-priority', 'x-virus-scanned', 'status',
       'delivered-to', 'list-unsubscribe', 'x-msmail-priority',
       'x-spam-status', 'sender', 'reply-to', 'x-spam-level',
       'x-spam-checker-version', 'thread-index', 'cc', 'received-spf',
       'domainkey-signature', 'importance', 'label', 'list-id',
       'content-length', 'x-virus-status', 'references', 'in-reply-to',
       'user-agent', 'content-disposition', 'precedence'], dtype=object)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 245 entries, 0 to 244
Data columns (total 45 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   received1                  245 non-null    object 
 1   received2                  138 non-null    object 
 2   received3                  65 non-null     object 
 3   received4                  19 non-null     object 
 4   received5                  9 non-null      object 
 5   received6                  7 non-null      object 
 6   received7                  1 non-null      object 
 7   received8                  1 non-null      object 
 8   hops                       245 non-null    int64  
 9   subject                    244 non-null    object 
 10  date                       245 non-null    object 
 11  message-id                 235 non-null    object 
 12  from                       245 non-null    object 
 13  return-path                245 non-null    object 

In [8]:
df.nunique()

received1                    245
received2                    138
received3                     65
received4                     19
received5                      9
received6                      7
received7                      1
received8                      1
hops                           7
subject                      177
date                         245
message-id                   235
from                         223
return-path                  204
to                            12
content-type                 144
mime-version                   2
x-mailer                       9
content-transfer-encoding      4
x-mimeole                      4
x-priority                     3
x-virus-scanned                5
status                         2
delivered-to                   1
list-unsubscribe               8
x-msmail-priority              1
x-spam-status                197
sender                         4
reply-to                      16
x-spam-level                   3
x-spam-che

In [9]:
df.describe()

Unnamed: 0,hops,x-priority,label,list-id,content-length,x-virus-status,references,in-reply-to,user-agent,content-disposition,precedence
count,245.0,8.0,245.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,1.979592,2.375,2.0,,,,,,,,
std,1.185462,0.744024,0.0,,,,,,,,
min,1.0,1.0,2.0,,,,,,,,
25%,1.0,2.0,2.0,,,,,,,,
50%,2.0,2.5,2.0,,,,,,,,
75%,3.0,3.0,2.0,,,,,,,,
max,8.0,3.0,2.0,,,,,,,,


In [10]:
df.shape

(245, 45)

Most columns have missing values:

In [11]:
df.columns[df.isna().any()].tolist()

['received2',
 'received3',
 'received4',
 'received5',
 'received6',
 'received7',
 'received8',
 'subject',
 'message-id',
 'to',
 'mime-version',
 'x-mailer',
 'content-transfer-encoding',
 'x-mimeole',
 'x-priority',
 'x-virus-scanned',
 'list-unsubscribe',
 'x-msmail-priority',
 'sender',
 'reply-to',
 'x-spam-level',
 'x-spam-checker-version',
 'thread-index',
 'cc',
 'received-spf',
 'domainkey-signature',
 'importance',
 'list-id',
 'content-length',
 'x-virus-status',
 'references',
 'in-reply-to',
 'user-agent',
 'content-disposition',
 'precedence']

These do not have missing values:

In [12]:
df.columns[~df.isna().any()].tolist()

['received1',
 'hops',
 'date',
 'from',
 'return-path',
 'content-type',
 'status',
 'delivered-to',
 'x-spam-status',
 'label']

In [13]:
df.columns

Index(['received1', 'received2', 'received3', 'received4', 'received5',
       'received6', 'received7', 'received8', 'hops', 'subject', 'date',
       'message-id', 'from', 'return-path', 'to', 'content-type',
       'mime-version', 'x-mailer', 'content-transfer-encoding', 'x-mimeole',
       'x-priority', 'x-virus-scanned', 'status', 'delivered-to',
       'list-unsubscribe', 'x-msmail-priority', 'x-spam-status', 'sender',
       'reply-to', 'x-spam-level', 'x-spam-checker-version', 'thread-index',
       'cc', 'received-spf', 'domainkey-signature', 'importance', 'label',
       'list-id', 'content-length', 'x-virus-status', 'references',
       'in-reply-to', 'user-agent', 'content-disposition', 'precedence'],
      dtype='object')

# **---------- Data Preprocessing -----------------------------**

# **Initial Setup**

In [14]:
initial_features_list = ['received1', 'received2', 'received3', 'received4', 'received5',
       'received6', 'received7', 'received8', 'hops', 'subject', 'date',
       'message-id', 'from', 'return-path', 'to', 'content-type',
       'mime-version', 'x-mailer', 'content-transfer-encoding', 'x-mimeole',
       'x-priority', 'x-virus-scanned', 'status', 'delivered-to',
       'list-unsubscribe', 'x-msmail-priority', 'x-spam-status', 'sender',
       'reply-to', 'x-spam-level', 'x-spam-checker-version', 'thread-index',
       'cc', 'received-spf', 'domainkey-signature', 'importance', 'label']

label_name = 'label'

final_features_list = ['hops']

# **Feature: "Missing Fields**"

Feature extraction: Creates one new feature per header field based on whether they're missing (NaN) from an email or not. If they're present, a value of 0 is assigned, otherwise 1 if they're missing. 

---



In [15]:
df.shape

(245, 45)

In [16]:
missing_feature_names = []

for name in initial_features_list:
  missing_feature_names.append('missing_' + name)

for feature, name in zip(initial_features_list, missing_feature_names):
  df.loc[df[feature].isnull(), name] = 1
  df.loc[~df[feature].isnull(), name] = 0

final_features_list.extend(missing_feature_names)

In [17]:
df.shape

(245, 82)

In [18]:
df['missing_return-path'].value_counts()

missing_return-path
0.0    245
Name: count, dtype: int64

In [19]:
df['missing_x-mailer'].value_counts()

missing_x-mailer
1.0    235
0.0     10
Name: count, dtype: int64

# **Dealing with NaN:**

In [20]:
df = df.replace(np.nan, '', regex=True)

# **Feature: "String Content Matching"**

Checks are done for the following things items, where 1 represents it does contain that thing, 0 represents it does NOT contain it: 

'From': contains '?', '!', or '<>'


---

'To': contains '<>'

---

'Message-ID': contains '$'

---

'Return-Path': contains 'bounce'

---

'Reply-To': contains: '?'

---

'Received-SPF': contains 'bad', 'softfail', 'fail'

---

'Content-Type': contains 'text/html'

---

Precedence: contains 'list'

---

Received: Contains 'forged'


**Content-Transfer-Encoding:**

Returns 0 if contains 7bit or 8bit, otherwise 1. (case insensitive)

In [21]:
df[~(df['content-transfer-encoding'].str.contains('(?i)8bit')) & ~(df['content-transfer-encoding'].str.contains('(?i)7bit'))]['label'].value_counts()

label
2    228
Name: count, dtype: int64

In [22]:
import re

def content_encoding_val(row):
  val = row['content-transfer-encoding']
  re1 = re.compile(r'(?i)8bit')
  re2 = re.compile(r'(?i)7bit')
  if (not re1.search(val)) and (not re2.search(val)):
    return 1
  else:
    return 0

In [23]:
df['content-encoding-val'] = df.apply(content_encoding_val, axis=1)

In [24]:
final_features_list.append('content-encoding-val')

**Received check:**

In [25]:
def check_received_forged(row):
  num_iters = row['hops']
  col_name_base = 'received'

  for i in range(1, num_iters+1):
    curr_val = row[col_name_base + str(i)]
    if 'forged' in curr_val:
      return 1
    else:
      continue
  return 0

In [26]:
df['received_str_forged'] = df.apply(check_received_forged, axis=1)
final_features_list.append('received_str_forged')

In [27]:
df[df['received_str_forged'] == 1]['label'].value_counts()

Series([], Name: count, dtype: int64)

**Other string checks:**

In [28]:
def str_based_features_add(old_col_name, new_col_names, items_to_check):

  for col_name, item_to_check in zip(new_col_names, items_to_check):
    if item_to_check == '':
      df.loc[(df[old_col_name].str.fullmatch(item_to_check)), col_name] = 1
      df.loc[~(df[old_col_name].str.fullmatch(item_to_check)), col_name] = 0
    else:
      df.loc[(df[old_col_name].str.contains('(?i)' + item_to_check)), col_name] = 1
      df.loc[~(df[old_col_name].str.contains('(?i)' + item_to_check)), col_name] = 0

  final_features_list.extend(new_col_names)

In [29]:
# Content-Transfer-Encoding
new_col_names = ['str_content-encoding_empty']
items_to_check = ['']
feature = 'content-transfer-encoding'
str_based_features_add(feature, new_col_names, items_to_check)

# From
new_col_names = ['str_from_question', 'str_from_exclam', 'str_from_chevron']
items_to_check = ['\?', '!', '<.+>']
feature = 'from'
str_based_features_add(feature, new_col_names, items_to_check)

# To
new_col_names = ['str_to_chevron', 'str_to_undisclosed', 'str_to_empty']
items_to_check = ['<.+>', 'Undisclosed Recipients', '']
feature = 'to'
str_based_features_add(feature, new_col_names, items_to_check)

# Message-ID
new_col_names = ['str_message-ID_dollar']
items_to_check = ['\$']
feature = 'message-id'
str_based_features_add(feature, new_col_names, items_to_check)

# Return-Path
new_col_names = ['str_return-path_bounce', 'str_return-path_empty']
items_to_check = ['bounce', '']
feature = 'return-path'
str_based_features_add(feature, new_col_names, items_to_check)

# Reply-To
new_col_names = ['str_reply-to_question']
items_to_check = ['\?']
feature = 'reply-to'
str_based_features_add(feature, new_col_names, items_to_check)

# Received-SPF
new_col_names = ['str_received-SPF_bad', 'str_received-SPF_softfail', 
                 'str_received-SPF_fail']
items_to_check = ['bad', 'softfail', 'fail']
feature = 'received-spf'
str_based_features_add(feature, new_col_names, items_to_check)

# Content-Type
new_col_names = ['str_content-type_texthtml']
items_to_check = ['text/html']
feature = 'content-type'
str_based_features_add(feature, new_col_names, items_to_check)

# **Feature: "Count-Based features"**

**The following features are extracted (and also discretizied):**

Hops: The number of received fields in an email. (This was extracted during the creation of the data set.)

---

Length: The total number of characters in the 'From' field.

---

Number of recipients: The number of email addresses specified in the 'To' field.

Number of recipients: The number of email addresses specified in the 'Cc' field.

Number of Senders: The number of email addresses specified in the 'From' field.

---

Number of replies: The number of message-ID's contained within the 'References' field. (Each message-ID is enclosed in a pair of '<>'). [PROPOSED]





**Hops:**

In [30]:
df['hops'].describe()

count    245.000000
mean       1.979592
std        1.185462
min        1.000000
25%        1.000000
50%        2.000000
75%        3.000000
max        8.000000
Name: hops, dtype: float64

In [31]:
df[df['hops'] > 5]['label'].value_counts()

label
2    7
Name: count, dtype: int64

In [32]:
df['hops'] = df['hops'].apply(lambda x: 0 if x <= 2 else 1 if x <= 5 else 2)

In [33]:
df['hops'].value_counts()

hops
0    180
1     58
2      7
Name: count, dtype: int64

**Length:**

In [34]:
def count_chars(field_names, new_col_names):
  for field_name, new_col_name in zip(field_names, new_col_names):
    df[new_col_name] = df[field_name].str.len()

  final_features_list.extend(new_col_names)

In [35]:
fields_to_find_lengths = ['from']
new_col_names_lengths = []

for val in fields_to_find_lengths:
  new_col_names_lengths.append('length_' + val)

count_chars(fields_to_find_lengths, new_col_names_lengths)

In [36]:
df['length_from'].describe()

count    245.000000
mean      47.808163
std       21.123173
min       10.000000
25%       35.000000
50%       44.000000
75%       55.000000
max      195.000000
Name: length_from, dtype: float64

In [37]:
df[df['length_from'] > 40]['label'].value_counts()

label
2    148
Name: count, dtype: int64

In [38]:
df['length_from'] = df['length_from'].apply(lambda x: 0 if x > 40 else 1)

**Number of recipients:**

In [39]:
import re

#https://stackoverflow.com/questions/42407785/regex-extract-email-from-strings
df['num_recipients_to'] = df.apply(lambda x: len(re.findall(
    r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', x['to'])), axis=1)

df['num_recipients_cc'] = df.apply(lambda x: len(re.findall(
    r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', x['cc'])), axis=1)

df['num_recipients_from'] = df.apply(lambda x: len(re.findall(
    r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', x['from'])), axis=1)

final_features_list.append('num_recipients_to')
final_features_list.append('num_recipients_cc')
final_features_list.append('num_recipients_from')

In [40]:
df['num_recipients_to'].describe()

count    245.000000
mean       0.991837
std        0.202279
min        0.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        2.000000
Name: num_recipients_to, dtype: float64

In [41]:
df[df['num_recipients_to'] == 0]['label'].value_counts()

label
2    6
Name: count, dtype: int64

In [42]:
df['num_recipients_to'] = df['num_recipients_to'].apply(lambda x: 0 if x == 0 else 1 if x == 1 else 2)

In [43]:
df['num_recipients_to'].value_counts()

num_recipients_to
1    235
0      6
2      4
Name: count, dtype: int64

In [44]:
df['num_recipients_cc'] = df['num_recipients_cc'].apply(lambda x: 0 if x == 0 else 1 if x == 1 else 2)

In [45]:
df[df['num_recipients_from'] == 0]['label'].value_counts()

label
2    11
Name: count, dtype: int64

In [46]:
df['num_recipients_from'].describe()

count    245.000000
mean       0.967347
std        0.237290
min        0.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        2.000000
Name: num_recipients_from, dtype: float64

In [47]:
df['num_recipients_from'].value_counts()

num_recipients_from
1    231
0     11
2      3
Name: count, dtype: int64

In [48]:
df['num_recipients_from'] = df['num_recipients_from'].apply(lambda x: 0 if x == 0 else 1 if x == 1 else 2)

**Number of replies:**

In [49]:
import re

def extract_num_replies(row):
  references_val = row['references']
  all = re.findall(r'<([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)>', 
                  references_val)
  return len(all)

In [50]:
df['number_replies'] = df.apply(extract_num_replies, axis=1)

In [51]:
df['number_replies'].describe()

count    245.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
Name: number_replies, dtype: float64

In [52]:
df['number_replies'].value_counts()

number_replies
0    245
Name: count, dtype: int64

In [53]:
df[df['number_replies'] >= 1]['label'].value_counts()

Series([], Name: count, dtype: int64)

In [54]:
df['number_replies'] = df['number_replies'].apply(lambda x: 0 if x >= 1 else 1)

In [55]:
final_features_list.append('number_replies')

# **Feature: "Extraction-based features"**

The following were extracted as features:

Time zone: The time zone listed in the date field.

---

X-Priority: The numeric value is used as a feature.

---

Lines: The numeric value is discretized into bins, and then used as a feature.

---

Content-Length: The numeric value is discretized into bins, and then used as a feature.

---

Lines: The numeric value is discretized into bins, and then used as a feature.

**Time Zone:**

In [56]:
from dateutil.parser import parse
from datetime import datetime, timedelta
import email.utils

In [57]:
def extract_time_zone(row):
  time_zone = email.utils.parsedate_tz(row['date'])
  if time_zone is None:
    return 'NA'
  else:
    return int(time_zone[9] / (60*60)) % 24

In [58]:
df['time_zone'] = df.apply(extract_time_zone, axis=1)

In [59]:
df.loc[df['time_zone'].astype(str).str.contains('NA'), 'time_zone'] = df['time_zone'].value_counts().index[0]

In [60]:
# Test
d1 = email.utils.parsedate_tz('Mon, 9 Apr 2007 14:31:03 02300')
d2 = email.utils.parsedate_tz('Mon, 8 Apr 2007 14:31:03 -0100')

print((email.utils.mktime_tz(d2)) - (email.utils.mktime_tz(d1)))

0


In [61]:
df['time_zone'].value_counts()

time_zone
0     127
17     37
16     21
2      21
1      16
8       5
20      4
9       3
19      3
3       3
7       3
5       1
21      1
Name: count, dtype: int64

In [62]:
df.loc[df['time_zone'] != 20, 'time_zone'] = 0
df.loc[df['time_zone'] == 20, 'time_zone'] = 1

In [63]:
final_features_list.append('time_zone')

**X-Priority:**

In [64]:
df['x-priority'] = df['x-priority'].astype(str).str.extract('(\\d+)')
df['x-priority'] = pd.to_numeric(df['x-priority'], errors='coerce')
df['x-priority'] = df['x-priority'].fillna(0)

In [65]:
df[df['x-priority'] > 0]['x-priority'].describe()

count    8.000000
mean     2.375000
std      0.744024
min      1.000000
25%      2.000000
50%      2.500000
75%      3.000000
max      3.000000
Name: x-priority, dtype: float64

In [66]:
df['x-priority'].value_counts()

x-priority
0.0    237
3.0      4
2.0      3
1.0      1
Name: count, dtype: int64

In [67]:
df['x-priority'] = df['x-priority'].apply(lambda x: 0 if x != 3 else 1)

In [68]:
final_features_list.append('x-priority')

**Content-Length:**

In [69]:
df['content-length'] = df['content-length'].astype(str).str.extract('(\\d+)')
df['content-length'] = pd.to_numeric(df['content-length'], errors='coerce')
df['content-length'] = df['content-length'].fillna(0)

In [70]:
df[df['content-length'] > 0]['content-length'].describe()

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: content-length, dtype: float64

In [71]:
df['content-length'] = df['content-length'].apply(lambda x: 0 if x < 1 else 1 if x < 1274 else 2 if x < 2348 else 3 if x < 5798 else 4)

In [72]:
final_features_list.append('content-length')

**Lines:**

In [73]:
df['lines'] = np.nan

In [74]:
df['lines'] = df['lines'].astype(str).str.extract('(\\d+)')
df['lines'] = pd.to_numeric(df['lines'], errors='coerce')

In [75]:
df['lines'].describe()

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: lines, dtype: float64

In [76]:
df['lines'] = df['lines'].fillna(0)

In [77]:
df['lines'] = df['lines'].apply(lambda x: 0 if x == 0 else 1 if x <= 30 else 2 if x <= 54 else 3 if x <= 119 else 4)

In [78]:
df['lines'].value_counts()

lines
0    245
Name: count, dtype: int64

In [79]:
final_features_list.append('lines')

In [80]:
df['last_received'] = ''
df['first_received'] = ''

**Day of the Week:**

In [81]:
df['last_received_date'] = df['last_received'].str.replace('\n\t', ';').str.split(r';').str[-1]
df['first_received_date'] = df['first_received'].str.replace('\n\t', ';').str.split(r';').str[-1]

In [82]:
def get_day_week(row):
  date_val = row['date']
  
  d1 = email.utils.parsedate_tz(date_val)

  if d1 is None:
    return 'NA'

  try:
    val1 = email.utils.mktime_tz(d1)
    day = datetime.fromtimestamp(val1).strftime("%A")
  except:
    return 'NA'

  return day

In [83]:
df['day_of_week'] = df.apply(get_day_week, axis=1)

In [84]:
df['day_of_week'].value_counts()

day_of_week
Wednesday    58
Thursday     47
Monday       40
Tuesday      35
Friday       30
Saturday     20
Sunday       15
Name: count, dtype: int64

In [85]:
df['day_of_week'] = df['day_of_week'].apply(lambda x: ['NA', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'].index(x))

In [86]:
final_features_list.append('day_of_week')

# **Feature: "Comparison-based features"**


Fields that contain only a single email: return-path, from (check inside <>), sender, reply-to (check inside <>, otherwise same as null), errors-to.

Refers to features that are derived by comparing other features. The following features are extracted:

**Span time**: Time difference between the first and last received fields. (Discretizied)

---

**Date and time conflict** (Consequtive received fields): Checks if consecutive received fields have a time difference of zero or greater. Output values = 0,1

---

**Date Validity** (Received): Checks the difference in time between the first Received fields date, and the date field, checks if zero or positive. Output values = 0,1

---



Domain matching between (if more than one domain per field, just check if any of them match): 


*   Message-ID + From
*   Message-ID + Return-Path
*   Message-ID + Sender
*   Message-ID + first Received
*   Return-Path + From
*   Return-Path + Reply-To
*   Reply-To + To
*   Error-To + Message-ID
*   Error-To + From
*   Error-To + Sender
*   Sender + From
*   Reference + Reply-To
*   Reference + InReply-To
*   To + last Received
*   InReply-To + To
*   Reference + To
*   Sender + first Received ('from' part of the first Received)
*   Return-Path + first Recieved ('from' part of the first Received)
*   Reply-To + last Received ('for' part of the last Received)
*   InReply-To + last Received ('for' part of the last Received)

---

Email address matching between (if more than one email per field, just check if any of them match):

*   To + Cc
*   Cc

---

Date and time checks: 

*   Date and last Received field (check if difference is greater than or equal to zero, or not. Also can check for a large gap in time, greater than some value.)
*   Span time: The time difference between the first and last received fields. This gets converted to 0 if the last received time is later than the first, 1 otherwise (also 1 if there is trouble reading the value).

---

Receieved Consequentive Checks:

*   Consequentive Received fields: checks if consequentive received fields have a matching domain for the 'by' of the eariler received field and the 'from' part of the later one. Value is 1 if there is an issue, otherwise 0 if all match.


---

**Date Validity: Comparing the 'Date' field and last received's "date" value:**

---


In [87]:
import time

def date_received_date_comp(row):
  date_date = row['date']
  date_received = row['last_received_date']

  d1 = email.utils.parsedate_tz(date_date)
  d2 = email.utils.parsedate_tz(date_received)

  if d1 is None or d2 is None:
    return -1

  try:
    val1 = email.utils.mktime_tz(d1)
    val2 = email.utils.mktime_tz(d2)
  except:
    return -1

  return (email.utils.mktime_tz(d2)) - (email.utils.mktime_tz(d1))

In [88]:
df['date_comp_date_received'] = df.apply(date_received_date_comp, axis=1)

In [89]:
df[df['date_comp_date_received'] > 0]['date_comp_date_received'].describe()

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: date_comp_date_received, dtype: float64

In [90]:
df['date_comp_date_received'] = df['date_comp_date_received'].apply(lambda x: 0 if x < 0 else 1)

In [91]:
df['date_comp_date_received'].value_counts()

date_comp_date_received
0    245
Name: count, dtype: int64

In [92]:
final_features_list.append('date_comp_date_received')

---

**Span time:**

---

In [93]:
test1 = email.utils.parsedate_tz('Mon, 12 Jul 2021 19:28:19 +0000')
test2 = email.utils.parsedate_tz('Mon, 12 Jul 2021 19:39:12 +0000')

print(((email.utils.mktime_tz(test2)) - (email.utils.mktime_tz(test1))) / (60*60))
print(((email.utils.mktime_tz(test2)) - (email.utils.mktime_tz(test1))) )

0.18138888888888888
653


In [94]:
def span_time_finder(row):
  first = row['first_received_date']
  last = row['last_received_date']

  d1 = email.utils.parsedate_tz(first)
  d2 = email.utils.parsedate_tz(last)

  if d1 is None or d2 is None:
    return -1

  try:
    val1 = email.utils.mktime_tz(d1)
    val2 = email.utils.mktime_tz(d2)
  except:
    return -1

  return (email.utils.mktime_tz(d2)) - (email.utils.mktime_tz(d1))

In [95]:
df['span_time'] = df.apply(span_time_finder, axis=1)

In [96]:
df[df['span_time'] > 0]['span_time'].describe()

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: span_time, dtype: float64

In [97]:
df['span_time'] = df['span_time'].apply(lambda x: 0 if x < 0 else 1 if x < 10 else 2 if x < 47 else 3 if x < 1100 else 4)

In [98]:
df['span_time'].value_counts()

span_time
0    245
Name: count, dtype: int64

In [99]:
final_features_list.append('span_time')

---

**Consecutive Received Field 'by' and 'from' matching:**

---

In [100]:
# Source: https://github.com/Te-k/pyreceived/blob/master/pyreceived/parser.py

import re
class ReceivedParser(object):
    regexes = [
        ("from\s+(mail\s+pickup\s+service|(?P<from_name>[\[\]\w\.\-]*))\s*(\(\s*\[?(?P<from_ip>[a-f\d\.\:]+)(\%\d+|)\]?\s*\)|)\s*by\s*(?P<by_hostname>[\w\.\-]+)\s*(\(\s*\[?(?P<by_ip>[\d\.\:a-f]+)(\%\d+|)\]?\)|)\s*(over\s+TLS\s+secured\s+channel|)\s*with\s*(mapi|Microsoft\s+SMTP\s+Server|Microsoft\s+SMTPSVC(\((?P<server_version>[\d\.]+)\)|))\s*(\((TLS|version=(?P<tls>[\w\.]+)|)\,?\s*(cipher=(?P<cipher>[\w\_]+)|)\)|)\s*(id\s+(?P<id>[\d\.]+)|)", "MS SMTP Server"), #exchange
        ("(from\s+(?P<from_name>[\[\S\]]+)\s+\(((?P<from_hostname>[\S]*)|)\s*\[(IPv6\:(?P<from_ipv6>[a-f\d\:]+)\:|)((?P<from_ip>[\d\.\:]+)|)\]\s*(\(may\s+be\s+forged\)|)\)\s*(\(using\s+(?P<tls>[\w\.]+)\s+with\s+cipher\s+(?P<cipher>[\w\-]+)\s+\([\w\/\s]+\)\)\s+(\(No\s+client\s+certificate\s+requested\)|)|)|)\s*(\(Authenticated\s+sender\:\s+(?P<authenticated_sender>[\w\.\-\@]+)\)|)\s*by\s+(?P<by_hostname>[\S]+)\s*(\((?P<by_hostname2>[\S]*)\s*\[((?P<by_ipv6>[a-f\:\d]+)|)(?P<by_ip>[\d\.]+)\]\)|)\s*(\([^\)]*\)|)\s*(\(Postfix\)|)\s*(with\s+(?P<protocol>\w*)|)\s*id\s+(?P<id>[\w\-]+)\s*(for\s+\<(?P<envelope_for>[\w\.\@]+)\>|)", "postfix"), #postfix
        ("(from\s+(?P<from_name>[\[\S\]]+)\s+\(((?P<from_hostname>[\S]*)|)\s*\[(IPv6\:(?P<from_ipv6>[a-f\d\:]+)|)\]\)\s*(\(using\s+(?P<tls>[\w\.]+)\s+with\s+cipher\s+(?P<cipher>[\w\-]+)\s+\([\w\/\s]+\)\)\s+(\(No\s+client\s+certificate\s+requested\)|)|)|)\s*(\(Authenticated\s+sender\:\s+(?P<authenticated_sender>[\w\.\-\@]+)\)|)\s*by\s+(?P<by_hostname>[\S]+)\s*(\((?P<by_hostname2>[\S]*)\s*\[((?P<by_ipv6>[a-f\:\d]+)|)(?P<by_ip>[\d\.]+)\]\)|)\s*(\([^\)]*\)|)\s*(\(Postfix\)|)\s*(with\s+(?P<protocol>\w+)|)\s*id\s+(?P<id>[\w\-]+)\s*(for\s+\<(?P<envelope_for>[\w\.\@]+)\>|)", "postfix"),#POSTFIX
        ("\s*from\s+\[?(?P<from_ip>[\d\.\:]+)\]?\s*(\((port=\d+|)\s*helo=(?P<from_name>[\[\]\w\.\:\-]+)\)|)\s+by\s+(?P<by_hostname>[\w\-\.]+)\s+with\s+(?P<protocol>\w+)\s*(\((?P<cipher>[\w\.\:\_\-]+)\)|)\s*(\(Exim\s+(?P<exim_version>[\d\.\_]+)\)|)\s*\(envelope-from\s+<?(?P<envelope_from>[\w\@\-\.]*)>?\s*\)\s*id\s+(?P<id>[\w\-]+)\s*\s*(for\s+<?(?P<envelope_for>[\w\.\@]+)>?|)", "exim"), #exim
        ("\s*from\s+(?P<from_hostname>[\w\.]+)\s+\(\[?(?P<from_ip>[\d\.\:a-f]+)\]?(\:\d+|)\s*(helo\=\[?(?P<from_name>[\w\.\:\-]+)|)\]?\)\s+by\s+(?P<by_hostname>[\w\-\.]+)\s+with\s+(?P<protocol>\w+)\s+(\((?P<cipher>[\w\.\:\_]+)\)|)\s*\(Exim\s+(?P<exim_version>[\d\.\_]+)\)\s*\(envelope-from\s+\<(?P<envelope_from>[\w\@\-\.]+)\>\s*\)\s*id\s+(?P<id>[\w\-]+)\s*(for\s+(?P<envelope_for>[\w\.\@]+)|)", "exim"),# exim
        ("from\s+(?P<from_name>[\w\.\-]+)\s+by\s+(?P<by_hostname>[\w\.\-]+)\s+with\s+(?P<protocol>\w+)\s+\(Exim\s+(?P<version>[\d\.]+)\)\s+\(envelope-from\s+<*(?P<envelope_from>[\w\.\-\@]+)>*\)\s+id\s+(?P<id>[\w\.\-]+)\s+for\s+<?(?P<envelope_for>[\w\.\-\@]+)>?", "exim"), #exim
        ("from\s+(?P<from_name>[\[\]\w\-\.]+)\s+\(((?P<from_hostname>[\w\.\-]+)|)\s*\[(?P<from_ip>[\da-f\.\:]+)\]\)\s+by\s+(?P<by_hostname>[\w\.\-]+)\s+\(Oracle\s+Communications\s+Messaging\s+Server\s+(?P<oracle_version>[\w\.\-]+)(\([\d\.]+\)|)\s+(32bit|64bit|)\s*(\([^\)]+\)|)\)\s*with\s+(?P<protocol>\w+)\s+id\s+\<?(?P<id>[\w\@\.\-]+)\>?", "Oracle Communication Messaging Server"), #Oracle
        ("from\s+(?P<from_hostname>[\w\-\.]+)\s+\(\[(?P<from_ip>[\d\.\:a-f]+)\]\s+helo=(?P<from_name>[\w\.\-]+)\)\s+by\s+(?P<by_hostname>[\w\.\-]+)\s+with\s+(?P<protocol>\w+)\s+\(ASSP\s+(?P<assp_version>[\d\.]+)\s*\)", "ASSP"), #ASSP
        ("from\s+(?P<from_hostname>[\[\]\d\w\.\-]+)\s+\(\[\[?(?P<from_ip>[\d\.]+)(\:\d+|)\]\s*(helo=(?P<from_name>[\w\.\-]+)|)\s*\)\s+by\s+(?P<by_hostname>[\w\.\-]+)\s+\(envelope-from\s+\<?(?P<envelope_from>[^>]+)\>?\)\s+\(ecelerity\s+(?P<version>[\d\.]+)\s+r\([\w\-\:\.]+\)\)\s+with\s+(?P<protocol>\w+)\s*(\(cipher=(?P<cipher>[\w\-\_]+)\)|)\s*id\s+(?P<id>[\.\-\w\/]+)", "ecelerity"), #ecelerity
        ("from\s+(?P<from_name>[\[\]\w\.\-]+)\s+\(((?P<from_hostname>[\w\.\-]+)|)\s*(\[(?P<from_ip>[\d\.\:a-f]+)\]|)\)\s*by\s+(?P<by_hostname>[\w\.\-]+)\s+(\([\w\.\-\=]+\)|)\s+with\s+(?P<protocol>\w+)\s+\(Nemesis\)\s+id\s+(?P<id>[\w\.\-]+)\s*(for\s+\<?(?P<envelope_for>[\w\.\@\-]+)\>?|)", "nemesis"), #nemesis
        ("\(qmail\s+\d+\s+invoked\s+(from\s+network|)(by\s+uid\s+\d+|)\)", "qmail"), #WTF qmail
        ("from\s+\[?(?P<from_ip>[\d\.a-f\:]+)\]?\s+\(account\s+<?(?P<envelope_from>[\w\.\@\-]+)>?\s+HELO\s+(?P<from_name>[\w\.\-]+)\)\s+by\s+(?P<by_hostname>[\w\.\-]*)\s+\(CommuniGate\s+Pro\s+SMTP\s+(?P<version>[\d\.]+)\)\s+with\s+(?P<protocol>\w+)\s+id\s+(?P<id>[\w\-\.]+)\s+for\s+<?(?P<envelope_for>[\w\.\-\@]+)>?", "CommuniGate"), #CommuniGate
        ("from\s+(?P<from_ip>[\d\.\:a-f]+)\s+\(SquirrelMail\s+authenticated\s+user\s+(?P<envelope_from>[\w\@\.\-]+)\)\s+by\s+(?P<by_hostname>[\w\.\-]+)\s+with\s+(?P<protocol>\w+)", "SquirrelMail"),
        ("by\s+(?P<by_hostname>[\w\.\-]+)\s+\((?P<protocol>\w+)\s+sendmail\s*(emulation|)\)", "sendmail"), #sendmail
        ("from\s+(?P<from_name>[\[\]\w\.\-]+)\s+\(\[(?P<from_hostname>[\w\.\-]+)\]\s+\[(?P<from_ip>[\d\.a-f\:]+)\]\)\s+by\s+(?P<by_hostname>[\w\.\-]+)\s+\(Sun\s+Java\(tm\)\s+System\s+Messaging\s+Server\s+(?P<version>[\w\.\-]+)\s+\d+bit\s+\(built\s+\w+\s+\d+\s+\d+\)\)\s+with\s+(?P<protocol>\w+)\s+id\s+<?(?P<id>[\w\.\-\@]+)>?", "Sun Java System Messaging Server"), # Sun Java System Messaging Server
        ("from\s+(?P<from_name>[\w\.\-\[\]]+)\s+\((?P<from_ip>[\d\.a-f\:]+)\)\s+by\s+(?P<by_hostname>[\w\.\-]+)\s+\(Axigen\)\s+with\s+(?P<protocol>\w+)\s+id\s+(?P<id>[\w\.\-]+)", "Axigen"), #axigen
        ("from\s+(?P<from_name>[\w\.\-]+)\s+\((?P<from_hostname>[\w\.\-]+)\s+\[(?P<from_ip>[\d\.a-f\:]+)\]\)\s+by\s+(?P<by_hostname>[\w\.\-]+)\s+\(Horde\s+MIME\s+library\)\s+with\s+(?P<protocol>\w+)", "Horde MIME library"), #Horde
        ("from\s+(?P<from_name>[\w\.\-\[\]]+)\s+by\s+(?P<by_hostname>[\w\.\-]+)\s+\(PGP\s+Universal\s+Service\)", "PGP Universal Service", "local"), # PGP Universal Service
        ("from\s+(?P<from_name>[\w\.\-]+)\s+by\s+(?P<by_hostname>[\w\.\-]+)\s+with\s+(?P<protocol>\w+)\s+\(Sophos\s+PureMessage\s+Version\s+(?P<version>[\d\.\-]+)\)\s+id\s+(?P<id>[\w\.\-]+)\s+for\s+(?P<envelope_for>[\w\.\-\@]+)", "Sophos PureMessage"), #Sophos PureMessage
        ("by\s+(?P<by_ip>[\d\.\:a-f]+)\s+with\s+(?P<protocol>\w+)", "unknown"), # other
        ("from\s+(?P<from_name>[\w\.\-]+)\s+\#?\s*(\(|\[|\(\[)\s*(?P<from_ip>[\d\.\:a-f]+)\s*(\]|\)|\]\))\s+by\s+(?P<by_hostname>[\w\.\-]+)(\s+\([\w\.\s\/]+\)|)\s*(with\s+(?P<protocol>\w+)|)\s*(id\s+(?P<id>[\w]+)|)(\(\-\)|)\s*(for\s+\<(?P<envelope_for>[\w\@\.]+)\>?|)", "unknown"), #unknown
        ("from\s+(?P<from_hostname>[\w\.\-]+)\s*\(HELO\s+(?P<from_name>[\w\.\-]+)\)\s*\(\[?(?P<from_ip>[\d\.\:a-f]+)\]?\)\s+by\s+(?P<by_hostname>[\w\.\-]+)(\s+\([\d\.]+\)|)\s*(with\s+(?P<protocol>\w+)|)\s*(id\s+(?P<id>[\w]+)|)(\(\-\)|)", "unknown"), #other other
        ("from\s+([\(\[](?P<from_ip>[\d\.\:a-f]+)[\)\]]|)\s+by\s+(?P<by_hostname>[\w\.\-]+)\s+id\s+(?P<id>\w+)\s*(with\s+(?P<protocol>\w+)|)\s*\s*(for\s+\<(?P<envelope_for>[\w\@\.\-]+)\>|)", "unknown"),#other
        ("from\s+(?P<from_hostname>[\w\.]+)\s+(\(HELO\s+(?P<from_name>[\w\.\-]+)\)|)\s*(\((?P<from_ip>[\da-f\.\:]+)\)|)\s*by\s+(?P<by_hostname>[\w\.\-]+)\s+with\s+(?P<cipher>[\w\-]+)\s+encrypted\s+SMTP", "unknown"), #unknown
        ("from\s+(?P<from_hostname>[\w\.\-]+)\s+(\(HELO\s+(?P<from_name>[\w\.\-]+)\)|)\s+\((?P<envelope_from>[\w\.]+\@[\w\.]+)\@(?P<from_ip>[\da-d\.\:]+)\)\s+by\s+(?P<by_hostname>[\w\.\-]+)\s+with\s+(?P<protocol>\w+)", "unknown"), #unknown
        ("from\s+(?P<from_hostname>[\w\.\-]+)\s+\(HELO\s+(?P<from_name>[\w\.\-\?]+)\)\s+\(\w+\@[\w\.]+\@(?P<from_ip>[\d\.a-f\-]+)_\w+\)\s+by\s+(?P<by_hostname>[\w\.\-\:]+)\s+with\s+(?P<protocol>\w+)", "unknown"), #unknown
        ("from\s+(?P<from_name>[\w\.\-\[\]]+)\s+\(\[(?P<from_ip>[\da-f\.\:]+)\]\)\s+by\s+(?P<by_hostname>[\w\.\-]+)\s+\(\[(?P<by_ip>[\d\.a-f\:]+)\]\)\s+with\s+(?P<protocol>\w+)", "unknown"), #unknown
        ]
    @staticmethod
    def parse(header):
        parts = header.split(";")
        if len(parts) != 2:
            return None

        data = {}

        # parse the hard part
        found = False
        for regex in ReceivedParser.regexes:
            match = re.match(regex[0], parts[0], re.IGNORECASE)
            if match:
                data['server'] = regex[1]
                found = True
                break

        if not found:
            return None
        return {**data, **match.groupdict()}

In [101]:
received_parser = ReceivedParser()

In [102]:
def check_if_valid(dict_to_check, str_val):
  if dict_to_check is None:
    return False
  elif str_val not in dict_to_check:
    return False
  elif dict_to_check[str_val] is None:
    return False
  else:
    return True

In [103]:
# One-Hot encoding, new columns: conseq_num_received_is_one, conseq_received_good, conseq_received_bad, conseq_received_unknown (All values are 0 or 1)

def check_conseq_received_domain(row):
  num_received = row['hops']
  if num_received == 1:
    return pd.Series([1,0,0,0])

  col_name_base = 'received'
  for i in range(1, num_received):
    curr_col = col_name_base + str(i)
    next_col = col_name_base + str(i+1)

    curr_val = row[curr_col]
    next_val = row[next_col]

    from_vals_dict = received_parser.parse(curr_val)
    by_val_dict = received_parser.parse(next_val)

    by_valid = check_if_valid(by_val_dict, 'by_hostname')
    from_hostname_valid = check_if_valid(from_vals_dict, 'from_hostname')
    from_name_valid = check_if_valid(from_vals_dict, 'from_name')

    if by_valid:
      # Both are valid case
      if from_hostname_valid and from_name_valid:
        if from_vals_dict['from_name'] == by_val_dict['by_hostname'] or \
        from_vals_dict['from_hostname'] == by_val_dict['by_hostname']:
          continue
        else:
          #print("K Val: " + str(i) +"  FROMHOSTNAME: " + from_vals_dict['from_hostname'] + "  FROMNAME: " + from_vals_dict['from_name'] + "   BY: " + by_val_dict['by_hostname'])
          return pd.Series([0,0,1,0])

      # Neither are valid case
      elif not from_hostname_valid and not from_name_valid:
        # Check my way before saying its invalid
        val = my_checks(curr_val, next_val)
        if isinstance(val, str):
          continue
        else:
          return val

      # From hostname is valid case
      elif from_hostname_valid:
        if from_vals_dict['from_hostname'] == by_val_dict['by_hostname']:
          continue
        else:
          #print("K Val: " + str(i) +"  FROMHOSTNAME: " + from_vals_dict['from_hostname'] + "   BY: " + by_val_dict['by_hostname'])
          return pd.Series([0,0,1,0])

      # From name is valid case
      elif from_name_valid:
        if from_vals_dict['from_name'] == by_val_dict['by_hostname']:
          continue
        else:
          #print("K Val: " + str(i) + "  FROMNAME: " + from_vals_dict['from_name'] + "   BY: " + by_val_dict['by_hostname'])
          return pd.Series([0,0,1,0])
    else:
      # Check my way before saying its invalid
      val = my_checks(curr_val, next_val)
      if isinstance(val, str):
        continue
      else:
        return val

  # All checks worked out, return a good result
  return pd.Series([0,1,0,0])


def my_checks(curr_val, next_val):
  first_domain_from = re.search(r'((?<=\bfrom\s)[^\s]+)', curr_val)
  second_domain_from = re.search(r'((?<=\().*?(?=\[))', curr_val)
  domain_by = re.search(r'((?<=\bby\s)[^\s]+)', next_val)

  if domain_by is not None:
    # Both are valid case
    if first_domain_from is not None and second_domain_from is not None:
        if first_domain_from.group(0) == domain_by.group(0) or \
        second_domain_from.group(0) == domain_by.group(0):
          return 'continue'
        else:
          return pd.Series([0,0,1,0])

    # Neither are valid case
    if first_domain_from is None and second_domain_from is None:
      return pd.Series([0,0,0,1]) 

    # One is valid case
    if first_domain_from is not None:
      if first_domain_from.group(0) == domain_by.group(0):
        return 'continue'
      else:
        return pd.Series([0,0,1,0])

    # One is valid case
    if second_domain_from is not None:
      if second_domain_from.group(0) == domain_by.group(0):
        return 'continue'
      else:
        return pd.Series([0,0,1,0])
        
  else:
    return pd.Series([0,0,0,1])

In [104]:
df[['conseq_num_received_is_one', 'conseq_received_good', 
   'conseq_received_bad', 'conseq_received_unknown']] = df.apply(check_conseq_received_domain, axis=1)

In [105]:
final_features_list.extend(['conseq_num_received_is_one', 'conseq_received_good', 
   'conseq_received_bad', 'conseq_received_unknown'])

**Date and time conflict (Consequtive received fields):**

In [106]:
def conseq_received_date(row):
  num_received = row['hops']
  if num_received == 1:
    return 0

  col_name_base = 'received'
  for i in range(1, num_received):
    curr_col = col_name_base + str(i)
    next_col = col_name_base + str(i+1)

    curr_val = row[curr_col]
    next_val = row[next_col]

    curr_date = curr_val.replace('\n\t', ';').split(r';')[-1]
    next_date = next_val.replace('\n\t', ';').split(r';')[-1]

    d1 = email.utils.parsedate_tz(curr_date)
    d2 = email.utils.parsedate_tz(next_date)

    if d1 is None or d2 is None:
      return 1

    try:
      val1 = email.utils.mktime_tz(d1)
      val2 = email.utils.mktime_tz(d2)
    except:
      return 1

    if (val2 - val1) < 0:
      return 1
  return 0

In [107]:
df['conseq_received_date'] = df.apply(conseq_received_date, axis=1)

In [108]:
df['conseq_received_date'].value_counts()

conseq_received_date
0    238
1      7
Name: count, dtype: int64

In [109]:
df[df['conseq_received_date'] == 1]['label'].value_counts()

label
2    7
Name: count, dtype: int64

In [110]:
final_features_list.append('conseq_received_date')

---

**Email Similarity:**

---

If more than one email, then all are tested, and the highest score is the output.

In [111]:
from difflib import SequenceMatcher

def simScore(a, b):
  if isinstance(a, float) or isinstance(b, float):
    return 0
  return SequenceMatcher(None, a, b).ratio()

In [112]:
print(simScore('ab', 'ba'))

0.5


---
**Extracting Emails**:

---

In [113]:
# emails in brackets '<>' are matched first, and if none, then other emails are matched
def extract_emails(row, col_name):

  in_brackets = re.findall(r'<([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)>', row[col_name])

  if len(in_brackets) == 0:
    not_in_brackets = re.findall(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', row[col_name])
    if len(not_in_brackets) == 0:
      return []
    else:
      return not_in_brackets
  else:
    return in_brackets

In [114]:
df['errors-to'] = ''

In [115]:
emails_from = df.apply(extract_emails, col_name='from', axis=1)
emails_message_id = df.apply(extract_emails, col_name='message-id', axis=1)
emails_return_path = df.apply(extract_emails, col_name='return-path', axis=1)
emails_reply_to = df.apply(extract_emails, col_name='reply-to', axis=1)
emails_errors_to = df.apply(extract_emails, col_name='errors-to', axis=1)
emails_in_reply_to = df.apply(extract_emails, col_name='in-reply-to', axis=1)
emails_references = df.apply(extract_emails, col_name='references', axis=1)
emails_to = df.apply(extract_emails, col_name='to', axis=1)
emails_cc = df.apply(extract_emails, col_name='cc', axis=1)
emails_sender = df.apply(extract_emails, col_name='sender', axis=1)

#simScores = domains_df[['return', 'from']].apply(lambda x: simScore(*x), axis=1)
#df['SimScore_return_from'] = simScores

In [116]:
emails_df = pd.concat([emails_from, emails_message_id, emails_return_path, 
                        emails_errors_to, emails_reply_to, emails_in_reply_to, 
                        emails_references, emails_to, emails_cc, emails_sender], axis=1)
emails_df = emails_df.set_axis(['from', 'message-id', 'return-path', 'errors-to', 'reply-to',
                     'in-reply-to', 'references', 'to', 'cc', 'sender'], 
                    axis=1)

In [117]:
def email_same_check(row, first_col, second_col):
  vals1 = row[first_col]
  vals2 = row[second_col]

  for val1 in vals1:
    for val2 in vals2:
      if val1 == val2:
        return 1

  return 0

In [118]:
emails_to_check = [('from', 'reply-to')]

for val in emails_to_check:
  first_field = val[0]
  second_field = val[1]
  new_col_name = 'email_match_' + first_field + '_' + second_field

  df[new_col_name] = emails_df.apply(email_same_check, first_col=first_field, 
                  second_col=second_field, axis=1)
  final_features_list.append(new_col_name)

**Extracting Domains:**

In [119]:
def extract_domains(row, col_name):
  emails_list = row[col_name]

  if len(emails_list) == 0:
    return []
  else:
    domains_list = []
    for email in emails_list:
      if len(email.split('.')) < 2:
        continue
      else:
        main_domain = email.split('@')[-1]
        main_domain = main_domain.split('.')[-2:]
        main_domain = main_domain[0] + '.' + re.sub('\W+','', main_domain[1])
        domains_list.append(main_domain.lower())
    return domains_list

In [120]:
domains_from = emails_df.apply(extract_domains, col_name='from', axis=1)
domains_message_id = emails_df.apply(extract_domains, col_name='message-id', axis=1)
domains_return_path = emails_df.apply(extract_domains, col_name='return-path', axis=1)
domains_reply_to = emails_df.apply(extract_domains, col_name='reply-to', axis=1)
domains_errors_to = emails_df.apply(extract_domains, col_name='errors-to', axis=1)
domains_in_reply_to = emails_df.apply(extract_domains, col_name='in-reply-to', axis=1)
domains_references = emails_df.apply(extract_domains, col_name='references', axis=1)
domains_to = emails_df.apply(extract_domains, col_name='to', axis=1)
domains_cc = emails_df.apply(extract_domains, col_name='cc', axis=1)
domains_sender = emails_df.apply(extract_domains, col_name='sender', axis=1)

In [121]:
domains_df = pd.concat([domains_from, domains_message_id, domains_return_path, 
                        domains_errors_to, domains_reply_to, domains_in_reply_to, 
                        domains_references, domains_to, domains_cc, domains_sender], axis=1)
domains_df = domains_df.set_axis(['from_domains', 'message-id_domains', 'return-path_domains', 'errors-to_domains', 'reply-to_domains',
                     'in-reply-to_domains', 'references_domains', 'to_domains', 'cc_domains', 'sender_domains'], 
                    axis=1)

df = pd.concat([df,domains_df], axis=1)

In [122]:
df.shape

(245, 130)

Adding in the value of message-id domain (1 if contains uwaterloo.ca, otherwise 0)

In [123]:
def extract_domain_message_id(row):
  val = row['message-id_domains']
  if len(val) == 0:
    return ''
  else:
    return val[0]

In [124]:
df['domain_val_message-id'] = domains_df.apply(extract_domain_message_id, axis=1)
df['domain_val_message-id'].value_counts()

domain_val_message-id
amazonses.com         50
                      49
monkey.org            10
serverpod.net          9
com.br                 5
                      ..
medelite.org           1
alibaba.com            1
sumsjobz.sbs           1
mailcarrier-sl.com     1
sofamekar.com          1
Name: count, Length: 107, dtype: int64

In [125]:
df.loc[~df['domain_val_message-id'].astype(str).str.contains('uwaterloo.ca'), 'domain_val_message-id'] = 0
df.loc[df['domain_val_message-id'].astype(str).str.contains('uwaterloo.ca'), 'domain_val_message-id'] = 1

In [126]:
df['domain_val_message-id'].value_counts()

domain_val_message-id
0    245
Name: count, dtype: int64

In [127]:
final_features_list.append('domain_val_message-id')

---

**Domain Matching:**

---

In [128]:
# Returns 0 if no matches, 1 if at least one match
def domain_match_check(row, first_col, second_col):

  first_domain_list = row[first_col]
  second_domain_list = row[second_col]

  if len(first_domain_list) == 0 or len(second_domain_list) == 0:
    return 0
  else:
    for d1 in first_domain_list:
      for d2 in second_domain_list:
        if d1 == d2:
          return 1
    return 0

In [129]:
domain_fields_to_check = [('message-id_domains', 'from_domains'), ('from_domains', 'return-path_domains'), ('message-id_domains', 'return-path_domains'), ('message-id_domains', 'sender_domains'), ('message-id_domains', 'reply-to_domains'),
                          ('return-path_domains', 'reply-to_domains'), ('reply-to_domains', 'to_domains'), ('to_domains', 'in-reply-to_domains'), ('errors-to_domains', 'message-id_domains'), ('errors-to_domains', 'from_domains'), ('errors-to_domains', 'sender_domains'),
                          ('errors-to_domains', 'reply-to_domains'), ('sender_domains', 'from_domains'), ('references_domains', 'reply-to_domains'), ('references_domains', 'in-reply-to_domains'), ('references_domains', 'to_domains'), ('from_domains', 'reply-to_domains'),
                          ('to_domains', 'from_domains'), ('to_domains', 'message-id_domains')]

for val in domain_fields_to_check:
  first_field = val[0].replace('_domains', '')
  second_field = val[1].replace('_domains', '')
  new_col_name = 'domain_match_' + first_field + '_' + second_field 

  df[new_col_name] = domains_df.apply(domain_match_check, first_col = val[0], 
                              second_col= val[1], axis=1)
  final_features_list.append(new_col_name)


**Domain Matching (involving 'Received' fields):**

In [130]:
parser = ReceivedParser()

In [131]:
df.head(5)

Unnamed: 0,received1,received2,received3,received4,received5,received6,received7,received8,hops,subject,...,domain_match_errors-to_from,domain_match_errors-to_sender,domain_match_errors-to_reply-to,domain_match_sender_from,domain_match_references_reply-to,domain_match_references_in-reply-to,domain_match_references_to,domain_match_from_reply-to,domain_match_to_from,domain_match_to_message-id
0,"from out.smarshmail.com (out.smarshmail.com [199.193.206.151])\n\tby imf04.b.hostedemail.com (Postfix) with ESMTP id AD63B100335B4\n\tfor <jose@monkey.org>; Wed, 12 Jan 2022 14:09:37 +0000 (UTC)","from ggg.mqn1tzeu2jze1gwcnkzezpilvh.phxx.internal.cloudapp.net\n (20.150.148.255) by MBX125-W8-CO-3.exch125.serverpod.net (10.224.13.216) with\n Microsoft SMTP Server (version=TLS1_2,\n cipher=TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384_P521) id 15.1.2375.17; Wed, 12\n Jan 2022 06:09:35 -0800",,,,,,,0,Mail Delivery Failed: Returning Message to Sender,...,0,0,0,0,0,0,0,0,0,0
1,"from out.smarshmail.com (out.smarshmail.com [199.193.206.142])\n\tby imf28.b.hostedemail.com (Postfix) with ESMTP id 291DC18577D10\n\tfor <jose@monkey.org>; Wed, 12 Jan 2022 15:26:33 +0000 (UTC)","from ggg.mqn1tzeu2jze1gwcnkzezpilvh.phxx.internal.cloudapp.net\n (20.150.148.255) by MBX125-W3-CO-4.exch125.serverpod.net (10.224.13.158) with\n Microsoft SMTP Server (version=TLS1_2,\n cipher=TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384_P521) id 15.1.2375.17; Wed, 12\n Jan 2022 07:26:32 -0800",,,,,,,0,Mail Delivery Failed: Returning Message to Sender,...,0,0,0,0,0,0,0,0,0,0
2,"from out.smarshmail.com (out.smarshmail.com [199.193.206.151])\n\tby imf01.b.hostedemail.com (Postfix) with ESMTP id 72ED618473512\n\tfor <jose@monkey.org>; Wed, 12 Jan 2022 16:56:22 +0000 (UTC)","from jjj.1wziks11ksnudjck0lsghgz2cd.phxx.internal.cloudapp.net\n (20.118.128.143) by MBX125-W8-CO-3.exch125.serverpod.net (10.224.13.216) with\n Microsoft SMTP Server (version=TLS1_2,\n cipher=TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384_P521) id 15.1.2375.17; Wed, 12\n Jan 2022 08:56:21 -0800",,,,,,,0,Undeliverable: Message failed!,...,0,0,0,0,0,0,0,0,0,0
3,"from out.smarshmail.com (out.smarshmail.com [199.193.206.151])\n\tby imf14.b.hostedemail.com (Postfix) with ESMTP id EC77110627451\n\tfor <jose@monkey.org>; Thu, 13 Jan 2022 11:26:10 +0000 (UTC)","from chc.tqfdehd2iccepi0sf1x2bncwsc.ex.internal.cloudapp.net\n (20.98.22.69) by MBX125-W8-CO-2.exch125.serverpod.net (10.224.13.214) with\n Microsoft SMTP Server (version=TLS1_2,\n cipher=TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384_P521) id 15.1.2375.17; Thu, 13\n Jan 2022 03:26:09 -0800",,,,,,,0,Mail delivery failed: returning message to sender - jose@monkey.org,...,0,0,0,0,0,0,0,0,0,0
4,"from out.smarshmail.com (out.smarshmail.com [199.193.206.148])\n\tby imf06.b.hostedemail.com (Postfix) with ESMTP id 06B9A1B2\n\tfor <jose@monkey.org>; Tue, 18 Jan 2022 15:37:33 +0000 (UTC)","from MBX125-W8-CO-1.exch125.serverpod.net (10.224.13.212) by\n MBX125-W6-CO-3.exch125.serverpod.net (10.224.13.192) with Microsoft SMTP\n Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384_P521) id\n 15.1.2375.17; Tue, 18 Jan 2022 07:37:33 -0800","from green.o1ltdsospz4erb4ns0lx4klihd.cx.internal.cloudapp.net\n (52.167.216.166) by MBX125-W8-CO-1.exch125.serverpod.net (10.224.13.212) with\n Microsoft SMTP Server (version=TLS1_2,\n cipher=TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384_P521) id 15.1.2375.17; Tue, 18\n Jan 2022 07:37:29 -0800",,,,,,1,Mail Delivery Failed: Returning Message to Sender,...,0,0,0,0,0,0,0,0,0,0


In [132]:
test = parser.parse('from cnnimail22.cnn.com (cnnimail22.cnn.com [64.236.25.79])\n\tby speedy.uwaterloo.ca (8.12.8/8.12.5) with ESMTP id l39IYQ0I018124\n\tfor <ktwarwic@SPEEDY.UWATERLOO.CA>; Mon, 9 Apr 2007 14:34:26 -0400')

In [133]:
def get_for_domain_last_received(row):
  last_received_val = row['last_received']
  parsed_val = parser.parse(last_received_val)

  if check_if_valid(parsed_val, 'envelope_for'):
    main_domain = parsed_val['envelope_for'].split('@')[-1]
    main_domain = main_domain.split('.')[-2:]
    main_domain = main_domain[0] + '.' + re.sub('\W+','', main_domain[1])
    return main_domain.lower()
  else:
    return 'NA'

In [134]:
def check_for_received_domain_equal(row, field_name):
  field_vals = row[field_name]

  for item in field_vals:
    if item == get_for_domain_last_received(row):
      return 1
  return 0

In [135]:
df['domain_match_to_received'] = df.apply(check_for_received_domain_equal, field_name='to_domains', axis=1)
df['domain_match_to_received'].value_counts()

domain_match_to_received
0    245
Name: count, dtype: int64

In [136]:
df['domain_match_reply-to_received'] = df.apply(check_for_received_domain_equal, field_name='reply-to_domains', axis=1)
df['domain_match_reply-to_received'].value_counts()

domain_match_reply-to_received
0    245
Name: count, dtype: int64

In [137]:
def get_from_domain_first_received(row):
  first_received_val = row['first_received']
  parsed_val = parser.parse(first_received_val)

  domains_list = []
  if check_if_valid(parsed_val, 'from_hostname'):
    if len(parsed_val['from_hostname'].split('@')) == 2:

      main_domain = parsed_val['from_hostname'].split('@')[-1]
      if len(main_domain.split('.')) >= 2:
        main_domain = main_domain.split('.')[-2:]
        main_domain = main_domain[0] + '.' + re.sub('\W+','', main_domain[1])
        domains_list.append(main_domain.lower())

  if check_if_valid(parsed_val, 'from_name'):
    if len(parsed_val['from_name'].split('@')) == 2:

      main_domain = parsed_val['from_name'].split('@')[-1]
      if len(main_domain.split('.')) >= 2:
        main_domain = main_domain.split('.')[-2:]
        main_domain = main_domain[0] + '.' + re.sub('\W+','', main_domain[1])
        domains_list.append(main_domain.lower())
    
  return domains_list

In [138]:
def check_received_from_domain_equal(row, field_name):
  field_vals = row[field_name]

  domains_list_check = get_from_domain_first_received(row)
  
  for item in field_vals:
    for item2 in domains_list_check:
      if item == item2:
        return 1
  return 0

In [139]:
df['domain_match_from_received'] = df.apply(check_received_from_domain_equal, field_name='from_domains', axis=1)
df['domain_match_from_received'].value_counts()

domain_match_from_received
0    245
Name: count, dtype: int64

In [140]:
df['domain_match_return-path_received'] = df.apply(check_received_from_domain_equal, field_name='return-path_domains', axis=1)
df['domain_match_return-path_received'].value_counts()

domain_match_return-path_received
0    245
Name: count, dtype: int64

In [141]:
final_features_list.extend(['domain_match_reply-to_received', 'domain_match_to_received', 'domain_match_return-path_received', 'domain_match_from_received'])

# **Outputting the processed data:**

In [142]:
for item in final_features_list:
  print(item)
  print(df[item].value_counts())
  print('\n')

hops
hops
0    180
1     58
2      7
Name: count, dtype: int64


missing_received1
missing_received1
0.0    245
Name: count, dtype: int64


missing_received2
missing_received2
0.0    138
1.0    107
Name: count, dtype: int64


missing_received3
missing_received3
1.0    180
0.0     65
Name: count, dtype: int64


missing_received4
missing_received4
1.0    226
0.0     19
Name: count, dtype: int64


missing_received5
missing_received5
1.0    236
0.0      9
Name: count, dtype: int64


missing_received6
missing_received6
1.0    238
0.0      7
Name: count, dtype: int64


missing_received7
missing_received7
1.0    244
0.0      1
Name: count, dtype: int64


missing_received8
missing_received8
1.0    244
0.0      1
Name: count, dtype: int64


missing_hops
missing_hops
0.0    245
Name: count, dtype: int64


missing_subject
missing_subject
0.0    244
1.0      1
Name: count, dtype: int64


missing_date
missing_date
0.0    245
Name: count, dtype: int64


missing_message-id
missing_message-id
0.0    2

In [143]:
df[df['str_to_chevron'] == 0]['label'].value_counts()

label
2    215
Name: count, dtype: int64

In [144]:
# Removes missing_received fields since received1 is always there, and the other ones
# encode the same information as 'hops'. The other removed features have only one value,
# or a strong majority towards one value.
remove_list = ['missing_received1', 'missing_received2', 'missing_received3',
 'missing_received4', 'missing_received5', 'missing_received6',
 'missing_received7', 'missing_received8', 'missing_received9',
 'missing_received10', 'missing_received11', 'missing_received12',
 'missing_received13', 'missing_received14', 'missing_received15',
 'missing_received16', 'missing_date', 'missing_message-id', 'missing_from',
 'missing_return-path', 'str_to_undisclosed', 'str_return-path_empty',
 'str_from_exclam', 'str_reply-to_question', 'str_received-SPF_bad', 
 'str_received-SPF_softfail', 'str_received-SPF_fail', 'str_reply-to_question', 
 'conseq_received_unknown', 'num_recipients_from', 
 'domain_match_reply-to_received', 'domain_match_return-path_received',
 'domain_match_from_received']

for v in remove_list:
  if v in final_features_list:
    final_features_list.remove(v)
    
final_features_list.append('label')

In [145]:
df_final = df[final_features_list]

In [146]:
df_final.shape

(245, 74)

In [None]:
df_final

In [147]:
df_final.to_csv('preprocessed_phishing_2022.csv', index=False)