## Import data 

In [1]:
import numpy as np
import pandas as pd
import re
import time
import email.utils
import os
from email.parser import BytesParser
from email import policy
from bs4 import BeautifulSoup

In [2]:
# Load the phishing emails
PHISHING_FILE = '../jose_phishing.csv'

df_phishing = pd.read_csv(PHISHING_FILE)

print(len(df_phishing))

pd.set_option('display.max_columns', None)
df_phishing.head(1)

2544


Unnamed: 0,received1,received2,received3,label,return-path,delivered-to,content-type,mime-version,subject,to,from,date,message-id,hops,body,content-transfer-encoding,received4,received5,received6,received7,received8,references,in-reply-to,importance,reply-to,received-spf,domainkey-signature,sender,content-disposition,received9,received10,cc,mailing-list,content-length
0,from dukecmmtar04.coxmail.com (dukecmmtar04.co...,from dukecmimpo03.coxmail.com ([68.99.120.137]...,from BERMLEE-TERM.BermLee.local ([98.175.100.1...,1,acmim@up.edu,jose@monkey.org,"multipart/mixed; boundary=""===============0249...",1.0,Important Security Message,Recipients <acmim@up.edu>,"""American .Express "" <acmim@up.edu>","Thu, 03 Jan 2019 17:22:43 -0500",<20190103224313.GZSI16483.dukecmmtar04.coxmail...,3,You will not see this in a MIME-aware mail rea...,,,,,,,,,,,,,,,,,,,


In [3]:
# Check for duplicates
phishing_duplicates = df_phishing.duplicated().sum()

# Remove duplicate rows
df_phishing = df_phishing.drop_duplicates()

print(f'Number of duplicate rows removed: {phishing_duplicates}')

Number of duplicate rows removed: 10


In [4]:
HAM_FILE = '../trec_train.csv'

df_ham = pd.read_csv(HAM_FILE)

df_ham.head(1)

Unnamed: 0,received1,received2,received3,received4,label,return-path,in-reply-to,references,mime-version,message-id,from,date,to,content-disposition,cc,subject,content-type,content-transfer-encoding,sender,content-length,hops,body,reply-to,importance,received5,received6,received7,received8,received9,mailing-list,delivered-to,received-spf,domainkey-signature,received10,received11,received12,received13,received14,received15,received16
0,from hypatia.math.ethz.ch (hypatia.math.ethz.c...,from hypatia.math.ethz.ch (hypatia [129.132.14...,from korteweg.uva.nl (korteweg.uva.nl [146.50....,from [145.18.153.42] ([145.18.153.42]) by kort...,0,<r-help-bounces@stat.math.ethz.ch>,<10175603.post@talk.nabble.com>,<10175603.post@talk.nabble.com>,1.0 (Apple Message framework v752.2),<B6373E84-D1B4-407D-A558-74800C86AC97@uva.nl>,Ingmar Visser <i.visser@uva.nl>,"Wed, 25 Apr 2007 09:35:24 +0200",francogrex <francogrex@mail.com>,inline,r-help@stat.math.ethz.ch,Re: [R] How to solve difficult equations?,"text/plain; charset=""us-ascii""",7bit,r-help-bounces@stat.math.ethz.ch,1468.0,4,"plot(fn,-1,1)\n\ncould be helpful, hth, Ingmar...",,,,,,,,,,,,,,,,,,


In [5]:
# Check for duplicates
ham_duplicates = df_ham.duplicated().sum()

# Remove duplicate rows
df_ham = df_ham.drop_duplicates()

print(f'Number of duplicate rows removed: {ham_duplicates}')

Number of duplicate rows removed: 0


In [6]:
df_combined = pd.concat([df_ham, df_phishing], ignore_index=True)

df_combined.tail(1)

Unnamed: 0,received1,received2,received3,received4,label,return-path,in-reply-to,references,mime-version,message-id,from,date,to,content-disposition,cc,subject,content-type,content-transfer-encoding,sender,content-length,hops,body,reply-to,importance,received5,received6,received7,received8,received9,mailing-list,delivered-to,received-spf,domainkey-signature,received10,received11,received12,received13,received14,received15,received16
5077,from dukecmfep01.coxmail.com (dukecmfep01.coxm...,from dukecmimpo03.coxmail.com ([68.99.120.137]...,from WIN-DBEVR53C4H6.us-east-2.compute.interna...,,1,david234@wsu.edu,,,1.0,<20181230001751.LXSQ17347.dukecmfep04.coxmail....,"""American Express "" <david234@wsu.edu>","Sat, 29 Dec 2018 18:17:16 -0600",Recipients <david234@wsu.edu>,,,Confirm This Transaction,"multipart/alternative; boundary=""=============...",,,,3,You will not see this in a MIME-aware mail rea...,,,,,,,,,jose@monkey.org,,,,,,,,,


## Dataset Information

In [7]:
df_combined['label'].value_counts()

label
0    2544
1    2534
Name: count, dtype: int64

In [8]:
df_combined.columns.values

array(['received1', 'received2', 'received3', 'received4', 'label',
       'return-path', 'in-reply-to', 'references', 'mime-version',
       'message-id', 'from', 'date', 'to', 'content-disposition', 'cc',
       'subject', 'content-type', 'content-transfer-encoding', 'sender',
       'content-length', 'hops', 'body', 'reply-to', 'importance',
       'received5', 'received6', 'received7', 'received8', 'received9',
       'mailing-list', 'delivered-to', 'received-spf',
       'domainkey-signature', 'received10', 'received11', 'received12',
       'received13', 'received14', 'received15', 'received16'],
      dtype=object)

In [9]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5078 entries, 0 to 5077
Data columns (total 40 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   received1                  5072 non-null   object 
 1   received2                  4579 non-null   object 
 2   received3                  2899 non-null   object 
 3   received4                  1823 non-null   object 
 4   label                      5078 non-null   int64  
 5   return-path                5072 non-null   object 
 6   in-reply-to                1156 non-null   object 
 7   references                 1186 non-null   object 
 8   mime-version               4695 non-null   object 
 9   message-id                 4730 non-null   object 
 10  from                       5078 non-null   object 
 11  date                       5023 non-null   object 
 12  to                         4963 non-null   object 
 13  content-disposition        579 non-null    objec

In [10]:
df_combined.describe()

Unnamed: 0,label,content-length,hops
count,5078.0,675.0,5078.0
mean,0.499015,6440.96,3.444269
std,0.500048,20821.842358,2.088816
min,0.0,14.0,0.0
25%,0.0,948.5,2.0
50%,0.0,1711.0,3.0
75%,1.0,3930.5,4.0
max,1.0,422233.0,16.0


In [11]:
df_combined.nunique()

received1                    5072
received2                    4572
received3                    2898
received4                    1822
label                           2
return-path                  2552
in-reply-to                  1118
references                   1151
mime-version                   12
message-id                   4722
from                         3073
date                         5011
to                           1141
content-disposition             2
cc                            292
subject                      4043
content-type                 2124
content-transfer-encoding       9
sender                         74
content-length                615
hops                           14
body                         5019
reply-to                      410
importance                      6
received5                    1249
received6                     874
received7                     386
received8                     272
received9                     164
mailing-list  

In [12]:
df_combined['hops'].unique()

array([ 4,  2,  3,  6,  5,  9, 10,  7,  8, 16,  1, 12, 11,  0])

In [13]:
# Columns with no missing values
df_combined.columns[~df_combined.isna().any()].tolist()

['label', 'from', 'hops', 'body']

In [14]:
# Columns with missing values
df_combined.columns[df_combined.isna().any()].tolist()

['received1',
 'received2',
 'received3',
 'received4',
 'return-path',
 'in-reply-to',
 'references',
 'mime-version',
 'message-id',
 'date',
 'to',
 'content-disposition',
 'cc',
 'subject',
 'content-type',
 'content-transfer-encoding',
 'sender',
 'content-length',
 'reply-to',
 'importance',
 'received5',
 'received6',
 'received7',
 'received8',
 'received9',
 'mailing-list',
 'delivered-to',
 'received-spf',
 'domainkey-signature',
 'received10',
 'received11',
 'received12',
 'received13',
 'received14',
 'received15',
 'received16']

## Data preprocessing

In [15]:
initial_features_list = df_combined.columns.values.tolist()

print(len(initial_features_list))

label_name = 'label'

final_features_list = ['hops']

40


## **Feature - Handling Missing Values in Features**

In this step, we create binary indicator features for missing values in the initial features list. For each feature in the `initial_features_list`, a new feature is created to indicate whether the value is missing (1) or not missing (0). This is useful for many machine learning models that may benefit from knowing whether a value was imputed.

In [16]:
df_combined.shape

(5078, 40)

In [17]:
missing_feature_names = ['missing_' + name for name in initial_features_list]

for feature, missing_feature in zip(initial_features_list, missing_feature_names):
    df_combined[missing_feature] = df_combined[feature].isnull().astype(int)

final_features_list.extend(missing_feature_names)

In [18]:
df_combined.shape

(5078, 80)

In [19]:
df_combined['missing_from'].value_counts()

missing_from
0    5078
Name: count, dtype: int64

In [20]:
df_combined['missing_to'].value_counts()

missing_to
0    4963
1     115
Name: count, dtype: int64

### Replacing NaN Values in the DF

In this step, we replace all `NaN` values in the `df_combined` DataFrame with empty strings. This operation ensures that there are no `NaN` values in the DataFrame, which can be particularly useful when preparing data for models that do not handle `NaN` values well or when converting the DataFrame to formats that do not support `NaN`.

The code below performs this replacement using the `replace` method from Pandas.

In [21]:
nan_count = df_combined.isna().sum()

print(nan_count)

received1                6
received2              499
received3             2179
received4             3255
label                    0
                      ... 
missing_received12       0
missing_received13       0
missing_received14       0
missing_received15       0
missing_received16       0
Length: 80, dtype: int64


In [22]:
df_combined = df_combined.replace(np.nan, '', regex=True)

## **Feature - String Content Matching**

In [23]:
def str_based_features_add(old_col_name, new_col_names, items_to_check):

  if all(col not in df_combined.columns for col in new_col_names):
    for col_name, item_to_check in zip(new_col_names, items_to_check):
      if item_to_check == '':
        df_combined.loc[(df_combined[old_col_name].str.fullmatch(item_to_check)), col_name] = 1
        df_combined.loc[~(df_combined[old_col_name].str.fullmatch(item_to_check)), col_name] = 0
      else:
        df_combined.loc[(df_combined[old_col_name].str.contains('(?i)' + item_to_check)), col_name] = 1
        df_combined.loc[~(df_combined[old_col_name].str.contains('(?i)' + item_to_check)), col_name] = 0

    final_features_list.extend(new_col_names)

**Received-SPF:**

'Received-SPF': contains 'bad', 'softfail', 'fail'

In [24]:
new_col_names = ['str_received-SPF_bad', 'str_received-SPF_softfail', 
                 'str_received-SPF_fail']
items_to_check = ['bad', 'softfail', 'fail']
feature = 'received-spf'
str_based_features_add(feature, new_col_names, items_to_check)

print(final_features_list)
df_combined.columns

['hops', 'missing_received1', 'missing_received2', 'missing_received3', 'missing_received4', 'missing_label', 'missing_return-path', 'missing_in-reply-to', 'missing_references', 'missing_mime-version', 'missing_message-id', 'missing_from', 'missing_date', 'missing_to', 'missing_content-disposition', 'missing_cc', 'missing_subject', 'missing_content-type', 'missing_content-transfer-encoding', 'missing_sender', 'missing_content-length', 'missing_hops', 'missing_body', 'missing_reply-to', 'missing_importance', 'missing_received5', 'missing_received6', 'missing_received7', 'missing_received8', 'missing_received9', 'missing_mailing-list', 'missing_delivered-to', 'missing_received-spf', 'missing_domainkey-signature', 'missing_received10', 'missing_received11', 'missing_received12', 'missing_received13', 'missing_received14', 'missing_received15', 'missing_received16', 'str_received-SPF_bad', 'str_received-SPF_softfail', 'str_received-SPF_fail']


Index(['received1', 'received2', 'received3', 'received4', 'label',
       'return-path', 'in-reply-to', 'references', 'mime-version',
       'message-id', 'from', 'date', 'to', 'content-disposition', 'cc',
       'subject', 'content-type', 'content-transfer-encoding', 'sender',
       'content-length', 'hops', 'body', 'reply-to', 'importance', 'received5',
       'received6', 'received7', 'received8', 'received9', 'mailing-list',
       'delivered-to', 'received-spf', 'domainkey-signature', 'received10',
       'received11', 'received12', 'received13', 'received14', 'received15',
       'received16', 'missing_received1', 'missing_received2',
       'missing_received3', 'missing_received4', 'missing_label',
       'missing_return-path', 'missing_in-reply-to', 'missing_references',
       'missing_mime-version', 'missing_message-id', 'missing_from',
       'missing_date', 'missing_to', 'missing_content-disposition',
       'missing_cc', 'missing_subject', 'missing_content-type',
       

In [25]:
df_combined['str_received-SPF_fail'].value_counts()

str_received-SPF_fail
0.0    5064
1.0      14
Name: count, dtype: int64

**Received check:**

Detect if any of the 'received' columns contain the word 'forged'. If found, it returns 1; otherwise, it continues checking. A new feature, `received_str_forged`, is created in the DataFrame to store these binary indicators.


In [26]:
def check_received_forged(row):
  num_iters = row['hops']
  col_name_base = 'received'

  for i in range(1, num_iters+1):
    curr_val = row[col_name_base + str(i)]
    if 'forged' in curr_val:
      return 1
    else:
      continue
  return 0

df_combined['received_str_forged'] = df_combined.apply(check_received_forged, axis=1)
final_features_list.append('received_str_forged')

In [27]:
df_combined[df_combined['received_str_forged'] == 1]['label'].value_counts()

label
0    14
1    13
Name: count, dtype: int64

**From:**

'From': contains '?', '!', or '<>'

In [28]:
new_col_names = ['str_from_question', 'str_from_exclam', 'str_from_chevron']
items_to_check = ['\?', '!', '<.+>']
feature = 'from'
str_based_features_add(feature, new_col_names, items_to_check)

**To:**

'To': contains '<>'

In [29]:
new_col_names = ['str_to_chevron', 'str_to_undisclosed', 'str_to_empty']
items_to_check = ['<.+>', 'Undisclosed Recipients', '']
feature = 'to'
str_based_features_add(feature, new_col_names, items_to_check)

**Message-ID:**

'Message-ID': contains '$'

In [30]:
new_col_names = ['str_message-ID_dollar']
items_to_check = ['\$']
feature = 'message-id'
str_based_features_add(feature, new_col_names, items_to_check)

**Return-path:**

'Return-Path': contains 'bounce'

In [31]:
new_col_names = ['str_return-path_bounce', 'str_return-path_empty']
items_to_check = ['bounce', '']
feature = 'return-path'
str_based_features_add(feature, new_col_names, items_to_check)

**Reply-to:**

'Reply-To': contains: '?'

In [32]:
new_col_names = ['str_reply-to_question']
items_to_check = ['\?']
feature = 'reply-to'
str_based_features_add(feature, new_col_names, items_to_check)

## **Feature - Count Based features**



**Hops:**

Hops: The number of received fields in an email.

In [33]:
df_combined['hops'].value_counts()

hops
2     1680
3     1076
4      574
1      493
6      488
5      375
7      114
8      108
10      85
9       54
11      12
12       7
16       6
0        6
Name: count, dtype: int64

In [34]:
df_combined[df_combined['hops'] > 1]['label'].value_counts()

label
0    2521
1    2058
Name: count, dtype: int64

In [35]:
#df_combined['hops'] = df_combined['hops'].apply(lambda x: 0 if x <= 3 else 1)

**Length-from:**

Length-from: The total number of characters in the 'From' field.

In [36]:
def count_chars(field_names, new_col_names):
  for field_name, new_col_name in zip(field_names, new_col_names):
    df_combined[new_col_name] = df_combined[field_name].str.len()

  final_features_list.extend(new_col_names)

fields_to_find_lengths = ['from']
new_col_names_lengths = []

for val in fields_to_find_lengths:
  new_col_names_lengths.append('length_' + val)

count_chars(fields_to_find_lengths, new_col_names_lengths)

In [37]:
df_combined[df_combined['length_from'] > 40]['label'].value_counts()

label
1    1227
0     890
Name: count, dtype: int64

In [38]:
#df_combined['length_from'] = df_combined['length_from'].apply(lambda x: 0 if x > 40 else 1)

**Number of recipients - To**

The number of email addresses specified in the 'To' field.

In [39]:
df_combined['num_recipients_to'] = df_combined.apply(lambda x: len(re.findall(
    r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', x['to'])), axis=1)

In [40]:
df_combined['num_recipients_to'].value_counts()

num_recipients_to
1       4611
2        252
0        194
3         15
4          3
908        1
4544       1
97         1
Name: count, dtype: int64

In [41]:
# df_combined['num_recipients_to'] = df_combined['num_recipients_to'].apply(lambda x: 0 if x == 0 else 1)
# df_combined['num_recipients_to'].value_counts()

In [42]:
final_features_list.append('num_recipients_to')

**Number of recipients - Cc**

The number of email addresses specified in the 'Cc' field.

In [43]:
df_combined['num_recipients_cc'] = df_combined.apply(lambda x: len(re.findall(
    r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', x['cc'])), axis=1)

In [44]:
df_combined['num_recipients_cc'].value_counts()

num_recipients_cc
0    4328
1     524
2     153
3      51
4      11
5       6
7       3
9       1
6       1
Name: count, dtype: int64

In [45]:
# df_combined['num_recipients_cc'] = df_combined['num_recipients_cc'].apply(lambda x: 0 if x == 0 else 1)
# df_combined['num_recipients_cc'].value_counts()

In [46]:
final_features_list.append('num_recipients_cc')

**Number of recipients - From**

The number of email addresses specified in the 'From' field.

In [47]:
df_combined['num_recipients_from'] = df_combined.apply(lambda x: len(re.findall(
    r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', x['cc'])), axis=1)

In [48]:
df_combined['num_recipients_from'].value_counts()

num_recipients_from
0    4328
1     524
2     153
3      51
4      11
5       6
7       3
9       1
6       1
Name: count, dtype: int64

In [49]:
# df_combined['num_recipients_from'] = df_combined['num_recipients_from'].apply(lambda x: 0 if x == 0 else 1)
# df_combined['num_recipients_from'].value_counts()

In [50]:
final_features_list.append('num_recipients_from')

**Number of replies:**

The number of message-ID's contained within the 'References' field. (Each message-ID is enclosed in a pair of '<>').

In [51]:
def extract_num_replies(row):
  references_val = row['references']
  all = re.findall(r'<([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)>', 
                  references_val)
  return len(all)

df_combined['number_replies'] = df_combined.apply(extract_num_replies, axis=1)

In [52]:
df_combined['number_replies'].value_counts()

number_replies
0     3947
1      532
2      259
3      152
4       70
5       41
6       25
7       19
8       15
9        8
10       6
11       2
12       1
14       1
Name: count, dtype: int64

In [53]:
df_combined[df_combined['number_replies'] >= 1]['label'].value_counts()

label
0    1053
1      78
Name: count, dtype: int64

In [54]:
#df_combined['number_replies'] = df_combined['number_replies'].apply(lambda x: 1 if x >= 1 else 0)

In [55]:
final_features_list.append('number_replies')

## **Feature - Comparison Based Features**

Refers to features that are derived by comparing other features.

**Date Validity (Received)**: 

Comparing the 'Date' field and last received's "date" value:

In [56]:
# Assuming df is your DataFrame
received_cols = ['received1', 'received2', 'received3', 'received4', 'received5', 
                 'received6', 'received7', 'received8', 'received9', 'received10']

# Function to get the last non-empty 'received' field in each row
def get_last_received(row):
    # Loop through the columns from the last to the first
    for col in reversed(received_cols):
        if row[col] != '':  # Check if the value is not an empty string
            return row[col]
    return ''  # Return empty string if all fields are empty

In [57]:
def date_received_date_comp(row):
  date_date = row['date']
  date_received = row['last_received_date']

  d1 = email.utils.parsedate_tz(date_date)
  d2 = email.utils.parsedate_tz(date_received)

  if d1 is None or d2 is None:
    return -1

  try:
    val1 = email.utils.mktime_tz(d1)
    val2 = email.utils.mktime_tz(d2)
  except:
    return -1

  return (email.utils.mktime_tz(d2)) - (email.utils.mktime_tz(d1))

In [58]:
df_combined['last_received'] = df_combined.apply(get_last_received, axis=1)

df_combined['last_received_date'] = df_combined['last_received'].str.replace('\n\t', ';').str.split(r';').str[-1]

df_combined['date_comp_date_received'] = df_combined.apply(date_received_date_comp, axis=1)

In [59]:
df_combined[df_combined['date_comp_date_received'] > 0]['date_comp_date_received'].describe()


count    2.383000e+03
mean     2.552471e+03
std      3.321444e+04
min      1.000000e+00
25%      2.000000e+00
50%      1.000000e+01
75%      8.900000e+01
max      1.398147e+06
Name: date_comp_date_received, dtype: float64

In [60]:
final_features_list.append('date_comp_date_received')

**Domain matching**: 

- Message-ID + From
- Message-ID + Return-Path
- Message-ID + Sender
- Message-ID + first Received
- Return-Path + From
- Return-Path + Reply-To
- Reply-To + To
- Error-To + Message-ID
- Error-To + From
- Error-To + Sender
- Sender + From
- Reference + Reply-To
- Reference + InReply-To
- To + last Received
- InReply-To + To
- Reference + To
- Sender + first Received ('from' part of the first Received)
- Return-Path + first Recieved ('from' part of the first Received)
- Reply-To + last Received ('for' part of the last Received)
- InReply-To + last Received ('for' part of the last Received)

Extracting emails and domains:

In [61]:
# emails in brackets '<>' are matched first, and if none, then other emails are matched
def extract_emails(row, col_name):

  in_brackets = re.findall(r'<([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)>', row[col_name])

  if len(in_brackets) == 0:
    not_in_brackets = re.findall(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', row[col_name])
    if len(not_in_brackets) == 0:
      return []
    else:
      return not_in_brackets
  else:
    return in_brackets

def extract_domains(row, col_name):
  emails_list = row[col_name]

  if len(emails_list) == 0:
    return []
  else:
    domains_list = []
    for email in emails_list:
      if len(email.split('.')) < 2:
        continue
      else:
        main_domain = email.split('@')[-1]
        main_domain = main_domain.split('.')[-2:]
        main_domain = main_domain[0] + '.' + re.sub('\W+','', main_domain[1])
        domains_list.append(main_domain.lower())
    return domains_list

In [62]:
emails_from = df_combined.apply(extract_emails, col_name='from', axis=1)
emails_message_id = df_combined.apply(extract_emails, col_name='message-id', axis=1)
emails_return_path = df_combined.apply(extract_emails, col_name='return-path', axis=1)
emails_reply_to = df_combined.apply(extract_emails, col_name='reply-to', axis=1)
#emails_errors_to = df_combined.apply(extract_emails, col_name='errors-to', axis=1)
emails_in_reply_to = df_combined.apply(extract_emails, col_name='in-reply-to', axis=1)
emails_references = df_combined.apply(extract_emails, col_name='references', axis=1)
emails_to = df_combined.apply(extract_emails, col_name='to', axis=1)
emails_cc = df_combined.apply(extract_emails, col_name='cc', axis=1)
emails_sender = df_combined.apply(extract_emails, col_name='sender', axis=1)

In [63]:
emails_df = pd.concat([emails_from, emails_message_id, emails_return_path, 
                        emails_reply_to, emails_in_reply_to, 
                        emails_references, emails_to, emails_cc, emails_sender], axis=1)

emails_df.columns = ['from', 'message-id', 'return-path', 'reply-to',
                     'in-reply-to', 'references', 'to', 'cc', 'sender']

In [64]:
emails_df.columns

Index(['from', 'message-id', 'return-path', 'reply-to', 'in-reply-to',
       'references', 'to', 'cc', 'sender'],
      dtype='object')

In [65]:
emails_df.tail(5)

Unnamed: 0,from,message-id,return-path,reply-to,in-reply-to,references,to,cc,sender
5073,[priemna@leluk.org.ua],[],[priemna@leluk.org.ua],[],[],[],[jose@monkey.org],[],[]
5074,[priemna@leluk.org.ua],[],[priemna@leluk.org.ua],[],[],[],[jose@monkey.org],[],[]
5075,[mailing@ml.ussnews.net],[],[mailing@ml.ussnews.net],[],[],[],[jose@monkey.org],[],[]
5076,[michael502@wsu.edu],[20181228001729.YTRM16524.dukecmfep05.coxmail....,[michael502@wsu.edu],[],[],[],[michael502@wsu.edu],[],[]
5077,[david234@wsu.edu],[20181230001751.LXSQ17347.dukecmfep04.coxmail....,[david234@wsu.edu],[],[],[],[david234@wsu.edu],[],[]


In [66]:
domains_from = emails_df.apply(extract_domains, col_name='from', axis=1)
domains_message_id = emails_df.apply(extract_domains, col_name='message-id', axis=1)
domains_return_path = emails_df.apply(extract_domains, col_name='return-path', axis=1)
domains_reply_to = emails_df.apply(extract_domains, col_name='reply-to', axis=1)
#domains_errors_to = emails_df.apply(extract_domains, col_name='errors-to', axis=1)
domains_in_reply_to = emails_df.apply(extract_domains, col_name='in-reply-to', axis=1)
domains_references = emails_df.apply(extract_domains, col_name='references', axis=1)
domains_to = emails_df.apply(extract_domains, col_name='to', axis=1)
domains_cc = emails_df.apply(extract_domains, col_name='cc', axis=1)
domains_sender = emails_df.apply(extract_domains, col_name='sender', axis=1)

domains_df = pd.concat([domains_from, domains_message_id, domains_return_path, 
                        domains_reply_to, domains_in_reply_to, 
                        domains_references, domains_to, domains_cc, domains_sender], axis=1)

domains_df.columns = ['from_domains', 'message-id_domains', 'return-path_domains', 'reply-to_domains',
                     'in-reply-to_domains', 'references_domains', 'to_domains', 'cc_domains', 'sender_domains']
                    
# domains_df.set_axis(['from_domains', 'message-id_domains', 'return-path_domains', 'reply-to_domains',
#                      'in-reply-to_domains', 'references_domains', 'to_domains', 'cc_domains', 'sender_domains'], 
#                     axis=1, inplace=True)

In [67]:
def email_same_check(row, first_col, second_col):
  vals1 = row[first_col]
  vals2 = row[second_col]

  for val1 in vals1:
    for val2 in vals2:
      if val1 == val2:
        return 1

  return 0

emails_to_check = [('from', 'reply-to'), ('from', 'return-path'),]

for val in emails_to_check:
  first_field = val[0]
  second_field = val[1]
  new_col_name = 'email_match_' + first_field + '_' + second_field

  df_combined[new_col_name] = emails_df.apply(email_same_check, first_col=first_field, 
                  second_col=second_field, axis=1)
  final_features_list.append(new_col_name)

In [68]:
# Returns 0 if no matches, 1 if at least one match
def domain_match_check(row, first_col, second_col):

  first_domain_list = row[first_col]
  second_domain_list = row[second_col]

  if len(first_domain_list) == 0 or len(second_domain_list) == 0:
    return 0
  else:
    for d1 in first_domain_list:
      for d2 in second_domain_list:
        if d1 == d2:
          return 1
    return 0
  
domain_fields_to_check = [('message-id_domains', 'from_domains'), ('from_domains', 'return-path_domains'), ('message-id_domains', 'return-path_domains'), ('message-id_domains', 'sender_domains'), ('message-id_domains', 'reply-to_domains'),
                          ('return-path_domains', 'reply-to_domains'), ('reply-to_domains', 'to_domains'), ('to_domains', 'in-reply-to_domains'), ('sender_domains', 'from_domains'), ('references_domains', 'reply-to_domains'), ('references_domains', 'in-reply-to_domains'), ('references_domains', 'to_domains'), ('from_domains', 'reply-to_domains'),
                          ('to_domains', 'from_domains'), ('to_domains', 'message-id_domains')]

for val in domain_fields_to_check:
  first_field = val[0].replace('_domains', '')
  second_field = val[1].replace('_domains', '')
  new_col_name = 'domain_match_' + first_field + '_' + second_field 

  df_combined[new_col_name] = domains_df.apply(domain_match_check, first_col = val[0], 
                              second_col= val[1], axis=1)
  final_features_list.append(new_col_name)

  df_combined[new_col_name] = domains_df.apply(domain_match_check, first_col = val[0],
  df_combined[new_col_name] = domains_df.apply(domain_match_check, first_col = val[0],
  df_combined[new_col_name] = domains_df.apply(domain_match_check, first_col = val[0],
  df_combined[new_col_name] = domains_df.apply(domain_match_check, first_col = val[0],
  df_combined[new_col_name] = domains_df.apply(domain_match_check, first_col = val[0],
  df_combined[new_col_name] = domains_df.apply(domain_match_check, first_col = val[0],
  df_combined[new_col_name] = domains_df.apply(domain_match_check, first_col = val[0],
  df_combined[new_col_name] = domains_df.apply(domain_match_check, first_col = val[0],
  df_combined[new_col_name] = domains_df.apply(domain_match_check, first_col = val[0],
  df_combined[new_col_name] = domains_df.apply(domain_match_check, first_col = val[0],
  df_combined[new_col_name] = domains_df.apply(domain_match_check, first_col = val[0],
  df_combined[new_col_name] = domains_df.ap

**Receieved Consequentive Checks**:

Consequentive Received fields: checks if consequentive received fields have a matching domain for the 'by' of the eariler received field and the 'from' part of the later one. Value is 1 if there is an issue, otherwise 0 if all match.

## **Feature - External lookup features**

Refers to features that cannot be calculated offline, such as features requiring DNS queries.

In [69]:
import requests
import dns.resolver

# Function to extract IP addresses from the 'Received' fields of a single row (email)
def extract_ips_from_row(row):
    # Regular expression to capture IP addresses in Received headers
    ip_regex = re.compile(r'\[(\d{1,3}(?:\.\d{1,3}){3})\]')

    # List to hold all extracted IPs
    ips = []

    # Iterate over the received columns for the row
    for col in row.index:
        if 'received' in col:
            # Extract IPs from the current 'received' field
            ips += ip_regex.findall(str(row[col]))

    # Remove duplicates
    return list(set(ips))

# Function to check if any IP in a row is blacklisted
def check_row_for_blacklist(row):
    ips = extract_ips_from_row(row)
    
    if not ips:
        return 0

    servers_blacklisted = 0
    
    # Check each IP and return 1 if any IP is blacklisted
    for ip in ips:
        if check_ip_spamhaus(ip):
            servers_blacklisted += 1

    return servers_blacklisted

def extract_host_ip(row):
    # Regular expression to capture IP addresses in Received headers
    ip_regex = re.compile(r'\[(\d{1,3}(?:\.\d{1,3}){3})\]')

    host = row['received1']
    # Extract IP from the host 'received' field
    host_ip = ip_regex.findall(str(host))
    return host_ip

# Function to check if any IP in a row is blacklisted
def check_host_ip(row):
    host_ip = extract_host_ip(row)
    
    if not host_ip:
        return 0
    
    # Check each IP and return 1 if IP is blacklisted
    if check_ip_spamhaus(host_ip[0]):
        return 1
    return 0

# Function to check if an IP is blacklisted using Spamhaus DNSBL
def check_ip_spamhaus(ip):
    reverse_ip = '.'.join(reversed(ip.split('.')))
    query = f"{reverse_ip}.zen.spamhaus.org"
    
    try:
        # Perform a DNS query to check if the IP is blacklisted
        answers = dns.resolver.resolve(query, 'A')
        print(f"IP {ip} is blacklisted by Spamhaus.")
        return True
    except dns.resolver.NXDOMAIN:
        print(f"IP {ip} is NOT blacklisted by Spamhaus.")
        return False
    except Exception as e:
        print(f"Error querying Spamhaus for IP {ip}: {str(e)}")
        return False

In [70]:
# Apply the blacklist check for each row (email) and store the result in a new column
df_combined['num_servers_blacklisted'] = df_combined.apply(lambda row: check_row_for_blacklist(row), axis=1)

IP 129.132.145.15 is NOT blacklisted by Spamhaus.
IP 145.18.153.42 is NOT blacklisted by Spamhaus.
IP 146.50.98.70 is NOT blacklisted by Spamhaus.
IP 64.236.25.87 is blacklisted by Spamhaus.
IP 66.37.210.87 is NOT blacklisted by Spamhaus.
IP 87.165.137.180 is blacklisted by Spamhaus.
IP 212.227.126.188 is NOT blacklisted by Spamhaus.
IP 192.168.178.21 is NOT blacklisted by Spamhaus.
IP 129.100.249.132 is NOT blacklisted by Spamhaus.
IP 127.0.0.1 is NOT blacklisted by Spamhaus.
IP 170.224.5.38 is blacklisted by Spamhaus.
IP 129.132.145.15 is NOT blacklisted by Spamhaus.
IP 66.249.82.229 is NOT blacklisted by Spamhaus.
IP 167.206.4.198 is NOT blacklisted by Spamhaus.
IP 10.240.3.196 is NOT blacklisted by Spamhaus.
IP 10.240.4.136 is NOT blacklisted by Spamhaus.
IP 129.132.145.15 is NOT blacklisted by Spamhaus.
IP 63.251.223.186 is NOT blacklisted by Spamhaus.
IP 154.20.156.154 is blacklisted by Spamhaus.
IP 127.0.0.1 is NOT blacklisted by Spamhaus.
IP 66.70.73.150 is NOT blacklisted by S

  df_combined['num_servers_blacklisted'] = df_combined.apply(lambda row: check_row_for_blacklist(row), axis=1)


In [71]:
df_combined['host_blacklisted'] = df_combined.apply(lambda row: check_host_ip(row), axis=1)

IP 129.132.145.15 is NOT blacklisted by Spamhaus.
IP 64.236.25.87 is blacklisted by Spamhaus.
IP 66.37.210.87 is NOT blacklisted by Spamhaus.
IP 129.100.249.132 is NOT blacklisted by Spamhaus.
IP 170.224.5.38 is blacklisted by Spamhaus.
IP 129.132.145.15 is NOT blacklisted by Spamhaus.
IP 129.132.145.15 is NOT blacklisted by Spamhaus.
IP 63.251.223.186 is NOT blacklisted by Spamhaus.
IP 66.70.73.150 is NOT blacklisted by Spamhaus.
IP 69.28.153.197 is NOT blacklisted by Spamhaus.
IP 64.233.162.176 is NOT blacklisted by Spamhaus.
IP 66.70.73.150 is NOT blacklisted by Spamhaus.
IP 66.70.73.150 is NOT blacklisted by Spamhaus.
IP 129.132.145.15 is NOT blacklisted by Spamhaus.
IP 129.132.145.15 is NOT blacklisted by Spamhaus.
IP 63.251.223.186 is NOT blacklisted by Spamhaus.
IP 129.132.145.15 is NOT blacklisted by Spamhaus.
IP 63.251.223.186 is NOT blacklisted by Spamhaus.
IP 66.70.73.150 is NOT blacklisted by Spamhaus.
IP 129.132.145.15 is NOT blacklisted by Spamhaus.
IP 8.15.20.150 is NOT 

  df_combined['host_blacklisted'] = df_combined.apply(lambda row: check_host_ip(row), axis=1)


In [72]:
df_combined[df_combined['num_servers_blacklisted'] == 1]['label'].value_counts()

label
0    474
1    404
Name: count, dtype: int64

In [73]:
df_combined[df_combined['host_blacklisted'] == 0]['label'].value_counts()

label
1    2410
0    2284
Name: count, dtype: int64

In [74]:
final_features_list.append('num_servers_blacklisted')
final_features_list.append('host_blacklisted')

## Processed data to a file

In [75]:
for item in final_features_list:
  print(item)
  print(df_combined[item].value_counts())
  print('\n')

hops
hops
2     1680
3     1076
4      574
1      493
6      488
5      375
7      114
8      108
10      85
9       54
11      12
12       7
16       6
0        6
Name: count, dtype: int64


missing_received1
missing_received1
0    5072
1       6
Name: count, dtype: int64


missing_received2
missing_received2
0    4579
1     499
Name: count, dtype: int64


missing_received3
missing_received3
0    2899
1    2179
Name: count, dtype: int64


missing_received4
missing_received4
1    3255
0    1823
Name: count, dtype: int64


missing_label
missing_label
0    5078
Name: count, dtype: int64


missing_return-path
missing_return-path
0    5072
1       6
Name: count, dtype: int64


missing_in-reply-to
missing_in-reply-to
1    3922
0    1156
Name: count, dtype: int64


missing_references
missing_references
1    3892
0    1186
Name: count, dtype: int64


missing_mime-version
missing_mime-version
0    4695
1     383
Name: count, dtype: int64


missing_message-id
missing_message-id
0    4730
1     

In [76]:
df_combined[df_combined['str_to_chevron'] == 0]['label'].value_counts()

label
1    1774
0    1382
Name: count, dtype: int64

In [77]:
df_combined.shape

(5078, 121)

In [78]:
final_features_list

['hops',
 'missing_received1',
 'missing_received2',
 'missing_received3',
 'missing_received4',
 'missing_label',
 'missing_return-path',
 'missing_in-reply-to',
 'missing_references',
 'missing_mime-version',
 'missing_message-id',
 'missing_from',
 'missing_date',
 'missing_to',
 'missing_content-disposition',
 'missing_cc',
 'missing_subject',
 'missing_content-type',
 'missing_content-transfer-encoding',
 'missing_sender',
 'missing_content-length',
 'missing_hops',
 'missing_body',
 'missing_reply-to',
 'missing_importance',
 'missing_received5',
 'missing_received6',
 'missing_received7',
 'missing_received8',
 'missing_received9',
 'missing_mailing-list',
 'missing_delivered-to',
 'missing_received-spf',
 'missing_domainkey-signature',
 'missing_received10',
 'missing_received11',
 'missing_received12',
 'missing_received13',
 'missing_received14',
 'missing_received15',
 'missing_received16',
 'str_received-SPF_bad',
 'str_received-SPF_softfail',
 'str_received-SPF_fail',
 're

In [79]:
df_combined[df_combined['missing_domainkey-signature'] == 0]['label'].value_counts()

label
0    170
1     81
Name: count, dtype: int64

In [80]:
# Removes missing_received fields since encode the same information as 'hops'.
# The other removed features have only one value,
# or a strong majority towards one value.
remove_list = ['missing_received1', 'missing_received2', 'missing_received3',
 'missing_received4', 'missing_received5', 'missing_received6',
 'missing_received7', 'missing_received8', 'missing_received9', 'missing_received10',
 'missing_x-msmail-priority', 'missing_x-beenthere', 'missing_x-virus-status', 'missing_x-spam-level', 'missing_x-spam-checker-version', 'missing_x-mailer', 'missing_x-priority',
# 'missing_subject', 'missing_date', 'missing_message-id', 'missing_from',
# 'missing_return-path', 'missing_to', 'missing_content_type', 'missing_mime-version',
#  'missing_x-mimeole', 'missing_x-priority', 'missing_x-virus-scanned', 
#  'missing_content-length', 'missing_delivered-to', 'missing_list-post',
#  'missing_list-help', 'missing_list-archive',
#  'missing_references', 'missing_in-reply-to', 'missing_user-agent', 'missing_thread-index', 'missing_cc',
#  'missing_content-disposition', 'missing_mailing-list', 'missing_domainkey-signature', 'missing_importance', 
#  'str_from_chevron', 'str_to_undisclosed', 'str_return-path_empty',
#  'str_from_exclam', 'str_reply-to_question', 'str_message-ID_dollar', 'str_received-SPF_bad', 
#  'str_received-SPF_softfail', 'str_received-SPF_fail', 'str_reply-to_question', 
#  'num_recipients_from'
]

for v in remove_list:
  if v in final_features_list:
    final_features_list.remove(v)
    
final_features_list.append('label')

In [81]:
duplicates = df_combined.duplicated().sum()
print(f"Number of duplicates: {duplicates}")

Number of duplicates: 0


In [82]:
len(final_features_list)

71

In [83]:
df_combined.shape

(5078, 121)

In [84]:
df_final = df_combined[final_features_list]

In [85]:
duplicates = df_final.duplicated().sum()
print(f"Number of duplicates: {duplicates}")

Number of duplicates: 805


In [86]:
df_final.shape

(5078, 71)

In [87]:
df_final.to_csv('preprocessed_data.csv', index=False)