## Import data 

In [1]:
import numpy as np
import pandas as pd
import re
import time
import email.utils

In [2]:
# Load the phishing emails
source_directory = '../../Data/JosePhishingEmails/extracted/'

PHISHING_FILE = source_directory + 'extracted_headers.csv'

df_phishing = pd.read_csv(PHISHING_FILE)

df_phishing.head(1)

df_phishing = df_phishing.drop_duplicates()

In [3]:
# Load the non-phishing emails
HAM_FILE = '../../Data/HamEmails/extracted/extracted_2544_headers.csv'

df_ham = pd.read_csv(HAM_FILE)

df_ham.head(1)

Unnamed: 0,received1,received2,received3,received4,hops,subject,date,message-id,from,return-path,...,list-id,precedence,delivered-to,list-unsubscribe,sender,reply-to,received-spf,x-original-to,content-disposition,label
0,from mda3cf.sendnode.com (mda3cf.sendnode.com ...,,,,1,=?UTF-8?Q?Bitte_best=C3=A4tigen_Sie_Ihr_Newsle...,"Thu, 27 Aug 2020 18:54:29 +0200",<6cc.4.199@sendnode.com>,=?UTF-8?Q?Sparkasse_zu_L=C3=BCbeck_AG?=\n <new...,<hbbj.d.afbi=bounce@bounces.sendnode.com>,...,<201c.4.sendnode.com>,,mailbox@privacy-mail.org,,,<newsletter@sparkasse-luebeck.de>,pass (mx2e90: domain of bounces.sendnode.com d...,christopher.casto@privacy-mail.org,,0


In [4]:
# DataFrame with the most columns and the column count
dfs = {
    'df': df_ham,
    'df_phishing': df_phishing,
}
max_columns_name, max_columns_df = max(dfs.items(), key=lambda x: x[1].shape[1])

print(f"The DataFrame with the most columns is {max_columns_name} with {max_columns_df.shape[1]} columns.")

The DataFrame with the most columns is df_phishing with 52 columns.


In [5]:
# Ensure all DataFrames have the same columns as df2015, adding missing ones with NaN values
dfs_aligned = [d[1].reindex(columns=df_phishing.columns, fill_value=np.nan) for d in dfs.items()]

# Concatenate all DataFrames into one, ignoring the original index
df_combined = pd.concat(dfs_aligned, ignore_index=True)

df_combined.tail(1)

Unnamed: 0,received1,received2,received3,received4,received5,received6,received7,received8,received9,received10,...,in-reply-to,user-agent,thread-index,cc,received-spf,content-disposition,mailing-list,domainkey-signature,importance,label
5077,from omr2.cc.vt.edu (outbound.smtp.vt.edu [198...,from mr5.cc.vt.edu (mr5.cc.ipv6.vt.edu [IPv6:2...,from Unknown (h80ad7345.dhcp.vt.edu [128.173.1...,,,,,,,,...,,,,,,,,,,1


## Dataset Information

In [6]:
df_combined['label'].value_counts()

0    2544
1    2534
Name: label, dtype: int64

In [7]:
df_combined.columns.values

array(['received1', 'received2', 'received3', 'received4', 'received5',
       'received6', 'received7', 'received8', 'received9', 'received10',
       'hops', 'subject', 'date', 'message-id', 'from', 'return-path',
       'to', 'content-type', 'mime-version', 'x-mailer',
       'content-transfer-encoding', 'x-mimeole', 'x-priority', 'list-id',
       'x-virus-scanned', 'status', 'content-length', 'precedence',
       'delivered-to', 'list-unsubscribe', 'list-post', 'list-help',
       'x-msmail-priority', 'x-spam-status', 'sender', 'x-beenthere',
       'list-archive', 'reply-to', 'x-virus-status', 'x-spam-level',
       'x-spam-checker-version', 'references', 'in-reply-to',
       'user-agent', 'thread-index', 'cc', 'received-spf',
       'content-disposition', 'mailing-list', 'domainkey-signature',
       'importance', 'label'], dtype=object)

In [8]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5078 entries, 0 to 5077
Data columns (total 52 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   received1                  5072 non-null   object 
 1   received2                  3327 non-null   object 
 2   received3                  992 non-null    object 
 3   received4                  400 non-null    object 
 4   received5                  215 non-null    object 
 5   received6                  155 non-null    object 
 6   received7                  45 non-null     object 
 7   received8                  15 non-null     object 
 8   received9                  4 non-null      object 
 9   received10                 1 non-null      object 
 10  hops                       5078 non-null   int64  
 11  subject                    5073 non-null   object 
 12  date                       5023 non-null   object 
 13  message-id                 4730 non-null   objec

In [9]:
df_combined.describe()

Unnamed: 0,hops,content-length,label
count,5078.0,2.0,5078.0
mean,2.013785,276635.0,0.499015
std,1.153651,205906.666254,0.500048
min,0.0,131037.0,0.0
25%,1.0,203836.0,0.0
50%,2.0,276635.0,0.0
75%,2.0,349434.0,1.0
max,10.0,422233.0,1.0


In [10]:
df_combined.nunique()

received1                    5072
received2                    3326
received3                     992
received4                     400
received5                     215
received6                     155
received7                      45
received8                      15
received9                       4
received10                      1
hops                           11
subject                      4134
date                         4766
message-id                   4725
from                         2200
return-path                  4092
to                            451
content-type                 3740
mime-version                    5
x-mailer                      161
content-transfer-encoding       7
x-mimeole                      20
x-priority                      8
list-id                        21
x-virus-scanned               149
status                          3
content-length                  2
precedence                      2
delivered-to                    6
list-unsubscri

In [11]:
df_combined['hops'].unique()

array([ 1,  2,  3,  4,  6,  5,  8,  0,  9,  7, 10])

In [12]:
# Columns with no missing values
df_combined.columns[~df_combined.isna().any()].tolist()

['hops', 'from', 'label']

In [13]:
# Columns with missing values
df_combined.columns[df_combined.isna().any()].tolist()

['received1',
 'received2',
 'received3',
 'received4',
 'received5',
 'received6',
 'received7',
 'received8',
 'received9',
 'received10',
 'subject',
 'date',
 'message-id',
 'return-path',
 'to',
 'content-type',
 'mime-version',
 'x-mailer',
 'content-transfer-encoding',
 'x-mimeole',
 'x-priority',
 'list-id',
 'x-virus-scanned',
 'status',
 'content-length',
 'precedence',
 'delivered-to',
 'list-unsubscribe',
 'list-post',
 'list-help',
 'x-msmail-priority',
 'x-spam-status',
 'sender',
 'x-beenthere',
 'list-archive',
 'reply-to',
 'x-virus-status',
 'x-spam-level',
 'x-spam-checker-version',
 'references',
 'in-reply-to',
 'user-agent',
 'thread-index',
 'cc',
 'received-spf',
 'content-disposition',
 'mailing-list',
 'domainkey-signature',
 'importance']

## Data preprocessing

In [14]:
initial_features_list = ['received1', 'received2', 'received3', 'received4', 'received5',
       'received6', 'received7', 'received8', 'subject', 'date',
       'message-id', 'from', 'return-path', 'to', 'content-type',
       'mime-version', 'x-mailer', 'content-transfer-encoding',
       'x-mimeole', 'x-priority', 'list-id', 'x-virus-scanned', 'status',
       'content-length', 'precedence', 'delivered-to', 'list-unsubscribe',
       'list-post', 'list-help', 'x-msmail-priority', 'x-spam-status',
       'sender', 'x-beenthere', 'list-archive', 'reply-to',
       'x-virus-status', 'x-spam-level', 'x-spam-checker-version',
       'references', 'in-reply-to', 'user-agent', 'thread-index', 'cc',
       'received-spf', 'content-disposition', 'mailing-list',
       'domainkey-signature', 'importance']

label_name = 'label'

final_features_list = ['hops']

## **Feature - Handling Missing Values in Features**

In this step, we create binary indicator features for missing values in the initial features list. For each feature in the `initial_features_list`, a new feature is created to indicate whether the value is missing (1) or not missing (0). This is useful for many machine learning models that may benefit from knowing whether a value was imputed.

In [15]:
df_combined.shape

(5078, 52)

In [16]:
missing_feature_names = ['missing_' + name for name in initial_features_list]

for feature, missing_feature in zip(initial_features_list, missing_feature_names):
    df_combined[missing_feature] = df_combined[feature].isnull().astype(int)

final_features_list.extend(missing_feature_names)

In [17]:
df_combined.shape

(5078, 100)

In [18]:
df_combined['missing_from'].value_counts()

0    5078
Name: missing_from, dtype: int64

In [19]:
df_combined['missing_to'].value_counts()

0    4965
1     113
Name: missing_to, dtype: int64

### Replacing NaN Values in the DF

In this step, we replace all `NaN` values in the `df_combined` DataFrame with empty strings. This operation ensures that there are no `NaN` values in the DataFrame, which can be particularly useful when preparing data for models that do not handle `NaN` values well or when converting the DataFrame to formats that do not support `NaN`.

The code below performs this replacement using the `replace` method from Pandas.

In [20]:
df_combined = df_combined.replace(np.nan, '', regex=True)

## **Feature - String Content Matching**

In [21]:
def str_based_features_add(old_col_name, new_col_names, items_to_check):

  for col_name, item_to_check in zip(new_col_names, items_to_check):
    if item_to_check == '':
      df_combined.loc[(df_combined[old_col_name].str.fullmatch(item_to_check)), col_name] = 1
      df_combined.loc[~(df_combined[old_col_name].str.fullmatch(item_to_check)), col_name] = 0
    else:
      df_combined.loc[(df_combined[old_col_name].str.contains('(?i)' + item_to_check)), col_name] = 1
      df_combined.loc[~(df_combined[old_col_name].str.contains('(?i)' + item_to_check)), col_name] = 0

  final_features_list.extend(new_col_names)

**Received-SPF:**

'Received-SPF': contains 'bad', 'softfail', 'fail'

In [22]:
new_col_names = ['str_received-SPF_bad', 'str_received-SPF_softfail', 
                 'str_received-SPF_fail']
items_to_check = ['bad', 'softfail', 'fail']
feature = 'received-spf'
str_based_features_add(feature, new_col_names, items_to_check)

In [23]:
df_combined['str_received-SPF_softfail'].value_counts()

0.0    5075
1.0       3
Name: str_received-SPF_softfail, dtype: int64

**Received check:**

Detect if any of the 'received' columns contain the word 'forged'. If found, it returns 1; otherwise, it continues checking. A new feature, `received_str_forged`, is created in the DataFrame to store these binary indicators.


In [24]:
def check_received_forged(row):
  num_iters = row['hops']
  col_name_base = 'received'

  for i in range(1, num_iters+1):
    curr_val = row[col_name_base + str(i)]
    if 'forged' in curr_val:
      return 1
    else:
      continue
  return 0

df_combined['received_str_forged'] = df_combined.apply(check_received_forged, axis=1)
final_features_list.append('received_str_forged')

In [25]:
df_combined[df_combined['received_str_forged'] == 1]['label'].value_counts()

1    13
Name: label, dtype: int64

**From:**

'From': contains '?', '!', or '<>'

In [26]:
new_col_names = ['str_from_question', 'str_from_exclam', 'str_from_chevron']
items_to_check = ['\?', '!', '<.+>']
feature = 'from'
str_based_features_add(feature, new_col_names, items_to_check)

**To:**

'To': contains '<>'

In [27]:
new_col_names = ['str_to_chevron', 'str_to_undisclosed', 'str_to_empty']
items_to_check = ['<.+>', 'Undisclosed Recipients', '']
feature = 'to'
str_based_features_add(feature, new_col_names, items_to_check)

**Message-ID:**

'Message-ID': contains '$'

In [28]:
new_col_names = ['str_message-ID_dollar']
items_to_check = ['\$']
feature = 'message-id'
str_based_features_add(feature, new_col_names, items_to_check)

**Return-path:**

'Return-Path': contains 'bounce'

In [29]:
new_col_names = ['str_return-path_bounce', 'str_return-path_empty']
items_to_check = ['bounce', '']
feature = 'return-path'
str_based_features_add(feature, new_col_names, items_to_check)

**Reply-to:**

'Reply-To': contains: '?'

In [30]:
new_col_names = ['str_reply-to_question']
items_to_check = ['\?']
feature = 'reply-to'
str_based_features_add(feature, new_col_names, items_to_check)

## **Feature - Count Based features**



**Hops:**

Hops: The number of received fields in an email.

In [31]:
df_combined['hops'].value_counts()

2     2335
1     1745
3      592
4      185
6      110
5       60
7       30
8       11
0        6
9        3
10       1
Name: hops, dtype: int64

In [32]:
df_combined[df_combined['hops'] > 3]['label'].value_counts()

1    398
0      2
Name: label, dtype: int64

In [33]:
df_combined['hops'] = df_combined['hops'].apply(lambda x: 0 if x <= 3 else 1)

**Length-from:**

Length-from: The total number of characters in the 'From' field.

In [34]:
def count_chars(field_names, new_col_names):
  for field_name, new_col_name in zip(field_names, new_col_names):
    df_combined[new_col_name] = df_combined[field_name].str.len()

  final_features_list.extend(new_col_names)

fields_to_find_lengths = ['from']
new_col_names_lengths = []

for val in fields_to_find_lengths:
  new_col_names_lengths.append('length_' + val)

count_chars(fields_to_find_lengths, new_col_names_lengths)

In [35]:
df_combined[df_combined['length_from'] > 40]['label'].value_counts()

1    1227
0     929
Name: label, dtype: int64

In [36]:
df_combined['length_from'] = df_combined['length_from'].apply(lambda x: 0 if x > 40 else 1)

**Number of recipients - To**

The number of email addresses specified in the 'To' field.

In [37]:
df_combined['num_recipients_to'] = df_combined.apply(lambda x: len(re.findall(
    r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', x['to'])), axis=1)

In [38]:
df_combined['num_recipients_to'].value_counts()

1       4774
0        181
2        120
97         1
908        1
4544       1
Name: num_recipients_to, dtype: int64

In [39]:
df_combined['num_recipients_to'] = df_combined['num_recipients_to'].apply(lambda x: 0 if x == 0 else 1)
df_combined['num_recipients_to'].value_counts()

1    4897
0     181
Name: num_recipients_to, dtype: int64

In [40]:
final_features_list.append('num_recipients_to')

**Number of recipients - Cc**

The number of email addresses specified in the 'Cc' field.

In [41]:
df_combined['num_recipients_cc'] = df_combined.apply(lambda x: len(re.findall(
    r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', x['cc'])), axis=1)

In [42]:
df_combined['num_recipients_cc'].value_counts()

0    5045
3      22
1       7
2       2
5       2
Name: num_recipients_cc, dtype: int64

In [43]:
df_combined['num_recipients_cc'] = df_combined['num_recipients_cc'].apply(lambda x: 0 if x == 0 else 1)
df_combined['num_recipients_cc'].value_counts()

0    5045
1      33
Name: num_recipients_cc, dtype: int64

In [44]:
final_features_list.append('num_recipients_cc')

**Number of recipients - From**

The number of email addresses specified in the 'From' field.

In [45]:
df_combined['num_recipients_from'] = df_combined.apply(lambda x: len(re.findall(
    r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', x['cc'])), axis=1)

In [46]:
df_combined['num_recipients_from'].value_counts()

0    5045
3      22
1       7
2       2
5       2
Name: num_recipients_from, dtype: int64

In [47]:
df_combined['num_recipients_from'] = df_combined['num_recipients_from'].apply(lambda x: 0 if x == 0 else 1)
df_combined['num_recipients_from'].value_counts()

0    5045
1      33
Name: num_recipients_from, dtype: int64

In [48]:
final_features_list.append('num_recipients_from')

**Number of replies:**

The number of message-ID's contained within the 'References' field. (Each message-ID is enclosed in a pair of '<>').

In [49]:
def extract_num_replies(row):
  references_val = row['references']
  all = re.findall(r'<([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)>', 
                  references_val)
  return len(all)

df_combined['number_replies'] = df_combined.apply(extract_num_replies, axis=1)

In [50]:
df_combined['number_replies'].value_counts()

0     5000
1       61
2        9
3        4
8        2
14       1
4        1
Name: number_replies, dtype: int64

In [51]:
df_combined[df_combined['number_replies'] >= 1]['label'].value_counts()

1    78
Name: label, dtype: int64

In [52]:
df_combined['number_replies'] = df_combined['number_replies'].apply(lambda x: 1 if x >= 1 else 0)

In [53]:
final_features_list.append('number_replies')

## **Feature - Comparison Based Features**

Refers to features that are derived by comparing other features.

**Date Validity (Received)**: 

Comparing the 'Date' field and last received's "date" value:

In [54]:
# def date_received_date_comp(row):
#   date_date = row['date']
#   date_received = row['last_received_date']

#   d1 = email.utils.parsedate_tz(date_date)
#   d2 = email.utils.parsedate_tz(date_received)

#   if d1 is None or d2 is None:
#     return -1

#   try:
#     val1 = email.utils.mktime_tz(d1)
#     val2 = email.utils.mktime_tz(d2)
#   except:
#     return -1

#   return (email.utils.mktime_tz(d2)) - (email.utils.mktime_tz(d1))

In [55]:
# df_combined['date_comp_date_received'] = df_combined.apply(date_received_date_comp, axis=1)

In [56]:
# df_combined[df_combined['date_comp_date_received'] > 0]['date_comp_date_received'].describe()


**Domain matching**: 

- Message-ID + From
- Message-ID + Return-Path
- Message-ID + Sender
- Message-ID + first Received
- Return-Path + From
- Return-Path + Reply-To
- Reply-To + To
- Error-To + Message-ID
- Error-To + From
- Error-To + Sender
- Sender + From
- Reference + Reply-To
- Reference + InReply-To
- To + last Received
- InReply-To + To
- Reference + To
- Sender + first Received ('from' part of the first Received)
- Return-Path + first Recieved ('from' part of the first Received)
- Reply-To + last Received ('for' part of the last Received)
- InReply-To + last Received ('for' part of the last Received)

Extracting emails and domains:

In [57]:
# emails in brackets '<>' are matched first, and if none, then other emails are matched
def extract_emails(row, col_name):

  in_brackets = re.findall(r'<([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)>', row[col_name])

  if len(in_brackets) == 0:
    not_in_brackets = re.findall(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', row[col_name])
    if len(not_in_brackets) == 0:
      return []
    else:
      return not_in_brackets
  else:
    return in_brackets

def extract_domains(row, col_name):
  emails_list = row[col_name]

  if len(emails_list) == 0:
    return []
  else:
    domains_list = []
    for email in emails_list:
      if len(email.split('.')) < 2:
        continue
      else:
        main_domain = email.split('@')[-1]
        main_domain = main_domain.split('.')[-2:]
        main_domain = main_domain[0] + '.' + re.sub('\W+','', main_domain[1])
        domains_list.append(main_domain.lower())
    return domains_list

In [58]:
emails_from = df_combined.apply(extract_emails, col_name='from', axis=1)
emails_message_id = df_combined.apply(extract_emails, col_name='message-id', axis=1)
emails_return_path = df_combined.apply(extract_emails, col_name='return-path', axis=1)
emails_reply_to = df_combined.apply(extract_emails, col_name='reply-to', axis=1)
#emails_errors_to = df_combined.apply(extract_emails, col_name='errors-to', axis=1)
emails_in_reply_to = df_combined.apply(extract_emails, col_name='in-reply-to', axis=1)
emails_references = df_combined.apply(extract_emails, col_name='references', axis=1)
emails_to = df_combined.apply(extract_emails, col_name='to', axis=1)
emails_cc = df_combined.apply(extract_emails, col_name='cc', axis=1)
emails_sender = df_combined.apply(extract_emails, col_name='sender', axis=1)

emails_df = pd.concat([emails_from, emails_message_id, emails_return_path, 
                        emails_reply_to, emails_in_reply_to, 
                        emails_references, emails_to, emails_cc, emails_sender], axis=1)
emails_df.set_axis(['from', 'message-id', 'return-path', 'reply-to',
                     'in-reply-to', 'references', 'to', 'cc', 'sender'], 
                    axis=1, inplace=True)

In [59]:
domains_from = emails_df.apply(extract_domains, col_name='from', axis=1)
domains_message_id = emails_df.apply(extract_domains, col_name='message-id', axis=1)
domains_return_path = emails_df.apply(extract_domains, col_name='return-path', axis=1)
domains_reply_to = emails_df.apply(extract_domains, col_name='reply-to', axis=1)
#domains_errors_to = emails_df.apply(extract_domains, col_name='errors-to', axis=1)
domains_in_reply_to = emails_df.apply(extract_domains, col_name='in-reply-to', axis=1)
domains_references = emails_df.apply(extract_domains, col_name='references', axis=1)
domains_to = emails_df.apply(extract_domains, col_name='to', axis=1)
domains_cc = emails_df.apply(extract_domains, col_name='cc', axis=1)
domains_sender = emails_df.apply(extract_domains, col_name='sender', axis=1)

domains_df = pd.concat([domains_from, domains_message_id, domains_return_path, 
                        domains_reply_to, domains_in_reply_to, 
                        domains_references, domains_to, domains_cc, domains_sender], axis=1)
domains_df.set_axis(['from_domains', 'message-id_domains', 'return-path_domains', 'reply-to_domains',
                     'in-reply-to_domains', 'references_domains', 'to_domains', 'cc_domains', 'sender_domains'], 
                    axis=1, inplace=True)

In [60]:
def email_same_check(row, first_col, second_col):
  vals1 = row[first_col]
  vals2 = row[second_col]

  for val1 in vals1:
    for val2 in vals2:
      if val1 == val2:
        return 1

  return 0

emails_to_check = [('from', 'reply-to'), ('from', 'return-path'),]

for val in emails_to_check:
  first_field = val[0]
  second_field = val[1]
  new_col_name = 'email_match_' + first_field + '_' + second_field

  df_combined[new_col_name] = emails_df.apply(email_same_check, first_col=first_field, 
                  second_col=second_field, axis=1)
  final_features_list.append(new_col_name)

In [61]:
# Returns 0 if no matches, 1 if at least one match
def domain_match_check(row, first_col, second_col):

  first_domain_list = row[first_col]
  second_domain_list = row[second_col]

  if len(first_domain_list) == 0 or len(second_domain_list) == 0:
    return 0
  else:
    for d1 in first_domain_list:
      for d2 in second_domain_list:
        if d1 == d2:
          return 1
    return 0
  
domain_fields_to_check = [('message-id_domains', 'from_domains'), ('from_domains', 'return-path_domains'), ('message-id_domains', 'return-path_domains'), ('message-id_domains', 'sender_domains'), ('message-id_domains', 'reply-to_domains'),
                          ('return-path_domains', 'reply-to_domains'), ('reply-to_domains', 'to_domains'), ('to_domains', 'in-reply-to_domains'), ('sender_domains', 'from_domains'), ('references_domains', 'reply-to_domains'), ('references_domains', 'in-reply-to_domains'), ('references_domains', 'to_domains'), ('from_domains', 'reply-to_domains'),
                          ('to_domains', 'from_domains'), ('to_domains', 'message-id_domains')]

for val in domain_fields_to_check:
  first_field = val[0].replace('_domains', '')
  second_field = val[1].replace('_domains', '')
  new_col_name = 'domain_match_' + first_field + '_' + second_field 

  df_combined[new_col_name] = domains_df.apply(domain_match_check, first_col = val[0], 
                              second_col= val[1], axis=1)
  final_features_list.append(new_col_name)

**Receieved Consequentive Checks**:

Consequentive Received fields: checks if consequentive received fields have a matching domain for the 'by' of the eariler received field and the 'from' part of the later one. Value is 1 if there is an issue, otherwise 0 if all match.

## Processed data to a file

In [62]:
for item in final_features_list:
  print(item)
  print(df_combined[item].value_counts())
  print('\n')

hops
0    4678
1     400
Name: hops, dtype: int64


missing_received1
0    5072
1       6
Name: missing_received1, dtype: int64


missing_received2
0    3327
1    1751
Name: missing_received2, dtype: int64


missing_received3
1    4086
0     992
Name: missing_received3, dtype: int64


missing_received4
1    4678
0     400
Name: missing_received4, dtype: int64


missing_received5
1    4863
0     215
Name: missing_received5, dtype: int64


missing_received6
1    4923
0     155
Name: missing_received6, dtype: int64


missing_received7
1    5033
0      45
Name: missing_received7, dtype: int64


missing_received8
1    5063
0      15
Name: missing_received8, dtype: int64


missing_subject
0    5073
1       5
Name: missing_subject, dtype: int64


missing_date
0    5023
1      55
Name: missing_date, dtype: int64


missing_message-id
0    4730
1     348
Name: missing_message-id, dtype: int64


missing_from
0    5078
Name: missing_from, dtype: int64


missing_return-path
0    5072
1       6
Name

In [63]:
df_combined[df_combined['str_to_chevron'] == 0]['label'].value_counts()

0    2021
1    1774
Name: label, dtype: int64

In [64]:
df_combined.shape

(5078, 136)

In [65]:
# Removes missing_received fields since received1 is always there, and the other ones
# encode the same information as 'hops'. The other removed features have only one value,
# or a strong majority towards one value.
remove_list = ['missing_received1', 'missing_received2', 'missing_received3',
 'missing_received4', 'missing_received5', 'missing_received6',
 'missing_received7', 'missing_received8', 'missing_received9',
 'missing_received10', 'missing_subject', 'missing_date', 'missing_message-id', 'missing_from',
 'missing_return-path', 'missing_to', 'missing_content_type', 'missing_mime-version', 'missing_x-mimeole',
 'missing_x-priority', 'missing_x-virus-scanned', 'missing_content-length', 'missing_delivered-to', 'missing_list-post',
 'missing_list-help', 'missing_x-msmail-priority', 'missing_x-beenthere', 'missing_list-archive', 'missing_x-virus-status', 'missing_x-spam-level', 'missing_x-spam-checker-version', 'missing_references', 'missing_in-reply-to', 'missing_user-agent', 'missing_thread-index', 'missing_cc', 'missing_content-disposition', 'missing_mailing-list', 'missing_domainkey-signature', 'missing_importance', 'str_from_chevron', 'str_to_undisclosed', 'str_return-path_empty',
 'str_from_exclam', 'str_reply-to_question', 'str_message-ID_dollar', 'str_received-SPF_bad', 
 'str_received-SPF_softfail', 'str_received-SPF_fail', 'str_reply-to_question', 
 'num_recipients_from']

# for v in remove_list:
#   if v in final_features_list:
#     final_features_list.remove(v)
    
final_features_list.append('label')

In [66]:
duplicates = df_combined.duplicated().sum()
print(f"Number of duplicates: {duplicates}")

Number of duplicates: 0


In [67]:
df_final = df_combined[final_features_list]

In [68]:
duplicates = df_final.duplicated().sum()
print(f"Number of duplicates: {duplicates}")

Number of duplicates: 4036


In [69]:
df_final.shape

(5078, 86)

In [70]:
df_final.to_csv('preprocessed_ham_phishing.csv', index=False)