# Metadata Analysis

Notebook created to perform the pre-processing, feature extraction, training and testing of metadata from .eml files on some Deep Learning models.

In [None]:
# install dependencies
!pip install gensim
!pip install nltk

In [1]:
# imports
import email
import os
from email.policy import default
import re
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

### Import phishing and ham data

In [2]:
# headers to be extracted
selected_headers = ['Subject', 'Date', 'From', 'To', 'Return-Path', 'Reply-To', 'Message-ID', 'Received-SPF', 'Authentication-Results', 'Content-Type', 'MIME-Version']

In [3]:
# function to extract the headers from the ham .eml files
def extract_eml_headers(source_folders):    
    
    # List to hold all headers from all emails
    headers_list = []
    counter = 0

    for website_folder in source_folders:
        # List all .eml files in the current source folder
        for email_folder in os.listdir(website_folder):
            if os.path.isdir(os.path.join(website_folder, email_folder)):
                for filename in os.listdir(os.path.join(website_folder, email_folder)):
                    if filename.endswith('.eml') and counter < 500:
                        source_path = os.path.join(website_folder, email_folder, filename)
                        
                        # Open and read the .eml file
                        with open(source_path, 'r', encoding='utf-8') as file:
                            eml_content = file.read()

                        # Parse the .eml content to get the message object
                        msg = email.message_from_string(eml_content, policy=default)

                        # Extract only selected headers/metadata into a dictionary
                        headers_dict = {header: msg[header] for header in selected_headers if header in msg}
                        
                        # Add the dictionary to the list for later processing
                        headers_list.append(headers_dict)

                        counter += 1

                        # Your preprocessing steps could go here, on headers_list or headers_dict before saving
                        # Example preprocessing could be:
                        # - Filtering specific headers
                        # - Cleaning header values
                        # - Etc.
    
    return headers_list

# get headers from the ham emails
source_path = '../../privacymail-dataset/data/'
source_folders = [source_path + folder for folder in os.listdir(source_path) if os.path.isdir(os.path.join(source_path, folder))]
ham_headers_list = extract_eml_headers(source_folders)

In [4]:
# get the already extracted headers from the phishing emails
def get_eml_headers(source_folder):
    # List to hold all headers from all emails
    headers_list = []

    for filename in os.listdir(source_folder):
        if filename.endswith('.eml'):
            source_path = os.path.join(source_folder, filename)

            # Open and read the .eml file
            with open(source_path, 'r', encoding='utf-8') as file:
                eml_content = file.read()

            # Parse the .eml content to get the message object
            msg = email.message_from_string(eml_content, policy=default)

            # Extract only selected headers/metadata into a dictionary
            headers_dict = {header: msg[header] for header in selected_headers if header in msg}

            # Add the dictionary to the list for later processing
            headers_list.append(headers_dict)

    return headers_list


# get headers from the phishing emails
source_folder = '../../EmailsHeaders'
phishing_headers_list = get_eml_headers(source_folder)


In [9]:
print(ham_headers_list[0])
print(phishing_headers_list[0])
print(len(ham_headers_list))
print(len(phishing_headers_list))

with open('phishing_subjects.txt', 'w') as f:
    for item in phishing_headers_list:
        f.write("%s\n" % item.get('Subject'))

{'Subject': 'Bitte bestätigen Sie Ihr Newsletter-Abonnement', 'Date': 'Thu, 27 Aug 2020 18:54:29 +0200', 'From': 'Sparkasse zu Lübeck AG <newsletter@sparkasse-luebeck.de>', 'To': 'christopher.casto@privacy-mail.org', 'Return-Path': '<hbbj.d.afbi=bounce@bounces.sendnode.com>', 'Reply-To': 'newsletter@sparkasse-luebeck.de', 'Message-ID': '<6cc.4.199@sendnode.com>', 'Received-SPF': 'pass (mx2e90: domain of bounces.sendnode.com designates 185.98.184.207 as permitted sender) client-ip=185.98.184.207; envelope-from=hbbj.d.afbi=bounce@bounces.sendnode.com; helo=mda3cf.sendnode.com;', 'Authentication-Results': 'mx2e90;\tdkim=pass header.d=sendnode.com;        spf=pass (sender IP is 185.98.184.207) smtp.mailfrom=hbbj.d.afbi=bounce@bounces.sendnode.com smtp.helo=mda3cf.sendnode.com', 'Content-Type': 'text/plain; charset="UTF-8"', 'MIME-Version': '1.0'}
{'Subject': 'Verify your email address to avoid deactivation', 'Date': 'Tue, 23 Jan 2024 08:32:08 +0200', 'From': 'Det IT Support <n-reply@safeso

In [50]:
def extract_email(header_value):
    """Extracts email address from a header value."""
    match = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', header_value)
    if match:
        return match.group(0).lower()  # Convert to lower case for case-insensitive comparison
    return ""

def preprocess_email_headers_for_phishing_detection(headers_list):

    # Placeholder for the dataset
    dataset = []

    for headers in headers_list:
        print(headers)
        # Start with an empty data point
        data_point = {
            'from_return_path_mismatch': 0,
            'reply_to_from_mismatch': 0,
            # 'subject_suspicious': 0,
            # 'date_inconsistent': 0,
            # 'to_unspecific': 0
            # Additional features can be added here
        }

        # Normalize and compare "From" and "Return-Path"
        from_email = extract_email(headers.get('From', ''))
        return_path_email = extract_email(headers.get('Return-Path', ''))
        print("from ", from_email)
        print("return ", return_path_email)
        if from_email != return_path_email:
            data_point['from_return_path_mismatch'] = 1

        # Normalize and compare "Reply-To" and "From"
        reply_to_email = extract_email(headers.get('Reply-To', ''))
        print("reply ", reply_to_email)
        if reply_to_email and from_email and reply_to_email != from_email:
            data_point['reply_to_from_mismatch'] = 1

        
        # Simplified check for suspicious "Subject" (can be enhanced)
        # if 'urgent' in headers.get('Subject', '').lower() or 'important' in headers.get('Subject', '').lower():
        #     data_point['subject_suspicious'] = 1
        
        # Example check for "Date" inconsistency (simplified)
        # This should be expanded based on your logic for date checking
        
        # Example check for unspecific "To" (simplified)
        # if headers.get('To') and ('undisclosed recipients' in headers.get('To').lower() or 'all' in headers.get('To').lower()):
        #     data_point['to_unspecific'] = 1

        dataset.append(data_point)

    # Save or return the dataset for DL model
    # For demonstration, let's just return it
    return dataset

# Example usage
ham_dataset = preprocess_email_headers_for_phishing_detection(phishing_headers_list)

print(ham_dataset)

# You can then save this dataset to a file or use it directly for training/testing your DL model

{'Subject': 'Verify your email address to avoid deactivation', 'Date': 'Tue, 23 Jan 2024 08:32:08 +0200', 'From': 'Det IT Support <n-reply@safesopkoco.com>', 'To': 'vieira@det.ua.pt', 'Return-Path': 'n-reply@safesopkoco.com', 'Message-ID': '<20240123083208.13F75BEDDE511554@safesopkoco.com>', 'Received-SPF': 'PermError (mx4.ua.pt: cannot correctly interpret  sender authenticity information from domain of  n-reply@safesopkoco.com) identity=mailfrom;  client-ip=176.117.76.112; receiver=mx4.ua.pt;  envelope-from="n-reply@safesopkoco.com";  x-sender="n-reply@safesopkoco.com"; x-conformance=spf_only', 'Authentication-Results': 'mx4.ua.pt; spf=PermError smtp.mailfrom=n-reply@safesopkoco.com; spf=None smtp.helo=postmaster@mail.safesopkoco.com; dkim=pass (signature verified) header.i=@safesopkoco.com; dmarc=pass (p=none dis=none) d=safesopkoco.com', 'Content-Type': 'multipart/alternative; boundary="B_3788843362_2715154253"', 'MIME-Version': '1.0'}
from  n-reply@safesopkoco.com
return  n-reply@s