In [1]:
import email
from bs4 import BeautifulSoup
import pandas as pd
import os
import quopri

def decode_payload(payload):
    if payload is None:
        return None, 0

    encodings = ['utf-8', 'cp437', 'ISO-8859-1']
    for encoding in encodings:
        try:
            decoded_text = quopri.decodestring(payload).decode(encoding, errors='replace')
            return decoded_text, decoded_text.count('�')
        except UnicodeDecodeError:
            pass
    return None, 0
def extract_info_from_email(file_path):
    with open(file_path, 'r', encoding='cp437', errors='replace') as f:
        msg = email.message_from_file(f)

        unknown_chars_count = 0
        body = ''
        unsubscribe_links = []
        soup = None  # Initialize soup here

        if msg.is_multipart():
            for part in msg.walk():
                content_disposition = str(part.get("Content-Disposition"))
                if "attachment" not in content_disposition:
                    part_body, unknown_count = decode_payload(part.get_payload(decode=True))
                    if part_body:
                        body += part_body
                        unknown_chars_count += unknown_count

                        if part_body.strip():  # Check if part_body is not empty
                            try:
                                part_soup = BeautifulSoup(part_body, 'html5lib')
                                unsubscribe_links.extend([link['href'] for link in part_soup.find_all('a', href=True) if "unsubscribe" in link.text.lower()])
                                if soup is None:
                                    soup = part_soup
                            except Exception as e:
                                print(f"Failed to parse part of {file_path} with html5lib due to: {e}")

        else:
            body, unknown_count = decode_payload(msg.get_payload(decode=True))
            unknown_chars_count += unknown_count

            try:
                soup = BeautifulSoup(body, 'html5lib')
                unsubscribe_links.extend([link['href'] for link in soup.find_all('a', href=True) if "unsubscribe" in link.text.lower()])
            except Exception as e:
                print(f"Failed to parse {file_path} with html5lib due to: {e}")

        if not body:
            print(f"No body content found for {file_path}")
            return None
        
        # Check if soup was successfully created
        if soup:
            tracking_pixel = len(soup.find_all('img', width='1', height='1')) > 0
            total_links = len(soup.find_all('a'))
        else:
            tracking_pixel = False
            total_links = 0

        email_size = len(body)

        dkim_signature = 'Present' if msg.get('DKIM-Signature') else 'Absent'

        return {
            'filename': os.path.basename(file_path),
            'number of unsubscribe links': len(unsubscribe_links),
            'number of undecodable characters': unknown_chars_count,
            'tracking pixel present': tracking_pixel,
            'total links in email': total_links,
            'email size (bytes)': email_size,
            'dkim-signature': dkim_signature
        }

directory = 'C:\\Users\\ericb\\Desktop\\Research\\Primary@gmail.com\\Cleaned_Mail\\2023_test\\08\\'
output_directory = 'C:\\Users\\ericb\\Desktop\\Research\\542_Project\\data\\test\\warranted_data_test_output\\'

infos = []
failed_files = []  # List to track files that failed to parse

# Ensure output directory exists
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

for dirpath, dirnames, filenames in os.walk(directory):
    for file_name in filenames:
        file_path = os.path.join(dirpath, file_name)
        info = extract_info_from_email(file_path)
        if info:
            infos.append(info)

if infos:
    df = pd.DataFrame(infos)
    if not df.empty:
        csv_path = os.path.join(output_directory, f"extracted_email_and_unsubscribe_info.csv")
        print('saving to excel')
        df.to_csv(csv_path, index=False)

# Saving the failed filenames to a CSV
if failed_files:
    failed_df = pd.DataFrame({'Failed Files': failed_files})
    failed_df.to_csv("failed_files.csv", index=False)

print("Script execution completed.")

saving to excel
Script execution completed.


In [None]:
# Note that the unsubscribe links reveal the account names... so these must be redacted before sharing the data

