In [1]:
import email
from bs4 import BeautifulSoup
import pandas as pd
import os
import quopri

def decode_payload(payload):
    encodings = ['utf-8', 'cp437', 'ISO-8859-1']
    for encoding in encodings:
        try:
            decoded_text = quopri.decodestring(payload).decode(encoding, errors='replace')
            return decoded_text, decoded_text.count('�')
        except UnicodeDecodeError:
            pass
    return None, 0

def extract_info_from_email(file_path):
    with open(file_path, 'r', encoding='cp437', errors='replace') as f:
        msg = email.message_from_file(f)
        
        unknown_chars_count = 0
        body = None
        
        if msg.is_multipart():
            for part in msg.walk():
                content_type = part.get_content_type()
                content_disposition = str(part.get("Content-Disposition"))

                if "attachment" not in content_disposition:
                    body, unknown_count = decode_payload(part.get_payload(decode=True))
                    unknown_chars_count += unknown_count
                    break
        else:
            body, unknown_count = decode_payload(msg.get_payload(decode=True))
            unknown_chars_count += unknown_count

        if body is None:
            print(f"Decoding failed for {file_path}")
            return None

        try:
            soup = BeautifulSoup(body, 'html5lib')
        except Exception as e:
            print(f"Failed to parse {file_path} with html5lib due to: {e}")
            print(f"Email body: {body}")
            failed_files.append(file_path)  # Track failed files
            return None

        sender = email.utils.parseaddr(msg['From'])[1]
        recipient = email.utils.parseaddr(msg['To'])[1]
        domain_of_sender = sender.split('@')[1] if '@' in sender else None
        tracking_pixel = len(soup.find_all('img', width='1', height='1')) > 0
        total_links = len(soup.find_all('a'))
        email_size = len(body)
        return_path = msg.get('Return-Path', '')
        
        spf_result = msg.get('Received-SPF', 'Not available')
        dkim_signature = 'Present' if msg.get('DKIM-Signature') else 'Absent'
        authentication_results = msg.get('Authentication-Results', 'Not available')

        subject = msg.get('subject', '').lower()

        unsubscribe_links = [link['href'] for link in soup.find_all('a', href=True) if "unsubscribe" in link.text.lower()]

        return {
            'FileName': os.path.basename(file_path),
            'To': recipient,
            'From': sender,
            'Domain of Sender': domain_of_sender,
            'Subject': subject,
            'Date and Time': msg['Date'],
            'Number of Unsubscribe Links': len(unsubscribe_links),  # Changed this line
            'Number of Undecodable Characters': unknown_chars_count,
            'Tracking Pixel Present': tracking_pixel,
            'Total Links in Email': total_links,
            'Email Size (bytes)': email_size,
            'Return Path': return_path,
            'Received-SPF': spf_result,
            'DKIM-Signature': dkim_signature,
            'Authentication-Results': authentication_results
        }

directory = 'C:\\Users\\ericb\\Desktop\\Research\\Primary@gmail.com\\Cleaned_Mail\\2023\\'

infos = []
failed_files = []  # List to track files that failed to parse

for dirpath, dirnames, filenames in os.walk(directory):
    for file_name in filenames:
        file_path = os.path.join(dirpath, file_name)
        info = extract_info_from_email(file_path)
        if info:
            infos.append(info)

df = pd.DataFrame(infos)
if not df.empty:
    print('saving to excel')
    df.to_csv("extracted_email_and_unsubscribe_info_primary.csv", index=False)

# Saving the failed filenames to a CSV
if failed_files:
    failed_df = pd.DataFrame({'Failed Files': failed_files})
    failed_df.to_csv("failed_files.csv", index=False)




KeyboardInterrupt: 

In [None]:
# Note that the unsubscribe links reveal the account names... so these must be redacted before sharing the data

