In [None]:
import os
import json
import email
from email import policy
from email.parser import BytesParser
from concurrent.futures import ThreadPoolExecutor

def parse_eml(file_path):
    try:
        with open(file_path, 'rb') as f:
            msg = BytesParser(policy=policy.default).parse(f)
        
        email_data = {
            "datetime": msg["Date"],
            "to": msg["To"],
            "reply-to": msg["Reply-To"] if msg["Reply-To"] else "",
            "CC": msg["CC"] if msg["CC"] else "",
            "Bcc": msg["Bcc"] if msg["Bcc"] else "",
            "subject": msg["Subject"],
            "body": get_email_body(msg),
            "attachments": any(part.get_filename() for part in msg.walk() if part.get_filename()),
            "label": "inbox"  # Modify this based on email folder location
        }
        return email_data
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

def get_email_body(msg):
    try:
        if msg.is_multipart():
            for part in msg.walk():
                if part.get_content_type() == "text/plain":
                    charset = part.get_content_charset() or "utf-8"
                    return part.get_payload(decode=True).decode(charset, errors="ignore")
        else:
            charset = msg.get_content_charset() or "utf-8"
            return msg.get_payload(decode=True).decode(charset, errors="ignore")
    except Exception as e:
        print(f"Error extracting body: {e}")
    return ""

def convert_eml_folder_to_json(folder_path, output_folder):
    try:
        if not os.path.exists(folder_path):
            print("Error: The specified input folder does not exist.")
            return
        
        os.makedirs(output_folder, exist_ok=True)
        output_file = os.path.join(output_folder, "emails.json")
        
        eml_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".eml")]
        
        if not eml_files:
            print("No .eml files found in the specified folder.")
            return
        
        with ThreadPoolExecutor() as executor:
            emails = list(filter(None, executor.map(parse_eml, eml_files)))
        
        with open(output_file, "w", encoding="utf-8") as json_file:
            json.dump(emails, json_file, indent=4)
        print(f"Extraction completed. Data saved to {output_file}")
    except Exception as e:
        print(f"Unexpected error: {e}")

# Example usage
folder_path = "/Users/dextercyberlabs/Desktop/LLM/Emails"  # Change this to your folder path
output_folder = "output_emails"  # The folder will be created automatically if it doesn't exist
convert_eml_folder_to_json(folder_path, output_folder)

Extraction completed. Data saved to output_emails/emails.json
