In [2]:
import os
import random
import email
from email.parser import Parser

In [6]:
# Define the base directory containing the nested structure of text files
# Assuming 'enron_dataset' directory contains the nested structure
base_directory = '../datasets/enron/maildir'

# Function to recursively collect all file paths in the directory
def collect_files(directory):
    file_paths = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            file_paths.append(file_path)
    return file_paths

# Collect all file paths
all_file_paths = collect_files(base_directory)

# Filter files based on size constraints (1 byte to 10,000 bytes)
filtered_files = [file for file in all_file_paths if 1 <= os.path.getsize(file) <= 10000]

# Randomly pick 10 files from the filtered list
selected_files = random.sample(filtered_files, 10)

# Parse and display the content of each selected file
email_details = []
for file_path in selected_files:
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        file_content = file.read()
        # Parse the email content
        email_message = Parser().parsestr(file_content)

        # Store email details
        email_info = {
            "Path": file_path,
            "Headers": {key: value for key, value in email_message.items()},
            "Content": ""
        }

        # Extract the email body
        if email_message.is_multipart():
            for part in email_message.walk():
                content_type = part.get_content_type()
                content_disposition = part.get('Content-Disposition', '')
                if content_type == 'text/plain' and 'attachment' not in content_disposition:
                    email_info["Content"] += part.get_payload() + "\n"
        else:
            email_info["Content"] = email_message.get_payload()
        
        email_details.append(email_info)

# Display the content of the emails
for index, email in enumerate(email_details):
    print(f"Email {index+1} Content:")
    print(f"Path: {email['Path']}")
    print("Headers:")
    for header, value in email['Headers'].items():
        print(f"{header}: {value}")
    print("Content:")
    print(email['Content'])
    print('-'*100)


Email 1 Content:
Path: ../datasets/enron/maildir/williams-w3/inbox/80.
Headers:
Message-ID: <21083505.1075839942375.JavaMail.evans@thyme>
Date: Tue, 29 Jan 2002 20:20:03 -0800 (PST)
From: john.oh@enron.com
To: shift.dl-portland@enron.com
Subject: New EL Paso URL
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Oh, John </O=ENRON/OU=NA/CN=RECIPIENTS/CN=JOH2>
X-To: DL-Portland Real Time Shift </O=ENRON/OU=NA/CN=RECIPIENTS/CN=DL-PortlandRealTimeShift>
X-cc: 
X-bcc: 
X-Folder: \ExMerge - Williams III, Bill\Inbox
X-Origin: WILLIAMS-W3
X-FileName: 
Content:
RT-

The new El Paso URL is:
http://172.17.172.62/rt/

Click on current.asp to have the page automatically refresh.  
The file !current.txt contains the data.

All historical data is in the folders listed on this site.  All past historical data that is not available on the current site ison the P drive under P:\trading\california\realtime.

Going forward, all El Paso data WILL be collect