In [None]:
import networkx as nx
import os
import re
import datetime

In [None]:
# build a dict of metadata entries by email with a path to the full email
# !!! WARNING This WILL NOT work without the enron dataset extracted to the path below. WARNING !!!
base_path = os.getcwd() + '/maildir'

messages = {}
errors = {'failures': [], 'not_emails': []}

def navigate_directories(path):
    # list the current directory
    contents = os.listdir(path)
    # for each item in the directory
    for entry in contents:
        entry_path = path + '/' + entry
        # if it's a directory, run this function again
        if os.path.isdir(entry_path):
            navigate_directories(entry_path)
        # if it's a file, extract the metadata
        else:
            build_dict(entry_path)

def build_dict(path):
    # open the file
    with open(path, 'r') as doc:
        try:
            contents = doc.read()
        
            # capture the messageid
            m = re.compile(r'^Message-ID: <(.*?)>.*')
            re_msg_id = m.match(contents)
            msg_id = re_msg_id.group(1)
            
            # if the messageid hasn't been parsed yet:
            if not msg_id in messages.keys():
                # add an entry with the messageid as the key, and a dict as the value with 'path': path as an entry
                messages[msg_id] = {'path': path}
        except UnicodeDecodeError:
            errors['failures'].append(path)
        except AttributeError:
            errors['not_emails'].append(path)



In [None]:
navigate_directories(base_path)

In [None]:
#for each email, add the date, the sender, and the recipients to the messages dict. 
for message in messages:
    with open(messages[message]['path'], 'r') as doc:
        email = doc.read()
        address = r"[\w!#\$%&'\*\+-/=\?\^_`\{\}\|~\.]+@[\w!#\$%&'\*\+-/=\?\^_`\{\}\|~\.]+\.[\w!#\$%&'\*\+-/=\?\^_`\{\}\|~\.]+"
        from_line = r"From: .*?(" + address + r")"
        to_line = r"To: ((" + address + "[\s,]+)+)"
        date_line = r"Date: ([a-zA-Z]+, \d+ [a-zA-Z]+ \d+ \d+:\d+:\d+ [-\+]?\d+)"
        #regex search From:
        re_from = re.search(from_line, email)
        try:
            messages[message]['from'] = re_from.group(1)
        except:
            print(f"No from email. Message path: {messages[message]['path']}")
        #regex search To: repeating group
        re_to = re.search(to_line, email)
        if re_to:
            messages[message]['to'] = re.split(r"[\s,]+", re_to.group(1))[:-1]
        else:
            messages[message]['to'] = []
        #regex search date, convert to datetime
        re_date = re.search(date_line, email)
        try:
            msg_time = datetime.datetime.strptime(re_date.group(1), '%a, %d %b %Y %H:%M:%S %z')
            messages[message]['time'] = msg_time
        except:
            print(f"Could not parse date for {messages[message]['path']}")

In [None]:
import pprint

print(len(messages))
pprint.pprint(errors)