In this experiment, I want to get used to more advanced mail parsing in order to include more mails in our converted dataset.

In [14]:
import email.parser as parser
import pandas as pd
import numpy as np
import re
import os

def read_maildir(path):
    return read_folder(path)
        
def read_folder(path):
    result = {}
    children = os.listdir(path)
    for child in children:
        child_path = os.path.join(path, child)
        if os.path.isdir(child_path):
            result.update(read_folder(child_path))
        else:
            with open(child_path, "rb") as f:
                message = parser.BytesParser().parse(f, headersonly=True)
                result[child_path] = {
                    "from": message["from"], 
                    "to": message["to"], 
                    "subject": message["subject"], 
                    "date": message["date"]}
    return result   
    
mails = read_maildir("../data/maildir")

In [15]:
df = pd.DataFrame(mails).transpose()
df.head()

Unnamed: 0,from,to,subject,date
../data/maildir/mckay-b/all_documents/44.,brad.mckay@enron.com,glaguarta@cadencemcshane.com,Re: Dove hunt,"Tue, 18 Jul 2000 08:04:00 -0700 (PDT)"
../data/maildir/mckay-b/all_documents/11.,brad.mckay@enron.com,rex@arn.net,Re: The Deal!,"Thu, 7 Dec 2000 07:05:00 -0800 (PST)"
../data/maildir/mckay-b/all_documents/61.,brad.mckay@enron.com,8915473@archwireless.net,Re: Wireless Message,"Tue, 8 May 2001 08:34:00 -0700 (PDT)"
../data/maildir/mckay-b/all_documents/57.,brad.mckay@enron.com,8915473@archwireless.net,Re: Wireless Message,"Wed, 9 May 2001 09:14:00 -0700 (PDT)"
../data/maildir/mckay-b/all_documents/38.,brad.mckay@enron.com,ghathaway@mmcable.com,Re: Follow-up from Friday meetings with Enron,"Mon, 18 Sep 2000 04:55:00 -0700 (PDT)"


In [16]:
len(df)

517401

In [18]:
mails_from_enron = df[df["from"].str.contains("enron.com", case=False)]
print(len(mails_from_enron))
mails_from_enron_without_duplicates = mails_from_enron.drop_duplicates(subset=["from", "subject", "date"])
print(len(mails_from_enron_without_duplicates))

429754
197960


In [20]:
# Verify that duplicates were removed correctly
print("Without trimming")
print(mails_from_enron[mails_from_enron["subject"] == "Overview of Investor Conference Call"])
print("With trimming")
print(mails_from_enron_without_duplicates[mails_from_enron_without_duplicates["subject"] == "Overview of Investor Conference Call"])

Without trimming
                                                                     from  \
../data/maildir/ybarbo-p/inbox/604.                chairman.ken@enron.com   
../data/maildir/reitmeyer-j/inbox/438.             chairman.ken@enron.com   
../data/maildir/ruscitti-k/inbox/253.              chairman.ken@enron.com   
../data/maildir/whalley-g/inbox/349.               chairman.ken@enron.com   
../data/maildir/weldon-c/deleted_items/205.        chairman.ken@enron.com   
...                                                                   ...   
../data/maildir/stepenovitch-j/deleted_items/216.  chairman.ken@enron.com   
../data/maildir/campbell-l/inbox/1455.             chairman.ken@enron.com   
../data/maildir/hodge-j/inbox/673.                 chairman.ken@enron.com   
../data/maildir/buy-r/inbox/480.                   chairman.ken@enron.com   
../data/maildir/heard-m/inbox/227.                 chairman.ken@enron.com   

                                                          