In [3]:
import pyspark as ps

Initilize our message directories

In [13]:
EMAIL_PARENT_DIR_GLOB_SAMPLE = './data/maildir_sample/*/*/*.'
# EMAIL_PARENT_DIR_GLOB = './data/maildir/*/*/*.' # this is the full dataset, but don't think I can get to run on my laptop

Load all messages to an RDD

In [5]:
messages_rdd = sc.wholeTextFiles(EMAIL_PARENT_DIR_GLOB_SAMPLE)

In [6]:
messages_rdd.count()

22208

Write a parser function that takes in a message `m` and parses out the requested data, return each message as a tuple

In [7]:
def parse_message(m):
    message_lines = m.split('\n')
    message_id = message_date = message_to = message_from = ''
    for line in message_lines:
        try:
            if line.startswith("Message-ID:"):
                message_id = ":".join(line.split(":")[1:]).strip()
            if line.startswith("Date"):
                message_date = ":".join(line.split(":")[1:]).strip()
            if line.startswith("To:"):
                message_to = ":".join(line.split(":")[1:]).strip()
            if line.startswith("From:"):
                message_from = ":".join(line.split(":")[1:]).strip()
        except Exception as e: 
            print("{}---It was me: {}".format(e, line))

    return (message_id, message_date, message_to, message_from, m)

Parse each message out to a new RDD

In [8]:
vmes_rdd = messages_rdd.map(lambda (k, v): parse_message(v))

Make sure our message parser works

In [9]:
vmes_rdd.take(3)

[(u'<18782981.1075855378110.JavaMail.evans@thyme>',
  u'Mon, 14 May 2001 16:39:00 -0700 (PDT)',
  u'tim.belden@enron.com',
  u'phillip.allen@enron.com',
  u"Message-ID: <18782981.1075855378110.JavaMail.evans@thyme>\nDate: Mon, 14 May 2001 16:39:00 -0700 (PDT)\nFrom: phillip.allen@enron.com\nTo: tim.belden@enron.com\nSubject: \nMime-Version: 1.0\nContent-Type: text/plain; charset=us-ascii\nContent-Transfer-Encoding: 7bit\nX-From: Phillip K Allen\nX-To: Tim Belden <Tim Belden/Enron@EnronXGate>\nX-cc: \nX-bcc: \nX-Folder: \\Phillip_Allen_Jan2002_1\\Allen, Phillip K.\\'Sent Mail\nX-Origin: Allen-P\nX-FileName: pallen (Non-Privileged).pst\n\nHere is our forecast\n\n "),
 (u'<15464986.1075855378456.JavaMail.evans@thyme>',
  u'Fri, 4 May 2001 13:51:00 -0700 (PDT)',
  u'john.lavorato@enron.com',
  u'phillip.allen@enron.com',
  u"Message-ID: <15464986.1075855378456.JavaMail.evans@thyme>\nDate: Fri, 4 May 2001 13:51:00 -0700 (PDT)\nFrom: phillip.allen@enron.com\nTo: john.lavorato@enron.com\nSubj

Convert to dataframe and make sure it looks ok, so we can use csv write with quote options

In [10]:
messages_df = sqlContext.createDataFrame(vmes_rdd, ['message_id', 'message_date', 'message_to', 'message_from', 'message_text'])

In [11]:
messages_df.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|          message_id|        message_date|          message_to|        message_from|        message_text|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|<18782981.1075855...|Mon, 14 May 2001 ...|tim.belden@enron.com|phillip.allen@enr...|Message-ID: <1878...|
|<15464986.1075855...|Fri, 4 May 2001 1...|john.lavorato@enr...|phillip.allen@enr...|Message-ID: <1546...|
|<24216240.1075855...|Wed, 18 Oct 2000 ...|leah.arsdall@enro...|phillip.allen@enr...|Message-ID: <2421...|
|<13505866.1075863...|Mon, 23 Oct 2000 ...|randall.gay@enron...|phillip.allen@enr...|Message-ID: <1350...|
|<30922949.1075863...|Thu, 31 Aug 2000 ...|greg.piper@enron.com|phillip.allen@enr...|Message-ID: <3092...|
+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



Write to CSV with quotes around text

In [12]:
messages_df.coalesce(1).write.option("header", True).option("quoteAll", True).csv('./data/parsed_messages.csv')