In [1]:
import pandas as pd
import os
import email
import time
import csv
import sys

#unique delim to separate columns without messing up content
DELIMETER = chr(255)

#needed because a few of the emails were too large for the default csv cell size
csv.field_size_limit(sys.maxsize)

131072

In [2]:
#set paths (currently pathing for remote AWS EC2 Instance)
root_path = '/home/ubuntu/ssl/notebooks/data/maildir'
saved_file = '/home/ubuntu/ssl/notebooks/processed enron emails.csv'
testing_batch = False

In [17]:
def process_emails_into_df(origin, destination, testbool):
    #safety check to see if the file has been created so I don't overwrite a finished file
    already_processed = os.path.isfile(destination) 

    #if no file already exists start the writing process
    if not already_processed:

        #list for holding file paths
        list_of_files = []    

        #get the path for all the email files
        for path, subdirs, files in os.walk(origin):
            for name in files:
                list_of_files.append(os.path.join(path, name))

        email_count = len(list_of_files)
        #caluculate 1% of the number of files, used for progress report printing.
        onepcofemail = (email_count//100)



        #create dataframe to store data to be written
        emails = pd.DataFrame()
        print('Starting. {}'.format(time.strftime("%I:%M:%S")))

        #counter for testing batches
        x = 0

        #iterrate through all the files found
        for i, myfile in enumerate(list_of_files):

            #if running a testing session to break before reading all file to inspect the saved csv
            if x > 6000 and testbool: break

            #read individual email file
            with open(myfile, 'r', encoding='utf-8', errors='replace') as filepath:
                #read in the email
                message=email.message_from_string(filepath.read())

                #save the important components
                emails.loc[i,'Message-ID']=message['Message-ID']
                emails.loc[i,'from']=message['from']
                emails.loc[i,'subject']=message['subject']
                emails.loc[i,'to']=message['to']
                emails.loc[i,'cc']=message['cc']
                emails.loc[i,'bcc']=message['Bcc']
                emails.loc[i,'date']=message['date']
                emails.loc[i,'file']=message['X-FileName']

                #check that the body is one or multiple sections and save it
                if message.is_multipart():
                    string = ''
                    for payload in message.get_payload():
                        print(payload.get_payload())
                        string = string + payload.get_payload()
                        emails.loc[i,'body'] = string
                else:
                    emails.loc[i,'body'] =  message.get_payload()

            #append entry to the csv file
            with open(destination, 'a') as f:
                #if this is the first entry use the headers, if not, dont.True
                if i == 0: emails.to_csv(f, header=emails.columns, index = False, sep=DELIMETER)
                else: emails.to_csv(f, header=None, index = False, sep=DELIMETER)

            #clear the dataframe to save memory
            emails = pd.DataFrame()        

            #print statement updating progress report
            if i % onepcofemail == 0: 
                print('{}% finished. {}'.format(i//onepcofemail, time.strftime("%I:%M:%S")), end="\r")

            #itterate counter for testing batches
            x = x+1


        #process complete
        print('Done! Exported {} lines to CSV {}'.format(x, time.strftime("%I:%M:%S")))

    #prints if the file already exists and there is no need to process the individual emails into a csv
    else: print('****************File Previously Processed, Delete File to Process Again********************')

In [19]:
process_emails_into_df(root_path, saved_file, testing_batch)

****************File Previously Processed, Delete File to Process Again********************


In [4]:
print('Loading File. {}'.format(time.strftime("%I:%M:%S")))
df = pd.read_csv(saved_file, sep=DELIMETER, engine='python')
print('File Loaded! {}'.format(time.strftime("%I:%M:%S")))

Loading File. 01:16:01
File Loaded! 01:16:22


In [141]:
def filter_small_senders(dataframe, min_email_count):
    unique_senders_inc_small = dataframe['from'].unique()
    filtered_df = pd.DataFrame(columns = dataframe.columns)
    
    for i, sender in enumerate(unique_senders_inc_small):
        temp = dataframe[dataframe['from'] == sender]
        print('{}% done'.format(round(i/len(unique_senders_inc_small)*100,2)) , end="\r")
        
        if (temp.shape[0] > min_email_count) and '@enron.com' in sender:
            filtered_df = filtered_df.append(temp, ignore_index = True)
    return filtered_df

In [142]:
new_df = filter_small_senders(df, 50)

100.0% done

In [143]:
new_df.shape[0]

378382

In [232]:
from sklearn.model_selection import train_test_split
from math import isnan

In [233]:
X_train, X_test = train_test_split(new_df, test_size= 0.25)

In [348]:
def get_sender_recip_list(dataframe):
    sender_by_recip = pd.DataFrame(columns = ['sender','recipients'])
    unique_senders = dataframe['from'].unique()
    senders = []
    recipients = []
    for i, sender in enumerate(unique_senders):
        users_emails = X_test[X_test['from'] == sender]
        users_emails = users_emails.dropna(subset = ['to'])
        list_of_people = []
        
        for recips in users_emails['to']:
            if recips.count('@') == 1: list_of_people.append(recips)
            else: list_of_people.append(recips.replace('\n\t','').split(','))
        
        list_of_people = [val for sublist in list_of_people for val in sublist]
        
        unique_email_recips = list(set(list_of_people))
        senders.append(sender)
        recipients.append(' '.join(unique_email_recips))
        
        
        
        print('{}% done'.format(round(i/len(unique_senders)*100,2)) , end="\r")
    sender_by_recip['sender'] = senders
    sender_by_recip['recipients'] = recipients
    return sender_by_recip

In [349]:
senders_to = get_sender_recip_list(X_test)

99.89% done

In [372]:
senders_to.head()

Unnamed: 0,sender,recipients
0,kaye.ellis@enron.com,sstack@enron.com jarrod.cyprow@enron.com kay...
1,william.bradford@enron.com,o jpiccone@hsresources.com james.steffes@enr...
2,carol.clair@enron.com,renz22@erols.com harry.collins@enron.com john...
3,kate.symes@enron.com,chris.foster@enron.com greg.wolfe@enron.com ...
4,vince.kaminski@enron.com,robert.johnston@enron.com zimin.lu@enron.com ...


In [351]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [352]:
tfidfv = TfidfVectorizer(min_df=1,stop_words='english')

In [353]:
x_tfidf=tfidfv.fit_transform(senders_to['recipients'])

In [354]:
from sklearn.cluster import DBSCAN
db = DBSCAN()
db.fit(x_tfidf)

DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='euclidean',
    min_samples=5, n_jobs=1, p=None)

In [366]:
import numpy as np
labels = db.labels_
len(np.unique(labels))

3

In [371]:
senders_to[labels == 0]

Unnamed: 0,sender,recipients
0,kaye.ellis@enron.com,sstack@enron.com jarrod.cyprow@enron.com kay...
3,kate.symes@enron.com,chris.foster@enron.com greg.wolfe@enron.com ...
5,joseph.alamo@enron.com,linda.sietzema@enron.com james.steffes@enron...
7,mary.hain@enron.com,pobenchain@idahopower.com raa@vnf.com jim.b...
8,kimberly.brown@enron.com,janet.wallis@enron.com john.arnold@enron.com ...
10,mike.grigsby@enron.com,phillip.allen@enron.com curtis.hutcheson@comp...
11,no.address@enron.com,yuan.tian@enron.com s..bradford@enron.com ti...
13,lorna.brennan@enron.com,yuan.tian@enron.com sean.bolks@enron.com ti...
14,sara.shackleton@enron.com,ellen.su@enron.com s..bradford@enron.com rajar...
15,jeff.dasovich@enron.com,lloyd.will@enron.com a..gomez@enron.com aod...
