In [345]:
import pandas as pd
import os
import email
import time
import csv
import sys
from sklearn.model_selection import train_test_split
import math
from sklearn.feature_extraction.text import TfidfVectorizertfidfv = TfidfVectorizer(min_df=1,stop_words='english')
#unique delim to separate columns without messing up content
DELIMETER = chr(255)

#needed because a few of the emails were too large for the default csv cell size
csv.field_size_limit(sys.maxsize)

9223372036854775807

In [2]:
#set paths (currently pathing for remote AWS EC2 Instance)
root_path = '/home/ubuntu/ssl/notebooks/data/maildir'
saved_file = '/home/ubuntu/ssl/notebooks/processed enron emails.csv'
testing_batch = False

In [3]:
def process_emails_into_df(origin, destination, testbool):
    #safety check to see if the file has been created so I don't overwrite a finished file
    already_processed = os.path.isfile(destination) 

    #if no file already exists start the writing process
    if not already_processed:

        #list for holding file paths
        list_of_files = []    

        #get the path for all the email files
        for path, subdirs, files in os.walk(origin):
            for name in files:
                list_of_files.append(os.path.join(path, name))

        email_count = len(list_of_files)
        #caluculate 1% of the number of files, used for progress report printing.
        onepcofemail = (email_count//100)



        #create dataframe to store data to be written
        emails = pd.DataFrame()
        print('Starting. {}'.format(time.strftime("%I:%M:%S")))

        #counter for testing batches
        x = 0

        #iterrate through all the files found
        for i, myfile in enumerate(list_of_files):

            #if running a testing session to break before reading all file to inspect the saved csv
            if x > 6000 and testbool: break

            #read individual email file
            with open(myfile, 'r', encoding='utf-8', errors='replace') as filepath:
                #read in the email
                message=email.message_from_string(filepath.read())

                #save the important components
                emails.loc[i,'Message-ID']=message['Message-ID']
                emails.loc[i,'from']=message['from']
                emails.loc[i,'subject']=message['subject']
                emails.loc[i,'to']=message['to']
                emails.loc[i,'cc']=message['cc']
                emails.loc[i,'bcc']=message['Bcc']
                emails.loc[i,'date']=message['date']
                emails.loc[i,'file']=message['X-FileName']

                #check that the body is one or multiple sections and save it
                if message.is_multipart():
                    string = ''
                    for payload in message.get_payload():
                        print(payload.get_payload())
                        string = string + payload.get_payload()
                        emails.loc[i,'body'] = string
                else:
                    emails.loc[i,'body'] =  message.get_payload()

            #append entry to the csv file
            with open(destination, 'a') as f:
                #if this is the first entry use the headers, if not, dont.True
                if i == 0: emails.to_csv(f, header=emails.columns, index = False, sep=DELIMETER)
                else: emails.to_csv(f, header=None, index = False, sep=DELIMETER)

            #clear the dataframe to save memory
            emails = pd.DataFrame()        

            #print statement updating progress report
            if i % onepcofemail == 0: 
                print('{}% finished. {}'.format(i//onepcofemail, time.strftime("%I:%M:%S")), end="\r")

            #itterate counter for testing batches
            x = x+1


        #process complete
        print('Done! Exported {} lines to CSV {}'.format(x, time.strftime("%I:%M:%S")))

    #prints if the file already exists and there is no need to process the individual emails into a csv
    else: print('****************File Previously Processed, Delete File to Process Again********************')

In [4]:
process_emails_into_df(root_path, saved_file, testing_batch)

****************File Previously Processed, Delete File to Process Again********************


In [5]:
print('Loading File. {}'.format(time.strftime("%I:%M:%S")))
df = pd.read_csv(saved_file, sep=DELIMETER, engine='python')
print('File Loaded! {}'.format(time.strftime("%I:%M:%S")))

Loading File. 09:42:02
File Loaded! 09:42:22


In [15]:
def filter_small_senders(dataframe, min_email_count, new_file_dest):
    path='/home/ubuntu/ssl/notebooks/' + new_file_dest
    if not os.path.isfile(path): 
        unique_senders_inc_small = dataframe['from'].unique()
        filtered_df = pd.DataFrame(columns = dataframe.columns)

        for i, sender in enumerate(unique_senders_inc_small):
            temp = dataframe[dataframe['from'] == sender]
            print('{}% done'.format(round(i/len(unique_senders_inc_small)*100,2)) , end="\r")

            if temp.shape[0] > min_email_count and '@enron.com' in sender:
                filtered_df = filtered_df.append(temp, ignore_index = True)
        filtered_df.to_csv(path, sep=DELIMETER, engine='python')
    
    else: 
        filtered_df = pd.read_csv(path, sep=DELIMETER, engine='python')
        print('Loaded a previously processed csv, to reprocess delete {}'.format(new_file_dest))
        
    return filtered_df

In [325]:
new_df = filter_small_senders(df, 50, 'small_senders_filtered')

Loaded a previously processed csv, to reprocess delete small_senders_filtered


In [407]:
new_df.tail()

Unnamed: 0.1,Unnamed: 0,Message-ID,from,subject,to,cc,bcc,date,file,body
378377,378377,<27555612.1075844385188.JavaMail.evans@thyme>,jeff.nogid@enron.com,Copies of ISDA documents,sara.shackleton@enron.com,,,"Fri, 11 Aug 2000 06:18:00 -0700 (PDT)",sshackle.nsf,Sara-\n\nAnnMarie Tiller has asked for:\n\n1) ...
378378,378378,<10479620.1075844378616.JavaMail.evans@thyme>,jeff.nogid@enron.com,Issuer Forward Transactions,sara.shackleton@enron.com,,,"Mon, 24 Jul 2000 07:16:00 -0700 (PDT)",sshackle.nsf,Sara-\n\nHere is a term sheet from another pot...
378379,378379,<26219107.1075855402800.JavaMail.evans@thyme>,jeff.nogid@enron.com,FW: ENE equity swap for 6/18/01,"sara.shackleton@enron.com, steve.ross@enron.com",,,"Tue, 19 Jun 2001 07:28:11 -0700 (PDT)",sshackl (Non-Privileged).pst,Here is the new roll of this Equity Swap.\n\n ...
378380,378380,<16324989.1075863201108.JavaMail.evans@thyme>,jeff.nogid@enron.com,FW: OTC physical expiration notice,sara.shackleton@enron.com,,,"Thu, 1 Nov 2001 08:33:48 -0800 (PST)",SSHACKL (Non-Privileged).pst,"\n\n-----Original Message-----\nFrom: Smoller,..."
378381,378381,<29386777.1075858812938.JavaMail.evans@thyme>,jeff.nogid@enron.com,FW: Forward Amendment,sara.shackleton@enron.com,steve.ross@enron.com,steve.ross@enron.com,"Fri, 21 Sep 2001 13:46:30 -0700 (PDT)",SSHACKL (Non-Privileged).pst,\n\n -----Original Message-----\nFrom: \tChris...


In [423]:
new_df = new_df.dropna(subset = ['to', 'cc', 'bcc'], how = 'all')
new_df = new_df.dropna(subset = ['body'])

In [424]:
X_train, X_test = train_test_split(new_df, test_size= 0.25)

In [456]:
def get_sender_recip_list(dataframe):
    
    #empty df for leading up addresses
    sender_by_recip = pd.DataFrame(columns = ['sender','recipients', 'booksize'])
    
    #list of unique senders
    unique_senders = dataframe['from'].unique()
    
    #arrays for holding values to be stored
    senders = []
    recipients = []
    addressbooksize = []
    
    #itterate through each unique email writer
    for i, sender in enumerate(unique_senders):
        
        #make a list of the samples from this itterations sender
        users_emails = dataframe[dataframe['from'] == sender]
        
        all_recips = users_emails['to'].tolist() + users_emails['cc'].tolist() + users_emails['bcc'].tolist() 
        
        
        
        senders.append(sender)
        
        name = []
        for l in all_recips:
            if not isinstance(l, str): pass
            else: 
                l = l.replace('\t','').replace('\n','')
                name.append(l.split(','))
        
        name = [item for sublist in name for item in sublist]
        name = [item for sublist in name for item in sublist]
        unique_recips = list(set(name))
        recipients.append(unique_recips)
        
        addressbooksize.append(len(unique_recips))
        
        print('{}% done'.format(round(i/len(unique_senders)*100,2)) , end="\r")
    
    sender_by_recip['sender'] = senders
    sender_by_recip['recipients'] = recipients
    sender_by_recip['booksize'] = addressbooksize
    return sender_by_recip

In [457]:
senders_to = get_sender_recip_list(X_train)

99.89% done

In [475]:
def calc_mean_idfs(addressbook, emails):
    tfidfv = TfidfVectorizer(min_df=1)
    if not os.path.isfile('/home/ubuntu/ssl/notebooks/meaned_tf'):
        unique_senders = addressbook['sender'].unique()

        senders = []
        recipients = []
        tf_idf_body = []

        for i, sender in enumerate(unique_senders):
            this_guys_recips = addressbook[addressbook['sender'] == sender].recipients
            this_guys_recips = [item for sublist in this_guys_recips for item in sublist]
            for j, name in enumerate(this_guys_recips):
                recip = name.strip(' ')
                combo_s_r = emails[(emails['from']==sender) & 
                             ((emails['to'].str.contains(recip)) | 
                              (emails['cc'].str.contains(recip)) | 
                              (emails['bcc'].str.contains(recip)))]
                print('{} / {} senders of {} / {} recipients done'.format(
                                                        i, len(unique_senders),
                                                        j, len(this_guys_recips)
                                                        ), end="\r")
                senders.append(sender)
                recipients.append(recip)
                tf_idf_body.append(np.mean(tfidfv.fit_transform(combo_s_r.body)))

        meaned_tf = pd.DataFrame(columns = ['sender', 'recipient', 'mean_tf_idf'])
        meaned_tf['sender'] = senders
        meaned_tf['recipient'] = recipients
        meaned_tf['mean_tf_idf'] = tf_idf_body
        
        print('Saving CSV, please wait.')
        meaned_tf.to_csv('/home/ubuntu/ssl/notebooks/meaned_tf')
        
    else: 
        print('There is already a processed and saved file, delete it to repeat process. Loading File!')
        meaned_tf = pd.read_csv('/home/ubuntu/ssl/notebooks/meaned_tf')
    return meaned_tf

In [None]:
print('Starting {}'.format(time.strftime("%I:%M:%S")))
meaned_idf = calc_mean_idfs(senders_to, new_df)
print('Finished {}'.format(time.strftime("%I:%M:%S")))

Starting 01:31:16
7 / 922 senders of 31 / 41 recipients done