In [1]:
import pandas as pd
import os
import email
import time
import csv
import sys
from sklearn.model_selection import train_test_split
import math
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import re

#unique delim to separate columns without messing up content
DELIMETER = chr(255)

#needed because a few of the emails were too large for the default csv cell size
csv.field_size_limit(sys.maxsize)
#set paths (currently pathing for remote AWS EC2 Instance)
root_path = '/home/ubuntu/ssl/notebooks/data/maildir'
saved_file = '/home/ubuntu/ssl/notebooks/processed enron emails.csv'
testing_batch = False

In [None]:
#steps through directory and loads every email in every folder into returned df
def process_emails_into_df(origin, destination, testbool):
    #safety check to see if the file has been created so I don't overwrite a finished file
    already_processed = os.path.isfile(destination) 

    #if no file already exists start the writing process
    if not already_processed:

        #list for holding file paths
        list_of_files = []    

        #get the path for all the email files
        for path, subdirs, files in os.walk(origin):
            for name in files:
                list_of_files.append(os.path.join(path, name))

        email_count = len(list_of_files)
        #caluculate 1% of the number of files, used for progress report printing.
        onepcofemail = (email_count//100)



        #create dataframe to store data to be written
        emails = pd.DataFrame()
        print('Starting. {}'.format(time.strftime("%I:%M:%S")))

        #counter for testing batches
        x = 0

        #iterrate through all the files found
        for i, myfile in enumerate(list_of_files):

            #if running a testing session to break before reading all file to inspect the saved csv
            if x > 6000 and testbool: break

            #read individual email file
            with open(myfile, 'r', encoding='utf-8', errors='replace') as filepath:
                #read in the email
                message=email.message_from_string(filepath.read())

                #save the important components
                emails.loc[i,'Message-ID']=message['Message-ID']
                emails.loc[i,'from']=message['from']
                emails.loc[i,'subject']=message['subject']
                emails.loc[i,'to']=message['to']
                emails.loc[i,'cc']=message['cc']
                emails.loc[i,'bcc']=message['Bcc']
                emails.loc[i,'date']=message['date']
                emails.loc[i,'file']=message['X-FileName']

                #check that the body is one or multiple sections and save it
                if message.is_multipart():
                    string = ''
                    for payload in message.get_payload():
                        print(payload.get_payload())
                        string = string + payload.get_payload()
                        emails.loc[i,'body'] = string
                else:
                    emails.loc[i,'body'] =  message.get_payload()

            #append entry to the csv file
            with open(destination, 'a') as f:
                #if this is the first entry use the headers, if not, dont.True
                if i == 0: emails.to_csv(f, header=emails.columns, index = False, sep=DELIMETER)
                else: emails.to_csv(f, header=None, index = False, sep=DELIMETER)

            #clear the dataframe to save memory
            emails = pd.DataFrame()        

            #print statement updating progress report
            if i % onepcofemail == 0: 
                print('{}% finished. {}'.format(i//onepcofemail, time.strftime("%I:%M:%S")), end="\r")

            #itterate counter for testing batches
            x = x+1


        #process complete
        print('Done! Exported {} lines to CSV {}'.format(x, time.strftime("%I:%M:%S")))

    #prints if the file already exists and there is no need to process the individual emails into a csv
    else: 
        #open csvs of all emails
        print('**********There is already a Process CSV of All Emails, loading Now************', end="\r")
        all_emails = pd.read_csv(saved_file, sep=DELIMETER, engine='python')
        print('***Processed Emails into DF Loaded From CSV, Delete File to Process Again******')
        return all_emails        

#function that filters out non enron originating email addresses       
def filter_unwanted_senders(dataframe, new_file_dest):
    path='/home/ubuntu/ssl/notebooks/' + new_file_dest
    
    #check to see if file has already been processed and saved
    if not os.path.isfile(path): 
        
        #get a list of unique email senders
        unique_senders_inc_small = dataframe['from'].unique()
        
        #create an empty df to holde data
        filtered_df = pd.DataFrame(columns = dataframe.columns)
        
        #iterate through all the unique senders
        for i, sender in enumerate(unique_senders_inc_small):
            temp = dataframe[dataframe['from'] == sender]
            
            #prints progress report
            print('{}% finished filtering unwanted senders            '.format(round(i/len(unique_senders_inc_small)*100,2)) , end="\r")

            #tests and saves senders who have enron emails and have sent more than 50 emails
            if '@enron.com' in sender:
                filtered_df = filtered_df.append(temp, ignore_index = True)
        
        #save new list of senders
        filtered_df.to_csv(path, sep=DELIMETER, index = False)
    
    #if it has already been done don't do it again
    else: 
        print('*********Previously Filtered Senders List. Loading Old File!*********'.format(new_file_dest), end="\r")
        filtered_df = pd.read_csv(path, sep=DELIMETER, engine='python')
        print('****To Reprocess Filtered Senders Delete {}.  Loaded Old File!*******'.format(new_file_dest))
    filtered_df = filtered_df.dropna(subset = ['to', 'cc', 'bcc'], how = 'all')
    filtered_df = filtered_df.dropna(subset = ['body'])    
    return filtered_df

#makes a list of all senders and a lists all the recipients they send to in next column
def get_sender_recip_list(dataframe):
    new_file = '/home/ubuntu/ssl/notebooks/sender_recip_df_list'
    
    if not os.path.isfile(new_file):
    
        #empty df for leading up addresses
        sender_by_recip = pd.DataFrame(columns = ['sender','recipients', 'booksize'])

        #list of unique senders
        unique_senders = dataframe['from'].unique()

        #arrays for holding values to be stored
        senders = []
        recipients = []
        addressbooksize = []

        #itterate through each unique email writer
        for i, sender in enumerate(unique_senders):

            #make a list of the samples from this itterations sender
            users_emails = dataframe[dataframe['from'] == sender]

            all_recips = users_emails['to'].tolist() + users_emails['cc'].tolist() + users_emails['bcc'].tolist() 



            senders.append(sender)

            namelist = []
            for l in all_recips:
                if not isinstance(l, str): pass
                else: 
                    l = l.replace('\t','').replace('\n','')
                    list_of_names = l.split(',')
                    for name in list_of_names:
                        if '<' in name:
                            result = re.search('<(.*)>', name)
                            name = result.group(1)
                            name = name.replace('"', '')
                            name = name.replace("'", '')
                            namelist.append(name)
                        #unique typing of someones email incorrectly, ignore it
                        elif name == '"john\".<john.dunn"@enron@enron.com': pass
                        else:
                            name = name.replace('"', '')
                            name = name.replace("'", '')
                            namelist.append(name)


            #name = [item for sublist in name for item in sublist]
            unique_recips = list(set(namelist))
            recipients.append(unique_recips)

            addressbooksize.append(len(unique_recips))

            print('{}% of sender + recip dataframe completed        '.format(round(i/len(unique_senders)*100,2)) , end="\r")

        sender_by_recip['sender'] = senders
        sender_by_recip['recipients'] = recipients
        sender_by_recip['booksize'] = addressbooksize
        print('********************* Sender Recip Dataframe Completed ******************')
        sender_by_recip.to_csv(new_file, sep=DELIMETER, index = False)
        return sender_by_recip
    else:
        print('********************* Sender Recip Already Processed ********************', end="\r")
        sender_by_recip = pd.read_csv(new_file, sep=DELIMETER, engine = 'python')
        print('************ Sender Recip Dataframe with Lists Already Loaded ***********')
        return sender_by_recip

def shrink_addressbooks(sender_recip_df, low, high):
    sender_recip_df = sender_recip_df[sender_recip_df['booksize']>low]
    sender_recip_df = sender_recip_df[sender_recip_df['booksize']<high]
    return sender_recip_df

#generates a dataframe with every sender and one of their recipients in each row
def list_of_s_r_combos(addressbook, emails):
    if not os.path.isfile('/home/ubuntu/ssl/notebooks/list_of_s_r'):
        unique_senders = addressbook['sender'].unique()

        senders = []
        recipients = []
        list_of_bodies = []
        s_r_combos = pd.DataFrame(columns = ['sender', 'recipient'])
        for i, sender in enumerate(unique_senders):
            senders_recips = addressbook[addressbook['sender'] == sender].recipients
            senders_recips = [item for sublist in senders_recips for item in sublist]
            for j, name in enumerate(senders_recips):
                recip = name.strip(' ')
                senders.append(sender)
                recipients.append(recip)

        s_r_combos['sender'] = senders
        s_r_combos['recipient'] = recipients
        print('Processed All Sender/Reciever Combos')           
        
        s_r_combos.to_csv('/home/ubuntu/ssl/notebooks/list_of_s_r', index = False)
        print('***************Sender/Reciever Pair DataFrame Processed ***************.')
    else: 
        print('**********Send/Recieve Pair DF already processed. Loading CSV!**********', end="\r")
        s_r_combos = pd.read_csv('/home/ubuntu/ssl/notebooks/list_of_s_r')
        print('*****Send/Recieve Pair CSV loaded! Delete list_of_s_r to reprocess******')
    return s_r_combos

#calculate the mean tf_idf score for a series of texts
def tfidf(txt, vectorizer):
    try: 
        X = vectorizer.fit_transform(txt)
        Y = list(X.tocoo().data)
        Y = [i/sum(Y) for i in Y]
    except ValueError:
        Y = [0]
    return Y

#add the tf_idf score to a dataframe listing senders paired with recipients from a dataframe of emails
def add_tf_score(combo_list, emails):
    
    #saved file location
    new_file = '/home/ubuntu/ssl/notebooks/scored_w_mean_idf'
    
    #if already completed calculations before do not recalculate or overwrite file
    if not os.path.isfile(new_file):
        
        #list for holding mean tf_idf scores
        tf_idf_body =[]
        tfidfv = TfidfVectorizer(min_df=1)
        
        #iterate through all sender/recipient combos in the dataframe
        for index, row in combo_list.iterrows():
            #current sender
            from_ = row['sender']
            #current recipient
            to_ = row['recipient']
            
            #select all eamil bodies from and to current selection
            combo_s_r = emails[(emails['from']== from_) & 
                                     ((emails['to'].str.contains(to_)) | 
                                      (emails['cc'].str.contains(to_) | 
                                      (emails['bcc'].str.contains(to_))))]['body']
            
            #calculate mean tf_idf score
            tf_idf_mean = tfidf(combo_s_r, tfidfv)
            
            #append score to list
            tf_idf_body.append(tf_idf_mean)

            #print status updates
            print('{} % done Time: {}                   '.format((round(index/len(combo_list),4)*100),time.strftime("%I:%M:%S")), end="\r")


        #append tf_idf scores to dataframe in new column
        combo_list['tf_idf_mean'] = tf_idf_body
        
        #save file with unique delimeter
        combo_list.to_csv(new_file, sep=DELIMETER)
        return combo_list   
    else:
        
        #let user know process was previously completed and open previously processed file
        print('Scores were previously calculated.  Opening saved File')
        combo_list = pd.read_csv(new_file, sep=DELIMETER, engine = 'python')
        print('*********TF_IDF scores file Opened *******************')
        return combo_list


In [46]:
#read in all files to process emails into dataframe
all_emails = process_emails_into_df(root_path, saved_file, testing_batch)

#filter out the non enron and make a list of their email addresses
#and the people they sent emails to
wanted_emails = filter_unwanted_senders(all_emails, 'small_senders_filtered')

#split data into train and test sets
X_train, X_test = train_test_split(wanted_emails, test_size= 0.25)

#compile a dataframe of all senders and all unique recipients in second column
senders_to = get_sender_recip_list(X_train)

#eliminate senders who have fewer than low or more than high number of recipients
smaller_book = shrink_addressbooks(senders_to, low = 50, high = 300)

#parse lists of recipients into unique entrys, each row has 1 sender and 1 reciever
list_of_s_r = list_of_s_r_combos(smaller_book, wanted_emails)

***Processed Emails into DF Loaded From CSV, Delete File to Process Again******
****To Reprocess Filtered Senders Delete small_senders_filtered.  Loaded Old File!*******
************ Sender Recip Dataframe with Lists Already Loaded ***********
*****Send/Recieve Pair CSV loaded! Delete list_of_s_r to reprocess******


In [49]:
del all_emails
del senders_to

In [None]:
print('Starting {}'.format(time.strftime("%I:%M:%S")))
scores = add_tf_score(list_of_s_r, wanted_emails)
print('Finished {}'.format(time.strftime("%I:%M:%S")))

Starting 04:45:58
0.04 % done Time: 04:46:49                   