# Question 1 - Analysis 2
#### Use a list of suspected words that people have used in stemmed format
#### Check sent, delete and inbox folders of all people and get the payload of all the emails in these folders.
#### Remove numbers, punctuations and blank quotes from message body. Use nltk word_tokenize to tokanize the email body and  Porter Stemmer to stem the words from email body.
#### Check if any email has used these suspected words. If they are present, get the person whose mail box we just scanned.
#### In a dictionary, store name of person as key and list of all suspected emails sent/received or deleted as value
#### Save dictionary in json file format

In [32]:
# Import statements
import json, os, glob, email, fnmatch, string, nltk, re, operator
from email.message import EmailMessage

In [33]:
# Get the data for emails
current_dir = os.path.dirname('__file__')
data_dir = os.path.join(current_dir, '..', 'data','enron')

matches = []
for root, dirnames, filenames in os.walk(data_dir):
    for filename in fnmatch.filter(filenames, '*'):
        matches.append(os.path.join(root, filename))

In [34]:
# To remove puctuations
translator = str.maketrans('', '', string.punctuation)

#Create a list of suspected words in stem format 
suspected_words = ['off-balance-sheet', 'special purpose vehicles', 'SPV', 'special purposes entities', 'SPE', 'balance sheet', 'Arthur Andersen', 'Duncan', 'investig','bankruptci','fraud','feder prosecut','shred','scandal','conspiraci','insid trade']
count = 0
porter = nltk.PorterStemmer()
message_of_interest=[]

#Iterate over all the emails 
for filename in matches:
    #Consider only emails sent, received or deleted
    if 'sent' in filename or 'delete' in filename or 'inbox' in filename:    
        with open(filename,'r') as f:
            data_from_file =  f.read()
            message = email.message_from_string(data_from_file)
            
            #Get the payload of email
            words = nltk.word_tokenize(message.get_payload())
            
            #Tokanize the message
            tok = [word.translate(translator) for word in words]
            
            #Remove punctuations, numbers and blank quotes
            tokens = [re.sub(r'\d+', '', toke) for toke in tok if toke !='']
            
            #Use Porter stemmer method to stem the words
            try:
                list_of_words = [porter.stem(t) for t in tokens]
            except:
                list_of_words = tokens
                
            #Compare the words if any one matches with suspected words
            for single_word in list_of_words:
                if single_word in  suspected_words:
                    message_of_interest.append(message)             #Save the entire message in a list to work on later
                    break

In [35]:
suspected_people = {}
#Get the name of people who used those words in a dictionary as key and the details of that email in a list as value
for message in message_of_interest:
    list_of_details_mailed = [message['From'], message['To'], message['Date'], message['Message-ID'], message.get_payload()]
    if message['X-Origin'].title() not in suspected_people:
        suspected_people[message['X-Origin'].title()] = [list_of_details_mailed]
    else:
        suspected_people[message['X-Origin'].title()] += [list_of_details_mailed]

In [40]:
#Sort the dictionary according to person name
sorted_mails_dict = sorted(suspected_people.items(), reverse=True)

In [41]:
#Function to write the data into a json file
def write_to_json_file(file_path, json_data):
    with open(file_path, 'w') as json_out:
        json.dump(json_data, json_out, indent=2)

#Function to create the relative path
def create_directory_for_output():
    current_dir = os.path.dirname('__file__')
    data_folder = os.path.join(current_dir, 'ana_2')
    if not os.path.exists(data_folder):
        os.mkdir(data_folder)
    return data_folder

output_folder=create_directory_for_output()

#Name of json file storing suspected people and the respective emails
file_name = 'suspected_users_by_email'
file_path = os.path.join(output_folder, file_name)
file_path+='.json'

#Write data to file
write_to_json_file(file_path, sorted_mails_dict)