In [1]:
import pandas as pd
import requests
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,8)
import numpy as np
from datetime import datetime
from textblob import TextBlob

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import linear_kernel


import pandas as pd



In [49]:
all_emails =  pd.read_csv('enron_email/emails.csv')

## Helper Module

In [8]:
def parse_into_emails(messages):
    emails = [parse_raw_message(message) for message in messages]
    bodies = []
    tos = []
    froms = []
    subjects = []
    dates = []
    xfroms = []
    xtos = []
    for email in emails:
        froms.append(email['from'])
        tos.append(email.get('to', None))
        bodies.append(email['body'])
        subjects.append(email.get('subject', None))
        dates.append(email.get('date', None))
        xfroms.append(email.get('x-from', None))
        xtos.append(email.get('x-to', None))
    return {
        'body': bodies, 
        'to': tos, 
        'from_': froms,
        'subject': subjects,
        'date': dates,
        'xfroms': xfroms,
        'xtos': xtos
    }

def parse_raw_message(raw_message):
    lines = raw_message.split('\n')
    email = {}
    message = ''
    keys_to_extract = ['from', 'to', 'subject', 'date', 'x-from', 'x-to']
    for line in lines:
        if ':' not in line:
            message += line.strip()
            email['body'] = message
        else:
            pairs = line.split(':')
            key = pairs[0].lower()
            val = pairs[1].strip()
            val = val.replace('.', '. ')
            if key in keys_to_extract:
                email[key] = val
    return email

In [52]:
all_email_df = pd.DataFrame(parse_into_emails(all_emails.message))

#### Email Searching Class

In [10]:
def read_email_bodies():
    emails = pd.read_csv('enron_email/emails.csv')
    email_df = pd.DataFrame(parse_into_emails(emails.message))
    email_df.drop(email_df.query("body == '' | to == '' | from_ == ''").index, inplace=True)
    email_df.drop_duplicates(inplace=True)
    return email_df['body']


class EmailDataset(): 
    def __init__(self):
        stopwords = ENGLISH_STOP_WORDS.union(['ect', 'hou', 'com', 'recipient'])
        self.vec = TfidfVectorizer(analyzer='word', stop_words=stopwords, max_df=0.3, min_df=2)
        self.emails = read_email_bodies() 

    # train on the given email data.

    def train(self):
        self.vec_train = self.vec.fit_transform(self.emails)

    def query(self, keyword, limit):
        vec_keyword = self.vec.transform([keyword])
        cosine_sim = linear_kernel(vec_keyword, self.vec_train).flatten()
        related_email_indices = cosine_sim.argsort()[:-limit:-1]
        return related_email_indices

    def find_email_by_index(self, i):
        return self.emails.to_numpy()[i]

In [12]:
ds = EmailDataset()
ds.train()
results = ds.query('project', 100)
# print out the first result.
print(ds.find_email_by_index(results[0]))

AccomplishmentsCurrent Project List


In [26]:
results

array([115537, 232339,  97460, 126459, 228801, 199663, 232337, 179097,
        88453, 154064, 232314, 231748,  24438,  85575, 173571,  60833,
       121123, 210469,  55318, 179134, 205039, 204302, 136878, 153831,
        77981, 205582,  80318, 173360,  81856,  27748,  27882, 118019,
        27883,  27962,  27972,  27639,  27640,  51050, 237645, 237652,
       137158,  36343, 227886,  12347, 121121, 144578, 121787,  37380,
       120990, 222740, 108094, 192235,  21390, 152600, 121987, 149515,
       183118, 130727, 202223, 238882, 208255,   1470,  87794, 153741,
        28372,  81047,  48628, 232341, 144264,  12931, 225417, 101055,
       179824, 165904, 237249, 225519,  88849,  60852, 153740, 109645,
        93702, 238193, 237755, 189763,  89373, 208238,  87668,  81854,
       109327,  77417,  76367, 165560,   4036,  81859, 220940, 179041,
       152504, 238161, 220937])

In [33]:
ds.emails[['115537']].tolist()

KeyError: "None of [Index(['115537'], dtype='object')] are in the [index]"

In [50]:
ds.emails.to_numpy()[115537]

'AccomplishmentsCurrent Project List'

In [51]:
ds.email_df[ds.email_df['body'] == 'AccomplishmentsCurrent Project List']

AttributeError: 'EmailDataset' object has no attribute 'email_df'

In [37]:
ds.emails[ds.emails[ds.emails.to_numpy()[202223]

'(if .....) (iii)   definitive agreements acceptable to Party A in its solediscretion regarding the debt and all other aspects of [the project to beowned by Party B], including but not limited to the senior debt facility,commitments and all project documents, have been fully negotiated andexecuted and the [project] has been brought to financial close.'

In [None]:
all_email_df.drop_duplicates()

In [53]:
all_email_df[all_email_df['body'] == 'AccomplishmentsCurrent Project List']

Unnamed: 0,body,to,from_,subject,date,xfroms,xtos
238472,AccomplishmentsCurrent Project List,"richard. shapiro@enron. com, d. . steffes@enro...",harry. kingerski@enron. com,Accomplishments/Activities,"Mon, 10 Sep 2001 11","Kingerski, Harry </O=ENRON/OU=NA/CN=RECIPIENTS...","Shapiro, Richard </O=ENRON/OU=NA/CN=RECIPIENTS..."
440208,AccomplishmentsCurrent Project List,"richard. shapiro@enron. com, d. . steffes@enro...",harry. kingerski@enron. com,Accomplishments/Activities,"Mon, 10 Sep 2001 11","Kingerski, Harry </O=ENRON/OU=NA/CN=RECIPIENTS...","Shapiro, Richard </O=ENRON/OU=NA/CN=RECIPIENTS..."
453837,AccomplishmentsCurrent Project List,"richard. shapiro@enron. com, d. . steffes@enro...",harry. kingerski@enron. com,Accomplishments/Activities,"Mon, 10 Sep 2001 11","Kingerski, Harry </O=ENRON/OU=NA/CN=RECIPIENTS...","Shapiro, Richard </O=ENRON/OU=NA/CN=RECIPIENTS..."
