# This notebook is a helper for finding related emails

#### Importing modules

In [1]:
import pandas as pd
import requests
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,8)
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from datetime import datetime
from textblob import TextBlob
from sklearn.metrics.pairwise import linear_kernel

from parse_utils import parse_raw_message, parse_into_emails


#### Loading And Parsing All emails

In [13]:
all_email_df = pd.read_csv('enron_email/emails.csv')
all_email_df = pd.DataFrame(parse_into_emails(all_email_df.message))

#### Email Searching Class

In [2]:
def read_email_bodies():
    emails = pd.read_csv('enron_email/emails.csv') 
    email_df = pd.DataFrame(parse_into_emails(emails.message))
    email_df.drop(email_df.query("body == '' | to == '' | from_ == ''").index, inplace=True)
    email_df.drop_duplicates('body',inplace=True)
    return email_df['body']


class EmailDataset(): 
    def __init__(self):
        stopwords = ENGLISH_STOP_WORDS.union(['ect', 'hou', 'com', 'recipient'])
        self.vec = TfidfVectorizer(analyzer='word', stop_words=stopwords, max_df=0.3, min_df=2)
        self.emails = read_email_bodies() 

    # train on the given email data.

    def train(self):
        self.vec_train = self.vec.fit_transform(self.emails)

    def query(self, keyword, limit):
        vec_keyword = self.vec.transform([keyword])
        cosine_sim = linear_kernel(vec_keyword, self.vec_train).flatten()
        related_email_indices = cosine_sim.argsort()[:-limit:-1]
        return related_email_indices

    def find_email_by_index(self, i):
        return self.emails.to_numpy()[i]

#### Searching with a keyword sample

In [3]:
ds = EmailDataset()
ds.train()
results = ds.query('project', 100)
# print out the first result.
print(ds.find_email_by_index(results[0]))

attached is my project report .


array([217905, 119846,  92600, 214502, 168507, 187815,  93120, 163060,
        57466, 217903,  93121,  22933, 146025, 116544, 217882, 114742,
       217346, 116539, 163261,  81236,  76897, 114987, 198226,  77673,
        83858, 168544, 129593,  11884, 193068, 192366, 145806, 193554,
        34552, 169226,  26107, 213595, 141853,  26007, 136647, 109953,
        48180, 114985, 104083,  82473, 190351,  83216, 223823, 115624,
       114856,  35581,  20041, 172325,  28408,  20657, 102938,  76883,
       168453,  73426,  84677,  72401, 123962, 136961,  52227, 145717,
       144601,   1402,  26681, 115821,  73975, 101746,  68368, 196126,
       168513, 217907, 168546,  46056, 222524, 156102,  23020,  96037,
        73776,  12446,  45895, 195954, 222332, 168243,   9332, 182814,
       104340, 181584,  77671, 145716,  84239,  76203,  77676, 147314,
       198810,  80722, 223152])

In [5]:
ds.emails.to_numpy()[results[0]]

'following the discussions i have had with louise and jay , this is the proposed solution to the interest rate risk management in newco . 1 . for the first few weeks ( to a few months) we continue to use infinity to update the curves at 2 pm . i will need to have one trader to execute , mark curves and do any deals for foreign exchange (cd$ etc . ) . louise , i hope you have this in your budget and headcount . i will need one junior trader(and maybe two as things grow) to help do this job . i have lined up one of the guys in the old trading desk , if this goes forward . 2 . in the first few weeks , jay webb and myself are hoping to acquire a lighter , and much simpler interest rate package which will allow us to do what we do , with significantly lower costs . we also will re-design the way we aggregate interest rate risk in the commodity books , to make it simpler and easier . to ensure there is no more discussion ever of drift-reallocation , i propose a method where , every desk head

In [14]:
all_email_df[all_email_df['body'] == 'AccomplishmentsCurrent Project List']

Unnamed: 0,body,to,from_,subject,date,xfroms,xtos
