In [2]:
import pandas as pd
from collections import Counter
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim import corpora, models
import gensim
import numpy as np
import re

In [3]:
allemails = pd.read_csv("emails.csv")
emails = allemails[:100]

In [4]:
def extract_data():
    df = pd.DataFrame(columns=['date',
                               'sender_address',
                               'recipient',
                               'subject',
                               'sender_name',
                               'recipient_name',
                               'cc',
                               'bcc',
                               'folder',
                               'body'])
    
    for index, details in emails.iterrows():
        raw_email_info = details['message']
        # raw_email_info = f_in.read().decode('utf8')
        date = re.findall(r'Date: (.*)', raw_email_info)[0]
        sender_address = re.findall(r'From: (.*)', raw_email_info)[0]
        recipient = re.findall(r'To: (.*)', raw_email_info)[0]
        subject = re.findall(r'Subject: (.*)', raw_email_info)[0]
        sender_name = re.findall(r'X-From: (.*)', raw_email_info)[0]
        recipient_name = re.findall(r'X-To: ([ A-Za-z]*)', raw_email_info)[0]
        cc = re.findall(r'X-cc: (.*)', raw_email_info)[0]
        bcc = re.findall(r'X-bcc: (.*)', raw_email_info)[0]
        folder = re.findall(r'"[a-zA-z-]*/(.*)/.*,"Message-ID.*>', raw_email_info)
        
        # strip everything before X-FileName
        bodies = re.findall(r'(?<=X-FileName: )(?s)(.*$)', raw_email_info)
        # Get everything after the first newline
        bodies = bodies[0].split('\n')
        body = " ".join(bodies[1:])
        
        df.loc[index] = [date, sender_address, recipient, subject, sender_name, recipient_name, cc, bcc, folder, body]
        
    print(df.shape)
    return df
                         
df = extract_data()



(100, 10)


In [5]:
data = shuffle(df)

## Cleaning the text

In [6]:
tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
texts = []

In [8]:
for i in data['body']:
    #print "Processing",i
    # clean and tokenize document string
    tokens = tokenizer.tokenize(i)
    # remove all numbers
    tokens = [x for x in tokens if not (x.isdigit() or x[0] == '-' and x[1:].isdigit())]
    # remove structural words
    tokens = [x for x in tokens if len(x) > 1]
    tokens = [x.lower() for x in tokens]
    tokens = [x for x in tokens if 'http' not in x]
    tokens = [x for x in tokens if x not in "_"]
    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    # add tokens to list
    texts.append(stemmed_tokens)

dictionaryall = corpora.Dictionary(texts)

corpusall = [dictionaryall.doc2bow(text) for text in texts]

texts[1]

['brad',
 'regard',
 'tori',
 'kuykendal',
 'like',
 'promot',
 'commerci',
 'manag',
 'instead',
 'convert',
 'commerci',
 'support',
 'manag',
 'associ',
 'duti',
 'sinc',
 'begin',
 'year',
 'commerci',
 'manag',
 'doubt',
 'will',
 'compar',
 'favor',
 'other',
 'categori',
 'year',
 'end',
 'martin',
 'cuilla',
 'central',
 'desk',
 'similiar',
 'situat',
 'tori',
 'hunter',
 'like',
 'martin',
 'handl',
 'tori',
 'let',
 'know',
 'issu',
 'phillip']

In [9]:
texts[4]

['forward',
 'phillip',
 'allen',
 'hou',
 'ect',
 'pm',
 'enron',
 'admin',
 'fsddatasvc',
 'com',
 'pallen',
 'enron',
 'com',
 'cc',
 'subject',
 'time',
 'sensit',
 'execut',
 'impact',
 'influenc',
 'program',
 'survey',
 'execut',
 'impact',
 'influenc',
 'program',
 'immedi',
 'action',
 'requir',
 'delet',
 'part',
 'execut',
 'impact',
 'influenc',
 'program',
 'particip',
 'ask',
 'gather',
 'input',
 'particip',
 'manag',
 'style',
 'practic',
 'experienc',
 'immedi',
 'manag',
 'direct',
 'report',
 'eight',
 'peer',
 'colleagu',
 'request',
 'provid',
 'feedback',
 'particip',
 'attend',
 'next',
 'program',
 'input',
 'self',
 'assess',
 'manag',
 'assess',
 'direct',
 'report',
 'assess',
 'peer',
 'colleagu',
 'assess',
 'will',
 'combin',
 'input',
 'other',
 'use',
 'program',
 'particip',
 'develop',
 'action',
 'plan',
 'improv',
 'manag',
 'style',
 'practic',
 'import',
 'complet',
 'assess',
 'later',
 'close',
 'busi',
 'thursday',
 'septemb',
 'sinc',
 'feedbac

In [10]:
ldamodelall = gensim.models.ldamodel.LdaModel(corpusall, num_topics=7, id2word = dictionaryall, passes=20,
                                              minimum_probability=0)

In [11]:
print(ldamodelall.print_topics(num_topics=5, num_words=5))

[(3, '0.013*"phillip" + 0.013*"can" + 0.012*"will" + 0.012*"ga" + 0.012*"price"'), (5, '0.151*"enron" + 0.042*"ect" + 0.036*"na" + 0.034*"corp" + 0.022*"hou"'), (1, '0.021*"project" + 0.014*"will" + 0.013*"austin" + 0.012*"phillip" + 0.009*"properti"'), (4, '0.026*"ect" + 0.016*"com" + 0.016*"phillip" + 0.013*"hou" + 0.012*"enron"'), (6, '0.022*"phillip" + 0.016*"hotmail" + 0.015*"com" + 0.014*"can" + 0.014*"luci"')]
