In [3]:
import pymongo
import time
import random
import string
import re
import os
import gensim


In [19]:
MongoClient = pymongo.MongoClient
conn = MongoClient('mongo-enron', 27017)
#use enron database
db = conn['enron']
messages = db.messages
messageswords = db.messageswords


def generate_messagewords():
    #agregate the spacy tokens and generate a summary collection
    pipeline=[
         { "$match": {"spacy.entities":{"$exists":True}}},
         { "$project": {"idOrig": "$_id", "message-id": "$message-id","from": "$from","spacy": "$spacy.tokens" }},
         {"$out":"messageswords"}
    ]

    #save it in a new collection
    messages.aggregate(pipeline, allowDiskUse=True)
    return True

def get_words():
    rows = messageswords.find({})
    rtn = [row['spacy'] for row in rows]
    return rtn

def get_author_docs():
    rtn = dict()
    pipeline=[
      {"$group":{"_id":"$from","documents":{"$push": "$rownum"} } }
    ]
    rows = list(messageswords.aggregate(pipeline, allowDiskUse=True))
    for author in rows:
        rtn[author['_id']] = author['documents']
    return rtn


def get_dictionary(records):
    #http://radimrehurek.com/gensim/corpora/dictionary.html
    rtn = gensim.corpora.Dictionary(records)
    
    no_below = 10 #filter out words that appear x times or less
    no_above = 0.5 #filter out words that appear more than y of the time
    rtn.filter_extremes(no_below, no_above)
    return rtn

def get_corpus(dictionary, records):
    return [dictionary.doc2bow(record) for record in records]

def get_lda_model(dictionary, corpus, num_topics, passes):
    start_time = time.time()
    models = []
    model = gensim.models.ldamulticore.LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, passes=passes, random_state=1)
    #get the Umass topic coherence
    #http://radimrehurek.com/gensim/models/ldamodel.html#gensim.models.ldamodel.LdaModel.top_topics
    top_topics = model.top_topics(corpus)
    topic_score = sum([topic[1] for topic in top_topics])
    models.append((model, topic_score))
    print("--model done %s seconds. topic score %d ---" % (time.time() - start_time, topic_score))
    print_topics(model,num_topics)
    print("topic coherence score: %d" % topic_score)
    """
    topic coherence score: -3544
    Words: market provide time base cost year include risk company business 
    Words: gas lng oil natural shackleton contact canada sara crude LNG 
    Words: attach review information receive document request report change question date 
    Words: click email receive free send mail information service new web 
    Words: charlie checkout game football player team play league reports report 
    Words: thank know deal let power gas question price change day 
    Words: state energy california say power news report enpower industry utility 
    Words: thank know enron let work time meeting like group week 
    Words: fax enron phone houston thank north america smith street master 
    Words: good day time like get great come know think look 

    topic coherence score: -3479
    Topic 1 Words: market follow power time energy issue day enron report new 
    Topic 2 Words: thank meeting time new think good week information enron deal 
    Topic 3 Words: thank attach know let question agreement enron deal gas send 
    Topic 4 Words: time meeting think good day week want follow mail change 
    Topic 5 Words: gas click email information available market receive report view day 
    Topic 6 Words: thank know enron let time fax phone work good houston 
    Topic 7 Words: start date hour schedule hourahead award variance detect ancillary good 
    Topic 8 Words: thank enron follow question year attach information new help send 
    Topic 9 Words: new year company energy enron state plan buy york stock 
    Topic 10 Words: click send receive mail email free message information time new 

    """
    return model   


def print_topics(model,num_topics):
    i=0
    for topic in model.show_topics(num_topics):
        words = ''
        for word, probability in model.show_topic(topic[0]):
            words += word + ' '
        i+=1
        print("Topic %s Words: %s" % (i, words))
        
        
def main():
    start_time = time.time()
    dictionary_file = '../gensim/lda/dictionary'
    model_file = '../gensim/lda/model'
    num_topics=10
    passes=1
    num_words = 10
    
    if not os.path.isfile(dictionary_file) or not os.path.isfile(model_file):
        generate_messagewords()
        records = get_words()
        author2doc = get_author_docs()
        print author2docs
        print("---query done %s seconds ---" % (time.time() - start_time))
        dictionary = get_dictionary(records)
        dictionary.save(dictionary_file)
        print("--dictionary saved %s seconds ---" % (time.time() - start_time))
        corpus = get_corpus(dictionary, records)
        print("--corpus done %s seconds ---" % (time.time() - start_time))
        model = get_lda_model(dictionary, corpus, num_topics, passes)
        model.save(model_file)
    else:
        dictionary = gensim.corpora.Dictionary.load(dictionary_file)
        model = gensim.models.ldamodel.LdaModel.load(model_file)

    print_topics(model,num_topics)

    print("---done %s seconds ---" % (time.time() - start_time))
    
main()


{u'maria.beccaccini@enron.com': [73607L, 115894L, 132672L], u'steve_williams@eogresources.com': [47371L], u'about-freestuffextra@aboutdirect.com': [143726L], u'maokotani@aol.com': [152288L], u'winston.jia@enron.com': [45083L, 117663L, 127949L], u'marc_graubart@enron.net': [130999L, 147695L], u'andrew.ralston@enron.com': [23386L, 91641L], u'kgh1021@yahoo.com': [77390L], u'newsforyou@mymeetings.com': [11586L, 23845L, 101589L], u'majordomo@stat.math.ethz.ch': [74550L, 94478L], u'insiderdi@resdata.com': [139070L], u'vnguyen@greenash.com': [29919L], u'caroll2506@aol.com': [31423L], u'jhavila@firstunion1.com': [15806L, 19702L, 35356L, 60310L, 137971L], u'warren@epis.com': [79843L], u'leblanc@enron.com': [92836L], u'julian.draven@turner.com': [147608L], u'michael.herman@enron.com': [16139L], u'scott.hendrickson@enron.com': [2487L, 6792L, 22137L, 30312L, 35765L, 41270L, 45567L, 47605L, 59876L, 69760L, 72084L, 84564L, 85869L, 97310L, 100721L, 102258L, 106871L, 109560L, 111709L, 114219L, 126323L

[{u'_id': u'bmwna@bmwna.rgc2.net', u'documents': [153709L]}, {u'_id': u'kudym@enron.com', u'documents': [153693L]}, {u'_id': u'nikole.vander@enron.com', u'documents': [153555L]}]
{u'nikole.vander@enron.com': [153555L], u'bmwna@bmwna.rgc2.net': [153709L], u'kudym@enron.com': [153693L]}
