In [3]:
import pandas as pd
import numpy as np
import nltk
import re
import os
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import multiprocessing



In [4]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

# Preparing the corpus

### Stopwords, stemming and tokenizing
#### 1. load list of English stop words

In [32]:
stopwords = nltk.corpus.stopwords.words('english')
english_vocab = set(w.lower() for w in nltk.corpus.words.words())

In [34]:
# extend set of stopwords modals with modal verbs
stopwords = stopwords + ['could', 'may', 'might', 'must', 'ought to', 'shall', 'would']
stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your']

In [7]:
emails_file = os.path.join('hillary-clinton-emails', 'Emails.csv')
emails_df = pd.read_csv(emails_file)
emails_df.head(2)

Unnamed: 0,Id,DocNumber,MetadataSubject,MetadataTo,MetadataFrom,SenderPersonId,MetadataDateSent,MetadataDateReleased,MetadataPdfLink,MetadataCaseNumber,...,ExtractedTo,ExtractedFrom,ExtractedCc,ExtractedDateSent,ExtractedCaseNumber,ExtractedDocNumber,ExtractedDateReleased,ExtractedReleaseInPartOrFull,ExtractedBodyText,RawText
0,1,C05739545,WOW,H,"Sullivan, Jacob J",87.0,2012-09-12T04:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH2/DOC_0C05739545...,F-2015-04841,...,,"Sullivan, Jacob J <Sullivan11@state.gov>",,"Wednesday, September 12, 2012 10:16 AM",F-2015-04841,C05739545,05/13/2015,RELEASE IN FULL,,UNCLASSIFIED\r\nU.S. Department of State\r\nCa...
1,2,C05739546,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,H,,,2011-03-03T05:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH1/DOC_0C05739546...,F-2015-04841,...,,,,,F-2015-04841,C05739546,05/13/2015,RELEASE IN PART,"B6\r\nThursday, March 3, 2011 9:45 PM\r\nH: La...",UNCLASSIFIED\r\nU.S. Department of State\r\nCa...


In [9]:
topic_modeling_df = emails_df.copy()
topic_modeling_df.drop(['Id'
                        , 'DocNumber'
                        , 'MetadataTo'
                        , 'MetadataFrom'
                        , 'SenderPersonId'
                        , 'MetadataDateSent'
                        , 'MetadataDateReleased'
                        , 'MetadataPdfLink'
                        , 'MetadataCaseNumber'
                        , 'ExtractedTo'
                        , 'ExtractedFrom'
                        , 'ExtractedCc'
                        , 'ExtractedDateSent'
                        , 'ExtractedCaseNumber'
                        , 'ExtractedDocNumber'
                        , 'ExtractedDateReleased'
                        , 'ExtractedReleaseInPartOrFull'], axis=1, inplace=True)
topic_modeling_df.head(2)

Unnamed: 0,MetadataSubject,MetadataDocumentClass,ExtractedSubject,ExtractedBodyText,RawText
0,WOW,HRC_Email_296,FW: Wow,,UNCLASSIFIED\r\nU.S. Department of State\r\nCa...
1,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,HRC_Email_296,,"B6\r\nThursday, March 3, 2011 9:45 PM\r\nH: La...",UNCLASSIFIED\r\nU.S. Department of State\r\nCa...


In [10]:
topic_modeling_df = topic_modeling_df.dropna(subset=['ExtractedBodyText'], how='all')

In [11]:
topic_modeling_df.head(2)

Unnamed: 0,MetadataSubject,MetadataDocumentClass,ExtractedSubject,ExtractedBodyText,RawText
1,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,HRC_Email_296,,"B6\r\nThursday, March 3, 2011 9:45 PM\r\nH: La...",UNCLASSIFIED\r\nU.S. Department of State\r\nCa...
2,CHRIS STEVENS,HRC_Email_296,Re: Chris Stevens,Thx,UNCLASSIFIED\r\nU.S. Department of State\r\nCa...


In [12]:
def re_longer_than(N):
    return re.compile('^[a-z]{' + '{0},'.format(N) + '}')

In [13]:
re_word_filter = re_longer_than(4)

def preprocess_msg(msg):
    sentences = nltk.sent_tokenize(msg)
    del sentences[:6]
    del sentences[-7:]
    
    tokens = []
    for s in sentences:
        curr_tokens = nltk.word_tokenize(s)
        curr_tokens = [word for word in curr_tokens if word in english_vocab]
        curr_tokens = [word for word in curr_tokens if word not in stopwords]
        curr_tokens = list(filter(lambda x: re_word_filter.match(x) ,curr_tokens))
        tokens = tokens + curr_tokens

    return tokens    

### Stemming

Import the Porter Stemmer which is actually part of NLTK. We should do **stemming** - breaking a word down into its root.

In [38]:
p_stemmer = PorterStemmer()
curr_stem_map = {}

def cleanup_stem(): curr_stem_map.clear()

def bind_stem(token):
    t_stem = p_stemmer.stem(token)
    s = curr_stem_map.get(t_stem, set())
    if not s:
        curr_stem_map[t_stem] = s
        curr_stem_map[t_stem].add(token)
    return t_stem

In [15]:
def steamming(tokens_per_msg):
    return [[bind_stem(t) for t in tokens] for tokens in tokens_per_msg]

### Constructing a document-term matrix

In [61]:
# to generate an LDA model, we need to understand how frequently each term occurs within each document;
# to do that, we need to construct a document-term matrix

def calculate_topics(message_group):
    tokens_per_msg = [preprocess_msg(msg) for msg in message_group]
    tokens_per_msg = steamming(tokens_per_msg)
    
    # the Dictionary() function traverses texts, assigning a unique integer id to each unique token 
    # while also collecting word counts and relevant statistics. 
    dictionary = corpora.Dictionary(tokens_per_msg)
    
    if not bool(dictionary):
        return None
    
    # The doc2bow() function converts dictionary into a bag-of-words.
    # The result, corpus, is a list of vectors equal to the number of documents;
    # in each document vector is a series of tuples
    # The tuples are (term ID, term frequency) pairs. doc2bow() only includes terms 
    # that actually occur: terms that do not occur in a document will not 
    # appear in that document’s vector.
    corpus = [dictionary.doc2bow(tokens) for tokens in tokens_per_msg]
    N = 5 if len(message_group) == 1 else 25

    # Applying the LDA model
    ldamodel = models.ldamodel.LdaModel(corpus, num_topics=25, id2word = dictionary, passes=N)

    # topics_matrix
    return ldamodel.show_topics(formatted=False, num_words=5)    

In [17]:
tmdf_topic_by_group_df = topic_modeling_df.copy()
tmdf_topic_by_group_df['MsgTopics'] = ""
tmdf_topic_by_group_df.head(1)

Unnamed: 0,MetadataSubject,MetadataDocumentClass,ExtractedSubject,ExtractedBodyText,RawText,MsgTopics
1,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,HRC_Email_296,,"B6\r\nThursday, March 3, 2011 9:45 PM\r\nH: La...",UNCLASSIFIED\r\nU.S. Department of State\r\nCa...,


In [18]:
grouped_msgs_by_subject = tmdf_topic_by_group_df.groupby(['MetadataSubject'])
gb = grouped_msgs_by_subject.groups

## Run calculation for every group

In [40]:
def destem_topics(topics_matrix):
    if not topics_matrix:
        return ''
    
    # we take first corresponding word from pre-mapped stem-words
    return ' '.join([min(curr_stem_map[w]) for w, p in topics_matrix[0][1]])

In [43]:
from tqdm import tqdm

In [52]:
for key, indices in tqdm(gb.items(), ncols=len(gb)):
    group = grouped_msgs_by_subject.get_group(key)
    msg_dict = group['RawText'].to_dict()
    topics_matrix = calculate_topics(list(msg_dict.values()))
    
    topics_str = destem_topics(topics_matrix)
    cleanup_stem()
    
    for i in indices:
        tmdf_topic_by_group_df.ix[i, 'MsgTopics'] = topics_str

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

In [53]:
tmdf_topic_by_group_df.head(3)

Unnamed: 0,MetadataSubject,MetadataDocumentClass,ExtractedSubject,ExtractedBodyText,RawText,MsgTopics
1,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,HRC_Email_296,,"B6\r\nThursday, March 3, 2011 9:45 PM\r\nH: La...",UNCLASSIFIED\r\nU.S. Department of State\r\nCa...,military zone support stated convinced
2,CHRIS STEVENS,HRC_Email_296,Re: Chris Stevens,Thx,UNCLASSIFIED\r\nU.S. Department of State\r\nCa...,sorry surely morning front former
4,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,HRC_Email_296,,"H <hrod17@clintonemail.com>\r\nFriday, March 1...",B6\r\nUNCLASSIFIED\r\nU.S. Department of State...,military zone support stated convinced


In [62]:
test_list = topic_modeling_df['RawText'].tolist()

In [64]:
topics_matrix = calculate_topics(test_list)

In [65]:
topics_str = destem_topics(topics_matrix)
cleanup_stem()
    
print(topics_str)

diplomacy military powerful public work


### Examining the results

In [455]:
print(ldamodel.print_topics(num_topics=5, num_words=3))
ldamodel.show_topics()

[(29, '0.138*"per" + 0.110*"cent" + 0.022*"public"'), (26, '0.034*"election" + 0.033*"party" + 0.028*"would"'), (17, '0.016*"said" + 0.014*"reconstruction" + 0.012*"local"'), (21, '0.014*"military" + 0.013*"would" + 0.011*"former"'), (23, '0.036*"update" + 0.025*"family" + 0.020*"time"')]


[(45,
  '0.025*"people" + 0.024*"get" + 0.020*"plan" + 0.019*"could" + 0.018*"kids" + 0.018*"school" + 0.015*"education" + 0.014*"work" + 0.012*"though" + 0.011*"help"'),
 (40,
  '0.044*"percent" + 0.022*"said" + 0.020*"enough" + 0.017*"would" + 0.015*"need" + 0.012*"margin" + 0.012*"climate" + 0.011*"opinion" + 0.010*"help" + 0.010*"change"'),
 (4,
  '0.026*"shall" + 0.022*"train" + 0.020*"nomination" + 0.016*"would" + 0.016*"turnout" + 0.016*"coup" + 0.014*"hold" + 0.014*"lift" + 0.014*"candidate" + 0.014*"recommend"'),
 (19,
  '0.027*"staff" + 0.021*"president" + 0.017*"adviser" + 0.017*"former" + 0.016*"chief" + 0.015*"secretary" + 0.015*"earthquake" + 0.014*"year" + 0.013*"team" + 0.011*"campaign"'),
 (37,
  '0.017*"new" + 0.016*"policy" + 0.013*"long" + 0.012*"video" + 0.012*"growth" + 0.012*"bank" + 0.011*"like" + 0.011*"society" + 0.009*"foreign" + 0.009*"ever"'),
 (33,
  '0.043*"aid" + 0.033*"money" + 0.026*"government" + 0.024*"tax" + 0.022*"speech" + 0.019*"spending" + 0.019

In [453]:
topics_matrix = ldamodel.show_topics(formatted=False, num_words=20)
for i, words in topics_matrix:
    print([w for w, p in words], '\n')

['movement', 'group', 'right', 'company', 'conservative', 'police', 'cable', 'white', 'front', 'oil', 'race', 'one', 'onto', 'death', 'hill', 'liberal', 'media', 'left', 'black', 'campaign'] 

['percent', 'would', 'religious', 'world', 'treaty', 'nation', 'new', 'faith', 'government', 'long', 'economic', 'like', 'growth', 'one', 'many', 'religion', 'debt', 'toward', 'face', 'political'] 

['deal', 'said', 'party', 'agreement', 'would', 'assembly', 'statement', 'last', 'new', 'peace', 'two', 'justice', 'vote', 'leader', 'see', 'support', 'press', 'community', 'government', 'minister'] 

['call', 'tomorrow', 'time', 'know', 'morning', 'today', 'back', 'update', 'office', 'like', 'route', 'speak', 'work', 'confirmed', 'schedule', 'talk', 'let', 'memo', 'available', 'need'] 

['administration', 'policy', 'nuclear', 'foreign', 'president', 'world', 'new', 'strategy', 'public', 'diplomacy', 'war', 'time', 'peace', 'would', 'state', 'official', 'even', 'one', 'settlement', 'senior'] 

['say',