In [65]:
import pandas as pd
import numpy as np
import nltk
import re
import os
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
from tqdm import tqdmtopics_data = lda.print_topics(num_topics=5, num_words=10)

In [227]:
# packages and collections to download
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

## Get data from csv and preliminary clean-up

In [68]:
emails_file = os.path.join('hillary-clinton-emails', 'Emails.csv')
emails_df = pd.read_csv(emails_file)
emails_df.head(2)

Unnamed: 0,Id,DocNumber,MetadataSubject,MetadataTo,MetadataFrom,SenderPersonId,MetadataDateSent,MetadataDateReleased,MetadataPdfLink,MetadataCaseNumber,...,ExtractedTo,ExtractedFrom,ExtractedCc,ExtractedDateSent,ExtractedCaseNumber,ExtractedDocNumber,ExtractedDateReleased,ExtractedReleaseInPartOrFull,ExtractedBodyText,RawText
0,1,C05739545,WOW,H,"Sullivan, Jacob J",87.0,2012-09-12T04:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH2/DOC_0C05739545...,F-2015-04841,...,,"Sullivan, Jacob J <Sullivan11@state.gov>",,"Wednesday, September 12, 2012 10:16 AM",F-2015-04841,C05739545,05/13/2015,RELEASE IN FULL,,UNCLASSIFIED\r\nU.S. Department of State\r\nCa...
1,2,C05739546,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,H,,,2011-03-03T05:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH1/DOC_0C05739546...,F-2015-04841,...,,,,,F-2015-04841,C05739546,05/13/2015,RELEASE IN PART,"B6\r\nThursday, March 3, 2011 9:45 PM\r\nH: La...",UNCLASSIFIED\r\nU.S. Department of State\r\nCa...


In [239]:
# remove all redundant information from data frameprint_topics(lda.show_topics(formatted=False, num_words=20))
topic_modeling_df = emails_df.copy()
topic_modeling_df.drop(['Id'
                        , 'DocNumber'
                        , 'MetadataTo'
                        , 'MetadataFrom'
                        , 'SenderPersonId'
                        , 'MetadataDateSent'
                        , 'MetadataDateReleased'
                        , 'MetadataPdfLink'
                        , 'MetadataCaseNumber'
                        , 'ExtractedTo'
                        , 'ExtractedFrom'
                        , 'ExtractedCc'
                        , 'ExtractedDateSent'
                        , 'ExtractedCaseNumber'
                        , 'ExtractedDocNumber'
                        , 'ExtractedDateReleased'
                        , 'ExtractedReleaseInPartOrFull'], axis=1, inplace=True)
topic_modeling_df.head(2)

Unnamed: 0,MetadataSubject,MetadataDocumentClass,ExtractedSubject,ExtractedBodyText,RawText
0,WOW,HRC_Email_296,FW: Wow,,UNCLASSIFIED\r\nU.S. Department of State\r\nCa...
1,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,HRC_Email_296,,"B6\r\nThursday, March 3, 2011 9:45 PM\r\nH: La...",UNCLASSIFIED\r\nU.S. Department of State\r\nCa...


# Preparing the corpus

### Stopwords, msg preprocessing, stemming and tokenizing
#### Load list of English stop words

In [66]:
stopwords = nltk.corpus.stopwords.words('english')
english_vocab = set(w.lower() for w in nltk.corpus.words.words())

In [67]:
# extend set of stopwords modals with modal verbs
stopwords = stopwords + ['could', 'may', 'might', 'must', 'ought to', 'shall', 'would']
stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your']

In [70]:
def re_longer_than(N):
    return re.compile('^[a-z]{' + '{0},'.format(N) + '}')

###### Every message is preprocessed accordng to belonging to english vocabulary and not belonging to stopwords;
Also we drop all words with length less than 4 (empirically detected). Of course there are some number of meaningful
words, however the majority is considered as a noise.
In addition first 6 and last 7 sentences are removed (also empirically detected), because they contain only meta-information

In [71]:
re_word_filter = re_longer_than(4)

def preprocess_msg(msg):
    sentences = nltk.sent_tokenize(msg)
    del sentences[:6]
    del sentences[-7:]
    
    tokens = []
    for s in sentences:
        curr_tokens = nltk.word_tokenize(s)
        curr_tokens = [word for word in curr_tokens if word in english_vocab]
        curr_tokens = [word for word in curr_tokens if word not in stopwords]
        curr_tokens = list(filter(lambda x: re_word_filter.match(x) ,curr_tokens))
        tokens = tokens + curr_tokens

    return tokens    

#### Stemming

Import the Porter Stemmer which is actually part of NLTK. We should do **stemming** - breaking a word down into its root, in order to eliminate an influence of identical words.

In [72]:
p_stemmer = PorterStemmer()

# we use stem_map to undo stem and get meaningful word from stem_token. (rough operation)
curr_stem_map = {}

# as we recalculate stem_map for every group of msgs, we should clean it up
def cleanup_stem(): curr_stem_map.clear()

# executes stemming and mapping for a token
def bind_stem(token):
    t_stem = p_stemmer.stem(token)
    s = curr_stem_map.get(t_stem, set())
    if not s:
        curr_stem_map[t_stem] = s
        curr_stem_map[t_stem].add(token)
    return t_stem

In [240]:
# stems a list of tokens and returns two-dim list of tokens (list per msg)
def steamming(tokens_per_msg):
    return [[bind_stem(t) for t in tokens] for tokens in tokens_per_msg]

#### Generate LDA model

In [241]:
# function performs:
#                    1) preprocessing for every message within a group in order to obtain appropriate tokens
#                    2) stemming of tokens
#                    3) the Dictionary() function which traverses texts, assigning a unique integer id to each unique 
#                       token while also collecting word counts and relevant statistics.
#                    4) the doc2bow() function which converts dictionary into a bag-of-words. The result, corpus, is a 
#                       list of vectors equal to the number of documents; doc2bow() only includes terms 
#                       that actually occur: terms that do not occur in a document will not appear in that document’s 
#                       vector.
#                    5) applying the LDA model

# returns calculated lda_model or None in case we have an empty dictionary (possible case)

def calculate_topics(message_group, n_topics):
    tokens_per_msg = [preprocess_msg(msg) for msg in message_group]
    tokens_per_msg = steamming(tokens_per_msg)
    dictionary = corpora.Dictionary(tokens_per_msg)
    
    if not bool(dictionary):
        return None
    
    corpus = [dictionary.doc2bow(tokens) for tokens in tokens_per_msg]
    
    # tiny optimization, we reguse a number of passes in case group consists of only message
    N = 5 if len(message_group) == 1 else 25

    return models.ldamodel.LdaModel(corpus, num_topics = n_topics, id2word = dictionary, passes=N)

#### Create a copy of dataframe and prepare column 'MsgTopics' for a topics set

In [75]:
tmdf_topic_by_group_df = topic_modeling_df.copy()
tmdf_topic_by_group_df['MsgTopics'] = ""
tmdf_topic_by_group_df.head(1)

Unnamed: 0,MetadataSubject,MetadataDocumentClass,ExtractedSubject,ExtractedBodyText,RawText,MsgTopics
0,WOW,HRC_Email_296,FW: Wow,,UNCLASSIFIED\r\nU.S. Department of State\r\nCa...,


#### Group messages by subject
We want to calculate topics modulation for a chain of messages with the same subject, so as we grouped by 'MetadataSubject'

In [76]:
grouped_msgs_by_subject = tmdf_topic_by_group_df.groupby(['MetadataSubject'])
gb = grouped_msgs_by_subject.groups

In [243]:
# auxiliary function that remaps stemmed tokens back to words;
# for simplification we take first corresponding word from pre-mapped stem-words

def destem_topics(topics_matrix):
    res = '' if not topics_matrix else ' '.join([min(curr_stem_map[w]) for w, p in topics_matrix[0][1]])
    cleanup_stem()
    return res

## Run calculation for every group and fill the DF

In [92]:
for key, indices in tqdm(gb.items(), ncols=len(gb)):
    group = grouped_msgs_by_subject.get_group(key)
    msg_dict = group['RawText'].to_dict()
    lda = calculate_topics(list(msg_dict.values()), n_topics=5)
    
    # get topics from lda model and perform destemming
    topics_str = '' if not lda else destem_topics(lda.show_topics(formatted=False, num_words=5))
    
    # fill 'MsgTopics' column with appropriate topics string
    for i in indices:
        tmdf_topic_by_group_df.ix[i, 'MsgTopics'] = topics_str

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

In [93]:
tmdf_topic_by_group_df.head(3)

Unnamed: 0,MetadataSubject,MetadataDocumentClass,ExtractedSubject,ExtractedBodyText,RawText,MsgTopics
0,WOW,HRC_Email_296,FW: Wow,,UNCLASSIFIED\r\nU.S. Department of State\r\nCa...,government also still resent million
1,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,HRC_Email_296,,"B6\r\nThursday, March 3, 2011 9:45 PM\r\nH: La...",UNCLASSIFIED\r\nU.S. Department of State\r\nCa...,military convinced zone support former
2,CHRIS STEVENS,HRC_Email_296,Re: Chris Stevens,Thx,UNCLASSIFIED\r\nU.S. Department of State\r\nCa...,corn front sorry devotedly former


# Topic modeling over the entire corpus

We decided to calculate topics for all messages in dataframe. So get 'RawText' message data as a list and apply previously described pipeline: preprocessing -> stemming -> dictionary -> corpus -> lda

In [170]:
test_list = topic_modeling_df['RawText'].tolist()

In [175]:
tokens = [preprocess_msg(msg) for msg in test_list]
tokens = steamming(tokens)
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(t) for t in tokens]

In [244]:
# auxiliary function; processes topics data obtained from lda model

def unstem(topics_data):
    topics_data_unstem = []
    for topics_item in topics_data:
        l = topics_item[1].split(" + ")
        curr_data = []
        for i in l:
            unstem = min(curr_stem_map[re.search('"([^"]*)"', i).group(1)])
            curr_data.append(re.sub('"([^"]*)"', '\"{0}\"'.format(unstem), i))
        topics_data_unstem.append((topics_item[0], '+'.join(curr_data)))
    return topics_data_unstem

In [235]:
def print_topics(topics_matrix):
    pure_topics_list = [[min(curr_stem_map[w]) for w, p in words] for i, words in topics_matrix]
    for pt in pure_topics_list:
        print(pt, '\n')

### Run LDA calculation for [5, 25, 50] topic number

##### Result for topics_number = 5

In [226]:
lda_5 = models.ldamodel.LdaModel(corpus, num_topics = 5, id2word = dictionary, passes=15)

In [246]:
topics_data_lda5 = lda_5.print_topics(num_topics=5, num_words=10)
unstem(topics_data_lda5)[:10]

[(0,
  '0.018*"secure"+0.017*"government"+0.010*"also"+0.009*"military"+0.008*"region"+0.007*"stated"+0.007*"country"+0.006*"situation"+0.006*"support"+0.006*"peace"'),
 (1,
  '0.020*"said"+0.008*"political"+0.007*"president"+0.006*"like"+0.006*"time"+0.006*"deal"+0.005*"government"+0.005*"administration"+0.005*"last"+0.005*"party"'),
 (2,
  '0.010*"freedom"+0.010*"world"+0.009*"calling"+0.008*"right"+0.008*"people"+0.007*"peace"+0.007*"speech"+0.007*"religious"+0.006*"human"+0.006*"history"'),
 (3,
  '0.015*"work"+0.014*"know"+0.014*"calling"+0.012*"time"+0.009*"like"+0.009*"meeting"+0.009*"said"+0.007*"also"+0.007*"think"+0.006*"today"'),
 (4,
  '0.010*"work"+0.009*"developer"+0.008*"needs"+0.007*"government"+0.007*"publicity"+0.006*"people"+0.006*"policy"+0.006*"economic"+0.006*"support"+0.006*"also"')]

In [238]:
print_topics(lda_5.show_topics(formatted=False, num_words=20))

['secure', 'government', 'also', 'military', 'region', 'stated', 'country', 'situation', 'support', 'peace', 'time', 'source', 'political', 'attack', 'international', 'force', 'concerned', 'well', 'effort', 'work'] 

['said', 'political', 'president', 'like', 'time', 'deal', 'government', 'administration', 'last', 'party', 'making', 'also', 'former', 'told', 'people', 'even', 'first', 'policy', 'think', 'campaign'] 

['freedom', 'world', 'calling', 'right', 'people', 'peace', 'speech', 'religious', 'human', 'history', 'opening', 'want', 'text', 'powerful', 'needs', 'like', 'democracy', 'making', 'know', 'last'] 

['work', 'know', 'calling', 'time', 'like', 'meeting', 'said', 'also', 'think', 'today', 'personal', 'coming', 'morning', 'good', 'people', 'backing', 'week', 'going', 'well', 'talk'] 

['work', 'developer', 'needs', 'government', 'publicity', 'people', 'policy', 'economic', 'support', 'also', 'world', 'global', 'health', 'diplomacy', 'international', 'million', 'foreign', 'he

##### Result for topics_number = 25

In [233]:
lda_25 = models.ldamodel.LdaModel(corpus, num_topics = 25, id2word = dictionary, passes=15)

In [245]:
topics_data_lda25 = lda_25.print_topics(num_topics=5, num_words=10)
unstem(topics_data_lda25)[:10]

[(21,
  '0.019*"said"+0.014*"secure"+0.009*"information"+0.009*"providing"+0.009*"force"+0.008*"ordered"+0.007*"police"+0.007*"also"+0.007*"report"+0.006*"casing"'),
 (8,
  '0.019*"government"+0.018*"source"+0.016*"also"+0.014*"sensitive"+0.014*"individual"+0.012*"situation"+0.012*"secure"+0.011*"opinion"+0.011*"region"+0.011*"stated"'),
 (2,
  '0.026*"political"+0.020*"conservative"+0.017*"company"+0.016*"movement"+0.015*"group"+0.015*"million"+0.012*"corporate"+0.009*"business"+0.009*"liberate"+0.008*"like"'),
 (6,
  '0.024*"publicity"+0.020*"work"+0.014*"engagement"+0.013*"developer"+0.011*"media"+0.009*"people"+0.009*"health"+0.009*"officer"+0.008*"technology"+0.007*"diplomacy"'),
 (13,
  '0.022*"government"+0.012*"country"+0.011*"military"+0.009*"political"+0.009*"said"+0.007*"report"+0.006*"stated"+0.006*"time"+0.006*"secure"+0.006*"local"')]

In [237]:
print_topics(lda_25.show_topics(formatted=False, num_words=20))

['said', 'attack', 'cent', 'secure', 'intelligence', 'official', 'media', 'administration', 'week', 'terrorist', 'campaign', 'last', 'cover', 'told', 'foreign', 'people', 'point', 'source', 'terror', 'meeting'] 

['freedom', 'history', 'symbolic', 'opening', 'future', 'wall', 'white', 'world', 'people', 'city', 'turn', 'courage', 'century', 'last', 'without', 'alliance', 'built', 'celebration', 'revolution', 'point'] 

['much', 'think', 'like', 'talk', 'wrote', 'always', 'time', 'good', 'surely', 'know', 'right', 'done', 'work', 'powerful', 'looking', 'party', 'play', 'something', 'backing', 'making'] 

['government', 'economic', 'percent', 'money', 'needs', 'economy', 'spending', 'like', 'people', 'publicity', 'financial', 'political', 'growth', 'also', 'international', 'first', 'chronic', 'budget', 'debt', 'effect'] 

['personal', 'registered', 'information', 'confidential', 'press', 'message', 'opinion', 'intended', 'enough', 'best', 'also', 'favor', 'part', 'number', 'course', 'ope

##### Result for topics_number = 50

In [184]:
lda = models.ldamodel.LdaModel(corpus, num_topics = 50, id2word = dictionary, passes=15)

In [232]:
topics_data_lda50 = lda.print_topics(num_topics=5, num_words=10)
unstem(topics_data_lda50)[:10]

[(28,
  '0.028*"million"+0.025*"money"+0.022*"company"+0.015*"corporate"+0.013*"said"+0.012*"funds"+0.012*"spending"+0.010*"business"+0.008*"also"+0.008*"climate"'),
 (33,
  '0.033*"family"+0.023*"police"+0.016*"said"+0.014*"casing"+0.014*"home"+0.012*"work"+0.012*"child"+0.010*"legal"+0.010*"last"+0.010*"take"'),
 (46,
  '0.164*"percent"+0.029*"disease"+0.022*"rating"+0.018*"support"+0.018*"primary"+0.016*"population"+0.014*"terror"+0.014*"program"+0.013*"handling"+0.013*"health"'),
 (19,
  '0.077*"time"+0.062*"route"+0.031*"hotel"+0.027*"room"+0.026*"photo"+0.024*"opening"+0.024*"delegation"+0.015*"regime"+0.014*"media"+0.014*"staff"'),
 (49,
  '0.020*"cent"+0.019*"military"+0.017*"conflict"+0.015*"government"+0.011*"police"+0.010*"needs"+0.009*"major"+0.009*"publicity"+0.008*"making"+0.008*"support"')]

In [236]:
print_topics(lda.show_topics(formatted=False, num_words=20))

['secure', 'attack', 'government', 'violence', 'protect', 'country', 'religious', 'threat', 'also', 'region', 'armed', 'consulate', 'condemned', 'group', 'people', 'violent', 'seen', 'personnel', 'response', 'investigating'] 

['said', 'director', 'film', 'executive', 'book', 'work', 'festive', 'censorship', 'professional', 'production', 'lead', 'also', 'correct', 'attention', 'political', 'writer', 'many', 'think', 'making', 'action'] 

['bill', 'vote', 'reform', 'carefully', 'health', 'legislative', 'floor', 'amend', 'debate', 'outright', 'time', 'pass', 'million', 'committee', 'said', 'next', 'major', 'first', 'lunch', 'even'] 

['former', 'military', 'week', 'think', 'invasion', 'also', 'said', 'inquiry', 'memo', 'like', 'making', 'coming', 'without', 'last', 'take', 'made', 'foreign', 'speech', 'time', 'secretary'] 

['also', 'death', 'concerned', 'attempt', 'intimidate', 'paper', 'information', 'believe', 'deal', 'coming', 'support', 'dangerous', 'press', 'short', 'attack', 'nati

### Conclusion
We can say that after processing topics modulation on corpus we obtained reasonable results of lda. However, we noticed that thematically tokens are more close for topics number = 50. Concerning messages groupped by subject, we see that, the larger group is, the more related tokens are. Since the corpus of groupped messages is not large, a high value of of topics number is meaningless.