In [None]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
import string

# HW5
### 1 Word clouds

Let first look at a raw word cloud:

In [None]:
emails = pd.read_csv('hillary-clinton-emails/Emails.csv')
raw_corpus = '\n'.join(emails['RawText'])

In [None]:
wordcloud = WordCloud().generate(raw_corpus)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

Now let create a pipeline and process again:

In [None]:
# Don't forget to run nltk.download() if not already done...
from nltk.tokenize import RegexpTokenizer #, StanfordTokenizer

def tokenize(str):
    tokenizer = RegexpTokenizer('\w+')
    #return tokenizer.tokenize(str)
    return [tokenizer.tokenize(s) for s in nltk.sent_tokenize(str)]

def remove_stopwords(words):
    return [w for w in words if w.lower() not in stopwords.words('english')]

def stemming(words):
    l = []
    porter = nltk.PorterStemmer()
    for word in words:
        l.append(porter.stem(word))
    return l

def pipeline(str, return_as_str=False, do_remove_stopwords=False):
    l = []
    words = []
    sentences = tokenize(str)
    for sentence in sentences:
        if do_remove_stopwords:
            words = remove_stopwords(sentence)
        else:
            words = sentence
        words = stemming(words)
        if return_as_str:
            l.append(' '.join(words))
        else:
            l.append(words)
    if return_as_str:
        return ' '.join(l)
    else:
        return l

#tokenize = lambda email: nltk.word_tokenize(email)
#not_stopword = lambda word: word not in stopwords.words('english')
# not_punctuation = lambda word: word not in string.punctuation

#tokens_list = []
#for email in emails['RawText']:
#    tokens = pipeline(email, do_remove_stopwords=True)
#    tokens_list.append(tokens)

In [None]:
stemmed = pipeline(raw_corpus, return_as_str=True, do_remove_stopwords=True)

In [None]:
wordcloud_2 = WordCloud().generate(stemmed)
plt.imshow(wordcloud_2)
plt.axis('off')
plt.show()

The two word clouds are very close together.  
The first approach is faster and straightforward but it could lack some fine grained tuning regarding language processing.  
The second one is way slower as we run through many pre-processing steps. But its main advantage is the ability to tune some parameters (stop words, stemming, etc.).

### Part 2

In [None]:
from pycountry import countries

def get_country(text):
    c = None
    '''if (len(country) == 2):
        c = countries.get(alpha_2=country)
    elif (len(country) == 3):
        c = countries.get(alpha_3=country)
    else:
        c = countries.get(name=country.title())'''
    if (len(text) > 3):
        try:
            c = countries.get(name=text.title()).name
        except:
            pass
    return c

In [None]:
mentioned = {}
for i in range(len(tokens)):
    for token in tokens[i]:
        country = get_country(token)
        if country:
            if country in mentioned:
                mentioned[country].append(i)
            else:
                mentioned[country] = [i]

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
sentiments = [map(sid.polarity_scores, t) for t in emails['RawText']]

In [None]:
sentiments[:5]

In [None]:
total = {}
for country, idx in mentioned.iteritems():
    sentiment = []
    for i in idx:
        sentiment.append(sentiments[i])
    total[country] = np.mean(sentiment)

### Part 3

In [None]:
from gensim import models.ldamodel

lda = LdaModel(corpus, num_topics=10) # 5 to 50