In [None]:
from os import path
from wordcloud import WordCloud, STOPWORDS
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sys
import nltk
from scipy.stats import itemfreq

In [None]:
current_directory = 'hillary-clinton-emails/'
df = pd.read_csv(current_directory + 'Emails.csv', index_col=0)
df.head()

---
# Word Cloud

## Generate word cloud without preprocessing

We use package `word_cloud` to generate a word cloud. The text are from the extracted columns of this dataframe

In [None]:
# random_corpus = (df.iloc[np.random.choice(df.index, size=2000, replace=False), -2].dropna()).sum()
all_body_text = (df.loc[:, 'ExtractedBodyText'].dropna()).sum()

In [None]:
wordcloud = WordCloud(max_font_size=40).generate(all_body_text)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

## Generate word cloud with preprocessing

### Preprocessing Strategies

1. filter out some email word like `fw`, `fwd`, `subject` and so on
2. case folding: reduce all letters to lower case
3. ignoring punctuates

TODO: (maybe)

1. Try Lemmatization
2. Try Stemming

##### Remove Punctuation

In [None]:
import string
def remove_punctuation(text):
    return text.translate(str.maketrans({key: '' for key in string.punctuation}))

##### Generate stopwords

In [None]:
email_word_list = ['fw', 'fwd', 'subject', 'to', 're', 'pm', 'will', 'case', 'also', 're', 'call', 'may', 'mr', 'ms', 'mrs']
stopwords = set(email_word_list).union(set(STOPWORDS))

##### Processing words

In [None]:
# Concatenate all the Extracted Body Text
txt = (df.loc[:, 'ExtractedBodyText'].dropna()).sum()
txt = remove_punctuation(txt)
tokens = nltk.word_tokenize(txt)
tokens = [i.lower() for i in tokens if i.lower() not in stopwords]
tokens_freq = itemfreq(tokens)
tokens, tokens_freq = tokens_freq[:, 0], tokens_freq[:, 1].astype(int)
tokens_x_freq = [(tokens[i], tokens_freq[i]) for i in range(len(tokens))]

##### Create word cloud

In [None]:
wordcloud = WordCloud(max_font_size=40, stopwords=STOPWORDS).generate_from_frequencies(tokens_x_freq)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

#### Stemming

In [None]:
from nltk.stem import PorterStemmer, SnowballStemmer

pt = PorterStemmer()
text = 'Clarissa risca com giz no quadro-negro a paisagem que os alunos devem copiar . Uma casinha de porta e janela , em cima duma coxilha .'
for token in text.split():
    print(pt.stem(token))

In [None]:
pt = SnowballStemmer('english')
def compare_stem(word1, word2):
    """if two words have same """
    print(pt.stem(word1))
    print(pt.stem(word2))
    return pt.stem(word1) ==  pt.stem(word2)