In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import show
%matplotlib inline

import glob # to find all files in folder

import pycountry
import re # regex
from nltk.sentiment.util import *
import nltk as nl
from nltk.corpus import stopwords as nlstopw
import string

## Import data


In [None]:
folder = 'hillary-clinton-emails/'

List all file in hillary-clinton-emails

In [None]:
glob.glob(folder + '*')

In [None]:
aliases = pd.read_csv(folder + 'Aliases.csv')
aliases.head(2)

In [None]:
receivers = pd.read_csv(folder + 'EmailReceivers.csv')
receivers.head(2)

In [None]:
emails = pd.read_csv(folder + 'Emails.csv')
emails.head(2)

In [None]:
persons = pd.read_csv(folder + 'Persons.csv')
persons.head(2)

# Analyse Emails

In [None]:
emails.head()

The columns ExtractedBodyText is supposed to be the content of the mail but some of the mail have a ExtractedBodyText = NaN but the Rawtext seems to contains something

In [None]:
emails.columns

In [None]:
print('Number of emails: ', len(emails))

In [None]:
bodyNaN = emails.ExtractedBodyText.isnull().sum()
print('Number of emails with ExtractedBodyText=NaN: {}, ({:.2f}%)'.format(emails.ExtractedBodyText.isnull().sum(), bodyNaN/ len(emails)))

We could also use the subject since it is usually a summary of the mail

In [None]:
bodyNaN = emails.ExtractedSubject.isnull().sum()
print('Number of emails with ExtractedSubject=NaN: {}, ({:.2f}%)'.format(emails.ExtractedBodyText.isnull().sum(), bodyNaN/ len(emails)))

Now let's try to combine the subject and the body and drop the mail that have both subject= NaN and body = Nan

In [None]:
subBodyNan = emails[np.logical_and(emails.ExtractedBodyText.isnull(),emails.ExtractedSubject.isnull())]
print('Number of email where both subject and body is NaN: {}({:.2f})'.format(len(subBodyNan), len(subBodyNan)/ len(emails)))

Well, that number is small enough to drop all email where both Extracted subject and Extracted body is NaN.

Let's drop them and create a new columns subjectBody that is the concatenation of the 2 columns ExtractedSubject and ExtractedBody. From now we will work with that columns

In [None]:
emails = emails[~ np.logical_and(emails.ExtractedBodyText.isnull(), emails.ExtractedSubject.isnull())]
len(emails)

In [None]:
emails.ExtractedBodyText.fillna('',inplace=True)
emails.ExtractedSubject.fillna('',inplace=True)
emails['SubjectBody'] = emails.ExtractedBodyText + emails.ExtractedSubject
emails.SubjectBody.head()

Last check to be sur that our columns of interest don't have anymore NaN

In [None]:
print('Number of NaN in columns SubjectBody: ' ,emails.SubjectBody.isnull().sum())

# Keep only mail that mentions a country

Structure of a country in pycountry.countres

In [None]:
list(pycountry.countries)[0]

First we create a dataframe with one line by countries and we count for each countries its occurences in the mail.

Since a country can be reference in many way (Switzerland, switzerland, CH), we need to consider all the possible form. 

We may have a problem with word that have many meaning like US(country) and us (pronoun) so we can't just take all the country name in loer case and all the mail in lower case and just compare.

Here are the consideration we use:
    1. the country name can appear either in lower case, with the first letter in uppercase or all in uppercase
    2. alpha_2 and alpha_3 are always used in uppercase


In [None]:
country_name = np.array([[country.name.lower(), country.name.upper(), country.name.title()] for country in list(pycountry.countries)])
country_name[:5]

In [None]:
alpha_2 = np.array([country.alpha_2 for country in list(pycountry.countries)])
alpha_2[:5]

In [None]:
alpha_3 = np.array([country.alpha_3 for country in list(pycountry.countries)])
alpha_3[:5]

In [None]:
country_name.shape

In [None]:
countries = np.vstack((alpha_2, alpha_3)).T
countries = np.concatenate([country_name, countries], axis=1)
countries = pd.DataFrame(countries, columns=['name', 'NAME', 'Name', 'Alhpa_2', 'Alph_3'])
countries.head()



In [None]:
countries.isin(['aruba']).any().any()

In [None]:
def check_country(row):
    return countries.isin(row.SubjectBody.split()).any().any()
    

In [None]:
emails_country = emails[emails.apply(check_country, axis=1)]
len(emails_country)


# Sentiment analysis

In [None]:
sentiments = pd.DataFrame(emails_country.SubjectBody)
sentiments.head()

## Cleaning

To apply the sentiment annalysis we need first to clean the data

We will now use the same pipeline as in exercice 1 to clean the emails:
    - cleaning, tokenization, stopword removal, stemming

First we need to remove ponctuation of each email


In [None]:
# remove punktuation
def removePunctuation(row):
    return re.sub('['+string.punctuation+']', '', row.SubjectBody)

sentiments['Punctuation']= sentiments.apply(removePunctuation, axis=1)
sentiments.Punctuation.head()

In [None]:
from nltk.stem.snowball import EnglishStemmer
from nltk.corpus import stopwords
nl.download('punkt') # needed for word tokenization
nltk.download('stopwords')

Tokenize

In [None]:
sentiments['Tokens'] = sentiments.Punctuation.apply(nl.word_tokenize)
sentiments.Tokens.head()


Stopword

We need to include also the stop word specific to the subject field

In [None]:
stop = set(stopwords.words('english'))
email_stopwords = set(['re', 'fw', 'fvv', 'fwd']).union(stop)

def removeStopWords(row):
    return[i for i in row if i.lower() not in email_stopwords]

sentiments['StopWords'] = sentiments.Tokens.apply(removeStopWords)
sentiments.StopWords.head()

Stemmatize


In [None]:
stemmer_E = EnglishStemmer()

def stemmatize(row):
    return [stemmer_E.stem(tok) for tok in row]

sentiments['Stem'] = sentiments.StopWords.apply(stemmatize)
sentiments.Stem.head()

## Analysis

We will do a sentiment analysis on each sentense and then compute a socre for each country

We will compare different module:
    - nltk.sentiment.util
    - nltk.sentiment.vader

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
def sentiment_analysis(row):
    score = sid.polarity_scores(' '.join(row))
    return pd.Series({'pos': score['pos'], 'neg': score['neg'] })

sentiments = pd.concat([sentiments, sentiments.Stem.apply(sentiment_analysis)], axis=1)
sentiments.head()

Saved the result in csv since it take quite some time to compute the score


In [None]:
sentiments.to_csv('mailScore.csv')

## Aggregate by countries

In [None]:
sentiments = pd.read_csv('mailScore.csv')

In [None]:
def aggScoreByCountry(country):
    bool = sentiments.apply(lambda x: country.isin(x.SubjectBody.split()).any(), axis=1)
    sent = sentiments[bool]
    if sent.empty:
        return 0
    print((np.mean(sent.pos) - np.mean(sent.neg))/2)
    return (np.mean(sent.pos) - np.mean(sent.neg))/2

In [None]:
countries['Score'] = countries.apply(aggScoreByCountry, axis=1)

Drop all country that have a score of 0 (either they never appear in the mails of they have a neutral sentiment)

In [None]:
countries = countries[countries.Score != 0]
len(countries)

In [None]:
countries_sorted = countries.sort(columns=['Score'])
f, axs = plt.subplots(1,1,figsize=(15,5))
index = np.arange(len(countries_sorted))
bar_width = 0.95
axs.bar(range(len(countries_sorted)), countries_sorted.Score,width=bar_width)


In [None]:
countries_sorted.Name