# Women Who Code
# NLP exercise - Hillary Clinton's Emails Subject Analysis

## Exploratory Analysis: Getting and Cleaning Data


In [1]:
# Loading libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

Emails.csv

1. Id - unique identifier for internal reference
2. DocNumber - FOIA document number
3. MetadataSubject - Email SUBJECT field (from the FOIA metadata)
4. MetadataTo - Email TO field (from the FOIA metadata)
5. MetadataFrom - Email FROM field (from the FOIA metadata)
6. SenderPersonId - PersonId of the email sender (linking to Persons table)
7. MetadataDateSent - Date the email was sent (from the FOIA metadata)
8. MetadataDateReleased - Date the email was released (from the FOIA metadata)
9. MetadataPdfLink - Link to the original PDF document (from the FOIA metadata)
10. MetadataCaseNumber - Case number (from the FOIA metadata)
11. MetadataDocumentClass - Document class (from the FOIA metadata)
12. ExtractedSubject - Email SUBJECT field (extracted from the PDF)
13. ExtractedTo - Email TO field (extracted from the PDF)
14. ExtractedFrom - Email FROM field (extracted from the PDF)
15. ExtractedCc - Email CC field (extracted from the PDF)
16. ExtractedDateSent - Date the email was sent (extracted from the PDF)
17. ExtractedCaseNumber - Case number (extracted from the PDF)
18. ExtractedDocNumber - Doc number (extracted from the PDF)
19. ExtractedDateReleased - Date the email was released (extracted from the PDF)
20. ExtractedReleaseInPartOrFull - Whether the email was partially censored (extracted from the PDF)
21. ExtractedBodyText - Attempt to only pull out the text in the body that the email sender wrote (extracted from the PDF)
22. RawText - Raw email text (extracted from the PDF)

In [2]:
# Loading data into dataframe

emails = pd.read_csv("~/Documents/WWC/NLP_PYTHON/Emails.csv")



In [3]:
emails.head()

Unnamed: 0,Id,DocNumber,MetadataSubject,MetadataTo,MetadataFrom,SenderPersonId,MetadataDateSent,MetadataDateReleased,MetadataPdfLink,MetadataCaseNumber,...,ExtractedTo,ExtractedFrom,ExtractedCc,ExtractedDateSent,ExtractedCaseNumber,ExtractedDocNumber,ExtractedDateReleased,ExtractedReleaseInPartOrFull,ExtractedBodyText,RawText
0,1,C05739545,WOW,H,"Sullivan, Jacob J",87.0,2012-09-12T04:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH2/DOC_0C05739545...,F-2015-04841,...,,"Sullivan, Jacob J <Sullivan11@state.gov>",,"Wednesday, September 12, 2012 10:16 AM",F-2015-04841,C05739545,05/13/2015,RELEASE IN FULL,,UNCLASSIFIED\nU.S. Department of State\nCase N...
1,2,C05739546,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,H,,,2011-03-03T05:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH1/DOC_0C05739546...,F-2015-04841,...,,,,,F-2015-04841,C05739546,05/13/2015,RELEASE IN PART,"B6\nThursday, March 3, 2011 9:45 PM\nH: Latest...",UNCLASSIFIED\nU.S. Department of State\nCase N...
2,3,C05739547,CHRIS STEVENS,;H,"Mills, Cheryl D",32.0,2012-09-12T04:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH2/DOC_0C05739547...,F-2015-04841,...,B6,"Mills, Cheryl D <MillsCD@state.gov>","Abedin, Huma","Wednesday, September 12, 2012 11:52 AM",F-2015-04841,C05739547,05/14/2015,RELEASE IN PART,Thx,UNCLASSIFIED\nU.S. Department of State\nCase N...
3,4,C05739550,CAIRO CONDEMNATION - FINAL,H,"Mills, Cheryl D",32.0,2012-09-12T04:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH2/DOC_0C05739550...,F-2015-04841,...,,"Mills, Cheryl D <MillsCD@state.gov>","Mitchell, Andrew B","Wednesday, September 12,2012 12:44 PM",F-2015-04841,C05739550,05/13/2015,RELEASE IN PART,,UNCLASSIFIED\nU.S. Department of State\nCase N...
4,5,C05739554,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,"Abedin, Huma",H,80.0,2011-03-11T05:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH1/DOC_0C05739554...,F-2015-04841,...,,,,,F-2015-04841,C05739554,05/13/2015,RELEASE IN PART,"H <hrod17@clintonemail.com>\nFriday, March 11,...",B6\nUNCLASSIFIED\nU.S. Department of State\nCa...


In [4]:
cols = ['Id', 'DocSubject', 'To', 'From', 'PersonId','DateSent', 'DateReleased', 'pdfLink',
        'CaseNumber', 'DocClass' 'pdfSubject', 'pdfTo', 'pdfFrom', 'pdfCc', 'pdfDateSent',
        'pdfCaseNumber', 'pdfDocNumber', 'pdfDateReleased', 'RinPartorFull', 'pdfBodyTest',
        'pdfRawEmail', 'x']

no_headers = pd.read_csv('~/Documents/WWC/NLP_PYTHON/Emails.csv', sep=',', header=0,
                         names=cols)

In [5]:
no_headers.head() 

Unnamed: 0,Id,DocSubject,To,From,PersonId,DateSent,DateReleased,pdfLink,CaseNumber,DocClasspdfSubject,...,pdfFrom,pdfCc,pdfDateSent,pdfCaseNumber,pdfDocNumber,pdfDateReleased,RinPartorFull,pdfBodyTest,pdfRawEmail,x
1,C05739545,WOW,H,"Sullivan, Jacob J",87.0,2012-09-12T04:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH2/DOC_0C05739545...,F-2015-04841,HRC_Email_296,...,,"Sullivan, Jacob J <Sullivan11@state.gov>",,"Wednesday, September 12, 2012 10:16 AM",F-2015-04841,C05739545,05/13/2015,RELEASE IN FULL,,UNCLASSIFIED\nU.S. Department of State\nCase N...
2,C05739546,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,H,,,2011-03-03T05:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH1/DOC_0C05739546...,F-2015-04841,HRC_Email_296,...,,,,,F-2015-04841,C05739546,05/13/2015,RELEASE IN PART,"B6\nThursday, March 3, 2011 9:45 PM\nH: Latest...",UNCLASSIFIED\nU.S. Department of State\nCase N...
3,C05739547,CHRIS STEVENS,;H,"Mills, Cheryl D",32.0,2012-09-12T04:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH2/DOC_0C05739547...,F-2015-04841,HRC_Email_296,...,B6,"Mills, Cheryl D <MillsCD@state.gov>","Abedin, Huma","Wednesday, September 12, 2012 11:52 AM",F-2015-04841,C05739547,05/14/2015,RELEASE IN PART,Thx,UNCLASSIFIED\nU.S. Department of State\nCase N...
4,C05739550,CAIRO CONDEMNATION - FINAL,H,"Mills, Cheryl D",32.0,2012-09-12T04:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH2/DOC_0C05739550...,F-2015-04841,HRC_Email_296,...,,"Mills, Cheryl D <MillsCD@state.gov>","Mitchell, Andrew B","Wednesday, September 12,2012 12:44 PM",F-2015-04841,C05739550,05/13/2015,RELEASE IN PART,,UNCLASSIFIED\nU.S. Department of State\nCase N...
5,C05739554,H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...,"Abedin, Huma",H,80.0,2011-03-11T05:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH1/DOC_0C05739554...,F-2015-04841,HRC_Email_296,...,,,,,F-2015-04841,C05739554,05/13/2015,RELEASE IN PART,"H <hrod17@clintonemail.com>\nFriday, March 11,...",B6\nUNCLASSIFIED\nU.S. Department of State\nCa...


In [6]:
no_headers.ndim # Display DataFrame attributes (Number of dimensions)

2

In [7]:
no_headers.shape # Number of elements in the dataframe

(7945, 21)

In [8]:
no_headers.dtypes # Types of elements

Id                     object
DocSubject             object
To                     object
From                   object
PersonId              float64
DateSent               object
DateReleased           object
pdfLink                object
CaseNumber             object
DocClasspdfSubject     object
pdfTo                  object
pdfFrom                object
pdfCc                  object
pdfDateSent            object
pdfCaseNumber          object
pdfDocNumber           object
pdfDateReleased        object
RinPartorFull          object
pdfBodyTest            object
pdfRawEmail            object
x                      object
dtype: object

### Slicing Dataframe to extract Subject ###

*Object > Type Selection >	  Return Value Type*  

Series	 >  series[label] > scalar value  

DataFrame > frame[colname] >  Series corresponding to colname  

Panel >  panel[itemname] > DataFrame corresponding to the itemname

In [9]:
emailsSubjects = no_headers['DocSubject']

In [10]:
emailsSubjects[0:5]  

1                                                  WOW
2    H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...
3                                        CHRIS STEVENS
4                           CAIRO CONDEMNATION - FINAL
5    H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...
Name: DocSubject, dtype: object

In [11]:
emailsSubjects.dtypes

dtype('O')

In [12]:
emailsSubjects.size

7945

In [13]:
emailsSubjects.ndim 

1

## Pre-Processing Data ##

The NLTK module is a Python kit to perform Natural Language Processing (NLP). With NLTK you would be able to split sentences from paragraphs, split up words, recognize the part of speech of those words, and highlight the main subjects. In this series, we're going to focus on subject/topic mining and sentiment analysis.

### Vocabulary ###

Corpus - Body of text, singular. Corpora is the plural of this. Example: A collection of medical journals.

Lexicon - Words and their meanings. Example: English dictionary. Consider, however, that various fields will have different lexicons. For example: To a financial investor, the first meaning for the word "Bull" is someone who is confident about the market, as compared to the common English lexicon, where the first meaning for the word "Bull" is an animal. As such, there is a special lexicon for financial investors, doctors, children, mechanics, and so on.

Token - Each "entity" that is a part of whatever was split up based on rules. For examples, each word is a token when a sentence is "tokenized" into words. Each sentence can also be a token, if you tokenized the sentences out of a paragraph.

From:https://pythonprogramming.net/tokenizing-words-sentences-nltk-tutorial/

### Cleaning ###

1. Eliminating punctuation
2. Eliminating stopwords
2. Normalizing data: converting to lower case
3. Tokenizing words


In [14]:
import nltk 

In [15]:
type(emailsSubjects)

pandas.core.series.Series

In [16]:
Subject = emailsSubjects[2]

In [17]:
nltk.sent_tokenize(Subject)

['H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MORE... SID']

In [18]:
nltk.word_tokenize(Subject)

['H',
 ':',
 'LATEST',
 ':',
 'HOW',
 'SYRIA',
 'IS',
 'AIDING',
 'QADDAFI',
 'AND',
 'MORE',
 '...',
 'SID']

In [19]:
from nltk.corpus import stopwords

In [20]:
set(stopwords.words('english'))

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 'd',
 'did',
 'didn',
 'do',
 'does',
 'doesn',
 'doing',
 'don',
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 'has',
 'hasn',
 'have',
 'haven',
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 'it',
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 'more',
 'most',
 'mustn',
 'my',
 'myself',
 'needn',
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 're',
 's',
 'same',
 'shan',
 'she',
 'should',
 'shouldn',
 'so',
 'some',
 'such',
 't',
 'than',
 'that',
 'the',
 'their',
 'theirs',
 'them',
 

In [21]:
stopWords = set(stopwords.words('english'))

In [22]:
tokens = nltk.word_tokenize(Subject) # Getting all the words within the subject

In [23]:
type(tokens)

list

In [24]:
tokens

['H',
 ':',
 'LATEST',
 ':',
 'HOW',
 'SYRIA',
 'IS',
 'AIDING',
 'QADDAFI',
 'AND',
 'MORE',
 '...',
 'SID']

In [25]:
mylist = emailsSubjects[0:5]
mylist

1                                                  WOW
2    H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...
3                                        CHRIS STEVENS
4                           CAIRO CONDEMNATION - FINAL
5    H: LATEST: HOW SYRIA IS AIDING QADDAFI AND MOR...
Name: DocSubject, dtype: object

In [26]:
tokenStrg = '\n'.join(map(str, mylist))
tokenStrg

'WOW\nH: LATEST: HOW SYRIA IS AIDING QADDAFI AND MORE... SID\nCHRIS STEVENS\nCAIRO CONDEMNATION - FINAL\nH: LATEST: HOW SYRIA IS AIDING QADDAFI AND MORE... SID'

In [27]:
type(tokenStrg)

str

In [28]:
from nltk.tokenize import RegexpTokenizer # Eliminating punctuation

tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(tokenStrg)

In [29]:
tokens

['WOW',
 'H',
 'LATEST',
 'HOW',
 'SYRIA',
 'IS',
 'AIDING',
 'QADDAFI',
 'AND',
 'MORE',
 'SID',
 'CHRIS',
 'STEVENS',
 'CAIRO',
 'CONDEMNATION',
 'FINAL',
 'H',
 'LATEST',
 'HOW',
 'SYRIA',
 'IS',
 'AIDING',
 'QADDAFI',
 'AND',
 'MORE',
 'SID']

In [30]:
cleanup = [token.lower() for token in tokens if token.lower() not in stopWords and  len(token)>2]

In [31]:
cleanup # Display normalized tokens in slice 0:5

['wow',
 'latest',
 'syria',
 'aiding',
 'qaddafi',
 'sid',
 'chris',
 'stevens',
 'cairo',
 'condemnation',
 'final',
 'latest',
 'syria',
 'aiding',
 'qaddafi',
 'sid']

### Exploring Data using a WordCloud

In [32]:
type(cleanup)

list

In [33]:
tokenStrgCln = ' '.join(map(str, cleanup))
tokenStrgCln

'wow latest syria aiding qaddafi sid chris stevens cairo condemnation final latest syria aiding qaddafi sid'

In [34]:
tokensCln = tokenizer.tokenize(tokenStrgCln)

In [35]:
set(tokensCln)

{'aiding',
 'cairo',
 'chris',
 'condemnation',
 'final',
 'latest',
 'qaddafi',
 'sid',
 'stevens',
 'syria',
 'wow'}

In [36]:
import wordcloud as wc

In [37]:
from wordcloud import WordCloud, STOPWORDS

In [38]:
import matplotlib.pyplot as plt

In [39]:
#Convert all the required text into a single string here 
#and store them in word_string (tokenStrgCln)

#you can specify stopwords, background color and other options

wordcloud = WordCloud(stopwords=STOPWORDS,
                      background_color='white',width=1200,height=1000).generate(tokenStrgCln)

In [40]:
plt.imshow(wordcloud)

<matplotlib.image.AxesImage at 0x11193c978>

In [41]:
plt.axis('off')

(-0.5, 1199.5, 999.5, -0.5)

In [None]:
plt.show()

### Frequency Distribution

How can we automatically identify the words of a text that are the most informative about the topics on Hillary Clinton's emails?



In [42]:
from nltk.probability import *

In [43]:
tokens = nltk.word_tokenize(tokenStrgCln)

In [44]:
fdist = nltk.FreqDist(tokens)

In [45]:
fdist

FreqDist({'aiding': 2,
          'cairo': 1,
          'chris': 1,
          'condemnation': 1,
          'final': 1,
          'latest': 2,
          'qaddafi': 2,
          'sid': 2,
          'stevens': 1,
          'syria': 2,
          'wow': 1})

In [46]:
vocabulary = fdist.keys()

In [47]:
vocabulary

dict_keys(['syria', 'sid', 'final', 'wow', 'cairo', 'aiding', 'latest', 'chris', 'stevens', 'qaddafi', 'condemnation'])

### Cumulative frequency

Do any words produced in the last example help us grasp the topic or genre of this text? (Eliminate English "plumbing")

In [None]:
fdist.plot(5, cumulative = True)