# Pos, Stem, and Ngrams: Oh My!
<img src="images/fdr.jpg" height="200" width="200" align="left">
<center>
<h3>In This Worksheet</h3> We will parse a new text using the methods described up to this point and introduce some new ways to characterize words and sentences in texts.
<h3>The Data</h3> <strong>Pearl Harbor Address to the Nation</strong><br><i>President Franklin Delano Roosevelt, December 8th, 1941</i><br>
This is FDR's call to war to the American people after Pearl Harbor.<br>
https://youtu.be/3VqQAf74fsE
</center>

In [10]:
from nltk import sent_tokenize, word_tokenize
import pandas as pd
from nltk.corpus import stopwords
import string

In [11]:
fp = 'data/FDR-PearlHarbor_parsed.csv'
speech = pd.read_csv(fp, index_col=0)
speech.head()

Unnamed: 0,is_punct,is_stop,sent_id,token
0,False,False,1,mr.
1,False,False,1,vice
2,False,False,1,president
3,True,False,1,","
4,False,False,1,mr.


In [12]:
from nltk import pos_tag

tokenized_sentence = word_tokenize('I saw a dog running.'.lower())
print(pos_tag(tokenized_sentence))

[('i', 'NN'), ('saw', 'VBD'), ('a', 'DT'), ('dog', 'NN'), ('running', 'NN'), ('.', '.')]


|Tag|Description|Example|
|---|---|---|
|CC|conjunction, coordinating|and, or, but|
|CD|cardinal number|five, three, 13%|
|DT|determiner|the, a, these |
|EX|existential there|there were six boys |
|FW|foreign word|mais |
|IN|conjunction, subordinating or preposition|of, on, before, unless |
|JJ|adjective|nice, easy|
|JJR|adjective, comparative|nicer, easier|
|JJS|adjective, superlative|nicest, easiest |
|LS|list item marker| |
|MD|verb, modal auxillary|may, should |
|NN|noun, singular or mass|tiger, chair, laughter |
|NNS|noun, plural|tigers, chairs, insects |
|NNP|noun, proper singular|Germany, God, Alice |
|NNPS|noun, proper plural|we met two Christmases ago |
|PDT|predeterminer|both his children |
|POS|possessive ending|'s|
|PRP|pronoun, personal|me, you, it |
|PRP\$|pronoun, possessive|my, your, our |
|RB|adverb|extremely, loudly, hard  |
|RBR|adverb, comparative|better |
|RBS|adverb, superlative|best |
|RP|adverb, particle|about, off, up |
|SYM|symbol|None|
|TO|infinitival to|what to do? |
|UH|interjection|oh, oops, gosh |
|VB|verb, base form|think |
|VBZ|verb, 3rd person singular present|she thinks |
|VBP|verb, non-3rd person singular present|I think |
|VBD|verb, past tense|they thought |
|VBN|verb, past participle|a sunken ship |
|VBG|verb, gerund or present participle|thinking is fun |
|WDT|wh-determiner|which, whatever, whichever |
|WP|wh-pronoun, personal|what, who, whom |
|WP\$|wh-pronoun, possessive|whose, whosever |
|WRB|wh-adverb|where, when |
|.|punctuation mark, sentence closer|.;?* |
|,|punctuation mark, comma|, |
|:|punctuation mark, colon|: |
|(|contextual separator, left paren|( |
|)|contextual separator, right paren|) |

In [14]:
def get_pos(word):
    return pos_tag([word])[0][1]

In [15]:
#much faster when given as list
pos_tags = pos_tag(parsed_speech.token.tolist())
just_tags = [x[1] for x in pos_tags]
parsed_speech['pos'] = pd.Series( just_tags )

NameError: name 'parsed_speech' is not defined

In [16]:
parsed_speech.head()

NameError: name 'parsed_speech' is not defined

In [17]:
parsed_speech.pos.value_counts()

NameError: name 'parsed_speech' is not defined

In [18]:
noun_pos = ['NN', 'NNS', 'NNP', 'NNPS']
parsed_speech[parsed_speech.pos.isin(noun_pos)].token.value_counts().head(10)

NameError: name 'parsed_speech' is not defined

In [19]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')
parsed_speech['stem'] = parsed_speech.token.apply(stemmer.stem)

NameError: name 'parsed_speech' is not defined

In [13]:
parsed_speech.head()

Unnamed: 0,sent_id,token,is_stop,is_punct,pos,stem
0,0,mr.,False,False,JJ,mr.
1,0,vice,False,False,NN,vice
2,0,president,False,False,NN,presid
3,0,",",False,True,",",","
4,0,mr.,False,False,NN,mr.


In [25]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('aardwolves')

'aardwolf'

In [14]:
no_stops = parsed_speech[ (parsed_speech['is_stop'] == False) 
                         & (parsed_speech.is_punct == False) ]

In [15]:
 no_stops.token.value_counts().head(10)

japanese     10
american      6
attacked      6
forces        6
united        6
states        6
people        5
japan         5
attack        5
yesterday     4
Name: token, dtype: int64

In [16]:
no_stops.stem.value_counts().head(10)

attack      11
japanes     10
state        9
unit         6
forc         6
american     6
japan        5
island       5
peopl        5
last         4
Name: stem, dtype: int64

In [None]:
#CONVERT EVERYTHING TO SENTENCE thing

In [21]:
#ngrams on sentences
from nltk import ngrams
first_tokens = parsed_speech[parsed_speech.sent_id==0].token.tolist()
bigrams = ngrams(first_tokens, 2)
print(list(bigrams))

[('mr.', 'vice'), ('vice', 'president'), ('president', ','), (',', 'mr.'), ('mr.', 'speaker'), ('speaker', ','), (',', 'members'), ('members', 'of'), ('of', 'the'), ('the', 'senate'), ('senate', ','), (',', 'and'), ('and', 'of'), ('of', 'the'), ('the', 'house'), ('house', 'of'), ('of', 'representatives'), ('representatives', ':'), (':', 'yesterday'), ('yesterday', ','), (',', 'december'), ('december', '7th'), ('7th', ','), (',', '1941'), ('1941', '--'), ('--', 'a'), ('a', 'date'), ('date', 'which'), ('which', 'will'), ('will', 'live'), ('live', 'in'), ('in', 'infamy'), ('infamy', '--'), ('--', 'the'), ('the', 'united'), ('united', 'states'), ('states', 'of'), ('of', 'america'), ('america', 'was'), ('was', 'suddenly'), ('suddenly', 'and'), ('and', 'deliberately'), ('deliberately', 'attacked'), ('attacked', 'by'), ('by', 'naval'), ('naval', 'and'), ('and', 'air'), ('air', 'forces'), ('forces', 'of'), ('of', 'the'), ('the', 'empire'), ('empire', 'of'), ('of', 'japan'), ('japan', '.')]


In [22]:
trigrams = ngrams(first_tokens, 3)
print(list(trigrams))

[('mr.', 'vice', 'president'), ('vice', 'president', ','), ('president', ',', 'mr.'), (',', 'mr.', 'speaker'), ('mr.', 'speaker', ','), ('speaker', ',', 'members'), (',', 'members', 'of'), ('members', 'of', 'the'), ('of', 'the', 'senate'), ('the', 'senate', ','), ('senate', ',', 'and'), (',', 'and', 'of'), ('and', 'of', 'the'), ('of', 'the', 'house'), ('the', 'house', 'of'), ('house', 'of', 'representatives'), ('of', 'representatives', ':'), ('representatives', ':', 'yesterday'), (':', 'yesterday', ','), ('yesterday', ',', 'december'), (',', 'december', '7th'), ('december', '7th', ','), ('7th', ',', '1941'), (',', '1941', '--'), ('1941', '--', 'a'), ('--', 'a', 'date'), ('a', 'date', 'which'), ('date', 'which', 'will'), ('which', 'will', 'live'), ('will', 'live', 'in'), ('live', 'in', 'infamy'), ('in', 'infamy', '--'), ('infamy', '--', 'the'), ('--', 'the', 'united'), ('the', 'united', 'states'), ('united', 'states', 'of'), ('states', 'of', 'america'), ('of', 'america', 'was'), ('ameri