# Exploring Natural Language Processing (NLT) with nltk package
### References:
* https://realpython.com/nltk-nlp-python/
* https://realpython.com/python-nltk-sentiment-analysis/
* https://www.nltk.org/book/

In [1]:
# Dependencies
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk import FreqDist

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from pprint import pp


In [2]:
NLTK_RESOURCES_TO_DOWNLOAD = [
    "names", "stopwords",
    "words", "state_union", "twitter_samples", "movie_reviews", "book",
    "maxent_ne_chunker", 
    "averaged_perceptron_tagger", "vader_lexicon", "punkt",
]
nltk.download(NLTK_RESOURCES_TO_DOWNLOAD)

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\Jeff\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jeff\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Jeff\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package state_union to
[nltk_data]     C:\Users\Jeff\AppData\Roaming\nltk_data...
[nltk_data]   Package state_union is already up-to-date!
[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\Jeff\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Jeff\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading collection 'bo

True

In [3]:
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


## Tokenizing by Word, by Sentence

In [4]:
example_string = """
Muad'Dib learned rapidly because his first training was in how to learn.
And the first lesson of all was the basic trust that he could learn.
It's shocking to find how many people do not believe they can learn,
and how many more believe learning to be difficult."""

In [5]:
sent_tokenize(example_string)

["\nMuad'Dib learned rapidly because his first training was in how to learn.",
 'And the first lesson of all was the basic trust that he could learn.',
 "It's shocking to find how many people do not believe they can learn,\nand how many more believe learning to be difficult."]

In [6]:
print( word_tokenize(example_string) )

["Muad'Dib", 'learned', 'rapidly', 'because', 'his', 'first', 'training', 'was', 'in', 'how', 'to', 'learn', '.', 'And', 'the', 'first', 'lesson', 'of', 'all', 'was', 'the', 'basic', 'trust', 'that', 'he', 'could', 'learn', '.', 'It', "'s", 'shocking', 'to', 'find', 'how', 'many', 'people', 'do', 'not', 'believe', 'they', 'can', 'learn', ',', 'and', 'how', 'many', 'more', 'believe', 'learning', 'to', 'be', 'difficult', '.']


## Filtering Stop Words

In [7]:
worf_quote = "Sir, I protest. I am not a merry man!"
words_in_quote = word_tokenize(worf_quote)
print(words_in_quote)

['Sir', ',', 'I', 'protest', '.', 'I', 'am', 'not', 'a', 'merry', 'man', '!']


In [8]:
stop_words = set( stopwords.words("english") )
len(stop_words)
print(sorted(stop_words)[:300])

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn', "shouldn't", 'so', 'some',

In [9]:
filtered_list = [ w for w in words_in_quote if w.casefold() not in stop_words ]
print(filtered_list)

['Sir', ',', 'protest', '.', 'merry', 'man', '!']


## Stemming by PorterStemmer

In [10]:
string_for_stemming = """
The crew of the USS Discovery discovered many discoveries.
Discovering is what explorers do."""

In [11]:
words = word_tokenize(string_for_stemming)
print(words)

['The', 'crew', 'of', 'the', 'USS', 'Discovery', 'discovered', 'many', 'discoveries', '.', 'Discovering', 'is', 'what', 'explorers', 'do', '.']


In [12]:
stemmer = PorterStemmer()
stemmed_words = [ stemmer.stem(w) for w in words ]
print(stemmed_words)

['the', 'crew', 'of', 'the', 'uss', 'discoveri', 'discov', 'mani', 'discoveri', '.', 'discov', 'is', 'what', 'explor', 'do', '.']


In [13]:
stemmer = SnowballStemmer("english")
stemmed_words = [ stemmer.stem(w) for w in words ]
print(stemmed_words)

['the', 'crew', 'of', 'the', 'uss', 'discoveri', 'discov', 'mani', 'discoveri', '.', 'discov', 'is', 'what', 'explor', 'do', '.']


## Tagging Parts of Speech

In [14]:
sagan_quote = """
If you wish to make an apple pie from scratch,
you must first invent the universe."""

In [15]:
words_in_sagan_quote = word_tokenize( sagan_quote )
print(words_in_sagan_quote)

['If', 'you', 'wish', 'to', 'make', 'an', 'apple', 'pie', 'from', 'scratch', ',', 'you', 'must', 'first', 'invent', 'the', 'universe', '.']


In [16]:
sagan_pos = nltk.pos_tag(words_in_sagan_quote)
print( sagan_pos )

[('If', 'IN'), ('you', 'PRP'), ('wish', 'VBP'), ('to', 'TO'), ('make', 'VB'), ('an', 'DT'), ('apple', 'NN'), ('pie', 'NN'), ('from', 'IN'), ('scratch', 'NN'), (',', ','), ('you', 'PRP'), ('must', 'MD'), ('first', 'VB'), ('invent', 'VB'), ('the', 'DT'), ('universe', 'NN'), ('.', '.')]


In [17]:
sagan_df = pd.DataFrame( sagan_pos, columns=['word','pos'])
sagan_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
word,If,you,wish,to,make,an,apple,pie,from,scratch,",",you,must,first,invent,the,universe,.
pos,IN,PRP,VBP,TO,VB,DT,NN,NN,IN,NN,",",PRP,MD,VB,VB,DT,NN,.


In [18]:
# Load the tagset help text
TAGSET_FILEN = r"C:\Users\Jeff\AppData\Roaming\nltk_data\help\tagsets\upenn_tagset.pickle"
tagset_dict = pickle.load( open(TAGSET_FILEN, 'rb') )
len(tagset_dict)

45

In [19]:
tagset_df = pd.DataFrame( tagset_dict, index=['pos_desc', 'example'] ).T\
                .sort_index(ignore_index=False)\
                .reset_index(drop=False)\
                .rename(columns={'index':'pos'})
tagset_df.head()

Unnamed: 0,pos,pos_desc,example
0,$,dollar,$ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
1,'',closing quotation mark,' ''
2,(,opening parenthesis,( [ {
3,),closing parenthesis,) ] }
4,",",comma,","


In [20]:
tagset_df[ tagset_df.pos_desc.str.lower().str.contains('noun') ]

Unnamed: 0,pos,pos_desc,example
19,NN,"noun, common, singular or mass",common-carrier cabbage knuckle-duster Casino a...
20,NNP,"noun, proper, singular",Motown Venneboerger Czestochwa Ranzer Conchita...
21,NNPS,"noun, proper, plural",Americans Americas Amharas Amityvilles Amuseme...
22,NNS,"noun, common, plural",undergraduates scotches bric-a-brac products b...
25,PRP,"pronoun, personal",hers herself him himself hisself it itself me ...
26,PRP$,"pronoun, possessive",her his mine my our ours their thy your
41,WP,WH-pronoun,that what whatever whatsoever which who whom w...
42,WP$,"WH-pronoun, possessive",whose


In [21]:
sagan_df.merge( tagset_df[['pos', 'pos_desc']], on='pos', how='left', suffixes=['', '_desc'])

Unnamed: 0,word,pos,pos_desc
0,If,IN,"preposition or conjunction, subordinating"
1,you,PRP,"pronoun, personal"
2,wish,VBP,"verb, present tense, not 3rd person singular"
3,to,TO,"""to"" as preposition or infinitive marker"
4,make,VB,"verb, base form"
5,an,DT,determiner
6,apple,NN,"noun, common, singular or mass"
7,pie,NN,"noun, common, singular or mass"
8,from,IN,"preposition or conjunction, subordinating"
9,scratch,NN,"noun, common, singular or mass"


In [22]:
jabberwocky_excerpt = """
'Twas brillig, and the slithy toves did gyre and gimble in the wabe:
all mimsy were the borogoves, and the mome raths outgrabe."""
words_in_excerpt = word_tokenize( jabberwocky_excerpt )
print( words_in_excerpt )

["'Twas", 'brillig', ',', 'and', 'the', 'slithy', 'toves', 'did', 'gyre', 'and', 'gimble', 'in', 'the', 'wabe', ':', 'all', 'mimsy', 'were', 'the', 'borogoves', ',', 'and', 'the', 'mome', 'raths', 'outgrabe', '.']


In [23]:
j_pos_dict = nltk.pos_tag( words_in_excerpt )
j_pos_df = pd.DataFrame( j_pos_dict, columns=['word','pos'])
j_pos_df.merge( tagset_df[['pos', 'pos_desc']], on='pos', how='left', suffixes=['', '_desc'])

Unnamed: 0,word,pos,pos_desc
0,'Twas,CD,"numeral, cardinal"
1,brillig,NN,"noun, common, singular or mass"
2,",",",",comma
3,and,CC,"conjunction, coordinating"
4,the,DT,determiner
5,slithy,JJ,"adjective or numeral, ordinal"
6,toves,NNS,"noun, common, plural"
7,did,VBD,"verb, past tense"
8,gyre,NN,"noun, common, singular or mass"
9,and,CC,"conjunction, coordinating"


## Lemmatizing

In [24]:
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize("scarves")

'scarf'

In [25]:
string_for_lemmatizing = "The friends of DeSoto love scarves."
words = word_tokenize(string_for_lemmatizing)
print( words )

['The', 'friends', 'of', 'DeSoto', 'love', 'scarves', '.']


In [26]:
lemmatized_words = [ lemmatizer.lemmatize(w) for w in words ]
print(lemmatized_words)

['The', 'friend', 'of', 'DeSoto', 'love', 'scarf', '.']


In [27]:
lemmatizer.lemmatize("worst")

'worst'

In [28]:
lemmatizer.lemmatize("worst", "a" )

'bad'

## Chunking: Identify phrases

In [43]:
lotr_quote = "It's a dangerous business, Frodo, going out your door."
words_in_lotr_quote = word_tokenize(lotr_quote)
print(words_in_lotr_quote)

['It', "'s", 'a', 'dangerous', 'business', ',', 'Frodo', ',', 'going', 'out', 'your', 'door', '.']


In [44]:
lotr_pos_tags = nltk.pos_tag(words_in_lotr_quote)
print(lotr_pos_tags)

[('It', 'PRP'), ("'s", 'VBZ'), ('a', 'DT'), ('dangerous', 'JJ'), ('business', 'NN'), (',', ','), ('Frodo', 'NNP'), (',', ','), ('going', 'VBG'), ('out', 'RP'), ('your', 'PRP$'), ('door', 'NN'), ('.', '.')]


In [45]:
grammar = "NP: {<DT>?<JJ>*<NN>}"
print(grammar)

NP: {<DT>?<JJ>*<NN>}


In [46]:
chunk_parser = nltk.RegexpParser(grammar)

In [47]:
tree = chunk_parser.parse(lotr_pos_tags)
pp(tree)

Tree('S', [('It', 'PRP'), ("'s", 'VBZ'), Tree('NP', [('a', 'DT'), ('dangerous', 'JJ'), ('business', 'NN')]), (',', ','), ('Frodo', 'NNP'), (',', ','), ('going', 'VBG'), ('out', 'RP'), ('your', 'PRP$'), Tree('NP', [('door', 'NN')]), ('.', '.')])


## Chinking: Patterns to exclude as a phrase

In [53]:
print(lotr_pos_tags)

[('It', 'PRP'), ("'s", 'VBZ'), ('a', 'DT'), ('dangerous', 'JJ'), ('business', 'NN'), (',', ','), ('Frodo', 'NNP'), (',', ','), ('going', 'VBG'), ('out', 'RP'), ('your', 'PRP$'), ('door', 'NN'), ('.', '.')]


In [54]:
grammar = """
Chunk: {<.*>+}
       }<JJ>{"""

In [55]:
chunk_parser = nltk.RegexpParser(grammar)
tree = chunk_parser.parse(lotr_pos_tags)
pp(tree)

Tree('S', [Tree('Chunk', [('It', 'PRP'), ("'s", 'VBZ'), ('a', 'DT')]), ('dangerous', 'JJ'), Tree('Chunk', [('business', 'NN'), (',', ','), ('Frodo', 'NNP'), (',', ','), ('going', 'VBG'), ('out', 'RP'), ('your', 'PRP$'), ('door', 'NN'), ('.', '.')])])
