# Introduction to NLTK


## Part 1 - Analyzing Moby Dick

In [1]:
import nltk
nltk.download('gutenberg')
nltk.download('genesis')
nltk.download('inaugural')
nltk.download('nps_chat')
nltk.download('webtext')
nltk.download('treebank')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('udhr')
nltk.download('tagsets_json')

from nltk.book import *

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package genesis to /root/nltk_data...
[nltk_data]   Unzipping corpora/genesis.zip.
[nltk_data] Downloading package inaugural to /root/nltk_data...
[nltk_data]   Unzipping corpora/inaugural.zip.
[nltk_data] Downloading package nps_chat to /root/nltk_data...
[nltk_data]   Unzipping corpora/nps_chat.zip.
[nltk_data] Downloading package webtext to /root/nltk_data...
[nltk_data]   Unzipping corpora/webtext.zip.
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger_eng t

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [2]:
import pandas as pd
import numpy as np

moby_raw = ' '.join(text1)
moby_tokens = nltk.word_tokenize(moby_raw)
text1 = nltk.Text(moby_tokens)

### Example 1

How many tokens (words and punctuation symbols) are in text1?

*This function should return an integer.*

In [3]:
def example_one():

    return len(nltk.word_tokenize(moby_raw))

example_one()


263333

### Example 2

How many unique tokens (unique words and punctuation) does text1 have?

*This function should return an integer.*

In [None]:
def example_two():

    return len(set(nltk.word_tokenize(moby_raw)))

example_two()

### Example 3

After lemmatizing the verbs, how many unique tokens does text1 have?

*This function should return an integer.*

In [4]:
from nltk.stem import WordNetLemmatizer

def example_three():

    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(w,'v') for w in text1]

    return len(set(lemmatized))

example_three()

15419

### Question 1

What is the lexical diversity of the given text input? (i.e. ratio of unique tokens to the total number of tokens)

*This function should return a float.*

In [None]:
def answer_one():
  # Your code here:
    unique_tokens =
    total_tokens =
    lexical_diversity =

    return lexical_diversity

answer_one()

### Question 2

What percentage of tokens is 'whale'or 'Whale'?

*This function should return a float.*

In [5]:
from nltk import FreqDist
def answer_two():
  # Your code here:
    distribution = FreqDist(text1)
    Whale_freq = distribution['Whale']
    whale_freq = distribution['whale']
    total_tokens = len(text1)
    freq = Whale_freq + whale_freq
    freq_percentage = (freq / total_tokens) * 100
    return freq_percentage

answer_two()

0.4511398115693817

### Question 3

What are the 20 most frequently occurring (unique) tokens in the text? What is their frequency? (Using FreqDist function)

*This function should return a list of 20 tuples where each tuple is of the form `(token, frequency)`. The list should be sorted in descending order of frequency.*

In [6]:
def answer_three():
  # Your code here:

    distribution = FreqDist(text1)
    return distribution.most_common(20)


answer_three()

[(',', 19229),
 ('the', 13721),
 ('.', 7514),
 ('of', 6536),
 ('and', 6024),
 ('a', 4569),
 ('to', 4542),
 (';', 4173),
 ('in', 3916),
 ('that', 2982),
 ("'", 2919),
 ('-', 2555),
 ('his', 2459),
 ('it', 2209),
 ('I', 2124),
 ('!', 1767),
 ('s', 1739),
 ('--', 1713),
 ('is', 1695),
 ('he', 1661)]

### Question 4

What tokens have a length of greater than 5 and frequency of more than 150?

*This function should return a sorted list of the tokens that match the above constraints. To sort your list, use `sorted()`*

In [None]:
def answer_four():

# Your code here:
    distribution = FreqDist(text1)
    freqwords = [w for w in set(text1) if len(w) > 5 and distribution[w] > 150]
    freqwords.sort()

    return freqwords

answer_four()

### Question 5

Find the longest word in text1 and that word's length.

*This function should return a tuple `(longest_word, length)`.*

In [7]:
def answer_five():
# Your code here:
    text_list = list(text1)
    sorted_list = sorted(text_list, key=len)
    longest_word = sorted_list[-1]
    return (longest_word, len(longest_word))

answer_five()

('uninterpenetratingly', 20)

### Question 6

What unique words have a frequency of more than 2000? What is their frequency?

"Hint:  you may want to use `isalpha()` to check if the token is a word and not punctuation."

*This function should return a list of tuples of the form `(frequency, word)` sorted in descending order of frequency.*

In [8]:
def answer_six():
# Your code here:
    distribution = FreqDist(text1)
    words = [w for w in set(text1) if w.isalpha() and distribution[w] > 2000]
    freqwords = [(distribution[w], w) for w in words]
    sorted_words = sorted(freqwords, reverse=True)

    return sorted_words

answer_six()

[(13721, 'the'),
 (6536, 'of'),
 (6024, 'and'),
 (4569, 'a'),
 (4542, 'to'),
 (3916, 'in'),
 (2982, 'that'),
 (2459, 'his'),
 (2209, 'it'),
 (2124, 'I')]

### Question 7

What is the average number of tokens per sentence?

*This function should return a float.*

In [10]:
import nltk
def answer_seven():
# Your code here:
    sentences = nltk.sent_tokenize(moby_raw)
    number_of_sentences = len(sentences)
    tokens_per_sentence = [len(nltk.word_tokenize(sentence)) for sentence in sentences]
    total_tokens = sum(tokens_per_sentence)
    average_tokens_per_sentence = total_tokens / number_of_sentences

    return average_tokens_per_sentence

answer_seven()

26.314879584290995

### Question 8

What are the 5 most frequent parts of speech in this text? What is their frequency?

*This function should return a list of tuples of the form `(part_of_speech, frequency)` sorted in descending order of frequency.*

In [11]:
from nltk import pos_tag, FreqDist

def answer_eight():
  # Your code here:
    pos_tags = pos_tag(text1)
    tag_counts = FreqDist(tag for (word, tag) in pos_tags)
    most_common_tags = tag_counts.most_common(5)
    return most_common_tags

answer_eight()

[('NN', 35651), ('IN', 28891), ('DT', 25870), (',', 19229), ('JJ', 17962)]

## Basic NLP Tasks with NLTK

In [12]:
import nltk
from nltk.book import *

### Normalization and stemming

In [13]:
input1 = "List listed lists listing listings"
words1 = input1.lower().split(' ')
words1

['list', 'listed', 'lists', 'listing', 'listings']

In [14]:
porter = nltk.PorterStemmer()
[porter.stem(t) for t in words1]

['list', 'list', 'list', 'list', 'list']

### Lemmatization

In [15]:
udhr = nltk.corpus.udhr.words('English-Latin1')
udhr[:20]

['Universal',
 'Declaration',
 'of',
 'Human',
 'Rights',
 'Preamble',
 'Whereas',
 'recognition',
 'of',
 'the',
 'inherent',
 'dignity',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalienable',
 'rights',
 'of']

In [16]:
[porter.stem(t) for t in udhr[:20]] # Still Lemmatization

['univers',
 'declar',
 'of',
 'human',
 'right',
 'preambl',
 'wherea',
 'recognit',
 'of',
 'the',
 'inher',
 'digniti',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalien',
 'right',
 'of']

In [None]:
WNlemma = nltk.WordNetLemmatizer()
[WNlemma.lemmatize(t) for t in udhr[:20]]

### Tokenization

In [None]:
text11 = "Children shouldn't drink a sugary drink before bed."
text11.split(' ')

In [None]:
nltk.word_tokenize(text11)

In [None]:
text12 = "This is the first sentence. A gallon of milk in the U.S. costs $2.99. Is this the third sentence? Yes, it is!"
sentences = nltk.sent_tokenize(text12)
len(sentences)

In [None]:
sentences

## Advanced NLP Tasks with NLTK

### POS tagging

In [18]:
nltk.help.upenn_tagset('MD')

MD: modal auxiliary
    can cannot could couldn't dare may might must need ought shall should
    shouldn't will would


In [19]:
# Your code here:
# tokenize the word in text 12 and print out the pos_tag of it.
text12 = "This is the first sentence. A gallon of milk in the U.S. costs $2.99. Is this the third sentence? Yes, it is!"
text13 = nltk.word_tokenize(text12)
nltk.pos_tag(text13)

[('This', 'DT'),
 ('is', 'VBZ'),
 ('the', 'DT'),
 ('first', 'JJ'),
 ('sentence', 'NN'),
 ('.', '.'),
 ('A', 'DT'),
 ('gallon', 'NN'),
 ('of', 'IN'),
 ('milk', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('U.S.', 'NNP'),
 ('costs', 'VBZ'),
 ('$', '$'),
 ('2.99', 'CD'),
 ('.', '.'),
 ('Is', 'VBZ'),
 ('this', 'DT'),
 ('the', 'DT'),
 ('third', 'JJ'),
 ('sentence', 'NN'),
 ('?', '.'),
 ('Yes', 'UH'),
 (',', ','),
 ('it', 'PRP'),
 ('is', 'VBZ'),
 ('!', '.')]

In [20]:
# Parsing sentence structure
text15 = nltk.word_tokenize("Alice loves Bob")
grammar = nltk.CFG.fromstring("""
S -> NP VP
VP -> V NP
NP -> 'Alice' | 'Bob'
V -> 'loves'
""")

parser = nltk.ChartParser(grammar)
trees = parser.parse_all(text15)
for tree in trees:
    print(tree)

(S (NP Alice) (VP (V loves) (NP Bob)))


In [21]:
from nltk.corpus import treebank
text16 = treebank.parsed_sents('wsj_0001.mrg')[0]
print(text16)

(S
  (NP-SBJ
    (NP (NNP Pierre) (NNP Vinken))
    (, ,)
    (ADJP (NP (CD 61) (NNS years)) (JJ old))
    (, ,))
  (VP
    (MD will)
    (VP
      (VB join)
      (NP (DT the) (NN board))
      (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director)))
      (NP-TMP (NNP Nov.) (CD 29))))
  (. .))


### POS tagging and parsing ambiguity

In [22]:
text18 = nltk.word_tokenize("The old man the boat")
nltk.pos_tag(text18)

[('The', 'DT'), ('old', 'JJ'), ('man', 'NN'), ('the', 'DT'), ('boat', 'NN')]