In [1]:
# Import libraries

import pandas as pd
import numpy as np
import nltk

In [2]:
# Retrieve the data

from nltk.corpus import treebank


In [6]:
# Store the tagged words

treebank_tagged_words = treebank.tagged_words()
print(len(treebank_tagged_words))
print(treebank_tagged_words[0:4])

100676
[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD')]


In [7]:
# Get classes of tags

tag_classes_fd = nltk.FreqDist(tag[0] for (word, tag) in treebank_tagged_words)
print(len(tag_classes_fd))
print(tag_classes_fd.keys())

24
dict_keys(['N', ',', 'C', 'J', 'M', 'V', 'D', 'I', '.', '-', 'R', 'T', 'P', 'W', '`', 'E', "'", ':', '$', 'F', 'U', 'S', 'L', '#'])


In [8]:
# Distribution of tags

for tag, freq in tag_classes_fd.most_common():
    print(tag, freq)

N 28867
V 12637
I 9857
D 8165
- 6838
J 6397
C 5811
, 4886
. 3874
P 3333
R 3209
T 2179
M 927
W 878
$ 724
` 712
' 694
: 563
E 88
# 16
L 13
F 4
U 3
S 1


In [12]:
# Store the tagged sents

treebank_tagged = treebank.tagged_sents()
print(len(treebank_tagged_words))
print(treebank_tagged_words[0:4])

100676
[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD')]


In [16]:
# Get the treebank tokens

treebank_tokens = treebank.words()
print(treebank_tokens[:20])

['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.', 'Mr.', 'Vinken']


In [14]:
# Seperate test and train

size = int(len(treebank_tagged) * 0.9)
treebank_train = treebank_tagged[:size]
treebank_test = treebank_tagged[size:]
print(size)

3522


In [17]:
# Start off with a default tagger

t0 = nltk.DefaultTagger("NN")
print(t0.tag(treebank_tokens[:50]))

[('Pierre', 'NN'), ('Vinken', 'NN'), (',', 'NN'), ('61', 'NN'), ('years', 'NN'), ('old', 'NN'), (',', 'NN'), ('will', 'NN'), ('join', 'NN'), ('the', 'NN'), ('board', 'NN'), ('as', 'NN'), ('a', 'NN'), ('nonexecutive', 'NN'), ('director', 'NN'), ('Nov.', 'NN'), ('29', 'NN'), ('.', 'NN'), ('Mr.', 'NN'), ('Vinken', 'NN'), ('is', 'NN'), ('chairman', 'NN'), ('of', 'NN'), ('Elsevier', 'NN'), ('N.V.', 'NN'), (',', 'NN'), ('the', 'NN'), ('Dutch', 'NN'), ('publishing', 'NN'), ('group', 'NN'), ('.', 'NN'), ('Rudolph', 'NN'), ('Agnew', 'NN'), (',', 'NN'), ('55', 'NN'), ('years', 'NN'), ('old', 'NN'), ('and', 'NN'), ('former', 'NN'), ('chairman', 'NN'), ('of', 'NN'), ('Consolidated', 'NN'), ('Gold', 'NN'), ('Fields', 'NN'), ('PLC', 'NN'), (',', 'NN'), ('was', 'NN'), ('named', 'NN'), ('*-1', 'NN'), ('a', 'NN')]


In [18]:
# Run evaluation on the test set

t0.evaluate(treebank_test)

0.14697201017811704

In [20]:
# Train a unigram tagger

t1 = nltk.UnigramTagger(treebank_tagged)
print(t1.tag(treebank_tokens[:50]))

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.'), ('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'JJ'), ('publishing', 'NN'), ('group', 'NN'), ('.', '.'), ('Rudolph', 'NNP'), ('Agnew', 'NNP'), (',', ','), ('55', 'CD'), ('years', 'NNS'), ('old', 'JJ'), ('and', 'CC'), ('former', 'JJ'), ('chairman', 'NN'), ('of', 'IN'), ('Consolidated', 'NNP'), ('Gold', 'NNP'), ('Fields', 'NNP'), ('PLC', 'NNP'), (',', ','), ('was', 'VBD'), ('named', 'VBN'), ('*-1', '-NONE-'), ('a', 'DT')]


In [21]:
# Evaluate the unigram tagger

print(t1.evaluate(treebank_test))

0.96


In [22]:
# Bigram tagger with backoffs

t0 = nltk.DefaultTagger("NN")
t1 = nltk.UnigramTagger(treebank_train, backoff = t0)
t2 = nltk.BigramTagger(treebank_train, backoff = t1)

In [23]:
# Evaluate on the test data

print(t2.evaluate(treebank_test))

0.8905852417302799


In [34]:
# Use the bigram tagger on some new text

text = ''' Three Calgarians have found a rather unusual way of leaving snow and ice behind. They set off this week on foot and by camels on a grueling trek across the burning Arabian desert.
'''

In [35]:
# Split into sentences

textsplit = nltk.sent_tokenize(text)
print(textsplit)

[' Three Calgarians have found a rather unusual way of leaving snow and ice behind.', 'They set off this week on foot and by camels on a grueling trek across the burning Arabian desert.']


In [36]:
# Apply the word tokenizer to each sentence

tokentext = [nltk.word_tokenize(sent) for sent in textsplit]
print(tokentext)

[['Three', 'Calgarians', 'have', 'found', 'a', 'rather', 'unusual', 'way', 'of', 'leaving', 'snow', 'and', 'ice', 'behind', '.'], ['They', 'set', 'off', 'this', 'week', 'on', 'foot', 'and', 'by', 'camels', 'on', 'a', 'grueling', 'trek', 'across', 'the', 'burning', 'Arabian', 'desert', '.']]


In [38]:
# Apply the bigram tagger to tag each sentence tokens

taggedtext = [t2.tag(tokens) for tokens in tokentext]
print(taggedtext)

[[('Three', 'CD'), ('Calgarians', 'NN'), ('have', 'VBP'), ('found', 'VBN'), ('a', 'DT'), ('rather', 'RB'), ('unusual', 'JJ'), ('way', 'NN'), ('of', 'IN'), ('leaving', 'VBG'), ('snow', 'NN'), ('and', 'CC'), ('ice', 'NN'), ('behind', 'IN'), ('.', '.')], [('They', 'PRP'), ('set', 'VBN'), ('off', 'RP'), ('this', 'DT'), ('week', 'NN'), ('on', 'IN'), ('foot', 'NN'), ('and', 'CC'), ('by', 'IN'), ('camels', 'NN'), ('on', 'IN'), ('a', 'DT'), ('grueling', 'NN'), ('trek', 'NN'), ('across', 'IN'), ('the', 'DT'), ('burning', 'NN'), ('Arabian', 'NN'), ('desert', 'NN'), ('.', '.')]]


In [39]:
# Use the stanford POS tagger to tag each sentence tokens

taggedtextStanford = [nltk.pos_tag(tokens) for tokens in tokentext]
print(taggedtextStanford)

[[('Three', 'CD'), ('Calgarians', 'NNPS'), ('have', 'VBP'), ('found', 'VBN'), ('a', 'DT'), ('rather', 'RB'), ('unusual', 'JJ'), ('way', 'NN'), ('of', 'IN'), ('leaving', 'VBG'), ('snow', 'NN'), ('and', 'CC'), ('ice', 'NN'), ('behind', 'NN'), ('.', '.')], [('They', 'PRP'), ('set', 'VBD'), ('off', 'RP'), ('this', 'DT'), ('week', 'NN'), ('on', 'IN'), ('foot', 'NN'), ('and', 'CC'), ('by', 'IN'), ('camels', 'NNS'), ('on', 'IN'), ('a', 'DT'), ('grueling', 'NN'), ('trek', 'NN'), ('across', 'IN'), ('the', 'DT'), ('burning', 'NN'), ('Arabian', 'JJ'), ('desert', 'NN'), ('.', '.')]]


In [41]:
# Flatten the list of tagged tokens from a nested list to a list

taggedtextflat = [pair for sent in taggedtext for pair in sent]
print(taggedtextflat)

[('Three', 'CD'), ('Calgarians', 'NN'), ('have', 'VBP'), ('found', 'VBN'), ('a', 'DT'), ('rather', 'RB'), ('unusual', 'JJ'), ('way', 'NN'), ('of', 'IN'), ('leaving', 'VBG'), ('snow', 'NN'), ('and', 'CC'), ('ice', 'NN'), ('behind', 'IN'), ('.', '.'), ('They', 'PRP'), ('set', 'VBN'), ('off', 'RP'), ('this', 'DT'), ('week', 'NN'), ('on', 'IN'), ('foot', 'NN'), ('and', 'CC'), ('by', 'IN'), ('camels', 'NN'), ('on', 'IN'), ('a', 'DT'), ('grueling', 'NN'), ('trek', 'NN'), ('across', 'IN'), ('the', 'DT'), ('burning', 'NN'), ('Arabian', 'NN'), ('desert', 'NN'), ('.', '.')]


<p>&nbsp;</p>

### **4.6.6 Running a POS Tagger on a Large Text and Looking at Tag Frequencies**

<p>&nbsp;</p>

In [42]:
# Choose the asuten-emma text from gutenberg corpus

filename = nltk.corpus.gutenberg.fileids()[0]
emmatext = nltk.corpus.gutenberg.raw(filename)

# Tokenize and POS tag the text with Stanford tagger

emmatokens = nltk.word_tokenize(emmatext)
emmatagged = nltk.pos_tag(emmatokens)

In [46]:
# Print the distribution of the POS tags

tagclasses = nltk.FreqDist(tag[0] for (word, tag) in emmatagged)
for tag, freq in tagclasses.most_common():
    print(tag, freq)

N 32045
V 31349
P 21611
I 17899
R 13926
D 12643
, 12016
J 11289
. 8039
C 7735
: 5627
T 5181
M 4410
' 2557
W 2556
` 1847
E 456
U 362
( 107
) 107
F 22
$ 1


In [55]:
# Split the text into sentences

textsplit = nltk.sent_tokenize(emmatext)
print(textsplit[0], "\n")
print(textsplit[1], "\n")

[Emma by Jane Austen 1816]

VOLUME I

CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings
of existence; and had lived nearly twenty-one years in the world
with very little to distress or vex her. 

She was the youngest of the two daughters of a most affectionate,
indulgent father; and had, in consequence of her sister's marriage,
been mistress of his house from a very early period. 



In [58]:
# Tokenize the list of sentences

tokentext = [nltk.word_tokenize(sent) for sent in textsplit]
print(tokentext[0], "\n")
print(tokentext[1], "\n")

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', 'VOLUME', 'I', 'CHAPTER', 'I', 'Emma', 'Woodhouse', ',', 'handsome', ',', 'clever', ',', 'and', 'rich', ',', 'with', 'a', 'comfortable', 'home', 'and', 'happy', 'disposition', ',', 'seemed', 'to', 'unite', 'some', 'of', 'the', 'best', 'blessings', 'of', 'existence', ';', 'and', 'had', 'lived', 'nearly', 'twenty-one', 'years', 'in', 'the', 'world', 'with', 'very', 'little', 'to', 'distress', 'or', 'vex', 'her', '.'] 

['She', 'was', 'the', 'youngest', 'of', 'the', 'two', 'daughters', 'of', 'a', 'most', 'affectionate', ',', 'indulgent', 'father', ';', 'and', 'had', ',', 'in', 'consequence', 'of', 'her', 'sister', "'s", 'marriage', ',', 'been', 'mistress', 'of', 'his', 'house', 'from', 'a', 'very', 'early', 'period', '.'] 



In [60]:
# Run the tagger on the list of sentences

taggedtext = [nltk.pos_tag(tokens) for tokens in tokentext]
print(taggedtext[0], "\n")
print(taggedtext[1], "\n")

[('[', 'NNS'), ('Emma', 'NNP'), ('by', 'IN'), ('Jane', 'NNP'), ('Austen', 'NNP'), ('1816', 'CD'), (']', 'NNP'), ('VOLUME', 'NNP'), ('I', 'PRP'), ('CHAPTER', 'VBP'), ('I', 'PRP'), ('Emma', 'NNP'), ('Woodhouse', 'NNP'), (',', ','), ('handsome', 'NN'), (',', ','), ('clever', 'NN'), (',', ','), ('and', 'CC'), ('rich', 'JJ'), (',', ','), ('with', 'IN'), ('a', 'DT'), ('comfortable', 'JJ'), ('home', 'NN'), ('and', 'CC'), ('happy', 'JJ'), ('disposition', 'NN'), (',', ','), ('seemed', 'VBD'), ('to', 'TO'), ('unite', 'VB'), ('some', 'DT'), ('of', 'IN'), ('the', 'DT'), ('best', 'JJS'), ('blessings', 'NNS'), ('of', 'IN'), ('existence', 'NN'), (';', ':'), ('and', 'CC'), ('had', 'VBD'), ('lived', 'VBN'), ('nearly', 'RB'), ('twenty-one', 'CD'), ('years', 'NNS'), ('in', 'IN'), ('the', 'DT'), ('world', 'NN'), ('with', 'IN'), ('very', 'RB'), ('little', 'JJ'), ('to', 'TO'), ('distress', 'VB'), ('or', 'CC'), ('vex', 'VB'), ('her', 'PRP'), ('.', '.')] 

[('She', 'PRP'), ('was', 'VBD'), ('the', 'DT'), ('you

In [61]:
# Flatten the list to get just one list of (word, tag)

taggedtextflat = [pair for sent in taggedtext for pair in sent]
print(taggedtextflat[0], "\n")
print(taggedtextflat[1], "\n")

('[', 'NNS') 

('Emma', 'NNP') 



In [62]:
# Print the distribution of the POS tags

tagclasses = nltk.FreqDist(tag[0] for (word, tag) in taggedtextflat)
for tag, freq in tagclasses.most_common():
    print(tag, freq)

N 31998
V 31297
P 21612
I 17880
R 14008
D 12743
, 12016
J 11227
. 8041
C 7724
: 5627
T 5181
M 4426
W 2570
' 2558
` 1847
E 456
U 344
( 107
) 107
F 17
$ 1
