In [1]:
# Import nltk
import nltk

In [2]:
# Define a feature extraction function for each name
def gender_features(word):
    return{'last_letter': word[-1]}

In [3]:
# Try out the feature extraction function for a name
print(gender_features('Shrek'))

{'last_letter': 'k'}


In [4]:
# Resource for male and female first names
from nltk.corpus import names
print(names.words('male.txt')[:20])
print(names.words('female.txt')[:20])

['Aamir', 'Aaron', 'Abbey', 'Abbie', 'Abbot', 'Abbott', 'Abby', 'Abdel', 'Abdul', 'Abdulkarim', 'Abdullah', 'Abe', 'Abel', 'Abelard', 'Abner', 'Abraham', 'Abram', 'Ace', 'Adair', 'Adam']
['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi', 'Abbie', 'Abby', 'Abigael', 'Abigail', 'Abigale', 'Abra', 'Acacia', 'Ada', 'Adah', 'Adaline', 'Adara', 'Addie', 'Addis', 'Adel', 'Adela']


In [5]:
# make list of male and female names paired with gender
namesgender = ([(name, 'male') for name in names.words('male.txt')] +
          [(name, 'female') for name in names.words('female.txt')])
print(len(namesgender))
print(namesgender[:20])
print(namesgender[7924:])

7944
[('Aamir', 'male'), ('Aaron', 'male'), ('Abbey', 'male'), ('Abbie', 'male'), ('Abbot', 'male'), ('Abbott', 'male'), ('Abby', 'male'), ('Abdel', 'male'), ('Abdul', 'male'), ('Abdulkarim', 'male'), ('Abdullah', 'male'), ('Abe', 'male'), ('Abel', 'male'), ('Abelard', 'male'), ('Abner', 'male'), ('Abraham', 'male'), ('Abram', 'male'), ('Ace', 'male'), ('Adair', 'male'), ('Adam', 'male')]
[('Zena', 'female'), ('Zenia', 'female'), ('Zia', 'female'), ('Zilvia', 'female'), ('Zita', 'female'), ('Zitella', 'female'), ('Zoe', 'female'), ('Zola', 'female'), ('Zonda', 'female'), ('Zondra', 'female'), ('Zonnya', 'female'), ('Zora', 'female'), ('Zorah', 'female'), ('Zorana', 'female'), ('Zorina', 'female'), ('Zorine', 'female'), ('Zsa Zsa', 'female'), ('Zsazsa', 'female'), ('Zulema', 'female'), ('Zuzana', 'female')]


In [6]:
# put the list into random order
import random
random.shuffle(namesgender)
print(namesgender[:20])

[('Phillipe', 'male'), ('Tibold', 'male'), ('Silvano', 'male'), ('Ronald', 'male'), ('Torr', 'male'), ('Linoel', 'male'), ('Brook', 'male'), ('Brodie', 'male'), ('Valma', 'female'), ('Vikki', 'female'), ('Dawn', 'female'), ('Marcille', 'female'), ('Kaleb', 'male'), ('Mag', 'female'), ('Godart', 'male'), ('Dixie', 'female'), ('Bidget', 'female'), ('Noam', 'male'), ('Ivie', 'female'), ('Meris', 'female')]


In [7]:
# separate the names into training and test
train_names = namesgender[500:]
test_names = namesgender[:500]

In [8]:
# use our features to train a classify and test on the development test set
train_set = [(gender_features(n), g) for (n, g) in train_names]
test_set = [(gender_features(n), g) for (n, g) in test_names]
print(train_set[:20])

[({'last_letter': 'a'}, 'female'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'e'}, 'male'), ({'last_letter': 'i'}, 'male'), ({'last_letter': 'l'}, 'male'), ({'last_letter': 'n'}, 'male'), ({'last_letter': 'b'}, 'male'), ({'last_letter': 'n'}, 'male'), ({'last_letter': 'n'}, 'male'), ({'last_letter': 'e'}, 'male'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'y'}, 'male'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'y'}, 'male'), ({'last_letter': 'm'}, 'male'), ({'last_letter': 'y'}, 'male'), ({'last_letter': 'e'}, 'male')]


In [9]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [10]:
# classify new instances
print(classifier.classify(gender_features('Neo')))
print(classifier.classify(gender_features('Trinity')))

male
female


In [11]:
# classify accuracy function runs the classifier on the test set and reports
#   comparisons between predicted labels and actual/gold labels
print(nltk.classify.accuracy(classifier, test_set))

0.73


In [12]:
# this function available for naive bayes classifiers
print(classifier.show_most_informative_features(20))

Most Informative Features
             last_letter = 'a'            female : male   =     34.3 : 1.0
             last_letter = 'k'              male : female =     31.1 : 1.0
             last_letter = 'f'              male : female =     15.4 : 1.0
             last_letter = 'p'              male : female =     12.7 : 1.0
             last_letter = 'm'              male : female =     10.7 : 1.0
             last_letter = 'd'              male : female =     10.1 : 1.0
             last_letter = 'v'              male : female =      9.9 : 1.0
             last_letter = 'o'              male : female =      8.7 : 1.0
             last_letter = 'r'              male : female =      7.2 : 1.0
             last_letter = 'g'              male : female =      5.7 : 1.0
             last_letter = 'w'              male : female =      5.4 : 1.0
             last_letter = 'b'              male : female =      4.4 : 1.0
             last_letter = 's'              male : female =      4.4 : 1.0

In [13]:
# define a function that will compare the classifier labels with the gold standard labels
def geterrors(test):
    errors = []
    for (name, tag) in test:
        guess = classifier.classify(gender_features(name))
        if guess != tag:
            errors.append( (tag, guess, name) )
    return errors

errors = geterrors(test_names)
print(len(errors))

135


In [14]:
# define a function to print the errors
def printerrors(errors):
    for (tag, guess, name) in sorted(errors):
        print('correct={:<8s} guess={:<8s} name={:<30s}'.format(tag, guess, name))

printerrors(errors)

correct=female   guess=male     name=Abagael                       
correct=female   guess=male     name=Aleen                         
correct=female   guess=male     name=Anne-Mar                      
correct=female   guess=male     name=Bess                          
correct=female   guess=male     name=Bidget                        
correct=female   guess=male     name=Bill                          
correct=female   guess=male     name=Brett                         
correct=female   guess=male     name=Cal                           
correct=female   guess=male     name=Camel                         
correct=female   guess=male     name=Carilyn                       
correct=female   guess=male     name=Carlen                        
correct=female   guess=male     name=Carrol                        
correct=female   guess=male     name=Cass                          
correct=female   guess=male     name=Charil                        
correct=female   guess=male     name=Charlott   

## Part of Speech Classifier

In [15]:
## classify part of speech based on sentence context
from nltk.corpus import brown

# define features for the "i"th word in the sentence, including three types of suffix 
#     and one pre-word
# the pos features function takes the sentence of untagged words and the index of a word i
#   it creates features for word i, including the previous word i-1
def pos_features(sentence, i):    
    features = {"suffix(1)": sentence[i][-1:],
		    "suffix(2)": sentence[i][-2:],
		    "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features 

In [16]:
# look at features of a specific word in a specific sentence
# first sentence of brown corpus
sentence0 = brown.sents()[0]
print(sentence0)
# word 8 of sentence 0
print(sentence0[8])

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']
investigation


In [17]:
# pos features of the word 8 
print(pos_features(sentence0, 8))

{'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}


In [18]:
# get the POS tagged sentences with categories of news
tagged_sents = brown.tagged_sents(categories='news')
tag_sent0 = tagged_sents[0]
tag_sent0

[('The', 'AT'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('Grand', 'JJ-TL'),
 ('Jury', 'NN-TL'),
 ('said', 'VBD'),
 ('Friday', 'NR'),
 ('an', 'AT'),
 ('investigation', 'NN'),
 ('of', 'IN'),
 ("Atlanta's", 'NP$'),
 ('recent', 'JJ'),
 ('primary', 'NN'),
 ('election', 'NN'),
 ('produced', 'VBD'),
 ('``', '``'),
 ('no', 'AT'),
 ('evidence', 'NN'),
 ("''", "''"),
 ('that', 'CS'),
 ('any', 'DTI'),
 ('irregularities', 'NNS'),
 ('took', 'VBD'),
 ('place', 'NN'),
 ('.', '.')]

In [19]:
# the function nltk.tag.untag will take the tags off
nltk.tag.untag(tag_sent0)

['The',
 'Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'an',
 'investigation',
 'of',
 "Atlanta's",
 'recent',
 'primary',
 'election',
 'produced',
 '``',
 'no',
 'evidence',
 "''",
 'that',
 'any',
 'irregularities',
 'took',
 'place',
 '.']

In [20]:
# the python enumerate function generates an index number for each item in a list
for i,(word,tag) in enumerate(tag_sent0):
    print (i, word, tag)

0 The AT
1 Fulton NP-TL
2 County NN-TL
3 Grand JJ-TL
4 Jury NN-TL
5 said VBD
6 Friday NR
7 an AT
8 investigation NN
9 of IN
10 Atlanta's NP$
11 recent JJ
12 primary NN
13 election NN
14 produced VBD
15 `` ``
16 no AT
17 evidence NN
18 '' ''
19 that CS
20 any DTI
21 irregularities NNS
22 took VBD
23 place NN
24 . .


In [21]:
# get feature sets of words appearing in the corpus, from untagged sentences.
# and then get their tags from corresponding tagged sentence
# use the Python function enumerate to pair the index numbers with sentence words 
#   for the pos features function
featuresets = []
for tagged_sent in tagged_sents:
	untagged_sent = nltk.tag.untag(tagged_sent)
	for i, (word, tag) in enumerate(tagged_sent):
		featuresets.append( (pos_features(untagged_sent, i), tag) )

In [22]:
# look at the feature sets of the first 10 words
for f in featuresets[:10]:
	print (f)

({'suffix(1)': 'e', 'suffix(2)': 'he', 'suffix(3)': 'The', 'prev-word': '<START>'}, 'AT')
({'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ton', 'prev-word': 'The'}, 'NP-TL')
({'suffix(1)': 'y', 'suffix(2)': 'ty', 'suffix(3)': 'nty', 'prev-word': 'Fulton'}, 'NN-TL')
({'suffix(1)': 'd', 'suffix(2)': 'nd', 'suffix(3)': 'and', 'prev-word': 'County'}, 'JJ-TL')
({'suffix(1)': 'y', 'suffix(2)': 'ry', 'suffix(3)': 'ury', 'prev-word': 'Grand'}, 'NN-TL')
({'suffix(1)': 'd', 'suffix(2)': 'id', 'suffix(3)': 'aid', 'prev-word': 'Jury'}, 'VBD')
({'suffix(1)': 'y', 'suffix(2)': 'ay', 'suffix(3)': 'day', 'prev-word': 'said'}, 'NR')
({'suffix(1)': 'n', 'suffix(2)': 'an', 'suffix(3)': 'an', 'prev-word': 'Friday'}, 'AT')
({'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}, 'NN')
({'suffix(1)': 'f', 'suffix(2)': 'of', 'suffix(3)': 'of', 'prev-word': 'investigation'}, 'IN')


In [23]:
# using naive Bayesian as classifier
# split data into a training set and a test set, using a 90%/10% split
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
print(len(train_set))
print(len(test_set))

90499
10055


In [24]:
# train classifier on the training set
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [25]:
# evaluate the accuracy (this will take a little while)
print(nltk.classify.accuracy(classifier, test_set))
# the result is reasonable for features without the previous tag

0.7891596220785678


## Document Classification with keywords

In [26]:
### classify documents based on keywords
from nltk.corpus import movie_reviews
import random

In [27]:
# movie reviews are labeled either positive or negative (by human annotators)
print(movie_reviews.categories())

['neg', 'pos']


In [28]:
# for each document in movie_reviews, get its words and category (positive/negative)
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]
print(len(documents))

2000


In [29]:
random.shuffle(documents)
# look at the first document - consists of a list of all the words in the review
# followed by the category
print(documents[0])

(['of', 'all', 'the', 'films', 'i', "'", 've', 'come', 'to', 'see', 'this', 'year', '(', '1997', ')', ',', 'i', 'think', 'only', 'chasing', 'amy', 'had', 'as', 'many', '"', 'laugh', '-', 'out', '-', 'loud', '"', 'moments', '.', 'i', "'", 'm', 'not', 'saying', 'that', 'i', 'was', 'hysterically', 'rolling', 'on', 'the', 'floor', 'while', 'watching', 'waiting', 'for', 'guffman', ',', 'but', 'my', 'mouth', 'burst', 'open', 'with', 'at', 'least', 'a', 'delighted', 'guffaw', 'at', 'least', 'every', 'couple', 'of', 'minutes', '.', 'and', 'i', 'think', 'i', "'", 'm', 'the', 'kind', 'of', 'person', 'that', 'laughs', 'least', 'at', 'movies', ',', 'even', 'when', 'i', 'find', 'them', 'funny', '.', 'usually', 'all', 'you', 'can', 'get', 'out', 'of', 'me', 'is', 'a', 'warm', 'smile', ',', 'so', 'to', 'get', 'me', 'laughing', 'out', 'loud', ',', 'and', 'so', 'frequently', ',', 'is', 'a', 'really', 'good', 'sign', '.', 'waiting', 'for', 'guffman', 'is', 'the', 'story', 'of', 'blaine', ',', 'missouri'

In [30]:
## use words from all documents to define the word vector for features
# get all words from all movie_reviews and put into a frequency distribution
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
print(len(all_words))

39768


In [33]:
# get the 2000 most frequently appearing keywords in the corpus
word_items = all_words.most_common(2000)
word_features = [word for (word, freq) in word_items]   # just the words

In [34]:
# look at the first 100 words
print(word_features[:100])

[',', 'the', '.', 'a', 'and', 'of', 'to', "'", 'is', 'in', 's', '"', 'it', 'that', '-', ')', '(', 'as', 'with', 'for', 'his', 'this', 'film', 'i', 'he', 'but', 'on', 'are', 't', 'by', 'be', 'one', 'movie', 'an', 'who', 'not', 'you', 'from', 'at', 'was', 'have', 'they', 'has', 'her', 'all', '?', 'there', 'like', 'so', 'out', 'about', 'up', 'more', 'what', 'when', 'which', 'or', 'she', 'their', ':', 'some', 'just', 'can', 'if', 'we', 'him', 'into', 'even', 'only', 'than', 'no', 'good', 'time', 'most', 'its', 'will', 'story', 'would', 'been', 'much', 'character', 'also', 'get', 'other', 'do', 'two', 'well', 'them', 'very', 'characters', ';', 'first', '--', 'after', 'see', '!', 'way', 'because', 'make', 'life']


In [35]:
# define features (keywords) of a document
# each feature is 'contains(keyword)' and is true or false depending
# on whether that keyword is in the document
def document_features(document, word_features):
	document_words = set(document)
	features = {}
	for word in word_features:
		features['V_%s' % word] = (word in document_words)
	return features

In [36]:
# get features sets for a document, including keyword features and category feature
featuresets = [(document_features(d, word_features), c) for (d,c) in documents]

In [None]:
# the feature sets are 2000 words long - so this is optional
print(featuresets[0])

In [38]:
# training using naive Baysian classifier with a 95/5 split
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [39]:
# evaluate the accuracy of the classifier
print (nltk.classify.accuracy(classifier, test_set))
# the accuracy result may vary since we randomized the documents

0.78


In [40]:
# show which features of classifier are most informative
print(classifier.show_most_informative_features(30))

Most Informative Features
                V_seagal = True              neg : pos    =     13.7 : 1.0
           V_outstanding = True              pos : neg    =     11.4 : 1.0
                 V_mulan = True              pos : neg    =      9.0 : 1.0
           V_wonderfully = True              pos : neg    =      6.8 : 1.0
                 V_damon = True              pos : neg    =      6.1 : 1.0
                  V_lame = True              neg : pos    =      6.0 : 1.0
                 V_flynt = True              pos : neg    =      5.7 : 1.0
                 V_awful = True              neg : pos    =      5.6 : 1.0
                   V_era = True              pos : neg    =      5.3 : 1.0
                 V_waste = True              neg : pos    =      5.3 : 1.0
                V_wasted = True              neg : pos    =      5.1 : 1.0
                V_poorly = True              neg : pos    =      5.0 : 1.0
            V_ridiculous = True              neg : pos    =      5.0 : 1.0

## Exercise 7.6.6

In [41]:
# get the top 5000  most frequently appearing keywords in the corpus
word_items = all_words.most_common(5000)
word_features = [word for (word, freq) in word_items]   # just the words

In [42]:
# define features (keywords) of a document
# each feature is 'contains(keyword)' and is true or false depending
# on whether that keyword is in the document
def document_features(document, word_features):
	document_words = set(document)
	features = {}
	for word in word_features:
		features['V_%s' % word] = (word in document_words)
	return features

In [43]:
# get features sets for a document, including keyword features and category feature
featuresets = [(document_features(d, word_features), c) for (d,c) in documents]

In [44]:
# training using naive Baysian classifier with a 95/5 split
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [45]:
# evaluate the accuracy of the classifier
print (nltk.classify.accuracy(classifier, test_set))
# the accuracy result may vary since we randomized the documents

0.82
