In [1]:
# Lab Week 7 - Classification tasks with different types of features
# This file has small examples that are meant to be run individually
#   in the Python shell

import nltk

# define a feature extraction function for each name
def gender_features(word):
    return{'last_letter': word[-1]}
print(gender_features('Shrek'))




{'last_letter': 'k'}


In [2]:
# resource for male and female first names
from nltk.corpus import names
print(names.words('male.txt')[:20])
print(names.words('female.txt')[:20])

['Aamir', 'Aaron', 'Abbey', 'Abbie', 'Abbot', 'Abbott', 'Abby', 'Abdel', 'Abdul', 'Abdulkarim', 'Abdullah', 'Abe', 'Abel', 'Abelard', 'Abner', 'Abraham', 'Abram', 'Ace', 'Adair', 'Adam']
['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi', 'Abbie', 'Abby', 'Abigael', 'Abigail', 'Abigale', 'Abra', 'Acacia', 'Ada', 'Adah', 'Adaline', 'Adara', 'Addie', 'Addis', 'Adel', 'Adela']


In [3]:
# make list of male and female names paired with gender
namesgender = ([(name, 'male') for name in names.words('male.txt')] +
          [(name, 'female') for name in names.words('female.txt')])
print(len(namesgender))
print(namesgender[:20])
print(namesgender[7924:])

7944
[('Aamir', 'male'), ('Aaron', 'male'), ('Abbey', 'male'), ('Abbie', 'male'), ('Abbot', 'male'), ('Abbott', 'male'), ('Abby', 'male'), ('Abdel', 'male'), ('Abdul', 'male'), ('Abdulkarim', 'male'), ('Abdullah', 'male'), ('Abe', 'male'), ('Abel', 'male'), ('Abelard', 'male'), ('Abner', 'male'), ('Abraham', 'male'), ('Abram', 'male'), ('Ace', 'male'), ('Adair', 'male'), ('Adam', 'male')]
[('Zena', 'female'), ('Zenia', 'female'), ('Zia', 'female'), ('Zilvia', 'female'), ('Zita', 'female'), ('Zitella', 'female'), ('Zoe', 'female'), ('Zola', 'female'), ('Zonda', 'female'), ('Zondra', 'female'), ('Zonnya', 'female'), ('Zora', 'female'), ('Zorah', 'female'), ('Zorana', 'female'), ('Zorina', 'female'), ('Zorine', 'female'), ('Zsa Zsa', 'female'), ('Zsazsa', 'female'), ('Zulema', 'female'), ('Zuzana', 'female')]


In [4]:
# put the list into random order
import random
random.shuffle(namesgender)
print(namesgender[:20])

[('Carlita', 'female'), ('Rhodia', 'female'), ('Sigmund', 'male'), ('Elbertina', 'female'), ('Joellen', 'female'), ('Nevile', 'male'), ('Hedwig', 'female'), ('Veronica', 'female'), ('Cain', 'male'), ('Jonah', 'male'), ('Sissy', 'female'), ('Flipper', 'male'), ('Marius', 'male'), ('Haven', 'male'), ('Jada', 'female'), ('Cara', 'female'), ('Glori', 'female'), ('Gabriel', 'male'), ('Dotty', 'female'), ('Barny', 'male')]


In [5]:
# separate the names into training and test
train_names = namesgender[500:]
test_names = namesgender[:500]


In [6]:
# use our features to train a classify and test on the development test set
train_set = [(gender_features(n), g) for (n, g) in train_names]
test_set = [(gender_features(n), g) for (n, g) in test_names]
print(train_set[:20])


[({'last_letter': 's'}, 'male'), ({'last_letter': 'o'}, 'male'), ({'last_letter': 'i'}, 'female'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'n'}, 'male'), ({'last_letter': 'y'}, 'female'), ({'last_letter': 'a'}, 'female'), ({'last_letter': 'n'}, 'male'), ({'last_letter': 'n'}, 'female'), ({'last_letter': 't'}, 'male'), ({'last_letter': 'n'}, 'male'), ({'last_letter': 'f'}, 'male'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'l'}, 'male'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'n'}, 'male'), ({'last_letter': 'e'}, 'female'), ({'last_letter': 'y'}, 'female'), ({'last_letter': 'l'}, 'male'), ({'last_letter': 'e'}, 'female')]


In [7]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

# classify new instances
print(classifier.classify(gender_features('Neo')))
print(classifier.classify(gender_features('Trinity')))

male
female


In [8]:
# classify accuracy function runs the classifier on the test set and reports
#   comparisons between predicted labels and actual/gold labels
print(nltk.classify.accuracy(classifier, test_set))

0.762


In [9]:
# this function available for naive bayes classifiers
print(classifier.show_most_informative_features(20))


Most Informative Features
             last_letter = 'a'            female : male   =     37.4 : 1.0
             last_letter = 'k'              male : female =     31.9 : 1.0
             last_letter = 'f'              male : female =     16.4 : 1.0
             last_letter = 'p'              male : female =     12.4 : 1.0
             last_letter = 'v'              male : female =     10.4 : 1.0
             last_letter = 'd'              male : female =      9.4 : 1.0
             last_letter = 'm'              male : female =      9.3 : 1.0
             last_letter = 'o'              male : female =      8.8 : 1.0
             last_letter = 'r'              male : female =      6.7 : 1.0
             last_letter = 'w'              male : female =      6.1 : 1.0
             last_letter = 'g'              male : female =      5.2 : 1.0
             last_letter = 'u'              male : female =      4.3 : 1.0
             last_letter = 't'              male : female =      4.1 : 1.0

In [10]:
# define a function that will compare the classifier labels with the gold standard labels
def geterrors(test):
    errors = []
    for (name, tag) in test:
        guess = classifier.classify(gender_features(name))
        if guess != tag:
            errors.append( (tag, guess, name) )
    return errors

errors = geterrors(test_names)
print(len(errors))

119


In [11]:
# define a function to print the errors
def printerrors(errors):
    for (tag, guess, name) in sorted(errors):
        print('correct={:<8s} guess={:<8s} name={:<30s}'.format(tag, guess, name))

printerrors(errors)


correct=female   guess=male     name=Angel                         
correct=female   guess=male     name=Beau                          
correct=female   guess=male     name=Bette-Ann                     
correct=female   guess=male     name=Britt                         
correct=female   guess=male     name=Calypso                       
correct=female   guess=male     name=Cameo                         
correct=female   guess=male     name=Cathryn                       
correct=female   guess=male     name=Charin                        
correct=female   guess=male     name=Cherilynn                     
correct=female   guess=male     name=Cherin                        
correct=female   guess=male     name=Cris                          
correct=female   guess=male     name=Deeann                        
correct=female   guess=male     name=Diamond                       
correct=female   guess=male     name=Dido                          
correct=female   guess=male     name=Doralyn    

In [12]:
## classify part of speech based on sentence context
from nltk.corpus import brown

# define features for the "i"th word in the sentence, including three types of suffix 
#     and one pre-word
# the pos features function takes the sentence of untagged words and the index of a word i
#   it creates features for word i, including the previous word i-1
def pos_features(sentence, i):    
    features = {"suffix(1)": sentence[i][-1:],
		    "suffix(2)": sentence[i][-2:],
		    "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features 



In [13]:
# look at features of a specific word in a specific sentence
# first sentence of brown corpus
sentence0 = brown.sents()[0]
print(sentence0)
# word 8 of sentence 0
print(sentence0[8])

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']
investigation


In [14]:
# pos features of the word 8 
print(pos_features(sentence0, 8))

{'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}


In [15]:
# get the POS tagged sentences with categories of news
tagged_sents = brown.tagged_sents(categories='news')
tag_sent0 = tagged_sents[0]
tag_sent0

[('The', 'AT'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('Grand', 'JJ-TL'),
 ('Jury', 'NN-TL'),
 ('said', 'VBD'),
 ('Friday', 'NR'),
 ('an', 'AT'),
 ('investigation', 'NN'),
 ('of', 'IN'),
 ("Atlanta's", 'NP$'),
 ('recent', 'JJ'),
 ('primary', 'NN'),
 ('election', 'NN'),
 ('produced', 'VBD'),
 ('``', '``'),
 ('no', 'AT'),
 ('evidence', 'NN'),
 ("''", "''"),
 ('that', 'CS'),
 ('any', 'DTI'),
 ('irregularities', 'NNS'),
 ('took', 'VBD'),
 ('place', 'NN'),
 ('.', '.')]

In [16]:
# the function nltk.tag.untag will take the tags off
nltk.tag.untag(tag_sent0)

# the python enumerate function generates an index number for each item in a list
for i,(word,tag) in enumerate(tag_sent0):
    print (i, word, tag)



0 The AT
1 Fulton NP-TL
2 County NN-TL
3 Grand JJ-TL
4 Jury NN-TL
5 said VBD
6 Friday NR
7 an AT
8 investigation NN
9 of IN
10 Atlanta's NP$
11 recent JJ
12 primary NN
13 election NN
14 produced VBD
15 `` ``
16 no AT
17 evidence NN
18 '' ''
19 that CS
20 any DTI
21 irregularities NNS
22 took VBD
23 place NN
24 . .


In [17]:
# get feature sets of words appearing in the corpus, from untagged sentences.
# and then get their tags from corresponding tagged sentence
# use the Python function enumerate to pair the index numbers with sentence words 
#   for the pos features function
featuresets = []
for tagged_sent in tagged_sents:
	untagged_sent = nltk.tag.untag(tagged_sent)
	for i, (word, tag) in enumerate(tagged_sent):
		featuresets.append( (pos_features(untagged_sent, i), tag) )



In [18]:
# look at the feature sets of the first 10 words
for f in featuresets[:10]:
	print (f)
	

({'suffix(1)': 'e', 'suffix(2)': 'he', 'suffix(3)': 'The', 'prev-word': '<START>'}, 'AT')
({'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ton', 'prev-word': 'The'}, 'NP-TL')
({'suffix(1)': 'y', 'suffix(2)': 'ty', 'suffix(3)': 'nty', 'prev-word': 'Fulton'}, 'NN-TL')
({'suffix(1)': 'd', 'suffix(2)': 'nd', 'suffix(3)': 'and', 'prev-word': 'County'}, 'JJ-TL')
({'suffix(1)': 'y', 'suffix(2)': 'ry', 'suffix(3)': 'ury', 'prev-word': 'Grand'}, 'NN-TL')
({'suffix(1)': 'd', 'suffix(2)': 'id', 'suffix(3)': 'aid', 'prev-word': 'Jury'}, 'VBD')
({'suffix(1)': 'y', 'suffix(2)': 'ay', 'suffix(3)': 'day', 'prev-word': 'said'}, 'NR')
({'suffix(1)': 'n', 'suffix(2)': 'an', 'suffix(3)': 'an', 'prev-word': 'Friday'}, 'AT')
({'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}, 'NN')
({'suffix(1)': 'f', 'suffix(2)': 'of', 'suffix(3)': 'of', 'prev-word': 'investigation'}, 'IN')


In [19]:
# using naive Bayesian as classifier
# split data into a training set and a test set, using a 90%/10% split
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
print(len(train_set))
print(len(test_set))



90499
10055


In [20]:
# train classifier on the training set
classifier = nltk.NaiveBayesClassifier.train(train_set)

# evaluate the accuracy (this will take a little while)
print(nltk.classify.accuracy(classifier, test_set))
# the result is reasonable for features without the previous tag



0.7891596220785678


In [21]:

### classify documents based on keywords
from nltk.corpus import movie_reviews
import random

# movie reviews are labeled either positive or negative (by human annotators)
print(movie_reviews.categories())

# for each document in movie_reviews, get its words and category (positive/negative)
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]
print(len(documents))

['neg', 'pos']
2000


In [22]:
random.shuffle(documents)
# look at the first document - consists of a list of all the words in the review
# followed by the category
print(documents[0])

(['in', ',', '"', 'the', 'muse', '"', 'albert', 'brooks', 'plays', 'steven', 'phillips', ',', 'a', 'hollywood', 'screenwriter', 'who', 'after', 'winning', 'a', 'humanitarian', 'award', 'for', 'his', 'work', 'is', 'dumped', 'by', 'his', 'studio', '.', 'they', 'claim', 'that', 'he', "'", 's', 'lost', 'his', 'edge', 'and', 'his', 'agent', 'is', 'quick', 'to', 'agree', 'with', 'them', '.', 'he', 'knows', 'that', 'he', 'needs', 'to', 'write', 'something', 'fresh', 'and', 'original', 'and', 'quick', 'or', 'else', 'his', 'career', 'will', 'be', 'over', '.', 'so', 'he', 'turns', 'to', 'his', '"', 'best', '"', 'friend', ',', 'jack', '(', 'jeff', 'bridges', ')', 'another', 'screenwriter', 'who', "'", 's', 'enjoyed', 'success', 'after', 'success', '.', 'on', 'the', 'way', 'over', 'to', 'jack', "'", 's', 'house', 'steven', 'sees', 'jack', 'helping', 'an', 'attractive', 'woman', '(', 'sharon', 'stone', ')', 'into', 'a', 'cab', 'and', 'begins', 'to', 'think', ':', 'is', 'he', 'having', 'an', 'affair

In [23]:

## use words from all documents to define the word vector for features
# get all words from all movie_reviews and put into a frequency distribution
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
print(len(all_words))



39768


In [24]:
# get the 2000 most frequently appearing keywords in the corpus
word_items = all_words.most_common(2000)
word_features = [word for (word, freq) in word_items]   # just the words

# look at the first 100 words
print(word_features[:100])

[',', 'the', '.', 'a', 'and', 'of', 'to', "'", 'is', 'in', 's', '"', 'it', 'that', '-', ')', '(', 'as', 'with', 'for', 'his', 'this', 'film', 'i', 'he', 'but', 'on', 'are', 't', 'by', 'be', 'one', 'movie', 'an', 'who', 'not', 'you', 'from', 'at', 'was', 'have', 'they', 'has', 'her', 'all', '?', 'there', 'like', 'so', 'out', 'about', 'up', 'more', 'what', 'when', 'which', 'or', 'she', 'their', ':', 'some', 'just', 'can', 'if', 'we', 'him', 'into', 'even', 'only', 'than', 'no', 'good', 'time', 'most', 'its', 'will', 'story', 'would', 'been', 'much', 'character', 'also', 'get', 'other', 'do', 'two', 'well', 'them', 'very', 'characters', ';', 'first', '--', 'after', 'see', '!', 'way', 'because', 'make', 'life']


In [25]:
# define features (keywords) of a document
# each feature is 'contains(keyword)' and is true or false depending
# on whether that keyword is in the document
def document_features(document, word_features):
	document_words = set(document)
	features = {}
	for word in word_features:
		features['V_%s' % word] = (word in document_words)
	return features

In [26]:

# get features sets for a document, including keyword features and category feature
featuresets = [(document_features(d, word_features), c) for (d,c) in documents]

# the feature sets are 2000 words long - so this is optional
print(featuresets[0])

({'V_,': True, 'V_the': True, 'V_.': True, 'V_a': True, 'V_and': True, 'V_of': True, 'V_to': True, "V_'": True, 'V_is': True, 'V_in': True, 'V_s': True, 'V_"': True, 'V_it': True, 'V_that': True, 'V_-': True, 'V_)': True, 'V_(': True, 'V_as': True, 'V_with': True, 'V_for': True, 'V_his': True, 'V_this': True, 'V_film': True, 'V_i': True, 'V_he': True, 'V_but': True, 'V_on': True, 'V_are': True, 'V_t': True, 'V_by': True, 'V_be': True, 'V_one': True, 'V_movie': True, 'V_an': True, 'V_who': True, 'V_not': True, 'V_you': True, 'V_from': True, 'V_at': True, 'V_was': False, 'V_have': True, 'V_they': True, 'V_has': True, 'V_her': True, 'V_all': True, 'V_?': True, 'V_there': True, 'V_like': True, 'V_so': True, 'V_out': True, 'V_about': True, 'V_up': True, 'V_more': True, 'V_what': True, 'V_when': True, 'V_which': False, 'V_or': True, 'V_she': True, 'V_their': True, 'V_:': True, 'V_some': False, 'V_just': True, 'V_can': True, 'V_if': True, 'V_we': True, 'V_him': True, 'V_into': True, 'V_even':

In [27]:
# training using naive Baysian classifier with a 95/5 split
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

# evaluate the accuracy of the classifier
print (nltk.classify.accuracy(classifier, test_set))
# the accuracy result may vary since we randomized the documents

0.84


In [28]:
# show which features of classifier are most informative
print(classifier.show_most_informative_features(30))



Most Informative Features
           V_outstanding = True              pos : neg    =     10.9 : 1.0
                 V_mulan = True              pos : neg    =      8.9 : 1.0
                V_seagal = True              neg : pos    =      8.3 : 1.0
                 V_damon = True              pos : neg    =      7.8 : 1.0
           V_wonderfully = True              pos : neg    =      6.7 : 1.0
                V_wasted = True              neg : pos    =      6.0 : 1.0
                 V_flynt = True              pos : neg    =      5.6 : 1.0
                 V_awful = True              neg : pos    =      5.4 : 1.0
            V_ridiculous = True              neg : pos    =      5.4 : 1.0
                 V_waste = True              neg : pos    =      5.4 : 1.0
                  V_lame = True              neg : pos    =      5.2 : 1.0
                   V_era = True              pos : neg    =      5.1 : 1.0
                V_poorly = True              neg : pos    =      4.8 : 1.0

In [29]:
def gender_features(word):
    return{'last_letter': word[-1] , 'last_but_one_letter':word[-2]}
print(gender_features("kanishk"))

{'last_letter': 'k', 'last_but_one_letter': 'h'}


In [30]:
namesgender = ([(name, 'male') for name in names.words('male.txt')] +
          [(name, 'female') for name in names.words('female.txt')])

In [31]:
random.shuffle(namesgender)
namesgender[:20]

[('Ronalda', 'female'),
 ('Juana', 'female'),
 ('Noam', 'male'),
 ('Moselle', 'female'),
 ('Tobias', 'male'),
 ('Flossy', 'female'),
 ('Arvy', 'male'),
 ('Hanny', 'female'),
 ('Anson', 'male'),
 ('Ulrick', 'male'),
 ('Gerta', 'female'),
 ('Bethany', 'female'),
 ('Rosabelle', 'female'),
 ('Kenny', 'male'),
 ('Birgitta', 'female'),
 ('Martguerita', 'female'),
 ('Kimberli', 'female'),
 ('Susann', 'female'),
 ('Candra', 'female'),
 ('Brynna', 'female')]

In [32]:
train_names_new = namesgender[500:]
test_names_new = namesgender[:500]
train_set_new = [(gender_features(n), g) for (n,g) in train_names_new]
test_set_new = [(gender_features(n), g) for (n,g) in test_names_new]
print(train_set_new[:20])

[({'last_letter': 'a', 'last_but_one_letter': 'n'}, 'female'), ({'last_letter': 't', 'last_but_one_letter': 'r'}, 'male'), ({'last_letter': 'n', 'last_but_one_letter': 'u'}, 'female'), ({'last_letter': 'l', 'last_but_one_letter': 'i'}, 'female'), ({'last_letter': 'e', 'last_but_one_letter': 'i'}, 'female'), ({'last_letter': 'i', 'last_but_one_letter': 'm'}, 'female'), ({'last_letter': 'y', 'last_but_one_letter': 'l'}, 'male'), ({'last_letter': 'e', 'last_but_one_letter': 'a'}, 'female'), ({'last_letter': 'a', 'last_but_one_letter': 't'}, 'female'), ({'last_letter': 'r', 'last_but_one_letter': 'r'}, 'female'), ({'last_letter': 'a', 'last_but_one_letter': 'l'}, 'female'), ({'last_letter': 'a', 'last_but_one_letter': 't'}, 'female'), ({'last_letter': 'i', 'last_but_one_letter': 'u'}, 'female'), ({'last_letter': 'f', 'last_but_one_letter': 'f'}, 'male'), ({'last_letter': 'a', 'last_but_one_letter': 'p'}, 'female'), ({'last_letter': 's', 'last_but_one_letter': 's'}, 'female'), ({'last_lette

In [33]:
classifier = nltk.NaiveBayesClassifier.train(train_set_new)
classifier.classify(gender_features('Christina'))


'female'

In [34]:

print(nltk.classify.accuracy(classifier, test_set_new))

0.792


In [35]:
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]
print(len(documents))

2000


In [36]:
random.shuffle(documents)

In [37]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_items = all_words.most_common(2500)
word_features = [word for (word, freq) in word_items]
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['V_{}'.format( word)] = (word in document_words)
    return features

In [38]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set=featuresets[100:]
test_set=featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print (nltk.classify.accuracy(classifier, test_set))

0.88


In [39]:
word_items = all_words.most_common(3200)
word_features = [word for (word, freq) in word_items]

In [40]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set=featuresets[100:]
test_set=featuresets[:100]

In [41]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [42]:
print (nltk.classify.accuracy(classifier, test_set))

0.89
