In [1]:
# Classification and Feature Sets in the NLTK, Part 1

# Name Gender Classifier

# preparing text data for classification and training and using classifiers
# male and female first names (in English) have distinctive characteristics
# names ending in a, e, and i are likely to be female
# names ending in k, o, r, s and t are likely to be male

In [2]:
from nltk.corpus import names

import nltk
import random

In [3]:
# a function that will extract or build the features for a single name
# generate a single feature which consists of the last letter of the name
# returns a dictionary with a single item.
def gender_features(word):
    return{'last_letter': word[-1]}

In [4]:
gender_features('Shrek')

{'last_letter': 'k'}

In [49]:
# construct the training data - a list of first names each of which will be labeled either male or female

names.words('male.txt')[:5]

['Aamir', 'Aaron', 'Abbey', 'Abbie', 'Abbot']

In [50]:
names.words('female.txt')[:5]

['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi']

In [7]:
# create one long list with (name, gender) pairs to create the labeled data
namesgender = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
len(namesgender)

7944

In [51]:
namesgender[:5]

[('Forrester', 'male'),
 ('Nero', 'male'),
 ('Lion', 'male'),
 ('Garnette', 'female'),
 ('Annelise', 'female')]

In [53]:
namesgender[7937:]

[('Joelly', 'female'),
 ('Mandi', 'female'),
 ('Berke', 'male'),
 ('Cyrill', 'male'),
 ('Rani', 'female'),
 ('Caressa', 'female'),
 ('Winn', 'male')]

In [54]:
# create a random shuffle
random.shuffle(namesgender)
namesgender[:5]

[('Barb', 'female'),
 ('Tobit', 'male'),
 ('Nil', 'female'),
 ('Georg', 'male'),
 ('Vilma', 'female')]

In [55]:
# create the list of instances of the problem that consists only of the features and the gender label.
featuresets = [(gender_features(n), g) for (n,g) in namesgender]
featuresets[:5]

[({'last_letter': 'b'}, 'female'),
 ({'last_letter': 't'}, 'male'),
 ({'last_letter': 'l'}, 'female'),
 ({'last_letter': 'g'}, 'male'),
 ({'last_letter': 'a'}, 'female')]

In [12]:
# split this list into training and test sets
train_set, test_set = featuresets[500:], featuresets[:500]

In [13]:
# run the Naïve Bayes classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [14]:
# compute the accuracy of the classifier on the test set.
print (nltk.classify.accuracy(classifier, test_set))

0.78


In [59]:
# use a classifier to label totally new instances of the problem
# names that come from the future
classifier.classify(gender_features('Neo'))

'female'

In [60]:
classifier.classify(gender_features('Trinity'))

'female'

In [17]:
# shows the feature values that were most important in doing the classification
classifier.show_most_informative_features(20)

# last_letter = 'a' female : male = 35.4 : 1.0
# last_letter = ‘a’, it is 35.4 times more likely to be classified as female

Most Informative Features
             last_letter = 'a'            female : male   =     35.8 : 1.0
             last_letter = 'k'              male : female =     30.6 : 1.0
             last_letter = 'f'              male : female =     17.2 : 1.0
             last_letter = 'p'              male : female =     11.8 : 1.0
             last_letter = 'v'              male : female =     11.1 : 1.0
             last_letter = 'd'              male : female =      9.9 : 1.0
             last_letter = 'm'              male : female =      9.1 : 1.0
             last_letter = 'o'              male : female =      7.8 : 1.0
             last_letter = 'r'              male : female =      7.2 : 1.0
             last_letter = 'g'              male : female =      5.4 : 1.0
             last_letter = 'w'              male : female =      5.1 : 1.0
             last_letter = 'b'              male : female =      4.3 : 1.0
             last_letter = 's'              male : female =      4.2 : 1.0

In [18]:
# Choosing Good Features

# Selecting relevant features can usually be the most important part of training a classifier
# throw in as many features as possible and then try to figure out which ones were important
# throwing in too many features can cause 'overfitting'

In [19]:
# a feature extraction function that has the first letter, the last letter, a count of each letter, 
# and the individual letters of the name

def gender_features2(name):
    features = {}
    
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

In [20]:
features = gender_features2('Shrek')

In [21]:
len(features)

54

In [46]:
features

{'firstletter': 's',
 'lastletter': 'k',
 'count(a)': 0,
 'has(a)': False,
 'count(b)': 0,
 'has(b)': False,
 'count(c)': 0,
 'has(c)': False,
 'count(d)': 0,
 'has(d)': False,
 'count(e)': 1,
 'has(e)': True,
 'count(f)': 0,
 'has(f)': False,
 'count(g)': 0,
 'has(g)': False,
 'count(h)': 1,
 'has(h)': True,
 'count(i)': 0,
 'has(i)': False,
 'count(j)': 0,
 'has(j)': False,
 'count(k)': 1,
 'has(k)': True,
 'count(l)': 0,
 'has(l)': False,
 'count(m)': 0,
 'has(m)': False,
 'count(n)': 0,
 'has(n)': False,
 'count(o)': 0,
 'has(o)': False,
 'count(p)': 0,
 'has(p)': False,
 'count(q)': 0,
 'has(q)': False,
 'count(r)': 1,
 'has(r)': True,
 'count(s)': 1,
 'has(s)': True,
 'count(t)': 0,
 'has(t)': False,
 'count(u)': 0,
 'has(u)': False,
 'count(v)': 0,
 'has(v)': False,
 'count(w)': 0,
 'has(w)': False,
 'count(x)': 0,
 'has(x)': False,
 'count(y)': 0,
 'has(y)': False,
 'count(z)': 0,
 'has(z)': False}

In [23]:
# new feature sets for all names in the namesgender list.
featuresets2 = [(gender_features2(n), g) for (n, g) in namesgender]

In [32]:
for (n, g) in namesgender[0:1]:
    print (n, gender_features2(n), '\n')

Forrester {'firstletter': 'f', 'lastletter': 'r', 'count(a)': 0, 'has(a)': False, 'count(b)': 0, 'has(b)': False, 'count(c)': 0, 'has(c)': False, 'count(d)': 0, 'has(d)': False, 'count(e)': 2, 'has(e)': True, 'count(f)': 1, 'has(f)': True, 'count(g)': 0, 'has(g)': False, 'count(h)': 0, 'has(h)': False, 'count(i)': 0, 'has(i)': False, 'count(j)': 0, 'has(j)': False, 'count(k)': 0, 'has(k)': False, 'count(l)': 0, 'has(l)': False, 'count(m)': 0, 'has(m)': False, 'count(n)': 0, 'has(n)': False, 'count(o)': 1, 'has(o)': True, 'count(p)': 0, 'has(p)': False, 'count(q)': 0, 'has(q)': False, 'count(r)': 3, 'has(r)': True, 'count(s)': 1, 'has(s)': True, 'count(t)': 1, 'has(t)': True, 'count(u)': 0, 'has(u)': False, 'count(v)': 0, 'has(v)': False, 'count(w)': 0, 'has(w)': False, 'count(x)': 0, 'has(x)': False, 'count(y)': 0, 'has(y)': False, 'count(z)': 0, 'has(z)': False} 



In [33]:
# accuracy check
train_set, test_set = featuresets2[500:], featuresets2[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print (nltk.classify.accuracy(classifier, test_set))

0.762


In [34]:
# development process -: error analysis of the test set to change our features and retrain the classifier if needed
# keep a separate test set that was not used in the error analysis for final evaluation
# divide labeled data into a training set, a development test set, and a test set

train_names = namesgender[1500:]
devtest_names = namesgender[500:1500]
test_names = namesgender[:500]

In [35]:
# generate the features for each name and train the classifier.

train_set = [(gender_features(n), g) for (n,g) in train_names]
devtest_set = [(gender_features(n), g) for (n,g) in devtest_names]
test_set = [(gender_features(n), g) for (n,g) in test_names]

classifier = nltk.NaiveBayesClassifier.train(train_set)

print (nltk.classify.accuracy(classifier, devtest_set))

0.766


In [36]:
# a function that will get a list of errors by running the classifier on the development test names 
# and comparing it with the original name gender labels

def geterrors(devtest):
    errors = []
    for (name, tag) in devtest:
        guess = classifier.classify(gender_features(name))
        if guess != tag:
            errors.append( (tag, guess, name) )
    return errors

In [37]:
errors = geterrors(devtest_names)
len(errors)

234

In [38]:
# a function to print all the errors, sorted by the correct labels

def printerrors(errors):
    for (tag, guess, name) in sorted(errors):
        print ('correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name))

In [39]:
printerrors(errors[:10])

correct=female   guess=male     name=Charlott                      
correct=female   guess=male     name=Charo                         
correct=female   guess=male     name=Karon                         
correct=female   guess=male     name=Lynnet                        
correct=female   guess=male     name=Madelin                       
correct=male     guess=female   name=Anthony                       
correct=male     guess=female   name=Charley                       
correct=male     guess=female   name=Ferdie                        
correct=male     guess=female   name=Tre                           
correct=male     guess=female   name=Zollie                        


In [None]:
# EXERCISE

In [40]:
def gender_features3(word):
    return {'suffix1': word[-1],
            'suffix2': word[-2]}

In [41]:
new_train_set = [(gender_features3(n), g) for (n,g) in train_names]
new_devtest_set = [(gender_features3(n), g) for (n,g) in devtest_names]
new_test_set = [(gender_features3(n), g) for (n,g) in test_names]
new_classifier = nltk.NaiveBayesClassifier.train(new_train_set)
print (nltk.classify.accuracy(new_classifier, new_devtest_set))

0.77


In [42]:
errors = geterrors(devtest_names)
len(errors)

234

In [44]:
printerrors(errors[:10])

correct=female   guess=male     name=Charlott                      
correct=female   guess=male     name=Charo                         
correct=female   guess=male     name=Karon                         
correct=female   guess=male     name=Lynnet                        
correct=female   guess=male     name=Madelin                       
correct=male     guess=female   name=Anthony                       
correct=male     guess=female   name=Charley                       
correct=male     guess=female   name=Ferdie                        
correct=male     guess=female   name=Tre                           
correct=male     guess=female   name=Zollie                        
