### Overview of Names Corpus

In [1]:
import nltk
from nltk.corpus import names

In [2]:
dir(names)

['_LazyCorpusLoader__args',
 '_LazyCorpusLoader__kwargs',
 '_LazyCorpusLoader__name',
 '_LazyCorpusLoader__reader_cls',
 '__class__',
 '__delattr__',
 '__dict__',
 '__doc__',
 '__format__',
 '__getattribute__',
 '__hash__',
 '__init__',
 '__module__',
 '__name__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_get_root',
 'abspath',
 'abspaths',
 'citation',
 'encoding',
 'ensure_loaded',
 'fileids',
 'license',
 'open',
 'raw',
 'readme',
 'root',
 'unicode_repr',
 'words']

**Looks like all the names are stored in words()**

In [3]:
names.words()[:20]

[u'Abagael',
 u'Abagail',
 u'Abbe',
 u'Abbey',
 u'Abbi',
 u'Abbie',
 u'Abby',
 u'Abigael',
 u'Abigail',
 u'Abigale',
 u'Abra',
 u'Acacia',
 u'Ada',
 u'Adah',
 u'Adaline',
 u'Adara',
 u'Addie',
 u'Addis',
 u'Adel',
 u'Adela']

In [4]:
len(names.words())

7944

**Also has individual files for male and female**

In [5]:
names.fileids()

[u'female.txt', u'male.txt']

In [6]:
male = names.words('male.txt')
female = names.words('female.txt')
print(len(male), len(female))

(2943, 5001)


**Make labeled set**

In [7]:
labeled_names = ([(name, 'male') for name in male] + [(name, 'female') for name in female])
labeled_names[:10]

[(u'Aamir', 'male'),
 (u'Aaron', 'male'),
 (u'Abbey', 'male'),
 (u'Abbie', 'male'),
 (u'Abbot', 'male'),
 (u'Abbott', 'male'),
 (u'Abby', 'male'),
 (u'Abdel', 'male'),
 (u'Abdul', 'male'),
 (u'Abdulkarim', 'male')]

### Examples from NLTK Ch. 6
*Just to better understand the example classifier(s) we are to improve on*

**Classify gender based on last letter of name**

In [8]:
def gender_features(word):
    return {'last_letter': word[-1]}
gender_features('Shrek')

{'last_letter': 'k'}

**Divide into training and test sets**

In [9]:
import random
random.shuffle(labeled_names)
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
print(train_set[:5])
print(type(train_set))

[({'last_letter': u'n'}, 'male'), ({'last_letter': u'e'}, 'female'), ({'last_letter': u'o'}, 'male'), ({'last_letter': u'a'}, 'female'), ({'last_letter': u'd'}, 'male')]
<type 'list'>


In [10]:
# Better way to store list data using NLTK
from nltk.classify import apply_features
train_set = apply_features(gender_features, labeled_names[500:])
test_set = apply_features(gender_features, labeled_names[:500])
print(train_set[:5])
print(type(train_set))

[({'last_letter': u'n'}, 'male'), ({'last_letter': u'e'}, 'female'), ...]
<class 'nltk.util.LazyMap'>


**Naive Bayes Classifier**

In [11]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(classifier.classify(gender_features('Neo')))
print(classifier.classify(gender_features('Trinity')))

male
female


In [12]:
print(nltk.classify.accuracy(classifier, test_set))

0.722


In [13]:
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = u'a'           female : male   =     34.6 : 1.0
             last_letter = u'k'             male : female =     31.0 : 1.0
             last_letter = u'f'             male : female =     16.5 : 1.0
             last_letter = u'p'             male : female =     12.5 : 1.0
             last_letter = u'v'             male : female =     11.1 : 1.0


**Feature Extraction example**

In [14]:
# Laundry list of all features
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

In [15]:
gender_features2('John') 

{'count(a)': 0,
 'count(b)': 0,
 'count(c)': 0,
 'count(d)': 0,
 'count(e)': 0,
 'count(f)': 0,
 'count(g)': 0,
 'count(h)': 1,
 'count(i)': 0,
 'count(j)': 1,
 'count(k)': 0,
 'count(l)': 0,
 'count(m)': 0,
 'count(n)': 1,
 'count(o)': 1,
 'count(p)': 0,
 'count(q)': 0,
 'count(r)': 0,
 'count(s)': 0,
 'count(t)': 0,
 'count(u)': 0,
 'count(v)': 0,
 'count(w)': 0,
 'count(x)': 0,
 'count(y)': 0,
 'count(z)': 0,
 'first_letter': 'j',
 'has(a)': False,
 'has(b)': False,
 'has(c)': False,
 'has(d)': False,
 'has(e)': False,
 'has(f)': False,
 'has(g)': False,
 'has(h)': True,
 'has(i)': False,
 'has(j)': True,
 'has(k)': False,
 'has(l)': False,
 'has(m)': False,
 'has(n)': True,
 'has(o)': True,
 'has(p)': False,
 'has(q)': False,
 'has(r)': False,
 'has(s)': False,
 'has(t)': False,
 'has(u)': False,
 'has(v)': False,
 'has(w)': False,
 'has(x)': False,
 'has(y)': False,
 'has(z)': False,
 'last_letter': 'n'}

In [16]:
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.75


**Divide corpus into training, test, and dev-test sets**

In [17]:
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]

In [18]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.78


**Error Analysis**

In [19]:
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )

In [20]:
for (tag, guess, name) in sorted(errors):
    print('correct={:<8} guess={:<8s} name={:<30}'.format(tag, guess, name))

correct=female   guess=male     name=Abigael                       
correct=female   guess=male     name=Addis                         
correct=female   guess=male     name=Alleen                        
correct=female   guess=male     name=Alys                          
correct=female   guess=male     name=Alyss                         
correct=female   guess=male     name=Amabel                        
correct=female   guess=male     name=Angel                         
correct=female   guess=male     name=Annabel                       
correct=female   guess=male     name=Arabel                        
correct=female   guess=male     name=Ayn                           
correct=female   guess=male     name=Bab                           
correct=female   guess=male     name=Bidget                        
correct=female   guess=male     name=Brett                         
correct=female   guess=male     name=Caitrin                       
correct=female   guess=male     name=Caro       

*Looking through this list of errors makes it clear that some suffixes that are more than one letter can be indicative of name genders. For example, names ending in yn appear to be predominantly female, despite the fact that names ending in n tend to be male; and names ending in ch are usually male, even though names that end in h tend to be female. We therefore adjust our feature extractor to include features for two-letter suffixes:*

In [21]:
def gender_features(word):
    return {'suffix1': word[-1:], 'suffix2': word[-2:]}

In [22]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.794
