# Name-Gender Classifier

## Team
* Daniel Dittenhafer
* Youqing Xiang

## Classifer Types

* Naive Bayes
* Decision Tree


### Load Dependencies

In [4]:
import nltk
from nltk.corpus import names
import random

### Naive Bayes Gender Classifier Wrapper Classs

The following class defines a gender classifier using a Naive Bayes Classifier internally.

In [5]:
class genderClassifier():
    def __init__(self):
        self.classifier = None
        pass
    
    def get_Features(self,name):
        features = {}
        
        features['prefix1'] = name[0].lower()
        features['prefix2'] = name[0:2].lower()
        features['prefix3'] = name[0:3].lower()
        features['prefix4'] = name[0:4].lower()
        #features['lastletter'] = name[-1].lower()
        #features['lenghtofletter'] = len(name)
        features['suffix1'] = name[-1:]
        features['suffix2'] = name[-2:]
        features['suffix3'] = name[-3:]
        features['suffix4'] = name[-4:]
        features['lastletter=vowel'] = name[-1] in ('a', 'e', 'i', 'o', 'u')

        return features
    
    def train(self,train_set):
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)
        return self.classifier
    
    def dev_test(self, classifier, devtest_set):
        accuracy = nltk.classify.accuracy(self.classifier, devtest_set)
        return accuracy
    
    def test(self, classifier, test_set):
        accuracy = nltk.classify.accuracy(self.classifier, test_set)
        return accuracy
    
    def get_Errors(self, devtest_names):
            errors = []
            for (name, tag) in devtest_names:
                guess = classifier.classify(self.get_Features(name))
                if guess != tag:
                    errors.append((tag, guess, name))
            return errors

### Load NLTK Gender Data

In [6]:
# load data
nameGenders = ([(name, 'male') for name in names.words('male.txt')] +
         [(name, 'female') for name in names.words('female.txt')])
random.shuffle(nameGenders)

# split the data into three sets: train, devtest, test
train_names = nameGenders[1000:]
devtest_names = nameGenders[500:1000]
test_names = nameGenders[:500]

In [7]:
genderclassifier = genderClassifier()

In [8]:
# get furtures
train_set = [(genderclassifier.get_Features(n),g) for (n,g) in train_names]
devtest_set = [(genderclassifier.get_Features(n),g) for (n,g) in devtest_names]
test_set = [(genderclassifier.get_Features(n),g) for (n,g) in test_names]

In [9]:
# Show an example of the features
train_set[:1]

[({'lastletter=vowel': True,
   'prefix1': u'j',
   'prefix2': u'ju',
   'prefix3': u'jud',
   'prefix4': u'judi',
   'suffix1': u'e',
   'suffix2': u'ie',
   'suffix3': u'die',
   'suffix4': u'udie'},
  'female')]

In [10]:
# train
classifier = genderclassifier.train(train_set)

In [11]:
# dev test
dev_test_accuracy = genderclassifier.dev_test(classifier, devtest_set)
print 'Devtest Accuracy: %f'% dev_test_accuracy

Devtest Accuracy: 0.848000


### Most Informative Features

In [12]:
classifier.show_most_informative_features()

Most Informative Features
                 suffix2 = u'na'          female : male   =     91.0 : 1.0
                 suffix1 = u'k'             male : female =     39.6 : 1.0
                 suffix1 = u'a'           female : male   =     37.1 : 1.0
                 suffix2 = u'ia'          female : male   =     36.0 : 1.0
                 suffix2 = u'sa'          female : male   =     33.8 : 1.0
                 suffix2 = u'ta'          female : male   =     30.9 : 1.0
                 suffix2 = u'us'            male : female =     27.2 : 1.0
                 suffix3 = u'ita'         female : male   =     26.6 : 1.0
                 suffix2 = u'ra'          female : male   =     25.2 : 1.0
                 suffix3 = u'tta'         female : male   =     24.9 : 1.0


In [13]:
errors = genderclassifier.get_Errors(devtest_names)

In [14]:
for (tag, guess, name) in sorted(errors): 
    print 'correct=%-8s guess=%-8s names=%-30s' % (tag, guess, name)

correct=female   guess=male     names=Aimil                         
correct=female   guess=male     names=Alison                        
correct=female   guess=male     names=Amberly                       
correct=female   guess=male     names=Anett                         
correct=female   guess=male     names=Bo                            
correct=female   guess=male     names=Britt                         
correct=female   guess=male     names=Brooke                        
correct=female   guess=male     names=Cam                           
correct=female   guess=male     names=Charo                         
correct=female   guess=male     names=Chicky                        
correct=female   guess=male     names=Christin                      
correct=female   guess=male     names=Debor                         
correct=female   guess=male     names=Demeter                       
correct=female   guess=male     names=Demetris                      
correct=female   guess=male     na

### Decision Tree Classifier

In [15]:
class genderClassifierDT(genderClassifier):

    def train(self,train_set):
        """Overrides the genderClassifier and specifies a DecisionTreeClassifier instead of the
        NaiveBayesClassifer."""
        self.classifier = nltk.DecisionTreeClassifier.train(train_set)
        return self.classifier


In [16]:
dtClassifier = genderClassifierDT()

In [17]:
classifier = dtClassifier.train(train_set)

In [18]:
# dev test
dt_dev_test_accuracy = dtClassifier.dev_test(classifier, devtest_set)
print 'Devtest Accuracy: %f'% dt_dev_test_accuracy

Devtest Accuracy: 0.676000


In [19]:
print classifier.pseudocode(depth=4)

if suffix4 == u' Ann': return 'female'
if suffix4 == u' Dee': return 'female'
if suffix4 == u'-Ann': return 'female'
if suffix4 == u'-Lou': return 'male'
if suffix4 == u'-Luc': return 'male'
if suffix4 == u'-Mar': return 'female'
if suffix4 == u'Abbe': return 'female'
if suffix4 == u'Abbi': return 'female'
if suffix4 == u'Abby': return 'male'
if suffix4 == u'Abra': return 'female'
if suffix4 == u'Ada': return 'female'
if suffix4 == u'Adah': return 'female'
if suffix4 == u'Adam': return 'male'
if suffix4 == u'Adel': return 'female'
if suffix4 == u'Adey': return 'female'
if suffix4 == u'Adi': return 'female'
if suffix4 == u'Ag': return 'female'
if suffix4 == u'Aggy': return 'female'
if suffix4 == u'Agna': return 'female'
if suffix4 == u'Aida': return 'female'
if suffix4 == u'Aili': return 'female'
if suffix4 == u'Aime': return 'female'
if suffix4 == u'Aina': return 'female'
if suffix4 == u'Ajay': return 'male'
if suffix4 == u'Al': return 'male'
if suffix4 == u'Alec': return 'male'
if suf

## Classifier Choice

Based on the dev_test accuracy, the Naive Bayes classifer performs better.

In [20]:
results = [('NaiveBayes', dev_test_accuracy), ('DecisionTree', dt_dev_test_accuracy)]
for (model, accuracy) in sorted(results): 
    print '%-15s %-30s' % (model, accuracy)

DecisionTree    0.676                         
NaiveBayes      0.848                         


Testing the NaiveBayes on the test_set: