# Name-Gender Classifier

## Team
* Daniel Dittenhafer
* Youqing Xiang

## Classifer Types

* Naive Bayes
* Decision Tree


### Load Dependencies

In [2]:
import nltk
from nltk.corpus import names
import random

### Naive Bayes Gender Classifier Wrapper Classs

The following class defines a gender classifier using a Naive Bayes Classifier internally.

In [41]:
class genderClassifier():
    def __init__(self):
        self.classifier = None
        pass
    
    def get_Features(self,name):
        features = {}
        
        features['prefix1'] = name[0].lower()
        features['prefix2'] = name[0:2].lower()
        features['prefix3'] = name[0:3].lower()
        features['prefix4'] = name[0:4].lower()
        #features['lastletter'] = name[-1].lower()
        #features['lenghtofletter'] = len(name)
        features['suffix1'] = name[-1:]
        features['suffix2'] = name[-2:]
        features['suffix3'] = name[-3:]
        features['suffix4'] = name[-4:]
        features['lastletter=vowel'] = name[-1] in ('a', 'e', 'i', 'o', 'u')

        return features
    
    def train(self,train_set):
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)
        return self.classifier
    
    def dev_test(self, devtest_set):
        accuracy = nltk.classify.accuracy(self.classifier, devtest_set)
        return accuracy
    
    def get_Errors(self, devtest_names):
        errors = []
        for (name, tag) in devtest_names:
            guess = classifier.classify(self.get_Features(name))
            if guess != tag:
                errors.append((tag, guess, name))
        return errors
    
    def test(self, test_set):
        accuracy = nltk.classify.accuracy(self.classifier, test_set)
        return accuracy
    
    def predict(self,raw_names):
        results = []
        for name in raw_names:
            predict = classifier.classify(self.get_Features(name))
            results.append((name, predict))
        return results

### Load NLTK Gender Data and Train the Model

In [42]:
# load data
nameGenders = ([(name, 'male') for name in names.words('male.txt')] +
         [(name, 'female') for name in names.words('female.txt')])
random.shuffle(nameGenders)

# split the data into three sets: train, devtest, test
train_names = nameGenders[1000:]
devtest_names = nameGenders[500:1000]
test_names = nameGenders[:500]

In [43]:
genderclassifier = genderClassifier()

In [44]:
# get furtures
train_set = [(genderclassifier.get_Features(n),g) for (n,g) in train_names]
devtest_set = [(genderclassifier.get_Features(n),g) for (n,g) in devtest_names]
test_set = [(genderclassifier.get_Features(n),g) for (n,g) in test_names]

In [45]:
# Show an example of the features
train_set[:1]

[({'lastletter=vowel': False,
   'prefix1': u'k',
   'prefix2': u'ki',
   'prefix3': u'kir',
   'prefix4': u'kirs',
   'suffix1': u'n',
   'suffix2': u'en',
   'suffix3': u'ten',
   'suffix4': u'sten'},
  'female')]

In [46]:
# train
classifier = genderclassifier.train(train_set)

In [47]:
# dev test
dev_test_accuracy = genderclassifier.dev_test(devtest_set)
print 'Devtest Accuracy: %f'% dev_test_accuracy

Devtest Accuracy: 0.842000


### Most Informative Features

In [48]:
classifier.show_most_informative_features()

Most Informative Features
                 suffix2 = u'na'          female : male   =     93.7 : 1.0
                 suffix2 = u'la'          female : male   =     71.2 : 1.0
                 suffix1 = u'k'             male : female =     40.6 : 1.0
                 suffix2 = u'ld'            male : female =     38.4 : 1.0
                 suffix2 = u'ia'          female : male   =     37.2 : 1.0
                 suffix1 = u'a'           female : male   =     35.9 : 1.0
                 suffix2 = u'ra'          female : male   =     33.1 : 1.0
                 suffix2 = u'sa'          female : male   =     31.3 : 1.0
                 suffix2 = u'ta'          female : male   =     30.9 : 1.0
                 suffix2 = u'rd'            male : female =     28.9 : 1.0


Show some of the errors:

In [49]:
errors = genderclassifier.get_Errors(devtest_names)

In [50]:
for (tag, guess, name) in sorted(errors[:10]): 
    print 'correct=%-8s guess=%-8s names=%-30s' % (tag, guess, name)

correct=female   guess=male     names=Abagael                       
correct=female   guess=male     names=Ajay                          
correct=female   guess=male     names=Judy                          
correct=female   guess=male     names=Kit                           
correct=female   guess=male     names=Lark                          
correct=female   guess=male     names=Marley                        
correct=female   guess=male     names=Nitin                         
correct=female   guess=male     names=Vicky                         
correct=male     guess=female   names=Cam                           
correct=male     guess=female   names=Niki                          


### Decision Tree Classifier

In [51]:
class genderClassifierDT(genderClassifier):

    def train(self,train_set):
        """Overrides the genderClassifier and specifies a DecisionTreeClassifier instead of the
        NaiveBayesClassifer."""
        self.classifier = nltk.DecisionTreeClassifier.train(train_set)
        return self.classifier


In [52]:
dtClassifier = genderClassifierDT()

In [53]:
classifier = dtClassifier.train(train_set)

In [54]:
# dev test
dt_dev_test_accuracy = dtClassifier.dev_test(devtest_set)
print 'Devtest Accuracy: %f'% dt_dev_test_accuracy

Devtest Accuracy: 0.684000


## Classifier Choice

Based on the dev_test accuracy, the Naive Bayes classifer performs better.

In [55]:
results = [('NaiveBayes', dev_test_accuracy), ('DecisionTree', dt_dev_test_accuracy)]
for (model, accuracy) in sorted(results): 
    print '%-15s %-30s' % (model, accuracy)

DecisionTree    0.684                         
NaiveBayes      0.842                         


Run the test_set through the selected classifer:

In [56]:
# test
test_accuracy = genderclassifier.dev_test(test_set)
print 'Test Accuracy: %f'% test_accuracy

Test Accuracy: 0.862000


In the current outcome of 0.86 test accuracy against the test_set, we are surprised that the classifier did better in the generalized case versus the dev_test set. This is likely due to coincidence in the random selection of name when split between sets.

## Do some test with our class names

For fun, we decided to use our class's names and run a cycle to see how well the trained classifier did. As you'll note below, the classifier did fairly well and achieved a 0.92 accuracy. 

In [57]:
classNames = ['Jason','Maxwell','Honey','Lara','Brian','Thomas',
        'Sandipayan','Neil','Justin','Burton','Frank','Jashan',
        'Stacey','Igor','Daina','Mohan','Bryant','John','Erik',
        'Robert','Youqing','Alexander','Daniel','Riguel','Alain']
classNamesGender =  [('Jason', 'male'),
                     ('Maxwell', 'male'),
                     ('Honey', 'female'),
                     ('Lara', 'female'),
                     ('Brian', 'male'),
                     ('Thomas', 'male'),
                     ('Sandipayan', 'male'),
                     ('Neil', 'male'),
                     ('Justin', 'male'),
                     ('Burton', 'male'),
                     ('Frank', 'male'),
                     ('Jashan', 'male'),
                     ('Stacey', 'female'),
                     ('Igor', 'male'),
                     ('Daina', 'female'),
                     ('Mohan', 'male'),
                     ('Bryant', 'male'),
                     ('John', 'male'),
                     ('Erik', 'male'),
                     ('Robert', 'male'),
                     ('Youqing','female'),
                     ('Alexander', 'male'),
                     ('Daniel','male'),
                     ('Riguel', 'male'),
                     ('Alain', 'male')]
class_set = [(genderclassifier.get_Features(n), g) for (n, g) in classNamesGender]

In [58]:
# DATA 620 Class
results = genderclassifier.predict(['Jason','Maxwell','Honey','Lara','Brian','Thomas',
                                    'Sandipayan','Neil','Justin','Burton','Frank','Jashan',
                                    'Stacey','Igor','Daina','Mohan','Bryant','John','Erik',
                                    'Robert','Youqing','Alexander','Daniel','Riguel','Alain'])
for result in results:
    print result

('Jason', 'male')
('Maxwell', 'male')
('Honey', 'female')
('Lara', 'female')
('Brian', 'male')
('Thomas', 'male')
('Sandipayan', 'male')
('Neil', 'male')
('Justin', 'male')
('Burton', 'male')
('Frank', 'male')
('Jashan', 'male')
('Stacey', 'female')
('Igor', 'male')
('Daina', 'female')
('Mohan', 'male')
('Bryant', 'male')
('John', 'male')
('Erik', 'male')
('Robert', 'male')
('Youqing', 'male')
('Alexander', 'male')
('Daniel', 'male')
('Riguel', 'male')
('Alain', 'male')


In [60]:
genderclassifier.dev_test(class_set)

0.92