In [4]:
import random
import csv
import string
import math
import time
import nltk
from nltk.corpus import names
from nltk.classify import apply_features
import numpy as np
import pickle


from data_util import *

In [48]:
NB_WEIGHTS = "weights/nb/naive_bayes_weights"

In [11]:
def _gender_features(name):
    features = {}
    features["last_letter"] = name[-1].lower()
    features["first_letter"] = name[0].lower()
    for letter in string.ascii_lowercase:
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    # names ending in -yn are mostly female, names ending in -ch ar mostly male, so add 2 more features
    features["suffix2"] = name[-2:]
    features["suffix3"] = name[-3:]
    features["suffix4"] = name[-4:]
    return features

In [44]:
def train(trainset=trainset, valset=valset, weight_file=NB_WEIGHTS):
    """trains classifier on name->gender
    
    Args:
        trainset: list of name->gender tuple pairs for training
        valset (opt): list of name->gender tuple pairs to validation
        weight_file: filename to save classifer weights

    """

    start = time.time()
    print("Training Naive Bayes Classifer on %d examples (%s)" % (len(trainset), time_since(start)))
    
    trainset = apply_features(_gender_features, trainset, labeled=True)
    classifier = nltk.NaiveBayesClassifier.train(trainset)

    # save weights
    with open(weight_file, 'wb') as f:
        pickle.dump(classifier, f)
        f.close()
    
    print("Training complete. (%s)" % (time_since(start)))
    
    # validation
    if valset is not None and len(valset) > 0: 
        valset = apply_features(_gender_features, valset, labeled=True)
        acc = nltk.classify.accuracy(classifier, valset)
        print("Validation accuracy is %.2f%% on %d examples (%s)" % (acc, len(valset), time_since(start)))

In [41]:
print("Validation accuracy is %.2f%% on %d examples (%s)" %
      (nltk.classify.accuracy(load_classifier(), apply_features(_gender_features, testset[:100])) * 100,
       len(valset), time_since(time.time())))

Validation accuracy is 86.00% on 0 examples (00h 00m 00s)


In [35]:
pct = nltk.classify.accuracy(load_classifier(), apply_features(_gender_features, testset[:100]))
print("%s" % (pct))

0.86


In [45]:
train_naive_bayes()

Training Naive Bayes Classifer on 101730 examples (00h 00m 00s)
Training complete. (00h 01m 09s)


In [92]:
def load_classifier(weight_file=NB_WEIGHTS, verbose=False):
    with open(weight_file, 'rb') as f:
        classifier = pickle.load(f)
        f.close()
    if verbose: print('Loaded weights from "%s"' % (weight_file))
    return classifier

In [77]:
def classify_name(name, weight_file=NB_WEIGHTS):
    name_ = _gender_features(clean_str(name))
    classifier = load_classifier(weight_file)
    guess = classifier.classify(name_)
    print("%s -> %s" % (name, guess))
    return guess

In [108]:
classify_name("alexandrio")

alexandrio -> male (100.00%)


'male'

In [79]:
def test(testset=testset, weight_file=NB_WEIGHTS):
    """trains classifier on name->gender
    
    Args:
        train: % of examples to train with (e.g., 0.8)
    """
    start = time.time()
    classifier = load_classifier(weight_file, verbose=True)
    
    print("Testing Naive Bayes Classifer on %d examples (%s)" % (len(testset), time_since(start)))
    testset = apply_features(_gender_features, testset, labeled=True)
    acc = nltk.classify.accuracy(classifier, testset)
    print("Testing accuracy is %.2f%% on %d examples (%s)" % (acc, len(testset), time_since(start)))
    return acc

In [59]:
test_classifier()

Loaded weights from "weights/nb/naive_bayes_weights"
Testing Naive Bayes Classifer on 33910 examples (00h 00m 00s)
Testing accuracy is 0.85% on 33910 examples (00h 00m 55s)


0.8483043350044235

In [74]:
def prob_classify_name(name, weight_file=NB_WEIGHTS):
    name_ = _gender_features(clean_str(name))
    classifier = load_classifier(weight_file)
    dist = classifier.prob_classify(name_)
    m, f = dist.prob("male"), dist.prob("female")
    d = {m: "male", f: "female"}
    first, last = max(m,f), min(m,f)
    print("%s:\n  (%.2f%%) %s\n  (%.2f%%) %s" % (name, first, d[first], last, d[last]))

In [110]:
prob_classify_name("sdf")

sdf:
  (0.99%) male
  (0.01%) female


In [100]:
def classify_name(name, weight_file=NB_WEIGHTS):
    name_ = _gender_features(clean_str(name))
    classifier = load_classifier(weight_file)
    dist = classifier.prob_classify(name_)
    m, f = dist.prob("male"), dist.prob("female")
    d = {m: "male", f: "female"}
    prob = max(m,f)
    print("%s -> %s (%.2f%%)" % (name, d[prob], prob * 100))
    return d[prob]

In [97]:
load_classifier().prob_classify(_gender_features('ellis')).prob('male')

0.8952970936462694