In [1]:
import random
import csv
import string
import math
import nltk
from nltk.corpus import names
from nltk.classify import apply_features
import numpy as np
# import torch
# import torch.nn as nn


In [2]:
def loadNames():
    filename = "nltk_mbejda_blk_mfran.csv"
    names = []
    genders = []
 
    with open(filename) as csvDataFile:
        csvReader = csv.reader(csvDataFile)
        for row in csvReader:
            names.append(row[0])
            genders.append(row[1])

    return names, genders

def loadDataset(shuffled=True):
    """Returns the name->gender dataset ready for processing
    
    Args:
        shuffled (Boolean, optional): set to False to return the dataset unshuffled
    Ret:
        namelist (list(String,String)): list of (name, gender) records
    """
    nms,gns = loadNames()
    namelist = list(zip(nms,gns))
    if shuffled:
        random.shuffle(namelist)
    return namelist

In [3]:
def _gender_features(name):
    features = {}
    features["last_letter"] = name[-1].lower()
    features["first_letter"] = name[0].lower()
    for letter in string.ascii_lowercase:
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    # names ending in -yn are mostly female, names ending in -ch ar mostly male, so add 2 more features
    features["suffix2"] = name[-2:]
    features["suffix3"] = name[-3:]
    features["suffix4"] = name[-4:]
    return features

In [4]:
def old_naive_bayes_gender_classifier(train):
    """trains classifier on name->gender
    
    Args:
        train: % of examples to train with (e.g., 0.8)
    """
    
    # get name lists
    male_names = names.words('male.txt')
    female_names = names.words('female.txt')
    labeled_names = ([(name, 'male') for name in male_names] + [(name, 'female') for name in female_names])
    random.shuffle(labeled_names)
    
    divider = int(len(labeled_names) * train)
    train_set = apply_features(_gender_features, labeled_names[divider:])
    test_set = apply_features(_gender_features, labeled_names[:divider])
    
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    acc = nltk.classify.accuracy(classifier, test_set)
    print("test: " + str(train) + ", acc: " + str(acc))
#     classifier.show_most_informative_features(5)
    return acc

In [5]:
from data_util import *

In [6]:
def naive_bayes_gender_classifier(train):
    """trains classifier on name->gender
    
    Args:
        train: % of examples to train with (e.g., 0.8)
    """
    
    start = time.time()
    # get name lists
    labeled_names = load_dataset()
    
    divider = int(len(labeled_names) * train)
    train_set = apply_features(_gender_features, labeled_names[divider:], labeled=True)
    test_set = apply_features(_gender_features, labeled_names[:divider], labeled=True)
    print("datasets loaded. beginning training on %d examples (%s)" % (len(train_set), time_since(start)))
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print("beginning testing on %d examples (%s)" % (len(test_set), time_since(start)))
    acc = nltk.classify.accuracy(classifier, test_set)
    print("test: " + str(train) + ", acc: " + str(acc))
#     classifier.show_most_informative_features(5)
    return acc

In [7]:
naive_bayes_gender_classifier(0.5)

datasets loaded. beginning training on 67820 examples (00h 00m 00s)
beginning testing on 67820 examples (00h 00m 06s)
test: 0.5, acc: 0.8493512238277794


0.8493512238277794

In [8]:
accuracies = {}
for i in range(60,79,5):
    cum = 0
    num = 5
    for j in range(num):
        cum += naive_bayes_gender_classifier(i * 0.01)
    accuracies[i] = cum / num

datasets loaded. beginning training on 54256 examples (00h 00m 00s)
beginning testing on 81384 examples (00h 00m 05s)
test: 0.6, acc: 0.8478570726432714
datasets loaded. beginning training on 54256 examples (00h 00m 00s)
beginning testing on 81384 examples (00h 00m 04s)
test: 0.6, acc: 0.848250270323405
datasets loaded. beginning training on 54256 examples (00h 00m 00s)
beginning testing on 81384 examples (00h 00m 04s)
test: 0.6, acc: 0.8489137914086307
datasets loaded. beginning training on 54256 examples (00h 00m 00s)
beginning testing on 81384 examples (00h 00m 04s)
test: 0.6, acc: 0.8474515875356335
datasets loaded. beginning training on 54256 examples (00h 00m 00s)
beginning testing on 81384 examples (00h 00m 05s)
test: 0.6, acc: 0.8475130246731545
datasets loaded. beginning training on 47474 examples (00h 00m 00s)
beginning testing on 88166 examples (00h 00m 04s)


KeyboardInterrupt: 

In [None]:
accuracies

In [None]:
old_accuracies

In [None]:
male_names = names.words('male.txt')
female_names = names.words('female.txt')
labeled_names = ([(name, 'male') for name in male_names] + [(name, 'female') for name in female_names])

In [None]:
len(labeled_names)

In [None]:
# classify a single person's name
male_names = names.words('male.txt')
female_names = names.words('female.txt')
labeled_names = ([(name, 'male') for name in male_names] + [(name, 'female') for name in female_names])
random.shuffle(labeled_names)

labeled_names = loadDataset()

divider = int(60)
train_set = apply_features(_gender_features, labeled_names[divider:])
test_set = apply_features(_gender_features, labeled_names[:divider])

classifier = nltk.NaiveBayesClassifier.train(train_set)


In [88]:
classifier.classify(_gender_features("elijah"))

'female'

In [55]:
def comparison_classifier(train):
    """trains classifier on name->gender
    
    Args:
        train: % of examples to train with (e.g., 0.8)
    """
    
    # get name lists
    male_names = names.words('male.txt')
    female_names = names.words('female.txt')
    labeled_names = ([(name, 'male') for name in male_names] + [(name, 'female') for name in female_names])
    random.shuffle(labeled_names)
    
    divider = int(len(labeled_names) * train)
    train_set = apply_features(_gender_features,labeled_names)
    test_set = apply_features(_gender_features, loadDataset())
    
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    acc = nltk.classify.accuracy(classifier, test_set)
    print("test: " + str(train) + ", acc: " + str(acc))
#     classifier.show_most_informative_features(5)
    return acc

In [56]:
def comparison_classifier2(train):
    """trains classifier on name->gender
    
    Args:
        train: % of examples to train with (e.g., 0.8)
    """
    
    # get name lists
    male_names = names.words('male.txt')
    female_names = names.words('female.txt')
    labeled_names = ([(name, 'male') for name in male_names] + [(name, 'female') for name in female_names])
    random.shuffle(labeled_names)
    
    divider = int(len(labeled_names) * train)
    train_set = apply_features(_gender_features,loadDataset())
    test_set = apply_features(_gender_features, labeled_names)
    
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    acc = nltk.classify.accuracy(classifier, test_set)
    print("test: " + str(train) + ", acc: " + str(acc))
#     classifier.show_most_informative_features(5)
    return acc

In [96]:
comparison_classifier(65)

test: 65, acc: 0.8111451551129836


0.8111451551129836

In [97]:
comparison_classifier2(65)

test: 65, acc: 0.8021148036253777


0.8021148036253777

In [80]:
def test_classifier(train):
    """trains classifier on name->gender
    
    Args:
        train: % of examples to train with (e.g., 0.8)
    """
    
    # get name lists
    male_names = names.words('male.txt')
    female_names = names.words('female.txt')
    labeled_names = ([(name, 'male') for name in male_names] + [(name, 'female') for name in female_names])
    random.shuffle(labeled_names)
    
    divider = int(len(labeled_names) * train)
    train_set = apply_features(_gender_features, loadDataset())
    test_set = apply_features(_gender_features, loadDataset())
    
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    acc = nltk.classify.accuracy(classifier, test_set)
    print("test: " + str(train) + ", acc: " + str(acc))
#     classifier.show_most_informative_features(5)
    return acc

In [98]:
test_classifier(65)

test: 65, acc: 0.8649789352738414


0.8649789352738414

In [82]:
test_classifier(65)

test: 65, acc: 0.8417081577939487


0.8417081577939487