In [27]:
import random
import csv
import string
import math
import nltk
from nltk.corpus import names
from nltk.classify import apply_features
import numpy as np
# import torch
# import torch.nn as nn


In [99]:
def loadNames():
    filename = "nltk_mbejda_blk_mfran.csv"
    names = []
    genders = []
 
    with open(filename) as csvDataFile:
        csvReader = csv.reader(csvDataFile)
        for row in csvReader:
            names.append(row[0])
            genders.append(row[1])

    return names, genders

def loadDataset(shuffled=True):
    """Returns the name->gender dataset ready for processing
    
    Args:
        shuffled (Boolean, optional): set to False to return the dataset unshuffled
    Ret:
        namelist (list(String,String)): list of (name, gender) records
    """
    nms,gns = loadNames()
    namelist = list(zip(nms,gns))
    if shuffled:
        random.shuffle(namelist)
    return namelist

In [89]:
def _gender_features(name):
    features = {}
    features["last_letter"] = name[-1].lower()
    features["first_letter"] = name[0].lower()
    for letter in string.ascii_lowercase:
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    # names ending in -yn are mostly female, names ending in -ch ar mostly male, so add 2 more features
    features["suffix2"] = name[-2:]
    features["suffix3"] = name[-3:]
    features["suffix4"] = name[-4:]
    return features

In [49]:
def old_naive_bayes_gender_classifier(train):
    """trains classifier on name->gender
    
    Args:
        train: % of examples to train with (e.g., 0.8)
    """
    
    # get name lists
    male_names = names.words('male.txt')
    female_names = names.words('female.txt')
    labeled_names = ([(name, 'male') for name in male_names] + [(name, 'female') for name in female_names])
    random.shuffle(labeled_names)
    
    divider = int(len(labeled_names) * train)
    train_set = apply_features(_gender_features, labeled_names[divider:])
    test_set = apply_features(_gender_features, labeled_names[:divider])
    
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    acc = nltk.classify.accuracy(classifier, test_set)
    print("test: " + str(train) + ", acc: " + str(acc))
#     classifier.show_most_informative_features(5)
    return acc

In [None]:
def naive_bayes_gender_classifier(train):
    """trains classifier on name->gender
    
    Args:
        train: % of examples to train with (e.g., 0.8)
    """
    
    # get name lists
    labeled_names = loadDataset()
    
    divider = int(len(labeled_names) * train)
    train_set = apply_features(_gender_features, labeled_names[divider:])
    test_set = apply_features(_gender_features, labeled_names[:divider])
    
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    acc = nltk.classify.accuracy(classifier, test_set)
    print("test: " + str(train) + ", acc: " + str(acc))
#     classifier.show_most_informative_features(5)
    return acc

In [100]:
naive_bayes_gender_classifier(0.5)

test: 0.5, acc: 0.8472279563550575


0.8472279563550575

In [94]:
accuracies = {}
for i in range(60,79,5):
    cum = 0
    num = 5
    for j in range(num):
        cum += naive_bayes_gender_classifier(i * 0.01)
    accuracies[i] = cum / num

test: 0.6, acc: 0.8490871951997957
test: 0.6, acc: 0.846355164049534
test: 0.6, acc: 0.8489084641899656
test: 0.6, acc: 0.8484871696667944
test: 0.6, acc: 0.8489084641899656
test: 0.65, acc: 0.8487808902035189
test: 0.65, acc: 0.8472842546872974
test: 0.65, acc: 0.8477202823573777
test: 0.65, acc: 0.8459761716770567
test: 0.65, acc: 0.8461647241830373
test: 0.7000000000000001, acc: 0.8460469442468677
test: 0.7000000000000001, acc: 0.8459594025277671
test: 0.7000000000000001, acc: 0.8467253925698965
test: 0.7000000000000001, acc: 0.8461673141106308
test: 0.7000000000000001, acc: 0.8461344859659682
test: 0.75, acc: 0.8446870659367596
test: 0.75, acc: 0.8447789852112101
test: 0.75, acc: 0.8455756189231146
test: 0.75, acc: 0.8455551924176812
test: 0.75, acc: 0.8457696707247324


In [95]:
accuracies

{60: 0.848349291459211,
 65: 0.8471852646216576,
 70: 0.8462067078842261,
 75: 0.8452733066426996}

In [92]:
old_accuracies

{60: 0.8368415677262862,
 65: 0.8370576381441719,
 70: 0.836644963615473,
 75: 0.8355707165618107,
 80: 0.8356970509383377,
 85: 0.8348409887624249}

In [3]:
male_names = names.words('male.txt')
female_names = names.words('female.txt')
labeled_names = ([(name, 'male') for name in male_names] + [(name, 'female') for name in female_names])

In [4]:
len(labeled_names)

7944

In [62]:
# classify a single person's name
male_names = names.words('male.txt')
female_names = names.words('female.txt')
labeled_names = ([(name, 'male') for name in male_names] + [(name, 'female') for name in female_names])
random.shuffle(labeled_names)

labeled_names = loadDataset()

divider = int(60)
train_set = apply_features(_gender_features, labeled_names[divider:])
test_set = apply_features(_gender_features, labeled_names[:divider])

classifier = nltk.NaiveBayesClassifier.train(train_set)


In [88]:
classifier.classify(_gender_features("elijah"))

'female'

In [55]:
def comparison_classifier(train):
    """trains classifier on name->gender
    
    Args:
        train: % of examples to train with (e.g., 0.8)
    """
    
    # get name lists
    male_names = names.words('male.txt')
    female_names = names.words('female.txt')
    labeled_names = ([(name, 'male') for name in male_names] + [(name, 'female') for name in female_names])
    random.shuffle(labeled_names)
    
    divider = int(len(labeled_names) * train)
    train_set = apply_features(_gender_features,labeled_names)
    test_set = apply_features(_gender_features, loadDataset())
    
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    acc = nltk.classify.accuracy(classifier, test_set)
    print("test: " + str(train) + ", acc: " + str(acc))
#     classifier.show_most_informative_features(5)
    return acc

In [56]:
def comparison_classifier2(train):
    """trains classifier on name->gender
    
    Args:
        train: % of examples to train with (e.g., 0.8)
    """
    
    # get name lists
    male_names = names.words('male.txt')
    female_names = names.words('female.txt')
    labeled_names = ([(name, 'male') for name in male_names] + [(name, 'female') for name in female_names])
    random.shuffle(labeled_names)
    
    divider = int(len(labeled_names) * train)
    train_set = apply_features(_gender_features,loadDataset())
    test_set = apply_features(_gender_features, labeled_names)
    
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    acc = nltk.classify.accuracy(classifier, test_set)
    print("test: " + str(train) + ", acc: " + str(acc))
#     classifier.show_most_informative_features(5)
    return acc

In [96]:
comparison_classifier(65)

test: 65, acc: 0.8111451551129836


0.8111451551129836

In [97]:
comparison_classifier2(65)

test: 65, acc: 0.8021148036253777


0.8021148036253777

In [80]:
def test_classifier(train):
    """trains classifier on name->gender
    
    Args:
        train: % of examples to train with (e.g., 0.8)
    """
    
    # get name lists
    male_names = names.words('male.txt')
    female_names = names.words('female.txt')
    labeled_names = ([(name, 'male') for name in male_names] + [(name, 'female') for name in female_names])
    random.shuffle(labeled_names)
    
    divider = int(len(labeled_names) * train)
    train_set = apply_features(_gender_features, loadDataset())
    test_set = apply_features(_gender_features, loadDataset())
    
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    acc = nltk.classify.accuracy(classifier, test_set)
    print("test: " + str(train) + ", acc: " + str(acc))
#     classifier.show_most_informative_features(5)
    return acc

In [98]:
test_classifier(65)

test: 65, acc: 0.8649789352738414


0.8649789352738414

In [82]:
test_classifier(65)

test: 65, acc: 0.8417081577939487


0.8417081577939487