In [27]:
import random
import csv
import string
import math
import nltk
from nltk.corpus import names
from nltk.classify import apply_features
import numpy as np
# import torch
# import torch.nn as nn


In [32]:
def loadNames():
    filename = "namedata.csv"
    names = []
    genders = []
 
    with open(filename) as csvDataFile:
        csvReader = csv.reader(csvDataFile)
        for row in csvReader:
            names.append(row[0])
            genders.append(row[1])

    return names, genders

def loadDataset(shuffled=True):
    """Returns the name->gender dataset ready for processing
    
    Args:
        shuffled (Boolean, optional): set to False to return the dataset unshuffled
    Ret:
        namelist (list(String,String)): list of (name, gender) records
    """
    nms,gns = loadNames()
    namelist = list(zip(nms,gns))
    if shuffled:
        random.shuffle(namelist)
    return namelist

In [15]:
def _gender_features(name):
    features = {}
    features["last_letter"] = name[-1].lower()
    features["first_letter"] = name[0].lower()
    for letter in string.ascii_lowercase:
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    # names ending in -yn are mostly female, names ending in -ch ar mostly male, so add 2 more features
    features["suffix2"] = name[-2:]
    features["suffix3"] = name[-3:]
    return features

In [16]:
def naive_bayes_gender_classifier(train):
    """trains classifier on name->gender
    
    Args:
        train: % of examples to train with (e.g., 0.8)
    """
    
    # get name lists
    labeled_names = namelist.copy()
    random.shuffle(labeled_names)
    
    divider = int(len(labeled_names) * train)
    train_set = apply_features(_gender_features, labeled_names[divider:])
    test_set = apply_features(_gender_features, labeled_names[:divider])
    
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    acc = nltk.classify.accuracy(classifier, test_set)
    print("test: " + str(train) + ", acc: " + str(acc))
#     classifier.show_most_informative_features(5)
    return acc

In [17]:
naive_bayes_gender_classifier(0.5)

test: 0.5, acc: 0.8066465256797583


0.8066465256797583

In [18]:
accuracies = {}
for i in range(50,85,5):
    cum = 0
    num = 10
    for j in range(num):
        cum += naive_bayes_gender_classifier(i * 0.01)
    accuracies[i] = cum / num

test: 0.5, acc: 0.8006042296072508
test: 0.5, acc: 0.7993454179254783
test: 0.5, acc: 0.8011077542799597
test: 0.5, acc: 0.8071500503524672
test: 0.5, acc: 0.8021148036253777
test: 0.5, acc: 0.8026183282980867
test: 0.5, acc: 0.8086606243705942
test: 0.5, acc: 0.8036253776435045
test: 0.5, acc: 0.7990936555891238
test: 0.5, acc: 0.8079053373615307
test: 0.55, acc: 0.7956054016937514
test: 0.55, acc: 0.79995422293431
test: 0.55, acc: 0.7965209430075532
test: 0.55, acc: 0.7983520256351568
test: 0.55, acc: 0.8024719615472649
test: 0.55, acc: 0.7946898603799496
test: 0.55, acc: 0.8006408789196613
test: 0.55, acc: 0.7930876630807965
test: 0.55, acc: 0.7985809109636073
test: 0.55, acc: 0.7910276951247425
test: 0.6, acc: 0.7958455728073857
test: 0.6, acc: 0.7933277381451951
test: 0.6, acc: 0.8006714225765842
test: 0.6, acc: 0.7983634074695761
test: 0.6, acc: 0.8006714225765842
test: 0.6, acc: 0.7968946705832983
test: 0.6, acc: 0.8017205203524969
test: 0.6, acc: 0.7979437683592111
test: 0.6, a

In [19]:
accuracies

{50: 0.8032225579053373,
 55: 0.7970931563286793,
 60: 0.7981955518254302,
 65: 0.8010265349602944,
 70: 0.7969424460431654,
 75: 0.7938905673044647,
 80: 0.7911880409126673}

In [3]:
male_names = names.words('male.txt')
female_names = names.words('female.txt')
labeled_names = ([(name, 'male') for name in male_names] + [(name, 'female') for name in female_names])

In [4]:
len(labeled_names)

7944

In [14]:
# classify a single person's name
male_names = names.words('male.txt')
female_names = names.words('female.txt')
labeled_names = ([(name, 'male') for name in male_names] + [(name, 'female') for name in female_names])
random.shuffle(labeled_names)

divider = int(60)
train_set = apply_features(_gender_features, labeled_names[divider:])
test_set = apply_features(_gender_features, labeled_names[:divider])

classifier = nltk.NaiveBayesClassifier.train(train_set)


classifier.classify(_gender_features("alex"))

'male'

In [23]:
type(labeled_names)

list

[('ginnifer', 'female'),
 ('veer', 'male'),
 ('agustya', 'male'),
 ('jaheam', 'male'),
 ('kesly', 'female'),
 ('sreehan', 'male'),
 ('ernisha', 'female'),
 ('mirjam', 'female'),
 ('perna', 'female'),
 ('kylematthew', 'male'),
 ('deshone', 'male'),
 ('ajisha', 'female'),
 ('dezhan', 'male'),
 ('sevryn', 'male'),
 ('foppe', 'male'),
 ('thinh', 'male'),
 ('saniyah', 'female'),
 ('brityn', 'female'),
 ('tekeira', 'female'),
 ('avry', 'female'),
 ('annunziato', 'male'),
 ('lizeht', 'female'),
 ('alcdia', 'female'),
 ('gurlie', 'female'),
 ('reeya', 'female'),
 ('vinay', 'male'),
 ('seoyoon', 'female'),
 ('evisa', 'female'),
 ('renado', 'male'),
 ('miyona', 'female'),
 ('vaira', 'female'),
 ('rupi', 'female'),
 ('aella', 'female'),
 ('dagbjrt', 'female'),
 ('mairtn', 'male'),
 ('konstantinos', 'male'),
 ('shadee', 'female'),
 ('augustis', 'male'),
 ('chanden', 'male'),
 ('gearald', 'male'),
 ('shaunea', 'female'),
 ('treton', 'male'),
 ('yishu', 'male'),
 ('darrlyn', 'male'),
 ('cashion', 'm

In [39]:
nl = namelist.copy()

In [40]:
random.shuffle(nl)
nl

[('delephine', 'female'),
 ('kaymi', 'female'),
 ('jayni', 'female'),
 ('shareema', 'female'),
 ('brandun', 'male'),
 ('coleon', 'male'),
 ('kemisha', 'female'),
 ('crestina', 'female'),
 ('kadir', 'male'),
 ('guisel', 'female'),
 ('meifeng', 'male'),
 ('wilder', 'male'),
 ('kazuyo', 'female'),
 ('danny', 'male'),
 ('devontra', 'male'),
 ('aaliah', 'female'),
 ('trevis', 'male'),
 ('taishi', 'male'),
 ('sigrn', 'female'),
 ('daemen', 'male'),
 ('huashan', 'male'),
 ('torence', 'male'),
 ('kenaya', 'female'),
 ('najaah', 'female'),
 ('katsumi', 'male'),
 ('jionny', 'male'),
 ('chyanne', 'female'),
 ('kiajah', 'female'),
 ('chasmin', 'female'),
 ('haney', 'male'),
 ('adesola', 'female'),
 ('mingyue', 'male'),
 ('shirlon', 'female'),
 ('zaharra', 'female'),
 ('serafine', 'female'),
 ('shiniya', 'female'),
 ('janiza', 'female'),
 ('markchristopher', 'male'),
 ('hanora', 'female'),
 ('edwynn', 'male'),
 ('sterie', 'male'),
 ('kiaralee', 'female'),
 ('suhas', 'male'),
 ('janella', 'female'),

In [41]:
namelist

[('deshard', 'male'),
 ('lavette', 'female'),
 ('kaifeng', 'male'),
 ('chidubem', 'male'),
 ('bladyn', 'male'),
 ('shixiong', 'male'),
 ('juliett', 'female'),
 ('prabhakar', 'male'),
 ('birglinde', 'female'),
 ('dagrunn', 'female'),
 ('danett', 'female'),
 ('autumn', 'female'),
 ('chongda', 'male'),
 ('inhong', 'male'),
 ('roiann', 'female'),
 ('jacyion', 'male'),
 ('benisha', 'female'),
 ('juliannah', 'female'),
 ('jurinus', 'male'),
 ('rondald', 'male'),
 ('chalia', 'female'),
 ('brune', 'female'),
 ('alara', 'female'),
 ('lasharon', 'female'),
 ('elliekate', 'female'),
 ('mik', 'male'),
 ('vigneswaran', 'male'),
 ('geneal', 'female'),
 ('ayden', 'male'),
 ('liwei', 'male'),
 ('elisjah', 'male'),
 ('aayana', 'female'),
 ('lourdez', 'female'),
 ('leartis', 'male'),
 ('mogens', 'male'),
 ('dalayah', 'female'),
 ('chantry', 'male'),
 ('finnborg', 'female'),
 ('nedaa', 'female'),
 ('johannes', 'male'),
 ('yuleni', 'female'),
 ('likang', 'male'),
 ('vivvian', 'female'),
 ('sung', 'male'),

In [47]:
loadDataset(False)

[('ginnifer', 'female'),
 ('veer', 'male'),
 ('agustya', 'male'),
 ('jaheam', 'male'),
 ('kesly', 'female'),
 ('sreehan', 'male'),
 ('ernisha', 'female'),
 ('mirjam', 'female'),
 ('perna', 'female'),
 ('kylematthew', 'male'),
 ('deshone', 'male'),
 ('ajisha', 'female'),
 ('dezhan', 'male'),
 ('sevryn', 'male'),
 ('foppe', 'male'),
 ('thinh', 'male'),
 ('saniyah', 'female'),
 ('brityn', 'female'),
 ('tekeira', 'female'),
 ('avry', 'female'),
 ('annunziato', 'male'),
 ('lizeht', 'female'),
 ('alcdia', 'female'),
 ('gurlie', 'female'),
 ('reeya', 'female'),
 ('vinay', 'male'),
 ('seoyoon', 'female'),
 ('evisa', 'female'),
 ('renado', 'male'),
 ('miyona', 'female'),
 ('vaira', 'female'),
 ('rupi', 'female'),
 ('aella', 'female'),
 ('dagbjrt', 'female'),
 ('mairtn', 'male'),
 ('konstantinos', 'male'),
 ('shadee', 'female'),
 ('augustis', 'male'),
 ('chanden', 'male'),
 ('gearald', 'male'),
 ('shaunea', 'female'),
 ('treton', 'male'),
 ('yishu', 'male'),
 ('darrlyn', 'male'),
 ('cashion', 'm