In [1]:
import nltk
from nltk.corpus import names
import random
from nltk.classify import apply_features
import pandas as pd

#get data
names = ([(name, 'male') for name in names.words('male.txt')] +
[(name, 'female') for name in names.words('female.txt')])
random.shuffle(names)

In [2]:
def gender_features(name): #gabby
    features = {}
    features["firstletter"] = name[0].lower() #came with code; A
    features["lastletter"] = name[-1].lower() #came with code; B
    features["prefix"] = name[:3].lower() if len(name) > 4 else name[:2].lower() #gabby made; C
    features["suffix"] = name[-3:].lower() if len(name) > 4 else name[-2:].lower()#gabby made; D
    features["final_vowels"] = name[-2:].lower() if name[-1].lower() in ["a", "e", "i", "o", "u"] and name[-2].lower() in ["a", "e", "i", "o", "u"] else False #gabby made; F
    features["name_length"] = len(name)
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

In [3]:
def gender_features2(name): #gabby
    features = {}
    features["firstletter"] = name[0].lower() #came with code; A
    features["lastletter"] = name[-1].lower() #came with code; B
    features["prefix"] = name[:3].lower() if len(name) > 4 else name[:2].lower() #gabby made; C
    features["suffix"] = name[-3:].lower() if len(name) > 4 else name[-2:].lower()#gabby made; D
    features["final_consonants"] = name[-2:].lower() if name[-1].lower() not in ["a", "e", "i", "o", "u"] and name[-2].lower() not in ["a", "e", "i", "o", "u"] else False #gabby made; E
    features["final_vowels"] = name[-2:].lower() if name[-1].lower() in ["a", "e", "i", "o", "u"] and name[-2].lower() in ["a", "e", "i", "o", "u"] else False #gabby made; F
    features["name_length"] = len(name)
    return features

In [18]:
def gender_features3(name): #gabby
    features = {}
    temp_name = name
    eng_cons_clusters = ["bl", "br", "ch", "cl", "cr", "dr", "fl", "fr", "gl", "gr", "pl", "pr", "sc", "sh", "sk", "sl", "sm", "sn", "sp", "st", "sw", "th", "tr", "tw", "wh", "wr", "sch", "scr", "shr", "sph", "spl", "spr", "squ", "str", "thr"]
    features["firstletter"] = name[0].lower() #came with code; A
    features["lastletter"] = name[-1].lower() #came with code; B
    features["prefix"] = name[:3].lower() if len(name) > 4 else name[:2].lower() #gabby made; C
    features["suffix"] = name[-3:].lower() if len(name) > 4 else name[-2:].lower()#gabby made; D
    clusters = []
    for cluster in eng_cons_clusters[::-1]:
        if cluster in temp_name:
            temp_name = temp_name.replace(cluster, "")
            clusters.append(cluster)
    features["english_consonant_clusters_1"] = cluster[0] if len(clusters) > 0 else None
    features["english_consonant_clusters_2"] = cluster[1] if len(clusters) > 1 else None
    features["english_consonant_clusters_3"] = cluster[2] if len(clusters) > 2 else None
    return features

In [5]:
def accuracy(number_of_runs, function_to_use=gender_features):
    acc_df = {
        "classifier": [],
        "train_set_accuracy": [],
        "test_set_accuracy": [],
        "devtest_set_accuracy": [],
        "devtest_errors": []
    }
    for i in range(number_of_runs):
        random.shuffle(names)
        acc_train_names = names[1500:]
        acc_devtest_names = names[500:1500]
        acc_test_names = names[:500]
        acc_train_set = [(function_to_use(n), g) for (n,g) in acc_train_names]
        acc_devtest_set = [(function_to_use(n), g) for (n,g) in acc_devtest_names]
        acc_test_set = [(function_to_use(n), g) for (n,g) in acc_test_names]
        acc_classifier = nltk.NaiveBayesClassifier.train(acc_train_set)
        acc_df["classifier"].append(acc_classifier)
        acc_df["train_set_accuracy"].append(nltk.classify.accuracy(acc_classifier, acc_train_set))
        acc_df["test_set_accuracy"].append(nltk.classify.accuracy(acc_classifier, acc_test_set))
        acc_df["devtest_set_accuracy"].append(nltk.classify.accuracy(acc_classifier, acc_devtest_set))
        acc_errors = []
        for (name, tag) in acc_devtest_names:
            acc_guess = acc_classifier.classify(function_to_use(name))
            if acc_guess != tag:
                acc_errors.append( (tag, acc_guess, name) )
        acc_df["devtest_errors"].append(acc_errors)
    return(acc_df)

In [6]:
df = accuracy(100)
df = pd.DataFrame.from_dict(df)
print("Gender Features 1")
df.describe()

Gender Features 1


Unnamed: 0,train_set_accuracy,test_set_accuracy,devtest_set_accuracy
count,100.0,100.0,100.0
mean,0.863825,0.82212,0.81914
std,0.002069,0.016713,0.010798
min,0.859094,0.784,0.798
25%,0.862469,0.81,0.81075
50%,0.86406,0.824,0.819
75%,0.865029,0.8345,0.826
max,0.869025,0.862,0.851


In [7]:
df2 = accuracy(100, gender_features2)
df2 = pd.DataFrame.from_dict(df2)
print("Gender Features 2")
df2.describe()

Gender Features 2


Unnamed: 0,train_set_accuracy,test_set_accuracy,devtest_set_accuracy
count,100.0,100.0,100.0
mean,0.8804,0.82822,0.82979
std,0.002139,0.017317,0.010793
min,0.874302,0.78,0.806
25%,0.878957,0.818,0.82175
50%,0.880354,0.828,0.83
75%,0.881828,0.84,0.837
max,0.885785,0.866,0.856


In [19]:
df3 = accuracy(100, gender_features3)
df3 = pd.DataFrame.from_dict(df3)
print("Gender Features 3")
df3.describe()

Gender Features 3


Unnamed: 0,train_set_accuracy,test_set_accuracy,devtest_set_accuracy
count,100.0,100.0,100.0
mean,0.885414,0.82692,0.83088
std,0.002037,0.017628,0.01151
min,0.881595,0.78,0.806
25%,0.883923,0.8135,0.823
50%,0.88532,0.828,0.83
75%,0.886561,0.8365,0.83825
max,0.891217,0.874,0.861


In [9]:
df["classifier"][0].classify(gender_features2("Abney"))

'male'

In [20]:
df3["classifier"][0].show_most_informative_features(50)

Most Informative Features
              lastletter = 'k'              male : female =     60.7 : 1.0
              lastletter = 'a'            female : male   =     30.1 : 1.0
                  suffix = 'ard'            male : female =     25.5 : 1.0
                  suffix = 'tta'          female : male   =     23.2 : 1.0
                  suffix = 'son'            male : female =     20.0 : 1.0
                  suffix = 'ene'          female : male   =     19.3 : 1.0
                  suffix = 'vin'            male : female =     16.9 : 1.0
                  prefix = 'ros'          female : male   =     15.9 : 1.0
                  prefix = 'rod'            male : female =     15.8 : 1.0
                  suffix = 'na'           female : male   =     15.4 : 1.0
                  suffix = 'old'            male : female =     14.9 : 1.0
                  suffix = 'nne'          female : male   =     14.7 : 1.0
                  prefix = 'dor'          female : male   =     14.1 : 1.0

In [17]:
composite_errors = []
for i in list(range(100)):
    for mistake in df3["devtest_errors"][i]:
        if mistake not in composite_errors:
            composite_errors.append(mistake)
composite_errors = list(set(composite_errors))
print("Total unique errors: %d" % (len(composite_errors)))
for (tag, guess, name) in sorted(composite_errors):
    print('correct=%-8s guess=%-8s name=%-20s length=%-8s' % (tag, guess, name, len(name)))

Total unique errors: 2265
correct=female   guess=male     name=Abagael              length=7       
correct=female   guess=male     name=Abagail              length=7       
correct=female   guess=male     name=Abbe                 length=4       
correct=female   guess=male     name=Abbey                length=5       
correct=female   guess=male     name=Abbie                length=5       
correct=female   guess=male     name=Abby                 length=4       
correct=female   guess=male     name=Abigael              length=7       
correct=female   guess=male     name=Abigail              length=7       
correct=female   guess=male     name=Abigale              length=7       
correct=female   guess=male     name=Addie                length=5       
correct=female   guess=male     name=Addis                length=5       
correct=female   guess=male     name=Adel                 length=4       
correct=female   guess=male     name=Adey                 length=4       
correct=fema

correct=male     guess=female   name=Rey                  length=3       
correct=male     guess=female   name=Reza                 length=4       
correct=male     guess=female   name=Rhett                length=5       
correct=male     guess=female   name=Rice                 length=4       
correct=male     guess=female   name=Rich                 length=4       
correct=male     guess=female   name=Richie               length=6       
correct=male     guess=female   name=Ricki                length=5       
correct=male     guess=female   name=Rik                  length=3       
correct=male     guess=female   name=Rikki                length=5       
correct=male     guess=female   name=Ritchie              length=7       
correct=male     guess=female   name=Robbie               length=6       
correct=male     guess=female   name=Robin                length=5       
correct=male     guess=female   name=Rockwell             length=8       
correct=male     guess=female   name=R

In [12]:
def syllables(word): # I was making this to try and find a better way to look at words without importing a new library... meh.
    to_return = []
    consonants = list("bcdfghjklmnpqrstvwxzy")
    vowels = list("aeiouy")
    current_word = word.lower()
    while current_word != "":
        if current_word[0] in vowels and current_word[0] != "y":
            if current_word[1:3] == "we":
                to_return.append(current_word[:3])
                current_word = current_word[3:]
            else:
                if current_word[1] in consonants:
                    to_return.append(current_word[:2])
                    current_word = current_word[2:]
                elif current_word[2] in consonants:
                    to_return.append(current_word[:3])
                    current_word = current_word[3:]
        else:
            vowel_index = None
            for vowel in vowels:
                if vowel in current_word[:5] and vowel_index == None:
                    vowel_index = current_word.index(vowel)
            if len(current_word)-1 != vowel_index:
                if current_word[vowel_index+1] in vowels and current_word[vowel_index+1] != "i":
                    vowel_index += 1
                if current_word[vowel_index+1] == "s" and current_word[vowel_index+2] == "t":
                    to_return.append(current_word[:vowel_index+3])
                    current_word = current_word[vowel_index+3:]
                elif current_word[vowel_index+1] == "t" and current_word[vowel_index+2] == "h":
                    to_return.append(current_word[:vowel_index+3])
                    current_word = current_word[vowel_index+3:]
                else:
                    to_return.append(current_word[:vowel_index+2])
                    current_word = current_word[vowel_index+2:]
            else:
                to_return.append(current_word[:vowel_index+2])
                current_word = current_word[vowel_index+2:]
    return(to_return)