In [2]:
import nltk
from nltk.corpus import names
import random
from nltk.classify import apply_features
import pandas as pd

#get data
names = ([(name, 'male') for name in names.words('male.txt')] +
[(name, 'female') for name in names.words('female.txt')])
random.shuffle(names)

In [3]:
def gender_features(word):
    return {'last_letter': word[-1]}

In [4]:
def gender_features2(name):
    features = {}
    features["firstletter"] = name[0].lower() #came with code; A
    features["lastletter"] = name[-1].lower() #came with code; B
    features["prefix"] = name[:3].lower() if len(name) > 4 else name[:2].lower() #gabby made; C
    features["suffix"] = name[-3:].lower() if len(name) > 4 else name[-2:].lower()#gabby made; D
    #features["final_consonants"] = name[-2:].lower() if name[-1].lower() not in ["a", "e", "i", "o", "u"] and name[-2].lower() not in ["a", "e", "i", "o", "u"] else False #gabby made; E
    features["final_vowels"] = name[-2:].lower() if name[-1].lower() in ["a", "e", "i", "o", "u"] and name[-2].lower() in ["a", "e", "i", "o", "u"] else False #gabby made; F
    features["name_length"] = len(name)
    #features["femme_endings"] = name[-4:].lower() if len(name) > 4 and name[-4].lower() in ["a", "e", "i", "o", "u"] and name[-3].lower() in ["a", "e", "i", "o", "u"] and name[-2].lower() == name[-3].lower() and name[-1].lower() in ["a", "e", "i", "o", "u", "y"] else False
    #name_len = len(name)
    #total_vowels = name.lower().count("a") + name.lower().count("e") + name.lower().count("i") + name.lower().count("o") + name.lower().count("u")
    #features["perc_consonants"] = int((name_len-total_vowels)/name_len)
    #features["perc_vowels"] = int(total_vowels/name_len)
    ### with the seven above, 0.829
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

In [28]:
def accuracy(number_of_runs, function_to_use=gender_features2):
    acc_df = {
        "classifier": [],
        "train_set_accuracy": [],
        "test_set_accuracy": [],
        "devtest_set_accuracy": [],
        "devtest_errors": []
    }
    for i in range(number_of_runs):
        random.shuffle(names)
        acc_train_names = names[1500:]
        acc_devtest_names = names[500:1500]
        acc_test_names = names[:500]
        acc_train_set = [(function_to_use(n), g) for (n,g) in acc_train_names]
        acc_devtest_set = [(function_to_use(n), g) for (n,g) in acc_devtest_names]
        acc_test_set = [(function_to_use(n), g) for (n,g) in acc_test_names]
        acc_classifier = nltk.NaiveBayesClassifier.train(acc_train_set)
        acc_df["classifier"].append(acc_classifier)
        acc_df["train_set_accuracy"].append(nltk.classify.accuracy(acc_classifier, acc_train_set))
        acc_df["test_set_accuracy"].append(nltk.classify.accuracy(acc_classifier, acc_test_set))
        acc_df["devtest_set_accuracy"].append(nltk.classify.accuracy(acc_classifier, acc_devtest_set))
        acc_errors = []
        for (name, tag) in acc_devtest_names:
            acc_guess = acc_classifier.classify(function_to_use(name))
            if acc_guess != tag:
                acc_errors.append( (tag, acc_guess, name) )
        acc_df["devtest_errors"].append(acc_errors)
    return(acc_df)

In [29]:
df = accuracy(100)
df = pd.DataFrame.from_dict(df)
df.describe()

Unnamed: 0,train_set_accuracy,test_set_accuracy,devtest_set_accuracy
count,100.0,100.0,100.0
mean,0.863881,0.81828,0.82043
std,0.002185,0.016035,0.011711
min,0.858628,0.786,0.796
25%,0.86243,0.808,0.813
50%,0.864137,0.819,0.8215
75%,0.865185,0.83,0.82825
max,0.868715,0.85,0.849


In [30]:
df["classifier"][0].classify(gender_features2("Abney"))

'female'

In [31]:
df["classifier"][0].show_most_informative_features(50)

Most Informative Features
              lastletter = 'k'              male : female =     36.7 : 1.0
            final_vowels = 'ia'           female : male   =     32.2 : 1.0
              lastletter = 'a'            female : male   =     30.6 : 1.0
                  suffix = 'tta'          female : male   =     22.3 : 1.0
                  suffix = 'vin'            male : female =     17.3 : 1.0
                  suffix = 'na'           female : male   =     16.7 : 1.0
                  suffix = 'nne'          female : male   =     16.4 : 1.0
            final_vowels = 'io'             male : female =     15.5 : 1.0
                  prefix = 'ros'          female : male   =     15.1 : 1.0
                  suffix = 'old'            male : female =     14.2 : 1.0
                  prefix = 'rod'            male : female =     14.1 : 1.0
              lastletter = 'f'              male : female =     13.4 : 1.0
                  prefix = 'dor'          female : male   =     13.4 : 1.0

In [34]:
print("Total errors: %d" % (len(df["devtest_errors"][0])))
for (tag, guess, name) in sorted(df["devtest_errors"][0]):
    print('correct=%-8s guess=%-8s name=%-20s length=%-8s' % (tag, guess, name, len(name)))

Total errors: 182
correct=female   guess=male     name=Adorne               length=6       
correct=female   guess=male     name=Aryn                 length=4       
correct=female   guess=male     name=Astrid               length=6       
correct=female   guess=male     name=Avrit                length=5       
correct=female   guess=male     name=Bert                 length=4       
correct=female   guess=male     name=Berty                length=5       
correct=female   guess=male     name=Beulah               length=6       
correct=female   guess=male     name=Bidget               length=6       
correct=female   guess=male     name=Britt                length=5       
correct=female   guess=male     name=Bryn                 length=4       
correct=female   guess=male     name=Cam                  length=3       
correct=female   guess=male     name=Charlot              length=7       
correct=female   guess=male     name=Christan             length=8       
correct=female   gue

In [13]:
def syllables(word): # I was making this to try and find a better way to look at words without importing a new library... meh.
    to_return = []
    consonants = list("bcdfghjklmnpqrstvwxzy")
    vowels = list("aeiouy")
    current_word = word.lower()
    while current_word != "":
        if current_word[0] in vowels and current_word[0] != "y":
            if current_word[1:3] == "we":
                to_return.append(current_word[:3])
                current_word = current_word[3:]
            else:
                if current_word[1] in consonants:
                    to_return.append(current_word[:2])
                    current_word = current_word[2:]
                elif current_word[2] in consonants:
                    to_return.append(current_word[:3])
                    current_word = current_word[3:]
        else:
            vowel_index = None
            for vowel in vowels:
                if vowel in current_word[:5] and vowel_index == None:
                    vowel_index = current_word.index(vowel)
            if len(current_word)-1 != vowel_index:
                if current_word[vowel_index+1] in vowels and current_word[vowel_index+1] != "i":
                    vowel_index += 1
                if current_word[vowel_index+1] == "s" and current_word[vowel_index+2] == "t":
                    to_return.append(current_word[:vowel_index+3])
                    current_word = current_word[vowel_index+3:]
                elif current_word[vowel_index+1] == "t" and current_word[vowel_index+2] == "h":
                    to_return.append(current_word[:vowel_index+3])
                    current_word = current_word[vowel_index+3:]
                else:
                    to_return.append(current_word[:vowel_index+2])
                    current_word = current_word[vowel_index+2:]
            else:
                to_return.append(current_word[:vowel_index+2])
                current_word = current_word[vowel_index+2:]
    return(to_return)