In [50]:
import pandas as pd
import nltk
from nltk.classify import 
from nltk.corpus import names
import pronouncing
import phonetics

import random

#
# Basic Feature Extraction

def gender_features(word):
    return {'last_letter' : word[-1]}


labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
[(name, 'female') for name in names.words('female.txt')])

# Set seed to ensure the simulation can be reproduced.
random.seed(a = 102)  
random.shuffle(labeled_names)

In [25]:
# 
#  Let's make sure the dictionary keys are globally unique by imposing
#  a namespace qualifier.
#  Use a prefix by feature-type.   
#  Each extractor function computes only 1 feature type.
#  So features in one extractor function 
# -------------------------------------------------------------------------

#   generates the dictionary key name:  
#     var_type = 1  =>  Alphabetical
#     var_type = 2  =>  Integer
#     var_type = 3  =>  Boolean
# -----------------------------------------------
def make_key_name(prefix, var_type , key ):
    if var_type == 1:        
        keyname = prefix + "_" + "A" + "_" + key
    elif var_type == 2:
        keyname = prefix + "_" + "N" + "_" + key
    else:
        keyname = prefix + "_" + "B" + "_" + key
    return keyname


def extract_response(gender):
    pre = "R"
    features = {}
    features[make_key_name(pre, 1, "gender")] = gender
    return features


def count_consonants(string):
   num_consonants = 0
   # to count the consonants
   for char in string:
      if char not in "aeiou":
         num_consonants += 1
   return num_consonants

def count_vowels(string):
   num_vowels = 0
   # to count the voweles
   for char in string:
      if char in "aeiou":
         num_vowels += 1
   return num_vowels

def extract_basic_features(name):
    pre = "B"
    features = {}
    
    features[make_key_name(pre, 1, "name")] = name
    features[make_key_name(pre, 1, "firstletter") ] = name[0].lower()
    features[make_key_name(pre, 1, "secondletter")] = name[1].lower()
    features[make_key_name(pre, 1, "lastletter")] = name[-1].lower()
    features[make_key_name(pre, 2, "length")] = len(name)
    
    
    # Count the consonants and vowels
    features[make_key_name(pre, 2, "numconsonants")] = count_consonants(name.lower())
    features[make_key_name(pre, 2, "numvowels")] = count_vowels(name.lower())
    
    dmeta  = phonetics.dmetaphone(name)  # Double Metaphone is usually defined for all names.

    features[make_key_name(pre, 1, "dmetacode")] = dmeta[0] if len(dmeta) > 0 else ""
    features[make_key_name(pre, 2, "dmetalen")]  = len(dmeta[0]) if len(dmeta) > 0 else 0    

    
    return features

In [29]:
def extract_phonetic_features(name):
    pre = "P"
    features = {}
    
    # Initial the phoneme split algorithm: 0
    for j in range(0, 12):
        features[make_key_name(pre, 1, "phx_" + str(j).zfill(2) )] = ""

    last_phoneme = ""

    stress_pos =  0  # If value is undefined

    plist = pronouncing.phones_for_word(name)

    num_syllables = 0
    
    # Final
    
    #
    # Strip out the accent.
    #
    if len(plist) > 0:
        

        
        num_syllables = pronouncing.syllable_count(plist[0])
        
        stress_string = pronouncing.stresses( plist[0])
        
        stress_pos = stress_string.find('1') + 1 # Finds the primary stress in the syllable.
        
        
        list_phonemes_stressed = str.split(plist[0])
        
        
        for j in range(0, len(list_phonemes_stressed)):
            
            s = list_phonemes_stressed[j]
                        
            s_no_ints = ''.join(x for x in s if not x.isdigit() )
            
            features[make_key_name(pre, 1, "phx_" + str(j).zfill(2) ) ] = s_no_ints
            
            if j == len(list_phonemes_stressed) - 1 :
                last_phoneme = s_no_ints
        
    else:
        num_syllables = 0
        
    features[make_key_name(pre, 1, "phcode")] = plist[0] if len(plist) > 0 else ""
    features[make_key_name(pre, 1, "phfirst")] = str.split(plist[0])[0] if len(plist) > 0 else ""
    features[make_key_name(pre, 2, "phlen")] = len( str.split(plist[0]))  if len(plist) > 0 else 0
    features[make_key_name(pre, 2, "phsyllables")] = num_syllables
    features[make_key_name(pre, 3, "phfound")] = True if len(plist) > 0 else False
    features[make_key_name(pre, 2, "phx_stress")] = stress_pos
    features[make_key_name(pre, 1, "phx_last")] = last_phoneme
    return features


In [82]:
#  Helper function to extract basic and phonetic features. 
#
def extract_advanced_features(name):
    features = {}
    
    features ={  **extract_basic_features(name) , **extract_phonetic_features( name ) }
    return features
    

In [83]:
# Let's work on the pronouncing library

i = 0

total_isin = 0
total_notin = 0
total_female_isin = 0
total_female_notin = 0

total_male_isin = 0
total_male_notin = 0


# Construct the dataframe of outputs row-by-row
df_row_sets = []

# Construct the list of dictionaries of only basic feature sets
basic_feature_sets = []

# Construct the list of dictionaries of basic and phonetic features
phonetic_feature_sets = []

phonetic_names = []

for n, g in labeled_names:
    
    i += 1
    
    gfeatures = extract_response(g)
    bfeatures = extract_basic_features(n)
    afeatures = extract_advanced_features(n)  

    all_features = { **afeatures , **gfeatures } 
    df_row_sets.append( all_features  )
    
    basic_feature_sets.append( ( bfeatures , g ) )
    
    isin = all_features["P_B_phfound"]
    
    # Tabulate statistics on the available pronunciations.
    
    if isin:
        total_isin += 1
        
        # if phonetic data is unavailable can't run the advanced model
        # but if it is save the names and gender
        # and the feature set 
        # ------------------------------------------------
        phonetic_names.append( (n,g ))
        phonetic_feature_sets.append( ( afeatures , g ) )
        
        if g == "female":
            total_female_isin += 1
        else:
            total_male_isin += 1
    else:
        total_notin += 1
        if g == "female":
            total_female_notin += 1
        else:
            total_male_notin += 1
    
    # For debugging
    # --------------------------
    if i < 10:
        num_syllables = all_features["P_N_phsyllables"]
        pronunciation = all_features["P_A_phcode"]
            
        print(" Name: ", n , " isin: " , isin , "gender: ", g, 
                  "Phonetic: " , all_features["B_A_dmetacode"] ,
                  "Pronounciation: ", pronunciation, 
                  " syllables: ", num_syllables )


df_features = pd.DataFrame.from_dict( df_row_sets  , orient = 'columns')

 Name:  Bridie  isin:  True gender:  female Phonetic:  PRT Pronounciation:  B R IH1 D IY0  syllables:  2
 Name:  Edeline  isin:  True gender:  female Phonetic:  ATLN Pronounciation:  EH1 D IH0 L AY0 N  syllables:  3
 Name:  Molli  isin:  False gender:  female Phonetic:  ML Pronounciation:    syllables:  0
 Name:  Ingeberg  isin:  False gender:  female Phonetic:  ANJPRK Pronounciation:    syllables:  0
 Name:  Willmott  isin:  True gender:  male Phonetic:  ALMT Pronounciation:  W IH1 L M AH0 T  syllables:  2
 Name:  Gilda  isin:  True gender:  female Phonetic:  KLT Pronounciation:  G IH1 L D AH0  syllables:  2
 Name:  Albertina  isin:  True gender:  female Phonetic:  ALPRTN Pronounciation:  AA0 L B ER0 T IY1 N AH0  syllables:  4
 Name:  Nickolas  isin:  True gender:  male Phonetic:  NKLS Pronounciation:  N IH1 K AH0 L AH0 Z  syllables:  3
 Name:  Chandler  isin:  True gender:  male Phonetic:  XNTLR Pronounciation:  CH AE1 N D L ER0  syllables:  2


In [91]:

print("Total Isin: ", total_isin, " Total Not In: ", total_notin, " Sum: " , total_isin + total_notin)

print("Total Female ISIN:  ", total_female_isin , " Total Male ISIN:  ", total_male_isin )

print("Total Female NOTIN: ", total_female_notin, " Total Male Not In:", total_male_notin )



# We will export the training and devtest data jointly.  


df_train_large = df_features[500:]
df_test = df_features[:500]

basic_feature_sets_test = basic_feature_sets[:500]
basic_feature_sets_devtest = basic_feature_sets[500:1000]
basic_feature_sets_train = basic_feature_sets[1000:]

phonetic_feature_sets_test    = phonetic_feature_sets[:500]
phonetic_feature_sets_devtest = phonetic_feature_sets[500:1000]
phonetic_feature_sets_train   = phonetic_feature_sets[1000:]
phonetic_names_test , phonetic_names_devtest , phone_names_train = phonetic_names[:500] , phonetic_names[500:1000], phonetic_names[1000:]

df_features.to_csv("names_all.csv", index=False)
df_train_large.to_csv("names_train.csv", index = False )
df_test.to_csv( "names_test.csv", index = False )

print("\nGenerated names with features in dataframe format for test-train split\n")

Total Isin:  5251  Total Not In:  2693  Sum:  7944
Total Female ISIN:   2820  Total Male ISIN:   2431
Total Female NOTIN:  2181  Total Male Not In: 512

Generated names with features in dataframe format for test-train split



Lastly, we check the test, devtest and training feature sets sizes for basic features and phonetic features.
We can see below that the test and devtest sets are the same size as required: 500.
The phonetic features training set is smaller - roughly 4200 instead of 6900 due to limitations in the CMU pronunciation dictionary.


In [92]:
len(basic_feature_sets_test), len(basic_feature_sets_devtest), len( basic_feature_sets_train)

(500, 500, 6944)

In [93]:
len(phonetic_feature_sets_test), len(phonetic_feature_sets_devtest), len( phonetic_feature_sets_train )

(500, 500, 4251)

## Run a Naive Bayes Model

Let's run the model using the features above.  Let's start with the Basic Feature Set.  Get the accuracy and the most informative features.
We'll then repeat this steps for the larger features set with phonetic attributes.
Lastly, we'll consider the confusion matrix and drivers of error.

In [94]:
classifier_nb_basic = nltk.NaiveBayesClassifier.train( basic_feature_sets_train)

print (nltk.classify.accuracy( classifier_nb_basic, basic_feature_sets_devtest) )


0.788


In [95]:
classifier_nb_basic.show_most_informative_features(20)

Most Informative Features
          B_A_lastletter = 'k'              male : female =     41.1 : 1.0
          B_A_lastletter = 'a'            female : male   =     32.1 : 1.0
          B_A_lastletter = 'v'              male : female =     16.4 : 1.0
          B_A_lastletter = 'f'              male : female =     13.9 : 1.0
           B_A_dmetacode = 'MRN'          female : male   =     13.3 : 1.0
           B_A_dmetacode = 'ALTN'           male : female =     12.8 : 1.0
           B_A_dmetacode = 'KRSTN'        female : male   =     10.7 : 1.0
          B_A_lastletter = 'd'              male : female =     10.2 : 1.0
          B_A_lastletter = 'p'              male : female =      9.9 : 1.0
           B_A_dmetacode = 'ANT'          female : male   =      9.8 : 1.0
        B_A_secondletter = 'z'              male : female =      9.6 : 1.0
           B_A_dmetacode = 'JLN'          female : male   =      9.4 : 1.0
          B_A_lastletter = 'm'              male : female =      8.9 : 1.0

Now let's consider the Naive Bayes Model on the Advanced Feature Set.

In [96]:
classifier_nb_phonetic = nltk.NaiveBayesClassifier.train( phonetic_feature_sets_train)

print (nltk.classify.accuracy( classifier_nb_phonetic, phonetic_feature_sets_devtest) )

0.766


In [97]:
classifier_nb_phonetic.show_most_informative_features(20)

Most Informative Features
          B_A_lastletter = 'a'            female : male   =     31.9 : 1.0
              P_A_phx_06 = 'OW'             male : female =     25.9 : 1.0
            P_A_phx_last = 'AH'           female : male   =     21.4 : 1.0
          B_A_lastletter = 'k'              male : female =     20.7 : 1.0
              P_A_phx_05 = 'Z'              male : female =     13.5 : 1.0
            P_A_phx_last = 'F'              male : female =     12.8 : 1.0
              P_A_phx_02 = 'W'              male : female =     11.2 : 1.0
            P_A_phx_last = 'AA'           female : male   =     10.2 : 1.0
           B_A_dmetacode = 'LL'           female : male   =      9.8 : 1.0
           B_A_dmetacode = 'MRN'          female : male   =      9.8 : 1.0
              P_A_phx_02 = 'NG'             male : female =      9.7 : 1.0
              P_A_phx_05 = 'OW'             male : female =      9.5 : 1.0
            P_A_phx_last = 'K'              male : female =      9.4 : 1.0

Now let's evaluate the errors in the phonetic model.

In [104]:
errors = []
preds_nb_phonetic = []
actual_nb_phonetic = []
for (n, g) in phonetic_names_devtest:
    prediction = classifier_nb_phonetic.classify( extract_advanced_features(n) )
    preds_nb_phonetic.append( prediction )
    actual_nb_phonetic.append( g )
    if prediction != g:
        errors.append( ( prediction, g, n ) )

In [100]:
errors

[('male', 'female', 'Ann'),
 ('male', 'female', 'Jan'),
 ('female', 'male', 'Geri'),
 ('female', 'male', 'Cody'),
 ('female', 'male', 'Rey'),
 ('male', 'female', 'Eve'),
 ('male', 'female', 'Margret'),
 ('female', 'male', 'Quillan'),
 ('male', 'female', 'Gabriel'),
 ('female', 'male', 'Kerry'),
 ('male', 'female', 'Shea'),
 ('female', 'male', 'Marwin'),
 ('male', 'female', 'Gertrude'),
 ('male', 'female', 'Agnes'),
 ('female', 'male', 'Dannie'),
 ('female', 'male', 'Farley'),
 ('female', 'male', 'Myron'),
 ('female', 'male', 'Kyle'),
 ('male', 'female', 'Hatty'),
 ('male', 'female', 'Hattie'),
 ('female', 'male', 'Lennie'),
 ('male', 'female', 'Tory'),
 ('female', 'male', 'Daryl'),
 ('male', 'female', 'Faustine'),
 ('male', 'female', 'Orel'),
 ('male', 'female', 'Danny'),
 ('male', 'female', 'Glenn'),
 ('female', 'male', 'Timothy'),
 ('male', 'female', 'Sukey'),
 ('male', 'female', 'Tommy'),
 ('female', 'male', 'Rory'),
 ('female', 'male', 'Tracey'),
 ('male', 'female', 'Raye'),
 ('fem

In [105]:
from nltk.metrics import ConfusionMatrix
print(nltk.ConfusionMatrix( actual_nb_phonetic , preds_nb_phonetic ) )


       |   f     |
       |   e     |
       |   m   m |
       |   a   a |
       |   l   l |
       |   e   e |
-------+---------+
female |<201> 58 |
  male |  59<182>|
-------+---------+
(row = reference; col = test)

