## Gender Classification Of Names
### Using Machine Learning To Detect/Predict Gender of Individuals 
+ Sklearn
+ Pandas
+ Text Extraction

In [1]:
# EDA packages
import pandas as pd
import numpy as np


In [2]:
# ML Packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
#from sklearn.feature_extraction.text import TfidfVectorizer


In [3]:
# Load our data
df = pd.read_csv('NationalNames.csv')

In [4]:
df.head(-20)

Unnamed: 0,Id,Name,Year,Gender,Count
0,1,Mary,1880,F,7065
1,2,Anna,1880,F,2604
2,3,Emma,1880,F,2003
3,4,Elizabeth,1880,F,1939
4,5,Minnie,1880,F,1746
...,...,...,...,...,...
1825408,1825409,Zephen,2014,M,5
1825409,1825410,Zerick,2014,M,5
1825410,1825411,Zhaiden,2014,M,5
1825411,1825412,Zhalen,2014,M,5


In [5]:
df.size
df.count()

Id        1825433
Name      1825433
Year      1825433
Gender    1825433
Count     1825433
dtype: int64

In [6]:
# Data Cleaning
# Checking for column name consistency
df.columns

Index(['Id', 'Name', 'Year', 'Gender', 'Count'], dtype='object')

In [7]:
# Data Types
df.dtypes

Id         int64
Name      object
Year       int64
Gender    object
Count      int64
dtype: object

In [8]:
# Checking for Missing Values
df.isnull().sum()

Id        0
Name      0
Year      0
Gender    0
Count     0
dtype: int64

In [9]:
# Number of Female Names
df[df.Gender == 'F'].size
df[df.Gender == 'F'].count()


Id        1081683
Name      1081683
Year      1081683
Gender    1081683
Count     1081683
dtype: int64

In [10]:
# Number of Male Names
df[df.Gender == 'M'].size
df[df.Gender == 'M'].count()

Id        743750
Name      743750
Year      743750
Gender    743750
Count     743750
dtype: int64

In [11]:
df_names = df

In [12]:
# Replacing All F and M with 0 and 1 respectively
df_names.Gender.replace({'F':0,'M':1},inplace=True)


In [13]:
df_names.Gender.unique()

array([0, 1], dtype=int64)

In [14]:
df_names.dtypes

Id         int64
Name      object
Year       int64
Gender     int64
Count      int64
dtype: object

In [15]:
Xfeatures =df_names['Name']

In [16]:
# Feature Extraction 
cv = CountVectorizer()
X = cv.fit_transform(Xfeatures)

In [17]:
cv.get_feature_names()

['aaban',
 'aabha',
 'aabid',
 'aabriella',
 'aadam',
 'aadan',
 'aadarsh',
 'aaden',
 'aadesh',
 'aadhav',
 'aadhavan',
 'aadhi',
 'aadhira',
 'aadhya',
 'aadhyan',
 'aadi',
 'aadian',
 'aadil',
 'aadin',
 'aadish',
 'aadison',
 'aadit',
 'aadith',
 'aaditri',
 'aaditya',
 'aadiv',
 'aadon',
 'aadrian',
 'aadrika',
 'aadrit',
 'aadvik',
 'aadvika',
 'aadya',
 'aadyn',
 'aafreen',
 'aagam',
 'aage',
 'aagot',
 'aahaan',
 'aahan',
 'aahana',
 'aahil',
 'aahliyah',
 'aahna',
 'aahron',
 'aaidan',
 'aaiden',
 'aaidyn',
 'aaila',
 'aailiyah',
 'aailyah',
 'aaima',
 'aaira',
 'aairah',
 'aaisha',
 'aaishah',
 'aaiyana',
 'aaiza',
 'aaja',
 'aajah',
 'aajaylah',
 'aajon',
 'aakanksha',
 'aakarsh',
 'aakash',
 'aakeem',
 'aakilah',
 'aakira',
 'aakiyah',
 'aakriti',
 'aala',
 'aalaiya',
 'aalaiyah',
 'aalana',
 'aalanah',
 'aalani',
 'aalap',
 'aalaya',
 'aalayah',
 'aalayiah',
 'aalayjah',
 'aalayna',
 'aalaysha',
 'aalaysia',
 'aalea',
 'aaleah',
 'aaleahya',
 'aaleeya',
 'aaleeyah',
 'aale

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
# Features 
X 
# Labels
y = df_names.Gender

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [21]:
# Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)


0.8950967225714774

In [22]:
# Accuracy of our Model
print("Accuracy of Model",clf.score(X_test,y_test)*100,"%")

Accuracy of Model 89.50967225714774 %


In [23]:
# Accuracy of our Model
print("Accuracy of Model",clf.score(X_train,y_train)*100,"%")

Accuracy of Model 90.54511708529566 %


### Sample Prediction

In [24]:
# Sample1 Prediction
sample_name = ["Mary"]
vect = cv.transform(sample_name).toarray()

In [25]:
vect

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [26]:
# Female is 0, Male is 1
clf.predict(vect)

array([0], dtype=int64)

In [27]:
# Sample2 Prediction
sample_name1 = ["Mark"]
vect1 = cv.transform(sample_name1).toarray()

In [28]:
clf.predict(vect1)

array([1], dtype=int64)

In [29]:
# Sample3 Prediction of Russian Names
sample_name2 = ["Natasha"]
vect2 = cv.transform(sample_name2).toarray()

In [30]:
clf.predict(vect2)

array([0], dtype=int64)

In [31]:
# Sample3 Prediction of Random Names
sample_name3 = ["Nefertiti","Nasha","Ama","Ayo","Xhavier","Ovetta","Tathiana","Xia","Joseph","Xianliang"]
vect3 = cv.transform(sample_name3).toarray()

In [32]:
clf.predict(vect3)

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0], dtype=int64)

In [33]:
# A function to do it
def genderpredictor(a):
    test_name = [a]
    vector = cv.transform(test_name).toarray()
    if clf.predict(vector) == 0:
        print("Female")
    else:
        print("Male")
    

In [34]:
genderpredictor("Martha")

Female


In [35]:
#Features fxn
#apply the fxn
#vectorizer
#fit
#transform
#classifier
#fit
#predict


In [36]:
namelist = ["Yaa","Yaw","Femi","Masha"]
for i in namelist:
    print(genderpredictor(i))

Female
None
Male
None
Male
None
Female
None


### Using a custom function for feature analysis

In [37]:


df.head(-20)


Unnamed: 0,Id,Name,Year,Gender,Count
0,1,Mary,1880,0,7065
1,2,Anna,1880,0,2604
2,3,Emma,1880,0,2003
3,4,Elizabeth,1880,0,1939
4,5,Minnie,1880,0,1746
...,...,...,...,...,...
1825408,1825409,Zephen,2014,1,5
1825409,1825410,Zerick,2014,1,5
1825410,1825411,Zhaiden,2014,1,5
1825411,1825412,Zhalen,2014,1,5


In [38]:
# By Analogy most female names ends in 'A' or 'E' or has the sound of 'A'
def features(name):
    name = name.lower()
    return {
        'first-letter'  : name[0], # First letter
        'first2-letters': name[0:2], # First 2 letters
        'first3-letters': name[0:3], # First 3 letters
        'last-letter'   : name[-1],
        'last2-letters' : name[-2:],
        'last3-letters' : name[-3:],
    }

In [39]:
df["features"] = df["Name"].apply(features)
df.head()

Unnamed: 0,Id,Name,Year,Gender,Count,features
0,1,Mary,1880,0,7065,"{'first-letter': 'm', 'first2-letters': 'ma', ..."
1,2,Anna,1880,0,2604,"{'first-letter': 'a', 'first2-letters': 'an', ..."
2,3,Emma,1880,0,2003,"{'first-letter': 'e', 'first2-letters': 'em', ..."
3,4,Elizabeth,1880,0,1939,"{'first-letter': 'e', 'first2-letters': 'el', ..."
4,5,Minnie,1880,0,1746,"{'first-letter': 'm', 'first2-letters': 'mi', ..."


In [40]:
def checkfeature(name):
    if name[-1] in "a,e":
        return "a"
    return "b"
df["Feature End"] = df["Name"].apply(checkfeature)
df.head(-20)

Unnamed: 0,Id,Name,Year,Gender,Count,features,Feature End
0,1,Mary,1880,0,7065,"{'first-letter': 'm', 'first2-letters': 'ma', ...",b
1,2,Anna,1880,0,2604,"{'first-letter': 'a', 'first2-letters': 'an', ...",a
2,3,Emma,1880,0,2003,"{'first-letter': 'e', 'first2-letters': 'em', ...",a
3,4,Elizabeth,1880,0,1939,"{'first-letter': 'e', 'first2-letters': 'el', ...",b
4,5,Minnie,1880,0,1746,"{'first-letter': 'm', 'first2-letters': 'mi', ...",a
...,...,...,...,...,...,...,...
1825408,1825409,Zephen,2014,1,5,"{'first-letter': 'z', 'first2-letters': 'ze', ...",b
1825409,1825410,Zerick,2014,1,5,"{'first-letter': 'z', 'first2-letters': 'ze', ...",b
1825410,1825411,Zhaiden,2014,1,5,"{'first-letter': 'z', 'first2-letters': 'zh', ...",b
1825411,1825412,Zhalen,2014,1,5,"{'first-letter': 'z', 'first2-letters': 'zh', ...",b


In [41]:
g=df.groupby(['Feature End','Gender'])
g.head()

Unnamed: 0,Id,Name,Year,Gender,Count,features,Feature End
0,1,Mary,1880,0,7065,"{'first-letter': 'm', 'first2-letters': 'ma', ...",b
1,2,Anna,1880,0,2604,"{'first-letter': 'a', 'first2-letters': 'an', ...",a
2,3,Emma,1880,0,2003,"{'first-letter': 'e', 'first2-letters': 'em', ...",a
3,4,Elizabeth,1880,0,1939,"{'first-letter': 'e', 'first2-letters': 'el', ...",b
4,5,Minnie,1880,0,1746,"{'first-letter': 'm', 'first2-letters': 'mi', ...",a
5,6,Margaret,1880,0,1578,"{'first-letter': 'm', 'first2-letters': 'ma', ...",b
6,7,Ida,1880,0,1472,"{'first-letter': 'i', 'first2-letters': 'id', ...",a
7,8,Alice,1880,0,1414,"{'first-letter': 'a', 'first2-letters': 'al', ...",a
9,10,Sarah,1880,0,1288,"{'first-letter': 's', 'first2-letters': 'sa', ...",b
21,22,Mabel,1880,0,808,"{'first-letter': 'm', 'first2-letters': 'ma', ...",b


In [42]:
names =(df.groupby(['Feature End','Gender'])).size()
#female_names =(df.groupby(['features','Gender Value'])).count()

#all_names = df.groupby(["Gender"])["Gender Value"].count()
print (names)

Feature End  Gender
a            0         689583
             1         120778
b            0         392100
             1         622972
dtype: int64


In [43]:
# Vectorize the features function
features = np.vectorize(features)
print(features(["Anna", "Hannah", "Peter","John","Vladmir","Mohammed"]))

[{'first-letter': 'a', 'first2-letters': 'an', 'first3-letters': 'ann', 'last-letter': 'a', 'last2-letters': 'na', 'last3-letters': 'nna'}
 {'first-letter': 'h', 'first2-letters': 'ha', 'first3-letters': 'han', 'last-letter': 'h', 'last2-letters': 'ah', 'last3-letters': 'nah'}
 {'first-letter': 'p', 'first2-letters': 'pe', 'first3-letters': 'pet', 'last-letter': 'r', 'last2-letters': 'er', 'last3-letters': 'ter'}
 {'first-letter': 'j', 'first2-letters': 'jo', 'first3-letters': 'joh', 'last-letter': 'n', 'last2-letters': 'hn', 'last3-letters': 'ohn'}
 {'first-letter': 'v', 'first2-letters': 'vl', 'first3-letters': 'vla', 'last-letter': 'r', 'last2-letters': 'ir', 'last3-letters': 'mir'}
 {'first-letter': 'm', 'first2-letters': 'mo', 'first3-letters': 'moh', 'last-letter': 'd', 'last2-letters': 'ed', 'last3-letters': 'med'}]


In [44]:
# Extract the features for the dataset
df_X = features(df_names['Name'])

In [45]:
df_y = df_names['Gender']

In [46]:
from sklearn.feature_extraction import DictVectorizer
 
corpus = features(["Mike", "Julia"])
dv = DictVectorizer()
dv.fit(corpus)
transformed = dv.transform(corpus)
print(transformed)
 

  (0, 1)	1.0
  (0, 3)	1.0
  (0, 5)	1.0
  (0, 7)	1.0
  (0, 9)	1.0
  (0, 10)	1.0
  (1, 0)	1.0
  (1, 2)	1.0
  (1, 4)	1.0
  (1, 6)	1.0
  (1, 8)	1.0
  (1, 11)	1.0


In [47]:
dv.get_feature_names()

['first-letter=j',
 'first-letter=m',
 'first2-letters=ju',
 'first2-letters=mi',
 'first3-letters=jul',
 'first3-letters=mik',
 'last-letter=a',
 'last-letter=e',
 'last2-letters=ia',
 'last2-letters=ke',
 'last3-letters=ike',
 'last3-letters=lia']

In [48]:
# Train Test Split
dfX_train, dfX_test, dfy_train, dfy_test = train_test_split(df_X, df_y, test_size=0.33, random_state=42)

In [49]:
dfX_train

array([{'first-letter': 'w', 'first2-letters': 'wi', 'first3-letters': 'wil', 'last-letter': 'y', 'last2-letters': 'ly', 'last3-letters': 'lly'},
       {'first-letter': 'h', 'first2-letters': 'ha', 'first3-letters': 'har', 'last-letter': 'd', 'last2-letters': 'ld', 'last3-letters': 'old'},
       {'first-letter': 'd', 'first2-letters': 'da', 'first3-letters': 'dag', 'last-letter': 'n', 'last2-letters': 'an', 'last3-letters': 'gan'},
       ...,
       {'first-letter': 'b', 'first2-letters': 'bo', 'first3-letters': 'bon', 'last-letter': 'a', 'last2-letters': 'na', 'last3-letters': 'nna'},
       {'first-letter': 'c', 'first2-letters': 'ch', 'first3-letters': 'che', 'last-letter': 'e', 'last2-letters': 'le', 'last3-letters': 'yle'},
       {'first-letter': 'r', 'first2-letters': 'ro', 'first3-letters': 'rox', 'last-letter': 'y', 'last2-letters': 'xy', 'last3-letters': 'oxy'}],
      dtype=object)

In [50]:

dv = DictVectorizer()
dv.fit_transform(dfX_train)


<1223040x8758 sparse matrix of type '<class 'numpy.float64'>'
	with 7338240 stored elements in Compressed Sparse Row format>

In [52]:
# Model building Using DecisionTree

from sklearn.tree import DecisionTreeClassifier
 
dclf = DecisionTreeClassifier()
my_xfeatures =dv.transform(dfX_train)
dclf.fit(my_xfeatures, dfy_train)
#dclf.score(dfX_test, dfy_test)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [74]:
# Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(my_xfeatures, dfy_train)
clf.score(dv.transform(dfX_test), dfy_test)

0.8222539106530122

In [53]:
# Build Features and Transform them
sample_name_eg = ["Alex"]
transform_dv =dv.transform(features(sample_name_eg))


In [54]:
vect3 = transform_dv.toarray()

In [55]:
# Predicting Gender of Name
# Male is 1,female = 0
dclf.predict(vect3)

array([1], dtype=int64)

In [56]:
if dclf.predict(vect3) == 0:
    print("Female")
else:
    print("Male")

Male


In [57]:
# Second Prediction With Nigerian Name
name_eg1 = ["Chioma"]
transform_dv =dv.transform(features(name_eg1))
vect4 = transform_dv.toarray()
if dclf.predict(vect4) == 0:
    print("Female")
else:
    print("Male")

Female


In [58]:
# A function to do it
def genderpredictor1(a):
    test_name1 = [a]
    transform_dv =dv.transform(features(test_name1))
    vector = transform_dv.toarray()
    if dclf.predict(vector) == 0:
        print("Female")
    else:
        print("Male")
    

In [59]:
random_name_list = ["Alex","Alice","Chioma","Vitalic","Clairese","Chan"]

In [60]:
for n in random_name_list:
    print(genderpredictor1(n))

Male
None
Female
None
Female
None
Male
None
Female
None
Male
None


In [61]:
## Accuracy of Models Decision Tree Classifier Works better than Naive Bayes
# Accuracy on training set
print(dclf.score(dv.transform(dfX_train), dfy_train)) 
 

0.9009525444793302


In [62]:
# Accuracy on test set
print(dclf.score(dv.transform(dfX_test), dfy_test))

0.8951797248640008


### Saving Our Model

In [63]:
from sklearn.externals import joblib



In [64]:
decisiontreModel = open("decisiontreemodel.pkl","wb")

In [65]:
joblib.dump(dclf,decisiontreModel)

In [66]:
decisiontreModel.close

<function BufferedWriter.close>

In [67]:
#Alternative to Model Saving
import pickle
dctreeModel = open("namesdetectormodel.pkl","wb")

In [68]:
pickle.dump(dclf,dctreeModel)

In [69]:
dctreeModel.close()

##### Save Multinomial NB Model

In [70]:
NaiveBayesModel = open("naivebayesgendermodel.pkl","wb")

In [71]:
joblib.dump(clf,NaiveBayesModel)

In [72]:
NaiveBayesModel.close()

In [73]:
# Thanks
# By Jesse JCharis
# Jesus Saves @ JCharisTech
# J-Secur1ty