## Gender Classification Of Names
### Using Machine Learning To Detect/Predict Gender of Individuals 
+ Sklearn
+ Pandas
+ Text Extraction

In [1]:
# EDA packages
import pandas as pd
import numpy as np


In [2]:
# ML Packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
#from sklearn.feature_extraction.text import TfidfVectorizer


In [3]:
# Load our data
df = pd.read_csv('BanglaNames.csv')

In [4]:
df.head()

Unnamed: 0,index,name,sex
0,0,fairuz,female
1,1,maliha,female
2,2,mehedi,male
3,3,sohan,male
4,4,shatil,male


In [5]:
df.count

<bound method DataFrame.count of       index     name     sex
0         0   fairuz  female
1         1   maliha  female
2         2   mehedi    male
3         3    sohan    male
4         4   shatil    male
...     ...      ...     ...
1364   1364   sadman    male
1365   1365      ovi    male
1366   1366   mehedi    male
1367   1367   kayser    male
1368   1368  pabitra    male

[1369 rows x 3 columns]>

In [6]:
# Data Cleaning
# Checking for column name consistency
df.columns

Index(['index', 'name', 'sex'], dtype='object')

In [7]:
# Data Types
df.dtypes

index     int64
name     object
sex      object
dtype: object

In [8]:
# Checking for Missing Values
df.isnull().isnull().sum()

index    0
name     0
sex      0
dtype: int64

In [9]:
# Number of Female Names
df[df.sex == 'female'].count

<bound method DataFrame.count of       index     name     sex
0         0   fairuz  female
1         1   maliha  female
6         6    samia  female
23       23  sadiqua  female
24       24    promi  female
...     ...      ...     ...
1356   1356     asha  female
1359   1359  shakira  female
1360   1360   riffat  female
1361   1361   nasrin  female
1362   1362     biva  female

[769 rows x 3 columns]>

In [10]:
# Number of Male Names
df[df.sex == 'male'].count

<bound method DataFrame.count of       index     name   sex
2         2   mehedi  male
3         3    sohan  male
4         4   shatil  male
5         5     akid  male
7         7   tanvir  male
...     ...      ...   ...
1364   1364   sadman  male
1365   1365      ovi  male
1366   1366   mehedi  male
1367   1367   kayser  male
1368   1368  pabitra  male

[600 rows x 3 columns]>

In [11]:
df_names = df

In [12]:
# Replacing All F and M with 0 and 1 respectively
df_names.sex.replace({'female':0,'male':1},inplace=True)

In [13]:
df_names.sex.unique()

array([0, 1], dtype=int64)

In [14]:
df_names.dtypes

index     int64
name     object
sex       int64
dtype: object

In [15]:
Xfeatures =df_names['name']

In [16]:
# Feature Extraction 
cv = CountVectorizer()
X = cv.fit_transform(Xfeatures)

In [17]:
cv.get_feature_names()

['aahil',
 'aaron',
 'abdul',
 'abdullah',
 'abdus',
 'abed',
 'abhijit',
 'abid',
 'abir',
 'abrar',
 'abrity',
 'abttahi',
 'adan',
 'adhora',
 'adib',
 'adiba',
 'adip',
 'aditi',
 'adity',
 'adnan',
 'afiqur',
 'afnan',
 'afreen',
 'afrin',
 'afrose',
 'afroz',
 'afroza',
 'afroze',
 'afsana',
 'afsar',
 'ahad',
 'ahmed',
 'ahnaf',
 'ahona',
 'ahssan',
 'aishia',
 'aizan',
 'akanto',
 'akash',
 'akhand',
 'akhi',
 'akib',
 'akid',
 'akif',
 'akram',
 'aksir',
 'akter',
 'alam',
 'alavi',
 'alfarrah',
 'ali',
 'alisha',
 'alo',
 'alom',
 'alvee',
 'amin',
 'amit',
 'amman',
 'amrin',
 'amy',
 'anamul',
 'ananna',
 'ananta',
 'ananya',
 'anas',
 'anee',
 'anik',
 'anika',
 'aninda',
 'aniruddha',
 'anisha',
 'anista',
 'anita',
 'anjina',
 'anjum',
 'anjuman',
 'ankon',
 'annan',
 'annesha',
 'anni',
 'anonna',
 'anoy',
 'ansary',
 'antara',
 'anto',
 'antor',
 'antora',
 'antu',
 'anu',
 'anushka',
 'any',
 'anzir',
 'anzum',
 'apu',
 'aquib',
 'araddha',
 'araf',
 'arafat',
 'arghy

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
# Features 
X
# Labels
y = df_names.sex

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [21]:
# Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)


0.6946902654867256

In [22]:
# Accuracy of our Model
print("Accuracy of Model",clf.score(X_test,y_test)*100,"%")

Accuracy of Model 69.46902654867256 %


In [23]:
# Accuracy of our Model
print("Accuracy of Model",clf.score(X_train,y_train)*100,"%")

Accuracy of Model 98.90948745910578 %


### Sample Prediction

In [24]:
# Sample1 Prediction
sample_name = ["Mary"]
vect = cv.transform(sample_name).toarray()

In [25]:
vect

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [26]:
# Female is 0, Male is 1
clf.predict(vect)

array([0], dtype=int64)

In [27]:
# Sample2 Prediction
sample_name1 = ["Mark"]
vect1 = cv.transform(sample_name1).toarray()

In [28]:
clf.predict(vect1)

array([0], dtype=int64)

In [29]:
# Sample3 Prediction of Russian Names
sample_name2 = ["Natasha"]
vect2 = cv.transform(sample_name2).toarray()

In [30]:
clf.predict(vect2)

array([0], dtype=int64)

In [31]:
# Sample3 Prediction of Random Names
sample_name3 = ["Nefertiti","Nasha","Ama","Ayo","Xhavier","Ovetta","Tathiana","Xia","Joseph","Xianliang"]
vect3 = cv.transform(sample_name3).toarray()


In [32]:
clf.predict(vect3)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [33]:
# A function to do it
def genderpredictor(a):
    test_name = [a]
    vector = cv.transform(test_name).toarray()
    if clf.predict(vector) == 0:
        print("Female")
    else:
        print("Male")
    

In [34]:
genderpredictor("Martha")

Female


In [35]:
#Features fxn
#apply the fxn
#vectorizer
#fit
#transform
#classifier
#fit
#predict


In [36]:
namelist = ["Yaa","Yaw","Femi","Masha"]
for i in namelist:
    print(genderpredictor(i))

Female
None
Female
None
Female
None
Female
None


### Using a custom function for feature analysis

In [92]:
# By Analogy most female names ends in 'A' or 'E' or has the sound of 'A'
def features(name):
    name = name.lower()
    return {
        'first-letter': name[0], # First letter
        'first2-letters': name[0:2], # First 2 letters
        'first3-letters': name[0:3], # First 3 letters
        'last-letter': name[-1],
        'last2-letters': name[-2:],
        'last3-letters': name[-3:],
    }

In [93]:
df["features"] = df["name"].apply(features)
df.head()

Unnamed: 0,index,name,sex,Gender Value,features,Feature End
0,0,fairuz,0,1,"{'first-letter': 'f', 'first2-letters': 'fa', ...",b
1,1,maliha,0,1,"{'first-letter': 'm', 'first2-letters': 'ma', ...",a
2,2,mehedi,1,1,"{'first-letter': 'm', 'first2-letters': 'me', ...",b
3,3,sohan,1,1,"{'first-letter': 's', 'first2-letters': 'so', ...",b
4,4,shatil,1,1,"{'first-letter': 's', 'first2-letters': 'sh', ...",b


In [94]:
def checkfeature(name):
    if name[-1] in "a,e":
        return "a"
    return "b"
df["Feature End"] = df["name"].apply(checkfeature)
df.head(-20)

Unnamed: 0,index,name,sex,Gender Value,features,Feature End
0,0,fairuz,0,1,"{'first-letter': 'f', 'first2-letters': 'fa', ...",b
1,1,maliha,0,1,"{'first-letter': 'm', 'first2-letters': 'ma', ...",a
2,2,mehedi,1,1,"{'first-letter': 'm', 'first2-letters': 'me', ...",b
3,3,sohan,1,1,"{'first-letter': 's', 'first2-letters': 'so', ...",b
4,4,shatil,1,1,"{'first-letter': 's', 'first2-letters': 'sh', ...",b
...,...,...,...,...,...,...
1344,1344,meherun,0,1,"{'first-letter': 'm', 'first2-letters': 'me', ...",b
1345,1345,mitu,0,1,"{'first-letter': 'm', 'first2-letters': 'mi', ...",b
1346,1346,mum,0,1,"{'first-letter': 'm', 'first2-letters': 'mu', ...",b
1347,1347,tahena,0,1,"{'first-letter': 't', 'first2-letters': 'ta', ...",a


In [95]:

g=df.groupby(['Feature End','sex'])
g.head()

Unnamed: 0,index,name,sex,Gender Value,features,Feature End
0,0,fairuz,0,1,"{'first-letter': 'f', 'first2-letters': 'fa', ...",b
1,1,maliha,0,1,"{'first-letter': 'm', 'first2-letters': 'ma', ...",a
2,2,mehedi,1,1,"{'first-letter': 'm', 'first2-letters': 'me', ...",b
3,3,sohan,1,1,"{'first-letter': 's', 'first2-letters': 'so', ...",b
4,4,shatil,1,1,"{'first-letter': 's', 'first2-letters': 'sh', ...",b
5,5,akid,1,1,"{'first-letter': 'a', 'first2-letters': 'ak', ...",b
6,6,samia,0,1,"{'first-letter': 's', 'first2-letters': 'sa', ...",a
7,7,tanvir,1,1,"{'first-letter': 't', 'first2-letters': 'ta', ...",b
16,16,khaja,1,1,"{'first-letter': 'k', 'first2-letters': 'kh', ...",a
23,23,sadiqua,0,1,"{'first-letter': 's', 'first2-letters': 'sa', ...",a


In [90]:

female_names =(df.groupby(['Feature nd','sex'])).size()
#female_names =(df.groupby(['features','Gender Value'])).count()

#all_names = df.groupby(["Gender"])["Gender Value"].count()
print (female_names)

Feature End  sex
a            0      414
             1       28
b            0      355
             1      572
dtype: int64


In [43]:
# Vectorize the features function
features = np.vectorize(features)
print(features(["Anna", "Hannah", "Peter","John","Vladmir","Mohammed"]))

[{'first-letter': 'a', 'first2-letters': 'an', 'first3-letters': 'ann', 'last-letter': 'a', 'last2-letters': 'na', 'last3-letters': 'nna'}
 {'first-letter': 'h', 'first2-letters': 'ha', 'first3-letters': 'han', 'last-letter': 'h', 'last2-letters': 'ah', 'last3-letters': 'nah'}
 {'first-letter': 'p', 'first2-letters': 'pe', 'first3-letters': 'pet', 'last-letter': 'r', 'last2-letters': 'er', 'last3-letters': 'ter'}
 {'first-letter': 'j', 'first2-letters': 'jo', 'first3-letters': 'joh', 'last-letter': 'n', 'last2-letters': 'hn', 'last3-letters': 'ohn'}
 {'first-letter': 'v', 'first2-letters': 'vl', 'first3-letters': 'vla', 'last-letter': 'r', 'last2-letters': 'ir', 'last3-letters': 'mir'}
 {'first-letter': 'm', 'first2-letters': 'mo', 'first3-letters': 'moh', 'last-letter': 'd', 'last2-letters': 'ed', 'last3-letters': 'med'}]


In [44]:
# Extract the features for the dataset
df_X = features(df_names['name'])

In [45]:
df_y = df_names['sex']

In [46]:
from sklearn.feature_extraction import DictVectorizer
corpus = features(["Mike", "Julia"])
dv = DictVectorizer()
dv.fit(corpus)
transformed = dv.transform(corpus)
print(transformed)
 

  (0, 1)	1.0
  (0, 3)	1.0
  (0, 5)	1.0
  (0, 7)	1.0
  (0, 9)	1.0
  (0, 10)	1.0
  (1, 0)	1.0
  (1, 2)	1.0
  (1, 4)	1.0
  (1, 6)	1.0
  (1, 8)	1.0
  (1, 11)	1.0


In [47]:
dv.get_feature_names()

['first-letter=j',
 'first-letter=m',
 'first2-letters=ju',
 'first2-letters=mi',
 'first3-letters=jul',
 'first3-letters=mik',
 'last-letter=a',
 'last-letter=e',
 'last2-letters=ia',
 'last2-letters=ke',
 'last3-letters=ike',
 'last3-letters=lia']

In [48]:
# Train Test Split
dfX_train, dfX_test, dfy_train, dfy_test = train_test_split(df_X, df_y, test_size=0.33, random_state=42)

In [49]:
dfX_train

array([{'first-letter': 'a', 'first2-letters': 'ah', 'first3-letters': 'ahm', 'last-letter': 'd', 'last2-letters': 'ed', 'last3-letters': 'med'},
       {'first-letter': 's', 'first2-letters': 'sa', 'first3-letters': 'san', 'last-letter': 'a', 'last2-letters': 'da', 'last3-letters': 'ida'},
       {'first-letter': 'm', 'first2-letters': 'ma', 'first3-letters': 'mah', 'last-letter': 'r', 'last2-letters': 'ur', 'last3-letters': 'zur'},
       {'first-letter': 'f', 'first2-letters': 'fa', 'first3-letters': 'far', 'last-letter': 'a', 'last2-letters': 'ia', 'last3-letters': 'ria'},
       {'first-letter': 's', 'first2-letters': 'sa', 'first3-letters': 'sai', 'last-letter': 'n', 'last2-letters': 'on', 'last3-letters': 'mon'},
       {'first-letter': 's', 'first2-letters': 'sa', 'first3-letters': 'sai', 'last-letter': 'a', 'last2-letters': 'ba', 'last3-letters': 'iba'},
       {'first-letter': 's', 'first2-letters': 'sh', 'first3-letters': 'shr', 'last-letter': 'a', 'last2-letters': 'ya', 'la

In [50]:

dv = DictVectorizer()
dv.fit_transform(dfX_train)


<917x1081 sparse matrix of type '<class 'numpy.float64'>'
	with 5502 stored elements in Compressed Sparse Row format>

In [51]:
# Model building Using DecisionTree

from sklearn.tree import DecisionTreeClassifier
 
dclf = DecisionTreeClassifier()
my_xfeatures =dv.transform(dfX_train)
dclf.fit(my_xfeatures, dfy_train)
#prediction = dclf.predict(dfX_test)
#accuracy_score(dfy_test, prediction)
## Accuracy of Models Decision Tree Classifier Works better than Naive Bayes
# Accuracy on training set
print(dclf.score(dv.transform(dfX_train), dfy_train)) 
# Accuracy on test set
print(dclf.score(dv.transform(dfX_test), dfy_test))


0.9890948745910578
0.8473451327433629


In [52]:
# Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(my_xfeatures, dfy_train)
clf.score(dv.transform(dfX_test), dfy_test)

0.8561946902654868

In [53]:
from sklearn.linear_model import LogisticRegression
logreg_clf = LogisticRegression()
logreg_clf.fit(my_xfeatures, dfy_train)
logreg_clf.score(dv.transform(dfX_test), dfy_test)



0.8716814159292036

In [54]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
KNN_model = KNeighborsClassifier(n_neighbors=5)
KNN_model.fit(my_xfeatures, dfy_train)
KNN_model.score(dv.transform(dfX_test), dfy_test)

0.8252212389380531

In [55]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
SVC_model = SVC()
SVC_model.fit(my_xfeatures, dfy_train)
SVC_model.score(dv.transform(dfX_test), dfy_test)



0.5508849557522124

In [56]:
#from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
#lda_clf = LinearDiscriminantAnalysis()
#lda_clf.fit(my_xfeatures, dfy_train)
#lda_clf.score(dv.transform(dfX_test), dfy_test)

In [57]:
# Build Features and Transform them
sample_name_eg = ["Alex"]
transform_dv =dv.transform(features(sample_name_eg))


In [58]:
vect3 = transform_dv.toarray()

In [59]:
# Predicting Gender of Name
# Male is 1,female = 0
dclf.predict(vect3)

array([1], dtype=int64)

In [60]:
if dclf.predict(vect3) == 0:
    print("Female")
else:
    print("Male")

Male


In [61]:
# Second Prediction With Nigerian Name
name_eg1 = ["Chioma"]
transform_dv =dv.transform(features(name_eg1))
vect4 = transform_dv.toarray()
if dclf.predict(vect4) == 0:
    print("Female")
else:
    print("Male")

Female


In [62]:
# A function to do it
def genderpredictor1(a):
    test_name1 = [a]
    transform_dv =dv.transform(features(test_name1))
    vector = transform_dv.toarray()
    if dclf.predict(vector) == 0:
        print("Female")
    else:
        print("Male")
    

In [63]:
random_name_list = ["Alex","Alice","Chioma","Vitalic","Clairese","Chan"]

In [64]:
for n in random_name_list:
    print(genderpredictor1(n))

Male
None
Female
None
Female
None
Male
None
Female
None
Male
None


### Saving Our Model

In [65]:
from sklearn.externals import joblib



In [66]:
decisiontreModel = open("decisiontreemodel.pkl","wb")

In [67]:
joblib.dump(dclf,decisiontreModel)

In [68]:
decisiontreModel.close

<function BufferedWriter.close>

In [69]:
#Alternative to Model Saving
import pickle
dctreeModel = open("namesdetectormodel.pkl","wb")

In [70]:
pickle.dump(dclf,dctreeModel)

In [71]:
dctreeModel.close()

##### Save Multinomial NB Model

In [72]:
NaiveBayesModel = open("naivebayesgendermodel.pkl","wb")

In [73]:
joblib.dump(clf,NaiveBayesModel)

In [74]:
NaiveBayesModel.close()

In [75]:
# Thanks
# By Jesse JCharis
# Jesus Saves @ JCharisTech
# J-Secur1ty