<h1> Gender Classification


## Data pre-processing
### Using Machine Learning to predict gender from an indivdual's name

In [34]:
import pandas as pd
import numpy as np

In [35]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## We load our data

In [36]:
data = pd.read_csv('data/names_dataset.csv')

In [37]:
# Preview first few rows
data.head()

Unnamed: 0,index,name,sex
0,0,Mary,F
1,1,Anna,F
2,2,Emma,F
3,3,Elizabeth,F
4,4,Minnie,F


In [38]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95025 entries, 0 to 95024
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   index   95025 non-null  int64 
 1   name    95025 non-null  object
 2   sex     95025 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.2+ MB


In [39]:
data.size

285075

## Data Cleaning - Check columns for name consistency

In [40]:
data.columns

Index(['index', 'name', 'sex'], dtype='object')

In [41]:
# Check for data types
# We'll probably have to convert the name and sex columns to value types
data.dtypes

index     int64
name     object
sex      object
dtype: object

In [42]:
# Check for missing values
data.isnull().sum()

index    0
name     0
sex      0
dtype: int64

In [43]:
# Check for duplicate values
data.duplicated().sum()

0

In [44]:
# Check to see if our dataset is balanced
# Number of Female names
data[data.sex == 'F'].size

181800

In [45]:
# Number of Male names
data[data.sex == 'M'].size

103275

In [46]:
# Copyt the data frame for the future we want modify
data_names = data

In [47]:
data_names.sex.replace({'F': 0, 'M': 1}, inplace=True)

In [48]:
data_names.sex.unique()

array([0, 1])

In [49]:
data_names.dtypes

index     int64
name     object
sex       int64
dtype: object

In [50]:
Xfeatures = data_names['name']

In [51]:
# Feature Extraction
cv = CountVectorizer()
X = cv.fit_transform(Xfeatures)

In [52]:
# Save Vectorizer
from sklearn.externals import joblib



In [55]:
gender_vectorizer = open('gender_vectorier.pkl', 'wb')
joblib.dump(cv, gender_vectorizer)

In [56]:
gender_vectorizer.close()

In [58]:
cv.get_feature_names()

['aaban',
 'aabha',
 'aabid',
 'aabriella',
 'aada',
 'aadam',
 'aadan',
 'aadarsh',
 'aaden',
 'aadesh',
 'aadhav',
 'aadhavan',
 'aadhi',
 'aadhira',
 'aadhvik',
 'aadhya',
 'aadhyan',
 'aadi',
 'aadian',
 'aadil',
 'aadin',
 'aadish',
 'aadison',
 'aadit',
 'aadith',
 'aadithya',
 'aaditri',
 'aaditya',
 'aadiv',
 'aadon',
 'aadrian',
 'aadrika',
 'aadrit',
 'aadvik',
 'aadvika',
 'aadya',
 'aadyn',
 'aafia',
 'aafreen',
 'aagam',
 'aage',
 'aagot',
 'aahaan',
 'aahan',
 'aahana',
 'aahil',
 'aahir',
 'aahliyah',
 'aahna',
 'aahron',
 'aaidan',
 'aaiden',
 'aaidyn',
 'aaila',
 'aailiyah',
 'aailyah',
 'aaima',
 'aaira',
 'aairah',
 'aaisha',
 'aaishah',
 'aaiyana',
 'aaiza',
 'aaja',
 'aajah',
 'aajaylah',
 'aajon',
 'aakanksha',
 'aakarsh',
 'aakash',
 'aakeem',
 'aakilah',
 'aakira',
 'aakiyah',
 'aakriti',
 'aala',
 'aalaiya',
 'aalaiyah',
 'aalana',
 'aalanah',
 'aalani',
 'aalap',
 'aalaya',
 'aalayah',
 'aalayiah',
 'aalayjah',
 'aalayna',
 'aalaysha',
 'aalaysia',
 'aalea',
 

In [59]:
from sklearn.model_selection import train_test_split

In [60]:
# Features 
X
# Labels
y = data_names.sex

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [64]:
# Use Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.6398163206734908

In [65]:
# Model Accuracy
print(f'Model accuracy is {clf.score(X_test, y_test)*100} %')

Model accuracy is 63.98163206734908 %


## Sample Prediction


In [75]:
sample_name = ['Mike']
vect = cv.transform(sample_name).toarray()

In [76]:
vect

array([[0, 0, 0, ..., 0, 0, 0]])

In [77]:
# Female is 0, Male is 1
clf.predict(vect)

array([1])

In [79]:
# Sample prediction of random names
sample_names = ['Mark', 'Larry', 'Samantha', 'Olivia', 'Kia', 'Moussa']
vect_random = cv.transform(sample_names).toarray()

In [80]:
clf.predict(vect_random)

array([1, 1, 0, 0, 0, 1])

In [83]:
# Put it all in a function
def gender_predictor(name):
    test_name = [name]
    vector = cv.transform(test_name).toarray()
    if clf.predict(vector) == 0:
        print('Female')
    else:
        print('Male')

In [84]:
gender_predictor('Obama')

Male


## Saving our model


In [85]:
gender_model = open('gender_model.pkl', 'wb')
joblib.dump(clf, gender_model)
gender_model.close()