In [1]:
import pandas as pd
import numpy as np


# Load ML Pkgs
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score,precision_score,classification_report
from sklearn.pipeline import Pipeline

# Load Data Viz Pkgs
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("../data/processed/name_gender_dataset.csv")
df.head()

Unnamed: 0,Name,Gender,Count,Probability
0,James,M,5304407,0.014517
1,John,M,5260831,0.014398
2,Robert,M,4970386,0.013603
3,Michael,M,4579950,0.012534
4,William,M,4226608,0.011567


In [3]:
df.shape

(147269, 4)

In [4]:
df.isna().sum()

Name           0
Gender         0
Count          0
Probability    0
dtype: int64

In [5]:
df.dtypes

Name            object
Gender          object
Count            int64
Probability    float64
dtype: object

In [6]:
df2 = df.copy()
df2['Gender'] = df2['Gender'].map({'M':1,'F':0})

In [7]:
df2[['Name','Gender']].to_csv("../data/processed/name_gender_dataset.csv",index=False)

In [8]:
Xfeatures = df2['Name']
ylabels = df2['Gender']

In [9]:
ylabels.unique()

array([1, 0])

In [10]:
x_train,x_test,y_train,y_test = train_test_split(Xfeatures,ylabels,test_size=0.3,random_state=7)

In [11]:
from sklearn.dummy import DummyClassifier

In [12]:
# Build Pipelines
pipe_base = Pipeline(steps=[('cv',CountVectorizer()),('dummy',DummyClassifier())])
pipe_lr = Pipeline(steps=[('cv',CountVectorizer()),('lr',LogisticRegression())])
pipe_dt = Pipeline(steps=[('cv',CountVectorizer()),('dt',DecisionTreeClassifier())])

In [13]:
pipe_base.fit(x_train,y_train)

In [14]:
# Base Accuracy
pipe_base.score(x_test,y_test)

0.609673841696657

In [15]:
pipe_lr.fit(x_train,y_train)


In [16]:
pipe_lr.score(x_test,y_test)


0.5628663905298658

In [17]:
y_pred = pipe_lr.predict(x_test)


In [18]:
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.59      0.91      0.72     26936
           1       0.16      0.03      0.05     17245

    accuracy                           0.56     44181
   macro avg       0.38      0.47      0.38     44181
weighted avg       0.42      0.56      0.46     44181



In [19]:
import joblib
def save_model(model):
    model_file = str(type(model).__name__).lower()
    joblib.dump(model, f'../models/{model_file}.pkl')

In [22]:
def save_pipeline(pipe):
    estimator_name = str(type(pipe.steps[1][1]).__name__).lower()
    model_file = f'../models/gender_classifier_{pipe.steps[0][0]}_{estimator_name}.pkl'
    joblib.dump(pipe, model_file)

In [23]:
save_pipeline(pipe_lr)


In [24]:
pipe_nb = Pipeline(steps=[('cv',CountVectorizer()),('nb',MultinomialNB())])

In [25]:
pipe_nb.fit(x_train,y_train)


In [26]:
pipe_nb.score(x_test,y_test)


0.565718295194767

In [27]:
save_pipeline(pipe_nb)
