In [1]:
import numpy as np
import pandas as pd
import re
import sklearn as sk
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import joblib
from tensorflow.keras.models import load_model

# Custom Function

In [39]:
# this function matches all alphabets and filter out non alphabetical characters in each cell
regex = re.compile(r"[a-zA-Z]+")

def get_name(value):
    texts = (re.findall(regex, value))
    return ''.join(texts)

In [3]:
test = "What&&what"
get_name(test)

'Whatwhat'

In [4]:
#this function reverse the name of the cell. This is to test if feeding the name into the model backwards 
#would improve the performance
def reverse_name(value):
    return value[::-1]
    

In [40]:
#transform label to 1 (F), and 0 (M)
def transform_gender(value):
    if value == "F":
        return 1
    else:
        return 0

# import dataset

In [41]:
df = pd.read_csv("./name_gender.csv")

# 1) Explore data

### missing values (None) 

In [43]:
df.isna().sum()

name      0
gender    0
dtype: int64

### level 
* (Category is good, 63.4% females)
* Names seems to be unique

In [44]:
for i in df.columns:
    print(df[i].value_counts(normalize=True))

Kamion            0.000011
Jaytin            0.000011
Romalis           0.000011
Mustafe           0.000011
Adacia            0.000011
                    ...   
Taejah            0.000011
Merrikay          0.000011
Airianna          0.000011
Emilyelizabeth    0.000011
Arles             0.000011
Name: name, Length: 95025, dtype: float64
F    0.634612
M    0.365388
Name: gender, dtype: float64


# 2) data cleaning

### Name
* strip the white space
* lower case
* remove special character

### Gender
* label F as 1 and M as 0

In [45]:
df["name"] = df["name"].str.strip()
df["name"] = df["name"].str.lower()

In [46]:
df["name"] = df["name"].apply(get_name)

In [47]:
# #try reversing the name and see if the model performs better (it does not. thus commented out to skip this step)
# df["name"] = df["name"].apply(reverse_name)

In [13]:
df["gender"] = df["gender"].apply(transform_gender)

# 3) data preprocessing for NLP 

## shuffle data (current names are in alphabetical order)

In [48]:
df = df.sample(frac=1,random_state=1211)

## split name by spaces

In [49]:
name = list(df["name"])

In [50]:
for i in range(0,len(name)):
    name[i] = " ".join(name[i])

## tokenize the letters

In [51]:
# parameters
tokens = 27
padded_len = 20

In [52]:
#create fit token 
tokenizer = Tokenizer(num_words=tokens, oov_token="oov")
tokenizer.fit_on_texts(name)

In [53]:
#check the word index
word_index = tokenizer.word_index

In [54]:
#replace text as token index 
sequences = tokenizer.texts_to_sequences(name)

In [55]:
padd_sequences = pad_sequences(sequences,maxlen= padded_len)

In [56]:
padd_sequences

array([[ 0,  0,  0, ...,  5, 16,  3],
       [ 0,  0,  0, ...,  5, 17,  8],
       [ 0,  0,  0, ...,  5,  4,  2],
       ...,
       [ 0,  0,  0, ...,  4,  4,  2],
       [ 0,  0,  0, ...,  7, 10,  4],
       [ 0,  0,  0, ...,  5,  4,  3]])

In [57]:
padd_sequences.shape

(95025, 20)

In [58]:
df["name"]

12474        brice
23919    dimitrius
46174      kaylina
63053        moran
52041       larken
           ...    
28166      eshanti
65913      niclole
44174    kailianna
36569     jaedalyn
1343       adoline
Name: name, Length: 95025, dtype: object

# 4) train test split

In [59]:
split_point = int(round(len(df)*0.7))
split_point

66518

In [60]:
gender = list(df["gender"])

In [61]:
training_name = padd_sequences[0:split_point]
training_gender = gender[0:split_point]

test_name = padd_sequences[split_point:]
test_gender = gender[split_point:]

In [62]:
training_gender = np.array(training_gender)
test_gender = np.array(test_gender)

# 5) train model

In [29]:
#hyper parameter tuning: input dim

dim = [5,16,40,64]
for i in dim:
    print("verbose = ", str(i))
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(tokens, i, input_length=padded_len),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(24,activation="relu"),
        tf.keras.layers.Dense(1, activation = "sigmoid"),
#         tf.keras.layers.Dense(1, activation = "sigmoid") # addition of this layer result in drop in model performance actually
    ])

    model.compile(loss="binary_crossentropy", optimizer = "adam", metrics=["accuracy"])

    nup_epochs = 30
    history = model.fit(training_name, training_gender, epochs=nup_epochs,
                        validation_data=(test_name,test_gender),verbose=2)

verbose =  5
Epoch 1/30
2079/2079 - 1s - loss: 0.5732 - accuracy: 0.6970 - val_loss: 0.5395 - val_accuracy: 0.7222
Epoch 2/30
2079/2079 - 1s - loss: 0.5416 - accuracy: 0.7229 - val_loss: 0.5373 - val_accuracy: 0.7244
Epoch 3/30
2079/2079 - 1s - loss: 0.5409 - accuracy: 0.7238 - val_loss: 0.5376 - val_accuracy: 0.7256
Epoch 4/30
2079/2079 - 1s - loss: 0.5410 - accuracy: 0.7241 - val_loss: 0.5367 - val_accuracy: 0.7254
Epoch 5/30
2079/2079 - 1s - loss: 0.5405 - accuracy: 0.7246 - val_loss: 0.5370 - val_accuracy: 0.7244
Epoch 6/30
2079/2079 - 1s - loss: 0.5407 - accuracy: 0.7237 - val_loss: 0.5389 - val_accuracy: 0.7239
Epoch 7/30
2079/2079 - 1s - loss: 0.5406 - accuracy: 0.7233 - val_loss: 0.5377 - val_accuracy: 0.7221
Epoch 8/30
2079/2079 - 1s - loss: 0.5407 - accuracy: 0.7241 - val_loss: 0.5371 - val_accuracy: 0.7231
Epoch 9/30
2079/2079 - 1s - loss: 0.5407 - accuracy: 0.7238 - val_loss: 0.5366 - val_accuracy: 0.7235
Epoch 10/30
2079/2079 - 1s - loss: 0.5406 - accuracy: 0.7235 - val_lo

Epoch 19/30
2079/2079 - 1s - loss: 0.5204 - accuracy: 0.7392 - val_loss: 0.5188 - val_accuracy: 0.7409
Epoch 20/30
2079/2079 - 2s - loss: 0.5196 - accuracy: 0.7400 - val_loss: 0.5186 - val_accuracy: 0.7404
Epoch 21/30
2079/2079 - 1s - loss: 0.5192 - accuracy: 0.7408 - val_loss: 0.5183 - val_accuracy: 0.7404
Epoch 22/30
2079/2079 - 2s - loss: 0.5186 - accuracy: 0.7411 - val_loss: 0.5165 - val_accuracy: 0.7421
Epoch 23/30
2079/2079 - 1s - loss: 0.5179 - accuracy: 0.7412 - val_loss: 0.5167 - val_accuracy: 0.7423
Epoch 24/30
2079/2079 - 2s - loss: 0.5167 - accuracy: 0.7435 - val_loss: 0.5173 - val_accuracy: 0.7399
Epoch 25/30
2079/2079 - 1s - loss: 0.5160 - accuracy: 0.7426 - val_loss: 0.5146 - val_accuracy: 0.7441
Epoch 26/30
2079/2079 - 2s - loss: 0.5147 - accuracy: 0.7441 - val_loss: 0.5143 - val_accuracy: 0.7446
Epoch 27/30
2079/2079 - 2s - loss: 0.5139 - accuracy: 0.7446 - val_loss: 0.5198 - val_accuracy: 0.7392
Epoch 28/30
2079/2079 - 1s - loss: 0.5131 - accuracy: 0.7442 - val_loss: 

In [30]:
#dimension of 64 perform best with test accuracy of 75.22% on last epoch

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(tokens, 64, input_length=padded_len),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24,activation="relu"),
    tf.keras.layers.Dense(1, activation = "sigmoid")
])

model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 20, 64)            1728      
_________________________________________________________________
global_average_pooling1d_4 ( (None, 64)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 24)                1560      
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 25        
Total params: 3,313
Trainable params: 3,313
Non-trainable params: 0
_________________________________________________________________


In [31]:
model.compile(loss="binary_crossentropy", optimizer = "adam", metrics=["accuracy"])

nup_epochs = 30
model.fit(training_name, training_gender, epochs=nup_epochs,
                    validation_data=(test_name,test_gender),verbose=2)

Epoch 1/30
2079/2079 - 2s - loss: 0.5561 - accuracy: 0.7089 - val_loss: 0.5405 - val_accuracy: 0.7234
Epoch 2/30
2079/2079 - 2s - loss: 0.5417 - accuracy: 0.7225 - val_loss: 0.5361 - val_accuracy: 0.7251
Epoch 3/30
2079/2079 - 2s - loss: 0.5408 - accuracy: 0.7235 - val_loss: 0.5372 - val_accuracy: 0.7263
Epoch 4/30
2079/2079 - 2s - loss: 0.5384 - accuracy: 0.7239 - val_loss: 0.5335 - val_accuracy: 0.7264
Epoch 5/30
2079/2079 - 2s - loss: 0.5352 - accuracy: 0.7261 - val_loss: 0.5307 - val_accuracy: 0.7281
Epoch 6/30
2079/2079 - 2s - loss: 0.5324 - accuracy: 0.7298 - val_loss: 0.5271 - val_accuracy: 0.7314
Epoch 7/30
2079/2079 - 2s - loss: 0.5303 - accuracy: 0.7331 - val_loss: 0.5259 - val_accuracy: 0.7360
Epoch 8/30
2079/2079 - 2s - loss: 0.5285 - accuracy: 0.7344 - val_loss: 0.5245 - val_accuracy: 0.7369
Epoch 9/30
2079/2079 - 2s - loss: 0.5263 - accuracy: 0.7361 - val_loss: 0.5204 - val_accuracy: 0.7397
Epoch 10/30
2079/2079 - 2s - loss: 0.5236 - accuracy: 0.7380 - val_loss: 0.5191 - 

<tensorflow.python.keras.callbacks.History at 0x1fdcf7166a0>

# Export all objects

In [32]:
model.save("NLP_model.h5")

In [33]:
joblib.dump(tokenizer, "tokenizer.pkl")

['tokenizer.pkl']