# Gender name data prediction

In [88]:
# library imports
import pickle
import pandas as pd
import os.path as path

# Library imports
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn. model_selection import train_test_split
from tensorflow.keras.layers import Dense, Input, Conv1D, Flatten, MaxPool2D, BatchNormalization, Dropout, LSTM, Embedding, Masking
from tensorflow.keras.models import Model
from tensorflow.keras import activations

from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.preprocessing import OneHotEncoder

# Data import

In [89]:
# Importing data
file_name = "all_gender_name"

with open(f'{file_name}.obj', 'rb') as f:
	data = pickle.load(f)

In [90]:
X = data[0]
y = data[1]

In [91]:
chars = []
for name in X:
    for char in name:
        chars.append(char.lower())
chars = list(set(chars))

### ML data prep

In [92]:
# Tokenization of characters.

list_of_unique_char = []
for name in X:
    for char in name:
        list_of_unique_char.append(char.lower())
list_of_unique_char = list(set(list_of_unique_char))
print(len(list_of_unique_char))

38


In [93]:
# Create a character dictionary
char_dictionary = {}
count = 0
for char in list_of_unique_char:
    char_dictionary[char] = count
    count += 1

In [94]:
X_token = []

for name in X:
    name_unit = []
    for char in name:
        all_chars = [0 for x in range(len(list_of_unique_char))]
        all_chars[char_dictionary[char.lower()]] = 1
        name_unit.append(all_chars)
    X_token.append(name_unit)

In [95]:
y_encoded = []
for gender in y:
    if gender == "M":
        y_encoded.append([1,0])
    else:
        y_encoded.append([0,1])
y = y_encoded

In [96]:
y = np.array(y)
X_token = pad_sequences(X_token, padding='post', value=0, maxlen=20)
print(y.shape)
#X_token = X_token.reshape(-1, 16, 38, 1)
X = X_token
print(X.shape)

(96756, 2)
(96756, 20, 38)


In [97]:
# Split into test and train
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.15)

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print("")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

Shape of X_train: (82242, 20, 38)
Shape of X_test: (14514, 20, 38)

Shape of y_train: (82242, 2)
Shape of y_test: (14514, 2)


# The machinelearning model
A LSTM model is used.

In [98]:
# Conv1D and LSTM network:
"""Epoch 15/15
643/643 [==============================] - 67s 104ms/step - loss: 0.2091 - accuracy: 0.9127 - val_loss: 0.2653 - val_accuracy: 0.8933"""

# input_layer = Input(shape=(20,38))
# conv_layer_1 = Conv1D(38,2,padding='same', use_bias=False)(input_layer)
# #conv_layer_2 = Conv1D(38,5,padding='same', use_bias=False)(conv_layer_1)
# masked_input = Masking(mask_value = 0)(conv_layer_1)
# print(masked_input.shape)
# print(masked_input._keras_mask.shape)
# lstm_layer_1= LSTM(38*4)(masked_input)
# print(lstm_layer_1.shape)
# output_layer = Dense(2, activation = 'softmax')(lstm_layer_1)



In [99]:
input_layer = Input(shape=(20,38))
conv_layer_1 = Conv1D(38,2,padding='same', use_bias=False)(input_layer)
conv_layer_2 = Conv1D(38,3,padding='same', use_bias=False)(conv_layer_1)
masked_input = Masking(mask_value = 0)(conv_layer_2)
print(masked_input.shape)
print(masked_input._keras_mask.shape)
lstm_layer_1= LSTM(38*4)(masked_input)
print(lstm_layer_1.shape)
dense_layer_1 = Dense(38*2, activation='relu')(lstm_layer_1)
output_layer = Dense(2, activation = 'softmax')(dense_layer_1)

(None, 20, 38)
(None, 20)
(None, 152)


In [100]:
# Model setup
gender_name_model = Model(inputs=input_layer, outputs=output_layer)
gender_name_model.compile(optimizer='adam', 
                    loss='categorical_crossentropy',
                    metrics='accuracy')

In [101]:
gender_name_model.summary()

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, 20, 38)]          0         
                                                                 
 conv1d_11 (Conv1D)          (None, 20, 38)            2888      
                                                                 
 conv1d_12 (Conv1D)          (None, 20, 38)            4332      
                                                                 
 masking_6 (Masking)         (None, 20, 38)            0         
                                                                 
 lstm_6 (LSTM)               (None, 152)               116128    
                                                                 
 dense_7 (Dense)             (None, 76)                11628     
                                                                 
 dense_8 (Dense)             (None, 2)                 154 

In [102]:
# Model fit
history_0=gender_name_model.fit(X_train, 
                                y_train, 
                                validation_data=[X_test, y_test], 
                                batch_size=128, 
                                epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15

KeyboardInterrupt: 

In [None]:
test_name = X_val[0]

In [None]:
test_name.reshape(1,16,38,);

In [None]:
test_name.shape

In [None]:
answer = gender_name_model.predict(X_val)

In [None]:
for item in answer:
    print(item)

In [None]:
X_val[2]

In [None]:
print(char_dictionary)

In [None]:
for n, entry in enumerate(X_val):
    name = ""
    for num, line in enumerate(entry):
        for index, char in enumerate(line):
            if char == 1:
                for key, value in char_dictionary.items():
                    if index == value:
                        name = name + key
    print(f"The name is: {name}")
    print(f"The gender code is: {y_val[n]}")
    print(f"While the result is: {answer[n]}\n")


In [None]:
dd = 0.87
print(round(dd))

In [None]:
count = 0

for num, res in enumerate(answer):
    if round(res[0]) != y_val[num][0]:
        count += 1
print(count)

print(len(y_val))