# Gender name data prediction

In [1]:
# library imports
import pickle
import pandas as pd
import os.path as path

# Library imports
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn. model_selection import train_test_split
from tensorflow.keras.layers import Dense, Input, Conv1D, Flatten, MaxPool2D, BatchNormalization, Dropout, LSTM, Embedding, Masking
from tensorflow.keras.models import Model
from tensorflow.keras import activations

from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.preprocessing import OneHotEncoder

# Data import

In [5]:
# Importing data
parent = path.abspath(path.join("03_gender_name_prediction.ipynb" ,"../.."))

with open(f'{parent}/processed/all_gender_name.obj', 'rb') as f:
	data = pickle.load(f)

In [6]:
X = data[0]
y = data[1]

In [7]:
chars = []
for name in X:
    for char in name:
        chars.append(char.lower())
chars = list(set(chars))

### ML data prep

In [8]:
# Tokenization of characters.

list_of_unique_char = []
for name in X:
    for char in name:
        list_of_unique_char.append(char.lower())
list_of_unique_char = list(set(list_of_unique_char))
print(len(list_of_unique_char))

38


In [9]:
# Create a character dictionary
char_dictionary = {}
count = 0
for char in list_of_unique_char:
    char_dictionary[char] = count
    count += 1

In [10]:
X_token = []

for name in X:
    name_unit = []
    for char in name:
        all_chars = [0 for x in range(len(list_of_unique_char))]
        all_chars[char_dictionary[char.lower()]] = 1
        name_unit.append(all_chars)
    X_token.append(name_unit)

In [11]:
y_encoded = []
for gender in y:
    if gender == "M":
        y_encoded.append([1,0])
    else:
        y_encoded.append([0,1])
y = y_encoded

In [12]:
y = np.array(y)
X_token = pad_sequences(X_token, padding='post', value=0, maxlen=20)
print(y.shape)
#X_token = X_token.reshape(-1, 16, 38, 1)
X = X_token
print(X.shape)

(96756, 2)
(96756, 20, 38)


In [13]:
# Split into test and train
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.001)

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print("")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

Shape of X_train: (96659, 20, 38)
Shape of X_test: (97, 20, 38)

Shape of y_train: (96659, 2)
Shape of y_test: (97, 2)


# The machinelearning model
A LSTM model is used.

In [14]:
input_layer = Input(shape=(20,38))
conv_layer_1 = Conv1D(38,2,padding='same', use_bias=False)(input_layer)
masked_input = Masking(mask_value = 0)(conv_layer_1)
print(masked_input.shape)
print(masked_input._keras_mask.shape)
lstm_layer_1= LSTM(38*6)(masked_input)
print(lstm_layer_1.shape)
output_layer = Dense(2, activation = 'softmax')(lstm_layer_1)

(None, 20, 38)
(None, 20)
(None, 228)


In [15]:
# Model setup
gender_name_model = Model(inputs=input_layer, outputs=output_layer)
gender_name_model.compile(optimizer='adam', 
                    loss='categorical_crossentropy',
                    metrics='accuracy')

In [16]:
gender_name_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 20, 38)]          0         
                                                                 
 conv1d (Conv1D)             (None, 20, 38)            2888      
                                                                 
 masking (Masking)           (None, 20, 38)            0         
                                                                 
 lstm (LSTM)                 (None, 228)               243504    
                                                                 
 dense (Dense)               (None, 2)                 458       
                                                                 
Total params: 246,850
Trainable params: 246,850
Non-trainable params: 0
_________________________________________________________________


In [17]:
# Model fit
history_0=gender_name_model.fit(X_train, 
                                y_train, 
                                validation_data=[X_test, y_test], 
                                batch_size=128, 
                                epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


## Import name data

In [46]:
# Data import
parent = path.abspath(path.join("03_gender_name_prediction.ipynb" ,"../../.."))
with open(f'{parent}/Data/Extracted/df_people.obj', 'rb') as file:
	people_data = pickle.load(file)

In [47]:
def first_name_extractor(name):
    first_name = ""
    for char in name:
        if char == " ":
            return first_name
        else:
            first_name = first_name + char


people_data['first_name'] = people_data['navn'].map(first_name_extractor)

In [48]:
first_name_list = people_data['first_name'].tolist()

In [49]:
first_name_list[0]

'Fredrik'

In [50]:
name_token = []

for name in first_name_list:
    name_unit = []
    for char in name:
        all_chars = [0 for x in range(len(list_of_unique_char))]
        all_chars[char_dictionary[char.lower()]] = 1
        name_unit.append(all_chars)
    name_token.append(name_unit)

In [51]:
name_token = pad_sequences(name_token, padding='post', value=0, maxlen=20)
print(name_token.shape)

(1853, 20, 38)


In [52]:
predictions = gender_name_model.predict(name_token)



In [53]:
for name, prediction in zip(first_name_list, predictions):
    print(f"The name is: {name} and the prediction is: {prediction}")

The name is: Fredrik and the prediction is: [9.9996066e-01 3.9306058e-05]
The name is: Fredrik and the prediction is: [9.9996066e-01 3.9306058e-05]
The name is: Stian and the prediction is: [0.960796   0.03920402]
The name is: Even and the prediction is: [0.8739496  0.12605041]
The name is: Mohammad and the prediction is: [9.996014e-01 3.985437e-04]
The name is: Thomas and the prediction is: [0.07485382 0.9251462 ]
The name is: Lydia and the prediction is: [5.9567086e-05 9.9994040e-01]
The name is: Karl and the prediction is: [0.99283594 0.00716403]
The name is: Sven-Helge and the prediction is: [0.55874085 0.44125912]
The name is: Gyrid and the prediction is: [9.994228e-01 5.772059e-04]
The name is: Mustfa and the prediction is: [0.06921294 0.9307871 ]
The name is: Brian and the prediction is: [0.9727946  0.02720537]
The name is: Firat and the prediction is: [0.12715147 0.8728486 ]
The name is: Arnt and the prediction is: [9.9955970e-01 4.4023848e-04]
The name is: Anna and the predict

In [54]:
sex = []
for value in predictions:
    if value[0] > value [1]:
        sex.append(1)
    else:
        sex.append(0)

In [55]:
people_data['sex'] = sex

In [56]:
people_data = people_data[['navn', 'person_id', 'sex']]

In [57]:
people_data

Unnamed: 0,navn,person_id,sex
0,Fredrik Kloster Hansen,0,1
1,Fredrik Øren Refsnes,1,1
2,Stian Bua Hellestad,2,1
3,Even Bernhard Bergstrøm Hegbom,3,1
4,Mohammad Shahid,4,1
...,...,...,...
1848,Hassan Omar Ibrahim,1848,1
1849,Pawel Adrian Turala,1849,1
1850,Erlend Nornes,1850,1
1851,Fred Arne Bakken,1851,1


In [58]:
# Data import
parent = path.abspath(path.join("03_gender_name_prediction.ipynb" ,"../../.."))
with open(f'{parent}/Data/Extracted/df_people_gender.obj', 'wb') as file:
	pickle.dump(people_data, file)

In [59]:
people_data

Unnamed: 0,navn,person_id,sex
0,Fredrik Kloster Hansen,0,1
1,Fredrik Øren Refsnes,1,1
2,Stian Bua Hellestad,2,1
3,Even Bernhard Bergstrøm Hegbom,3,1
4,Mohammad Shahid,4,1
...,...,...,...
1848,Hassan Omar Ibrahim,1848,1
1849,Pawel Adrian Turala,1849,1
1850,Erlend Nornes,1850,1
1851,Fred Arne Bakken,1851,1
