# Origin name data prediction

In [1]:
# library imports
import pickle
import pandas as pd
import os.path as path

# Library imports
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn. model_selection import train_test_split
from tensorflow.keras.layers import Dense, Input, Conv1D, Flatten, MaxPool2D, BatchNormalization, Dropout, LSTM, Embedding, Masking
from tensorflow.keras.models import Model
from tensorflow.keras import activations

from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.preprocessing import OneHotEncoder

# Data import

In [2]:
# Importing data
file_name = "origin_data"

with open(f'{file_name}.obj', 'rb') as f:
	data = pickle.load(f)

In [3]:
X = data[0]
y1 = data[1]
y2 = data[2]
y3 = data[3]
y4 = data[4]

In [4]:
chars = []
for name in X:
    for char in name:
        chars.append(char.lower())
chars = list(set(chars))

### ML data prep

In [5]:
# Tokenization of characters.

list_of_unique_char = []
for name in X:
    for char in name:
        list_of_unique_char.append(char.lower())
list_of_unique_char = list(set(list_of_unique_char))
print(len(list_of_unique_char))

33


In [6]:
# Create a character dictionary
char_dictionary = {}
count = 0
for char in list_of_unique_char:
    char_dictionary[char] = count
    count += 1

In [7]:
X_token = []

for name in X:
    name_unit = []
    for char in name:
        all_chars = [0 for x in range(len(list_of_unique_char))]
        all_chars[char_dictionary[char.lower()]] = 1
        name_unit.append(all_chars)
    X_token.append(name_unit)

In [8]:
# One hot encoding of the y lists
y1 = np.array(y1)
class_encoder1 = OneHotEncoder(sparse_output=False)
class_encoder1.fit(y1.reshape(-1,1))
y1 = class_encoder1.transform(y1.reshape(-1,1))

In [9]:
y1[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])

In [10]:
y2 = np.array(y2)
class_encoder2 = OneHotEncoder(sparse_output=False)
class_encoder2.fit(y2.reshape(-1,1))
y2 = class_encoder2.transform(y2.reshape(-1,1))
y2[0]

array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0.])

In [11]:
y3 = np.array(y3)
class_encoder3 = OneHotEncoder(sparse_output=False)
class_encoder3.fit(y3.reshape(-1,1))
y3 = class_encoder3.transform(y3.reshape(-1,1))
y3[0]

array([0., 0., 0., 0., 1., 0.])

In [12]:
y4 = np.array(y4)
class_encoder4 = OneHotEncoder(sparse_output=False)
class_encoder4.fit(y4.reshape(-1,1))
y4 = class_encoder4.transform(y4.reshape(-1,1))
y4[0]

array([0., 0., 0., 1., 0.])

In [13]:
X_token = pad_sequences(X_token, padding='post', value=0, maxlen = 20)
X = X_token
print(X.shape)

(5528, 20, 33)


In [14]:
# Split into test and train
X_train, X_test, y_train, y_test = train_test_split(X,y4, test_size=0.15)
#X_test, X_val, y_test, y_val = train_test_split(X_test,y_test, test_size=0.25)

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
#print(f"Shape of X_val: {X_val.shape}")
print("")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")
#print(f"Shape of y_val: {y_val.shape}")

Shape of X_train: (4698, 20, 33)
Shape of X_test: (830, 20, 33)

Shape of y_train: (4698, 5)
Shape of y_test: (830, 5)


# The machinelearning model
A LSTM model is used.

In [15]:
input_layer = Input(shape=(20,33))
conv_layer_1 = Conv1D(33,2, padding='same', use_bias=False)(input_layer)
conv_layer_2 = Conv1D(33,3, padding='same', use_bias=False)(conv_layer_1)
conv_layer_3 = Conv1D(33,5, padding='same', use_bias=False)(conv_layer_2)
conv_layer_4 = Conv1D(33,7, padding='same', use_bias=False)(conv_layer_3)
masked_input = Masking(mask_value = 0)(conv_layer_4)
print(masked_input.shape)
print(masked_input._keras_mask.shape)
lstm_layer_1= LSTM(33*4)(masked_input)
print(lstm_layer_1.shape)
output_layer = Dense(5, activation = 'softmax')(lstm_layer_1)

(None, 20, 33)
(None, 20)
(None, 132)


In [16]:
# Model setup
origin_name_model = Model(inputs=input_layer, outputs=output_layer)
origin_name_model.compile(optimizer='adam', 
                    loss='categorical_crossentropy',
                    metrics='accuracy')

In [17]:
origin_name_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 20, 33)]          0         
                                                                 
 conv1d (Conv1D)             (None, 20, 33)            2178      
                                                                 
 conv1d_1 (Conv1D)           (None, 20, 33)            3267      
                                                                 
 conv1d_2 (Conv1D)           (None, 20, 33)            5445      
                                                                 
 conv1d_3 (Conv1D)           (None, 20, 33)            7623      
                                                                 
 masking (Masking)           (None, 20, 33)            0         
                                                                 
 lstm (LSTM)                 (None, 132)               87648 

In [18]:
# Model fit
history_0=origin_name_model.fit(X_train,
                                y_train,
                                validation_data=[X_test, y_test],
                                batch_size=128,
                                epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [19]:
stop

NameError: name 'stop' is not defined

In [None]:
test_name = X_val[0]

In [None]:
test_name.reshape(1,16,38,);

In [None]:
test_name.shape

In [None]:
answer = gender_name_model.predict(X_val)

In [None]:
for item in answer:
    print(item)

In [None]:
X_val[2]

In [None]:
print(char_dictionary)

In [None]:
for n, entry in enumerate(X_val):
    name = ""
    for num, line in enumerate(entry):
        for index, char in enumerate(line):
            if char == 1:
                for key, value in char_dictionary.items():
                    if index == value:
                        name = name + key
    print(f"The name is: {name}")
    print(f"The gender code is: {y_val[n]}")
    print(f"While the result is: {answer[n]}\n")


In [None]:
dd = 0.87
print(round(dd))

In [None]:
count = 0

for num, res in enumerate(answer):
    if round(res[0]) != y_val[num][0]:
        count += 1
print(count)

print(len(y_val))