## Import Library

In [1]:
import numpy as np
import pandas as pd
from keras.layers.core import Dense, Activation, Dropout
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
import re

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Method untuk One Hot Encoding

In [2]:
def one_hot_encode(i):
    temp = np.zeros(28)
    temp[i] = 1
    return temp

## Preprocessing data

In [3]:
data = pd.read_csv('datanama.csv')
len(data)

15103

In [4]:
data = data.dropna(how="all")

In [5]:
data["Nama"] = data.Nama.str.lower() 
data["Nama"] = data.Nama.apply(lambda x: re.sub(r"[^a-z]"," ",x))

In [6]:
#split
temp = np.random.rand(len(data)) < 0.8
train = data[temp]
test = data[~temp]

## Membuat vocabulary index

In [7]:
vocabulary = set(' '.join([str(i) for i in data["Nama"]]))
vocabulary.add("END")

In [8]:
maxlen = 32
len_vocab = len(vocabulary)

In [9]:
set_index = dict((c,i) for i,c in enumerate(vocabulary))

## Proses data menjadi one hot encode

In [10]:
xtrain = []
ytrain = []
namanya= [str(i)[0:maxlen] for i in train.Nama]
for i in namanya:
    temp =[one_hot_encode(set_index[j]) for j in str(i)]
    for k in range(0,maxlen - len(str(i))):
        temp.append(one_hot_encode(set_index["END"]))
    xtrain.append(temp)
for j in train.gender:
    if j == "m":
        ytrain.append([1,0])
    else:
        ytrain.append([0,1])

In [11]:
print(np.asarray(xtrain).shape)
print(np.asarray(ytrain).shape)

(11991, 32, 28)
(11991, 2)


## Membuat model neural network

In [12]:
print('Build model...')
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(maxlen,len_vocab)))
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(2))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

Build model...
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [13]:
xtest = []
ytest = []
namanya= [str(i)[0:maxlen] for i in test.Nama]
for i in namanya:
    temp =[one_hot_encode(set_index[j]) for j in str(i)]
    for k in range(0,maxlen - len(str(i))):
        temp.append(one_hot_encode(set_index["END"]))
    xtest.append(temp)
for j in test.gender:
    if j == "m":
        ytest.append([1,0])
    else:
        ytest.append([0,1])

In [14]:
print(np.asarray(xtest).shape)
print(np.asarray(ytest).shape)

(2927, 32, 28)
(2927, 2)


## Training Neural Network

In [15]:
batch_size=1000
model.fit(np.array(xtrain), np.array(ytrain),batch_size=batch_size,epochs=50,validation_data= (np.array(xtest),np.array(ytest)))

Instructions for updating:
Use tf.cast instead.
Train on 11991 samples, validate on 2927 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x20de676b710>

## Evaluating score

In [16]:
score,acc = model.evaluate(np.asarray(xtest),np.asarray(ytest))
print('score =',score)
print('acc = ',acc)

score = 0.24336701552127465
acc =  0.9173214896408657


## Testing

In [19]:
x = []
testing = ['wahyu nugroho','shinta kurniawati','yoshi setiawan']
for i in testing:
    temp =[one_hot_encode(set_index[j]) for j in str(i)]
    for k in range(0,maxlen - len(str(i))):
        temp.append(one_hot_encode(set_index["END"]))
    x.append(temp)

In [20]:
model.predict(np.asarray(x))

array([[9.9814963e-01, 1.8504190e-03],
       [2.9698084e-04, 9.9970299e-01],
       [9.6287429e-01, 3.7125748e-02]], dtype=float32)

In [21]:
model.save('GenderClassificationLSTMModel')

In [24]:
model.save_weights('gender_model',overwrite=True)

In [26]:
evals = model.predict(np.asarray(xtest))
prob_m = [i[0] for i in evals]

In [27]:
wew = pd.DataFrame(prob_m)

In [33]:
wew['name'] = test.Nama.reset_index()['Nama']
wew['gender']=test.gender.reset_index()['gender']