# Gender Prediction Model

In [None]:
import re
import os
import pandas as pd
import numpy as np
import multiprocessing
import ipython_genutils
import pickle
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from keras.models import model_from_json
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Dropout, MaxPooling1D, LSTM, GRU
from keras.layers import Flatten
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from sklearn.metrics import classification_report
from sklearn import model_selection
from sklearn import utils
from sklearn.utils import shuffle
from keras.preprocessing.text import Tokenizer
from nltk.tokenize import word_tokenize
import progressbar
import matplotlib.pyplot as plt

### read dataset

In [None]:
df = pd.read_csv('lib/dataset',header=None,encoding = "ISO-8859-1")
df=df.rename(columns={0: 'name'})
tmp=df.name
df.info()
df.head()

# Preprocess

data tiap row di split karena nama dan gender masih dalam satu kolom

In [None]:
widgets = [progressbar.Percentage(),progressbar.Bar()," Processed : ",progressbar.Counter(),"  ",progressbar.ETA()]
bar = progressbar.ProgressBar(widgets=widgets, max_value=len(df.index))
bar.start()

name=[]
gender=[]
for index,row in bar(df.iterrows()):
    # split
    tmp=row['name'].split("@")
    name.append(tmp[0])
    gender.append(tmp[1])

In [None]:
# buat dataframe
df=pd.DataFrame({'name':name,'gender':gender})
df.info()
df.head()

cek jumlah data tiap kelas

In [None]:
# male   : 0
# female : 1
df['gender']=df['gender'].map({'m':0,'f':1})
df.groupby('gender')['name'].count()

karena tidak seimbang maka data di downsampling

In [None]:
# shuffle dataset
df = shuffle(df)

# mengambil semua data gender male & female di 2 var berbeda
dfm=df.query('gender == 0')
dff=df.query('gender == 1')

# min jumlah terkecil dari gender male & female
minlen=min(len(dff),len(dfm))

# ambil data male & female sebanyak nilai min
dfm=dfm.head(minlen)
dff=dff.head(minlen)

# gabung
data_training=pd.concat([dff.name,dfm.name])
target=pd.concat([dff.gender,dfm.gender])

nama di split menjadi list huruf, lalu cek nama yg terpanjang untuk nantinya digunakan dalam padding

In [None]:
widgets = [progressbar.Percentage(),progressbar.Bar()," Processed : ",progressbar.Counter(),"  ",progressbar.ETA()]
bar = progressbar.ProgressBar(widgets=widgets, max_value=len(df.index))
bar.start()

maxleng=0
tmp=[]
for index,row in bar(df.iterrows()):
    a=list(row['name'].lower())
    tmp.append(a)
    maxleng=max(maxleng, len(a))
print("max length name : ",maxleng)

 menghapus spasi yg ada di depan nama dan di akhir nama. spasi di antara nama tidak dihapus karena spasi tsb termasuk fitur

In [None]:
widgets = [progressbar.Percentage(),progressbar.Bar()," Processed : ",progressbar.Counter(),"  ",progressbar.ETA()]
bar = progressbar.ProgressBar(widgets=widgets, max_value=len(tmp))
bar.start()

tmp2=[]
for letter in bar(tmp):
    for p in range(maxleng):
        l = len(letter)
        if letter[0]==' ':
            del letter[0]
        elif letter[l-1]==' ':
            del letter[l-1]
    tmp2.append(letter)
data_training=tmp2

In [None]:
data_training[0]

tokenizing. mengkonversi list huruf menjadi vector angka. menggunakan tokenizer yg sudah dibuat sebelumnya dengan semua dataset. jika blm ada tokenizer maka dibuat terlabih dahulu dengan semua data, tidak di downsampling.

In [None]:
#save tokenizer
#tokenizer = Tokenizer(num_words=30)
#tokenizer.fit_on_texts(data_training)                        
#sequences = tokenizer.texts_to_sequences(data_training) 
#with open('tokenizer_letter.pickle', 'wb') as handle:
#    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

load tokenizer file lalu konversi

In [None]:
nwords=30
tokenizer = Tokenizer(num_words=nwords)
with open('lib/tokenizer_letter.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)                                
sequences = tokenizer.texts_to_sequences(data_training) 

cek data hasil konevrsi

In [None]:
sequences[0]

In [None]:
c=0
mlet=0
for x in sequences:
    for y in sequences[c]:
        mlet=max(mlet,y)
print(mlet)

padding, menyamakan length vector dengan maxlen = nama terpanjang + 3

In [None]:
input_seq = pad_sequences(sequences, maxlen=maxleng+3)

split dataset menjadi data training dan data validasi

In [None]:
input_train, input_val, target_train, target_val = model_selection.train_test_split(input_seq,target,test_size = 0.2, random_state = 0)
print('data train\t: ',len(input_train))
print('data validasi\t: ',len(input_val),)

cek data yg sudah di preprocess dan ready utk training

In [None]:
input_train[0]

# Model

training 4 model
1. CNN
2. LSTM
3. CNN LSTM
4. CNN GRU


semua model menggunakan output fungsi aktivasi sigmoid, dg optimizer adam dan fungsi loss binary crossentropy.

inisiasi embed dimension, epoch dan batch size

In [None]:
embed_dim = 128
epoc=100
batch_size = 32

build model cnn

In [None]:
model = Sequential()
model.add(Embedding(nwords, embed_dim,input_length = input_train.shape[1],trainable=True))
model.add(Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPooling1D())
model.add(Dense(256, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

set filepath utk save weights dan set callback untuk mengevaluasi bahwa hanya jika acc naik maka weights akan di save.

In [None]:
filepath="weights_gender_letter_cnn.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]
history=model.fit(input_train, target_train,validation_data=(input_val, target_val), shuffle=False, epochs=epoc, batch_size=batch_size,callbacks=callbacks_list)

score acc

In [None]:
model.load_weights('weights_gender_letter_cnn.hdf5')
scores = model.evaluate(input_val, target_val)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

In [None]:
from sklearn.metrics import classification_report
predict = model.predict_classes(input_val)
y_true = target_val
y_pred = predict
target_names = ['male', 'female',]
print (classification_report(y_true, y_pred, target_names=target_names))

plot acc dan loss

In [None]:
# Plot training & validation accuracy values
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

build model lstm

In [None]:
model = Sequential()
model.add(Embedding(nwords, embed_dim,input_length = input_train.shape[1],trainable=True))
model.add(Dropout(0.2))
model.add(LSTM(100, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
model.summary()

set filepath utk save weights dan set callback untuk mengevaluasi bahwa hanya jika acc naik maka weights akan di save.

In [None]:
filepath="weights_gender_letter_lstm.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]
history=model.fit(input_train, target_train,validation_data=(input_val, target_val), shuffle=False, epochs=epoc, batch_size=batch_size,callbacks=callbacks_list)

score acc

In [None]:
model.load_weights("weights_gender_letter_lstm.hdf5")
scores = model.evaluate(input_val, target_val)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

In [None]:
from sklearn.metrics import classification_report
predict = model.predict_classes(input_val)
y_true = target_val
y_pred = predict
target_names = ['male', 'female',]
print (classification_report(y_true, y_pred, target_names=target_names))

plot acc dan loss

In [None]:
# Plot training & validation accuracy values
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

build model cnn lstm

In [None]:
model = Sequential()
model.add(Embedding(nwords, embed_dim,input_length = input_train.shape[1],trainable=True))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100,return_sequences=False))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

set filepath utk save weights dan set callback untuk mengevaluasi bahwa hanya jika acc naik maka weights akan di save.

In [None]:
filepath="weights_gender_letter_cnn-lstm.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]
history=model.fit(input_train, target_train,validation_data=(input_val, target_val), shuffle=False, epochs=epoc, batch_size=batch_size,callbacks=callbacks_list)

score acc

In [None]:
model.load_weights("weights_gender_letter_cnn-lstm.hdf5")
scores = model.evaluate(input_val, target_val)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

In [None]:
from sklearn.metrics import classification_report
predict = model.predict_classes(input_val)
y_true = target_val
y_pred = predict
target_names = ['male', 'female',]
print (classification_report(y_true, y_pred, target_names=target_names))

plot acc dan los

In [None]:
# Plot training & validation accuracy values
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

build model cnn gru

In [None]:
model = Sequential()
model.add(Embedding(nwords, embed_dim,input_length = input_train.shape[1],trainable=True))
model.add(Conv1D(32,kernel_size=3,padding='same',activation='relu'))
model.add(MaxPooling1D())
model.add(Dropout(0.3))
model.add(Conv1D(64,kernel_size=3,padding='same',activation='relu'))
model.add(MaxPooling1D())
model.add(Dropout(0.35))
model.add(Conv1D(128,kernel_size=3,padding='same',activation='relu'))
model.add(MaxPooling1D())
model.add(Dropout(0.4))
model.add(GRU(50,return_sequences=True))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128,activation='relu'))
model.add(Dropout(0.45))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

save model

In [None]:
model_json = model.to_json()
with open("model_gender_letter.json", "w") as json_file:
    json_file.write(model_json)

set filepath utk save weights dan set callback untuk mengevaluasi bahwa hanya jika acc naik maka weights akan di save.

In [None]:
filepath="weights_gender_letter_cnn-lstm2.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]
history=model.fit(input_train, target_train,validation_data=(input_val, target_val), shuffle=False, epochs=epoc, batch_size=batch_size,callbacks=callbacks_list)

score acc

In [None]:
model.load_weights("weights_gender_letter_cnn-lstm2.hdf5")
scores = model.evaluate(input_val, target_val)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

In [None]:
from sklearn.metrics import classification_report
predict = model.predict_classes(input_val)
y_true = target_val
y_pred = predict
target_names = ['male', 'female',]
print (classification_report(y_true, y_pred, target_names=target_names))

plot acc dan loss

In [None]:
# Plot training & validation accuracy values
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Predict

In [None]:
nama="Choiril Kurniawan"

In [None]:
nama=nama.lower()
nama= list(nama)
tmp=[]
tmp.append(nama)
tmp.append(nama)
sequences = tokenizer.texts_to_sequences(tmp) 
input_seq = pad_sequences(sequences, maxlen=maxleng+3)

In [None]:
# male   : 0
# female : 1
result=model.predict_classes(input_seq)
print(result[0])