In [1]:
#VALIDATION_SET: built in to model.fit
#ACTIVATION: sigmoid
#HYPERPARAMETERS: gridsearch
#LOSS: binary_crossentropy
#OPTIMIZER: adam
#VISUALIZATION: loss vs LSTM epoch, f1 score

In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from matplotlib import pyplot
from keras.models import Sequential
from keras.layers.core import Dense
from keras.layers.core import Dropout
from keras.layers.core import Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
import sklearn
import pandas as pd
import numpy as np
import re
from sklearn import metrics
from keras.preprocessing import sequence
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

Using TensorFlow backend.


In [2]:
females_names_path = '/Users/beh502/Downloads/names/female.txt'
male_names_path = '/Users/beh502/Downloads/names/male.txt'
internet_words_path = '/Users/beh502/Downloads/unigram_freq.csv'

In [3]:
with open(females_names_path) as f:
    female_lines = f.read().splitlines()
with open(male_names_path) as f:
    male_lines = f.read().splitlines()

In [4]:
names_list = list(set(male_lines + female_lines))
names_df = pd.DataFrame(np.array(names_list), columns = ['word'])
names_df['target'] = 1

In [5]:
# create DF out of internet words, drop count column add target column
#want 50/50 distribution in data, no bias
internet_df = pd.read_csv(internet_words_path, nrows=names_df.size)
internet_df['target'] = 0
internet_df = internet_df.drop(['count'], axis=1)
# use outer so names in internet_df that appear in names_df get target 1
merged_df = pd.merge(names_df, internet_df, how='outer', on=['word', 'target'])

In [6]:
merged_df = merged_df.dropna()
merged_df['word'] = merged_df['word'].str.lower()
merged_df['word'] = merged_df['word'].str.strip()
merged_df['word_length'] = merged_df['word'].apply(lambda x: len(x))
X = merged_df['word']
y = merged_df['target']

In [7]:
valid_chars = {x:idx+1 for idx, x in enumerate(set(''.join(X)))}

In [8]:
max_word_len = np.max([len(x) for x in X])
max_features = len(valid_chars) + 1
x_data_sequences = [[valid_chars[char] for char in word] for word in X]
x_data_sequences = sequence.pad_sequences(x_data_sequences, maxlen=max_word_len)

In [9]:
def create_model(shuffle=True, optimizer='Adam', dropout=0.5, embed_dimensions=64):
    embedding_layer = Embedding(max_features, output_dim=embed_dimensions, input_length=max_word_len)
    lstm_layer = LSTM(max_features)
    dropout_layer = Dropout(dropout)
    dense_layer = Dense(1)
    sigmoid_layer = Activation('sigmoid')
    model = Sequential([embedding_layer, lstm_layer, dropout_layer, dense_layer, sigmoid_layer])
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [10]:
X_train, X_test, y_train, y_test = train_test_split(x_data_sequences, y, test_size=0.2, random_state=0)

In [None]:
batch_size = 32
epochs = 5
hyperparams_dict = {
    'optimizer': ['SGD', 'Adam', 'RMSprop', 'Adagrad', 'Adadelta', 'Adamax'],
    'dropout': [0.1, 0.2, 0.4],
    'embed_dimensions': [64,128,256],
    'batch_size': [20, 32, 40],
    'epochs': [18]
}

model = KerasClassifier(build_fn=create_model, verbose=1)
grid = GridSearchCV(estimator=model, param_grid=hyperparams_dict, n_jobs=-1, verbose=1)
grid_result = grid.fit(X_train, y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Fitting 3 folds for each of 162 candidates, totalling 486 fits
Epoch 1/18
Epoch 1/18
Epoch 1/18
Epoch 1/18
Epoch 1/18
Epoch 1/18
Epoch 1/18
Epoch 1/18
  200/12125 [..............................] - ETA: 28s - loss: 0.6130 - acc: 0.7000Epoch 2/18
  820/12126 [=>............................] - ETA: 32s - loss: 0.6376 - acc: 0.6646Epoch 2/18
  660/12125 [>.............................] - ETA: 31s - loss: 0.4603 - acc: 0.7848Epoch 2/18
  540/12125 [>.............................] - ETA: 32s - loss: 0.4645 - acc: 0.7815Epoch 2/18
  240/12125 [..............................] - ETA: 32s - loss: 0.6361 - acc: 0.6625Epoch 3/18
Epoch 3/18
 1280/12125 [==>...........................] - ETA: 30s - loss: 0.4442 - acc: 0.7945Epoch 4/18
  300/12125 [..............................] - ETA: 30s - loss: 0.6116 - acc: 0.6800Epoch 6/18
  360/12125 [..............................] - ETA: 32s - loss: 0.6112 - acc: 0.6806Epoch 6/18
  920/12125 [=>............................] - ETA: 29s - loss: 0.6149 - acc: 

  360/12125 [..............................] - ETA: 32s - loss: 0.4123 - acc: 0.8194Epoch 9/18
Epoch 11/18
  200/12126 [..............................] - ETA: 28s - loss: 0.3910 - acc: 0.8100Epoch 11/18
  400/12126 [..............................] - ETA: 29s - loss: 0.3745 - acc: 0.8375Epoch 11/18
 2460/12125 [=====>........................] - ETA: 25s - loss: 0.4222 - acc: 0.8073Epoch 11/18
 1120/12126 [=>............................] - ETA: 29s - loss: 0.4103 - acc: 0.8036Epoch 13/18
 2780/12125 [=====>........................] - ETA: 25s - loss: 0.4284 - acc: 0.8040Epoch 14/18
  400/12126 [..............................] - ETA: 30s - loss: 0.3799 - acc: 0.8250Epoch 16/18
 1440/12126 [==>...........................] - ETA: 29s - loss: 0.4243 - acc: 0.8000Epoch 16/18
Epoch 17/18
Epoch 18/18
 2740/12125 [=====>........................] - ETA: 24s - loss: 0.3877 - acc: 0.8230Epoch 18/18
Epoch 1/18
Epoch 2/18
 2660/12125 [=====>........................] - ETA: 26s - loss: 0.6287 - acc: 0