In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from matplotlib import pyplot
from keras.models import Sequential
from keras.layers.core import Dense
from keras.layers.core import Dropout
from keras.layers.core import Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
import sklearn
import pandas as pd
import numpy as np
import re
from sklearn import metrics
from keras.preprocessing import sequence
from pandas import get_dummies
import operator

Using TensorFlow backend.


# Preprocessing

In [2]:
females_names_path = '/Users/beh502/Downloads/names/female.txt'
male_names_path = '/Users/beh502/Downloads/names/male.txt'
internet_words_path = '/Users/beh502/Downloads/unigram_freq.csv'

In [3]:
with open('/Users/beh502/Downloads/names/female.txt') as f:
    female_lines = f.read().splitlines()
with open('/Users/beh502/Downloads/names/male.txt') as f:
    male_lines = f.read().splitlines()

In [4]:
names_list = list(set(male_lines + female_lines))
female_list = list(set(female_lines))
male_list = list(set(male_lines))

In [5]:
# create dataframes, make equal representation

male_df = pd.DataFrame({'word': male_list, 'target': 'male'})
female_df = pd.DataFrame({'word': female_list, 'target': 'female'})

internet_df = pd.read_csv(internet_words_path) #want 50/50 distribution in data, no bias
internet_df = internet_df.drop(['count'], axis=1)
internet_df['target'] = 'internet'

min_size = min(male_df.shape[0], female_df.shape[0], internet_df.shape[0])
male_df = male_df.head(min_size)
female_df = female_df.head(min_size)
internet_df = internet_df.head(min_size)

In [6]:
# merge them, clean up, apply get_dummies
name_frames = [male_df, female_df]
merged_names_df = pd.concat(name_frames)
merged_names_df = merged_names_df.drop_duplicates(subset='word', keep=False)

frames = [merged_names_df, internet_df]
merged_df = pd.concat(frames)
merged_df = merged_df.drop_duplicates(subset='word', keep='first')
merged_df = pd.get_dummies(merged_df, columns=['target'])
merged_df = merged_df.dropna()

In [7]:
merged_df['word'] = merged_df['word'].str.lower()
merged_df['word'] = merged_df['word'].str.strip()
X = merged_df['word']
y = merged_df[['target_male', 'target_female', 'target_internet']].values

In [8]:
valid_chars = {x:idx+1 for idx, x in enumerate(set(''.join(X)))}

In [9]:
max_word_len = np.max([len(x) for x in X])
max_features = len(valid_chars) + 1
print("Max length of word: ", str(max_word_len))

Max length of word:  15


In [10]:
x_data_sequences = [[valid_chars[char] for char in word] for word in X]
x_data_sequences = sequence.pad_sequences(x_data_sequences, maxlen=max_word_len)

# Modelling

In [11]:
batch_size = 32
epochs = 18
embedding_layer = Embedding(max_features, 1256, input_length=max_word_len)
lstm_layer = LSTM(max_features)
dropout_layer = Dropout(0.2)
dense_layer = Dense(3)
softmax_layer = Activation('softmax')

model = Sequential([embedding_layer, lstm_layer, dropout_layer, dense_layer, softmax_layer])
model.compile(loss='binary_crossentropy', optimizer='adam')

In [12]:
X_train, X_test, y_train, y_test = train_test_split(x_data_sequences, y, test_size=0.2, random_state=0)

In [13]:
history = model.fit(X_train, y_train, epochs = 10, validation_split=0.33, verbose=0)

# Plug n Chug

In [14]:
def get_max_index(my_list):
    index, value = max(enumerate(my_list), key=operator.itemgetter(1))
    return index

In [15]:
def names_to_softmaxes(listed_names):
    name_vectors = [[valid_chars[char] for char in word] for word in listed_names]
    name_vectors = sequence.pad_sequences(name_vectors, maxlen=max_word_len)
    return model.predict(name_vectors)

In [16]:
inverted_valid_chars = {v: k for k, v in valid_chars.items()}

def sequence_to_string(sequence):
    constructed = ''
    inverted_keys = list(inverted_valid_chars.keys())
    for num in sequence:
        if(num != 0):
            constructed += inverted_valid_chars[num]
    return constructed

In [24]:
training_names = [sequence_to_string(x) for x in X_train]
difficult_names = ['taylor','tylor','charli','charlie','charlie','alex','alexandra','alexander']
in_training = set(training_names).intersection(set(difficult_names))
print(list(in_training))

['charlie', 'taylor', 'alexander']


In [35]:
# feel free to try some names yourself. Just put them in the names array.

names = ['tylor','charli','charly','alex','alexandra']
softmaxes = names_to_softmaxes(names)
header = "%s \t \t %s \t \t \t \t %s" % ("word", "y_pred", "flat y_pred")
print(header)
for i in range(0, len(names)):
    domain = names[i]
    softmax = softmaxes[i]
    rounded_y_pred = get_max_index(softmax)
    #this is for pretty printing
    if (len(domain) <= 6):
        result = "%s \t \t %s \t \t %s" % (domain, softmax, rounded_y_pred)
    else:
        result = "%s \t %s \t \t %s" % (domain, softmax, rounded_y_pred)
    print(result)
    

word 	 	 y_pred 	 	 	 	 flat y_pred
tylor 	 	 [ 0.68323016  0.20387457  0.11289527] 	 	 0
charli 	 	 [ 0.24883558  0.74952954  0.00163483] 	 	 1
charly 	 	 [ 0.4942224   0.43937388  0.06640373] 	 	 0
alex 	 	 [ 0.37422207  0.33759364  0.28818434] 	 	 0
alexandra 	 [ 0.01046914  0.98052543  0.00900534] 	 	 1
