In [1]:
%matplotlib inline
import os
import urllib
from functools import reduce
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import pandas as pd
import sklearn.utils.class_weight
from scipy.spatial.distance import cdist

from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

  from ._conv import register_converters as _register_converters


In [2]:
origin_names = pd.read_csv(os.path.join('data','text.csv'))
origin_names = np.asarray(origin_names[:]['text']) # Parse the CSV and get the column called text
for i in range(len(origin_names)):
    # Do our best to clean up weird characters
    origin_names[i] = origin_names[i].encode('ascii','replace').decode('ascii').replace(u'\x02','')
print (origin_names[2])

Babbelute


In [10]:
num_of_names = len(origin_names)
names = origin_names[np.random.permutation(num_of_names)]
names = u'\n'.join(names.tolist())

# Find the unique characters in all the names
names_chars = sorted(list(set(names)))


chars_to_int = {}
for i in range(len(names_chars)):
    chars_to_int[names_chars[i]] = i
    
int_to_chars = {b:a for a,b in chars_to_int.items()}
print(names[:100])
print("From chars to indexes:",chars_to_int)
print("From indexes to chars:", int_to_chars)
print("Total num of unique char types:", len(chars_to_int))

Snow White
Mandolin
Ash Winder
Skippy
Fedora
Pardon My Dust
Zippo's Sensation
The Secrets Out
Painte
From chars to indexes: {'p': 62, 'j': 56, 'f': 52, 's': 65, 'V': 41, '\n': 0, 'd': 50, ',': 8, '9': 17, 'D': 23, 'e': 51, 't': 66, '(': 6, '-': 9, '/': 11, "'": 5, '0': 12, '!': 2, '7': 15, 'w': 69, '"': 3, 'X': 43, 'Y': 44, 'A': 20, 'F': 25, '8': 16, 'C': 22, 'O': 34, 'r': 64, '2': 13, 'I': 28, 'E': 24, 'S': 38, 'g': 53, 'W': 42, 'H': 27, 'J': 29, 'T': 39, 'o': 61, 'b': 48, 'z': 72, 'K': 30, 'L': 31, 'q': 63, '?': 19, '6': 14, '&': 4, 'i': 55, 'P': 35, 'v': 68, 'B': 21, 'N': 33, 'y': 71, ')': 7, 'h': 54, 'Q': 36, 'm': 59, 'R': 37, 'a': 47, '`': 46, 'Z': 45, 'u': 67, ' ': 1, 'l': 58, 'n': 60, 'c': 49, '.': 10, 'x': 70, 'U': 40, '=': 18, 'M': 32, 'k': 57, 'G': 26}
From indexes to chars: {0: '\n', 1: ' ', 2: '!', 3: '"', 4: '&', 5: "'", 6: '(', 7: ')', 8: ',', 9: '-', 10: '.', 11: '/', 12: '0', 13: '2', 14: '6', 15: '7', 16: '8', 17: '9', 18: '=', 19: '?', 20: 'A', 21: 'B', 22: 'C', 23: '

In [11]:
# Encode all characters to their ints
encoded = []
for i in names:
    encoded.append(chars_to_int[i])


l = ""
for i in encoded[:50]:
    l += int_to_chars[i]

print (l)
print ("Example encoded: ",encoded[:50])
print ("Doc length in chars: ",len(encoded), "in ints: ", len(names), " should match.")

Snow White
Mandolin
Ash Winder
Skippy
Fedora
Pardo
Example encoded:  [38, 60, 61, 69, 1, 42, 54, 55, 66, 51, 0, 32, 47, 60, 50, 61, 58, 55, 60, 0, 20, 65, 54, 1, 42, 55, 60, 50, 51, 64, 0, 38, 57, 55, 62, 62, 71, 0, 25, 51, 50, 61, 64, 47, 0, 35, 47, 64, 50, 61]
Doc length in chars:  47910 in ints:  47910  should match.


In [12]:
window = 40 # How many characters to consider when building prediction

huge_list_x = []
huge_list_y = []
for i in range(len(encoded) - window - 1):
    huge_list_x.append(encoded[i:window+i])
    huge_list_y.append([encoded[window+i]])
    
split = int(0.75 * len(huge_list_x))
x_train = np.asarray(huge_list_x[:split])
x_test = np.asarray(huge_list_x[split:])
y_train = np.asarray(huge_list_y[:split])
y_test = np.asarray(huge_list_y[split:])

print ("Check that each set shifts by one number per step:")
print(huge_list_x[0], huge_list_y[0])
print(huge_list_x[1], huge_list_y[1])
print(huge_list_x[2], huge_list_y[2])
print ("Length of input and output samples should match (but number of entries in each sample don't):",
       len(huge_list_x), len(huge_list_y))

Check that each set shifts by one number per step:
[38, 60, 61, 69, 1, 42, 54, 55, 66, 51, 0, 32, 47, 60, 50, 61, 58, 55, 60, 0, 20, 65, 54, 1, 42, 55, 60, 50, 51, 64, 0, 38, 57, 55, 62, 62, 71, 0, 25, 51] [50]
[60, 61, 69, 1, 42, 54, 55, 66, 51, 0, 32, 47, 60, 50, 61, 58, 55, 60, 0, 20, 65, 54, 1, 42, 55, 60, 50, 51, 64, 0, 38, 57, 55, 62, 62, 71, 0, 25, 51, 50] [61]
[61, 69, 1, 42, 54, 55, 66, 51, 0, 32, 47, 60, 50, 61, 58, 55, 60, 0, 20, 65, 54, 1, 42, 55, 60, 50, 51, 64, 0, 38, 57, 55, 62, 62, 71, 0, 25, 51, 50, 61] [64]
Length of input and output samples should match (but number of entries in each sample don't): 47869 47869


In [18]:
# From https://github.com/minimaxir/char-embeddings
# Make a matrix that maps chars to the precomputed array of vectors
# So a => index(54) => [0.123, -0.234, ... 287 more ..., 0.001], b = [-.999, 0.00, ... 287 more ..., 0.001], and so on.
# Any chars not in minimaxir's precomputed list is set a value of [0.00, ... 0.000]
char_vecs = {}
glove_char_path = os.path.join('data','glove.840B.300d-char.txt')
glove_char_url = 'http://raw.githubusercontent.com/minimaxir/char-embeddings/master/glove.840B.300d-char.txt'

if not os.path.isfile(glove_char_path):
    print('Downloading: ' + glove_char_url)
    try:
        urllib.request.urlretrieve(glove_char_url, glove_char_path)
    except Exception as inst:
        print(inst)
        print('  Encountered unknown error. Continuing.')

with open(glove_char_path) as f:
    for line in f:
        cleaned = line.strip().split(" ")
        char = cleaned[0]
        vec = np.array(cleaned[1:], dtype=float)
        char_vecs[char] = vec

embedding_matrix = np.zeros((len(chars_to_int), 300))
for char, i in chars_to_int.items():
    embedding_vector = char_vecs.get(char)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
embedding_matrix, len(embedding_matrix)

(array([[ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
          0.      ],
        [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
          0.      ],
        [ 0.02714 , -0.202417,  0.215536, ...,  0.202695, -0.032797,
         -0.039701],
        ...,
        [ 0.246788, -0.299682,  0.091904, ...,  0.153139, -0.092827,
         -0.131775],
        [ 0.239696, -0.272422,  0.09878 , ...,  0.197519, -0.075066,
         -0.113583],
        [ 0.242073, -0.274988,  0.109062, ...,  0.201191, -0.06692 ,
         -0.102592]]), 73)

In [19]:
chars_len = len(chars_to_int)

main_input = tf.keras.Input(shape=(window,))

embedding_layer = tf.keras.layers.Embedding(
    chars_len, 300, input_length=window, weights=[embedding_matrix],
    trainable=False)# This means we don't want to make minimaxirs vocab more like our dataset (to save time)
                    # this might be a bad assumption if your dataset isn't like MagicTheGathering cards.  
embedded = embedding_layer(main_input)

# RNN Layer
rnn = tf.keras.layers.LSTM(256, implementation=2, return_sequences=True)(embedded)
rnn2 = tf.keras.layers.LSTM(256, implementation=2, return_sequences=False)(rnn)

aux_output = tf.keras.layers.Dense(chars_len)(rnn2)
aux_output = tf.keras.layers.Activation('softmax', name='aux_out')(aux_output)

# Hidden Layers (cargo culting a little from minimaxir, but it works)
hidden_1 = tf.keras.layers.Dense(512, use_bias=False)(rnn2)
hidden_1 = tf.keras.layers.BatchNormalization()(hidden_1)
hidden_1 = tf.keras.layers.LeakyReLU()(hidden_1) # I think leakies are always just better than standard ReLUs.

hidden_2 = tf.keras.layers.Dense(256, use_bias=False)(hidden_1)
hidden_2 = tf.keras.layers.BatchNormalization()(hidden_2)
hidden_2 = tf.keras.layers.LeakyReLU()(hidden_2)

# Traditional end to a classification problem, end with number of nodes equal to possible catagories and softmax it
main_output = tf.keras.layers.Dense(chars_len)(hidden_2)
main_output = tf.keras.layers.Activation('softmax', name='main_out')(main_output)

# Two outputs because we want to make sure the model doesn't get too far off into the weeds
# by having it have to show immediate progress after the RNN layers, which acts to regularize 
# (keeps the model from overfitting) the model. So we're checking the model against both its
# guess after just the RNN layers, and then after the whole model finishes.
# Think of it as a partial credit reward system.
model = tf.keras.Model(inputs=main_input, outputs=[main_output, aux_output])

optimizer = tf.keras.optimizers.Adam()
model.compile(loss='categorical_crossentropy',
              optimizer=optimizer, loss_weights=[1., 0.2]) 
                                    # Penalize it for bad initial guesses after the RNN layers but not too much (0.2)
                                    # vs being wrong at the end (1.0 penalty).
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 40, 300)      21900       input_2[0][0]                    
__________________________________________________________________________________________________
lstm_3 (LSTM)                   (None, None, 256)    570368      embedding_2[0][0]                
__________________________________________________________________________________________________
lstm_4 (LSTM)                   (None, 256)          525312      lstm_3[0][0]                     
__________________________________________________________________________________________________
dense_6 (D

In [24]:
# Everything likes to be one hot encoded:
def quick_one_hot_encoding(items):
    one_hotted_items = []
    for i in items:
        hot = np.zeros(chars_len)
        hot[i[0]] = 1 # i is not one hot encoded as array "hot"
        one_hotted_items.append(hot) # Add hot back to our encoded list
    return np.asarray(one_hotted_items)

y_train = np.asarray(huge_list_y[:split])
y_train_hot = quick_one_hot_encoding(y_train)

y_test = np.asarray(huge_list_y[split:])
y_test_hot = quick_one_hot_encoding(y_test)
print ("One hot encoded example:", y_train[0],y_train_hot[0])    

# If your tet has an uneven number of characters (hint: it almost certainly does)
# create a list of class weights to balance things out. The idea is to keep signals
# for white-space and 'e's and 's'es in English from overpowering all the other signals
y_labels = []
for i in range(len(y_train)):
    y_labels.append(y_train[i][0])
weights = sklearn.utils.class_weight.compute_class_weight('balanced', np.unique(y_labels), y_labels)

weights[chars_to_int['\n']], weights[chars_to_int[' ']], weights[chars_to_int['s']],\
weights[chars_to_int['e']], weights[chars_to_int['z']], weights # Unusual letters get more weight

One hot encoded example: [50] [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]


(0.1379507771522988,
 0.256811760077256,
 0.37484338456398264,
 0.1620944365682087,
 3.112623547771805,
 array([1.37950777e-01, 2.56811760e-01, 4.91794521e+02, 2.45897260e+02,
        9.83589041e+01, 4.86925268e+00, 2.04914384e+01, 2.04914384e+01,
        7.02563601e+01, 1.00366229e+01, 1.53685788e+01, 4.91794521e+02,
        2.45897260e+02, 4.91794521e+02, 4.91794521e+02, 4.91794521e+02,
        2.45897260e+02, 4.91794521e+02, 1.63931507e+02, 7.02563601e+01,
        1.94385186e+00, 1.19367602e+00, 1.21131655e+00, 1.45501337e+00,
        3.90313112e+00, 2.28741637e+00, 2.49641889e+00, 3.03576865e+00,
        5.12285959e+00, 3.48790440e+00, 3.69770316e+00, 2.38735204e+00,
        1.11014564e+00, 4.03110263e+00, 6.22524710e+00, 1.71956126e+00,
        1.32917438e+01, 1.94385186e+00, 7.16901633e-01, 1.52258365e+00,
        1.14370819e+01, 8.33550035e+00, 2.94487737e+00, 3.78303477e+01,
        1.53685788e+01, 7.34021672e+00, 4.91794521e+02, 1.88571519e-01,
        1.49028643e+00, 7.045766

In [26]:
data_checkpoint = os.path.join('data','text.checkpoint.h5')
if os.path.isfile(data_checkpoint):
    model.load_weights(data_checkpoint)
checkpoint = tf.keras.callbacks.ModelCheckpoint(data_checkpoint, save_weights_only=True, 
                                                monitor='val_loss', mode='min', verbose=1, save_best_only=True)
callbacks_list = [checkpoint]
model.optimizer.lr = 0.001
model.fit(x_train, [y_train_hot,y_train_hot], # Note the two expected outputs are the same
          validation_data=(x_test, [y_test_hot, y_test_hot]), 
          epochs=10, batch_size=128, callbacks=callbacks_list, class_weight=[weights, weights]) # Class weights per output
model.save_weights('text.model_weights.h5')

Train on 35901 samples, validate on 11968 samples
Epoch 1/1
Epoch 00001: val_loss improved from inf to 1.39255, saving model to data\text.checkpoint.h5



In [27]:
# From minimaxir from keras
# adds some randomness (controlled by the temperature value 0 -> 1 log)
# to the predicted values to keep the model from looping.
# Taken from Boltzmann sampling  
def textgenrnn_sample(preds, temperature):
    '''
    Samples predicted probabilities of the next character to allow
    for the network to show "creativity."
    '''

    preds = np.asarray(preds).astype('float64')

    if temperature is None or temperature == 0.0:
        return np.argmax(preds)

    preds = np.log(preds + 1e-12) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    index = -1

    probas = np.random.multinomial(1, preds, 1)
    index = np.argmax(probas)
    return index

In [28]:
seed = x_train[102:103]
model.predict(seed)
text = ""
i = 0
while True:
    # Generate the next character, pop the first char off the seed, tack on the new char to the end of the seed list, and rerun
    # also called "moving the window"
    new_char = textgenrnn_sample(model.predict(seed)[0][0], 0.95) # 0->no randomness in sampling, 1->lots of randomness
    seed = seed[0][1:].tolist()
    seed.append(new_char)
    text += int_to_chars[new_char]
    seed = np.asarray([seed])
    i+=1
    if i>5000 and int_to_chars[new_char]=="\n": # how many characters we want to generate, 
        break                                    # and give it a chance to finish its thought
        
    
print(text)

ass
Hazel
Rebel Prades
Hango
Laddie
Arthur
Tody
Mysty
Santos
Baront
Pheance
Riuanne
Cappie Piquilla
Foxy Lady
Pilot
Pass N Stip
Suzie
Chamber
Shesa
Watson
Dakota Grace
Pebasco
Black Legend
Apple Buise
Painters
Mig
Burager
Footsittaz Dancer
Ragelle
Nikita
Grace Novelle
Dambit
Yokini
Rainy
Mander Story
All Natural Elem
Fox Fire
Lisi Prince
Pride
Moejo the Mistmire
Aureland Stars
Picass Serapher
Felicitles Advocation
Maple Stars)
Endles)
Cappie
Gem
Pass Rose
Pavana
Cherokee Prize of the Wind
Andrejan
Divasco
Gino
Eagle Expectations Fiasce
Red Sponer
Sako
Camuna
Daze
Senshi (alibue
Star Glory
Desert Majesty
Cowboys Delight
Trapper
Rusty
Shasta
Acro Be Style
My Friend
Shanti (means Mist
Jumper
Murphy
On the Rose
Reds
Clover Boy
Herris
Suble Angel
Tempo
Just Between Rebellion
Ima Perdy
Piestian Glory
Unity
Topaz
Mountain
Dimite
Finnigan
Red Stepper
Trad
Deputy
Mica
Mungle Commet
Shermarkman
Weston
Gotta-Go (G-G)
Lime Tate
Court Bug
Oscada
Robbie
Fizz
Botan Heathern Wonder
Pippy Gly Day
Woods

In [29]:
results = []
new_names = text.split("\n")
originals = names.split('\n')
for i in new_names: # Remove duplicates (you'll get a lot)
    if i not in originals:
        results.append(i)
print (len(results), len(new_names), len(np.unique(originals)))
results

285 491 4740


['ass',
 'Rebel Prades',
 'Hango',
 'Baront',
 'Pheance',
 'Riuanne',
 'Cappie Piquilla',
 'Pass N Stip',
 'Suzie',
 'Pebasco',
 'Black Legend',
 'Apple Buise',
 'Painters',
 'Burager',
 'Footsittaz Dancer',
 'Ragelle',
 'Grace Novelle',
 'Dambit',
 'Yokini',
 'Mander Story',
 'All Natural Elem',
 'Lisi Prince',
 'Moejo the Mistmire',
 'Aureland Stars',
 'Picass Serapher',
 'Felicitles Advocation',
 'Maple Stars)',
 'Endles)',
 'Pass Rose',
 'Pavana',
 'Cherokee Prize of the Wind',
 'Andrejan',
 'Divasco',
 'Eagle Expectations Fiasce',
 'Red Sponer',
 'Camuna',
 'Senshi (alibue',
 'Star Glory',
 'Desert Majesty',
 'Acro Be Style',
 'Shanti (means Mist',
 'On the Rose',
 'Clover Boy',
 'Herris',
 'Suble Angel',
 'Just Between Rebellion',
 'Ima Perdy',
 'Piestian Glory',
 'Dimite',
 'Red Stepper',
 'Trad',
 'Mungle Commet',
 'Shermarkman',
 'Lime Tate',
 'Court Bug',
 'Oscada',
 'Botan Heathern Wonder',
 'Pippy Gly Day',
 'Cuddello Yeview',
 'Wonder Rain',
 'El Percy',
 'Jay Trader',
 'F