In [1]:
import time
import sys
import re
import unicodedata

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.contrib import layers

from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout
from keras.layers import LSTM, GRU
from keras.layers import Activation
from keras.optimizers import RMSprop

from keras.preprocessing import sequence

from __future__ import print_function


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# determine dictionary of characters

infile = 'fundnames.txt'
data = open(infile, 'r').read()
chars = sorted(list(set(data)))

# add start/end chars (pointless if it has \n separators)
STARTCHAR ='{'
ENDCHAR = '}'
chars.append(STARTCHAR)
chars.append(ENDCHAR)
print(chars)
DATA_SIZE, VOCAB_SIZE = len(data), len(chars)
print('data has %d characters, %d unique.' % (DATA_SIZE, VOCAB_SIZE))

char_to_ix = {ch: i for i, ch in enumerate(chars)}
ix_to_char = {i: ch for i, ch in enumerate(chars)}

def mapstring(str):
    return list([char_to_ix[c] for c in str])


['\n', ' ', '!', '"', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|', '{', '}']
data has 600287 characters, 59 unique.


In [3]:
#for each string
#0-length string predicts first letter
#each substring predicts following letter
#full string predicts end of line char

def mapsubstrings(str):
    substrings = []
    nextchars = []
    
    str = STARTCHAR + str
    for i in range(len(str)):
        substrings.append(str[:i])
        nextchars.append(str[i])

    substrings.append(str)
    nextchars.append(ENDCHAR)
        
    return(zip(substrings,nextchars)[1:])
    
mapsubstrings('aloha')


[('{', 'a'),
 ('{a', 'l'),
 ('{al', 'o'),
 ('{alo', 'h'),
 ('{aloh', 'a'),
 ('{aloha', '}')]

In [4]:
# cut the file into strings and pad them
with open(infile) as f:
    content = f.readlines()
    
source_strings = []
target_strings = []
for str in content:
    str = str.strip()
    for source, target in mapsubstrings(str):
        source_strings.append(source)
        target_strings.append(target)
        
print(source_strings[:1017])
print(target_strings[:1017])

['{', '{1', '{10', '{10-', '{10-1', '{10-15', '{10-15 ', '{10-15 a', '{10-15 as', '{10-15 ass', '{10-15 asso', '{10-15 assoc', '{10-15 associ', '{10-15 associa', '{10-15 associat', '{10-15 associate', '{10-15 associates', '{10-15 associates ', '{10-15 associates i', '{10-15 associates in', '{10-15 associates inc', '{', '{1', '{10', '{10k', '{10k ', '{10k c', '{10k ca', '{10k cap', '{10k capi', '{10k capit', '{10k capita', '{10k capital', '{10k capital ', '{10k capital l', '{10k capital ll', '{10k capital llc', '{', '{1', '{10', '{10x', '{10x ', '{10x i', '{10x in', '{10x inv', '{10x inve', '{10x inves', '{10x invest', '{10x investm', '{10x investme', '{10x investmen', '{10x investment', '{10x investments', '{', '{1', '{10', '{10x', '{10x ', '{10x i', '{10x in', '{10x inv', '{10x inve', '{10x inves', '{10x invest', '{10x investm', '{10x investme', '{10x investmen', '{10x investment', '{10x investments', '{10x investments ', '{10x investments (', '{10x investments (p', '{10x investments 

In [5]:
# map to 1-hot vectors
MAXLEN=max(list([len(str) for str in source_strings]))
print("longest string: %d chars" % MAXLEN)

X = np.zeros((len(source_strings), MAXLEN, len(chars)), dtype=np.bool)
y = np.zeros((len(target_strings), len(chars)), dtype=np.bool)

for i, source_string in enumerate(source_strings):
    for t, char in enumerate(source_string):
        X[i, t, char_to_ix[char]] = True

for i, target_string in enumerate(target_strings):
    y[i, char_to_ix[target_string[0]]] = True

print(source_strings[0])
print(X.shape)
print(X[0][0])

print(target_strings[0])
print(y.shape)
print(y[0])

# select about 1/3 of the rows at random to speed things up
sample_indexes = np.random.choice(X.shape[0], size=X.shape[0]/3, replace=False)
X_sample = X[sample_indexes, :]
y_sample = y[sample_indexes, :]

longest string: 111 chars
{
(600287, 111, 59)
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False  True False]
1
(600287, 59)
[False False False False False False False False False False False False
 False False False  True False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False]


In [6]:
def vec_to_char(v):
    char_index = np.argmax(v)
    return ix_to_char[char_index]

def vec_to_string(x):
    retstr=''
    for i in range(1, MAXLEN):
        one_hot_char=vec_to_char(x[i])
        if one_hot_char == '\n':
            break
        else:
            retstr += (one_hot_char)
    return retstr



In [10]:
bestmodel = 'fund_name_0.723500.h5'
model = load_model(bestmodel)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_1 (GRU)                  (None, 111, 128)          72192     
_________________________________________________________________
gru_2 (GRU)                  (None, 64)                37056     
_________________________________________________________________
dense_1 (Dense)              (None, 59)                3835      
_________________________________________________________________
activation_1 (Activation)    (None, 59)                0         
Total params: 113,083
Trainable params: 113,083
Non-trainable params: 0
_________________________________________________________________


In [None]:

#model.add(GRU(512, input_shape=(MAXLEN, len(chars)), return_sequences=True))
#model.add(GRU(512, input_shape=(MAXLEN, len(chars))))
#odel.add(LSTM(512, input_shape=(MAXLEN, len(chars)), return_sequences=True))
model = Sequential()
model.add(GRU(128, input_shape=(MAXLEN, len(chars)), return_sequences=True))
model.add(GRU(64, input_shape=(MAXLEN, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

#optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())


In [8]:
# shuffle training data and hold out 10000 for xval
sample_size = X.shape[0]
xval_size = 10000

sample_indexes = np.random.choice(sample_size, size=sample_size, replace=False)

X_xval = X[sample_indexes[:xval_size], :]
y_xval = y[sample_indexes[:xval_size], :]

X_sample = X[sample_indexes[xval_size:], :]
y_sample = y[sample_indexes[xval_size:], :]

print(vec_to_string(X_sample[15]))
print(vec_to_char(y_sample[15]))
print(vec_to_string(X_xval[15]))
print(vec_to_char(y_xval[15]))

th
e
wcas fraser sullivan inve
s


In [None]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

y_xval_char=[vec_to_char(xv_y) for xv_y in y_xval]

# keep training until xval accuracy maxes out
for i in range(100):
    print ("Starting %s" % time.strftime("%H:%M:%S"))
    fit = model.fit(X_sample, y_sample,
                    batch_size=1024,
                    epochs=1)
    accuracy = fit.history['acc'][-1]
    print ("Finished %s" % time.strftime("%H:%M:%S"))
    y_pred_xval = model.predict(X_xval)
    y_pred_xval_char=[vec_to_char(pred) for pred in y_pred_xval]
    y_eq = [ypc == ytc for ypc, ytc in zip (y_pred_xval_char,y_xval_char)]
    xval_acc = sum(y_eq)*1.0/xval_size
    print('Xval Accuracy: %.6f' % (xval_acc))
    print ("Saving fund_name_%f.h5" % xval_acc)
    model.save('fund_name_%f.h5' % xval_acc)


Starting 06:41:16
Epoch 1/1
Finished 06:43:36
Xval Accuracy: 0.720700
Saving fund_name_0.720700.h5
Starting 06:43:45
Epoch 1/1
Finished 06:46:04
Xval Accuracy: 0.720700
Saving fund_name_0.720700.h5
Starting 06:46:12
Epoch 1/1
Finished 06:48:31
Xval Accuracy: 0.715900
Saving fund_name_0.715900.h5
Starting 06:48:40
Epoch 1/1
Finished 06:50:58
Xval Accuracy: 0.708400
Saving fund_name_0.708400.h5
Starting 06:51:07
Epoch 1/1
Finished 06:53:25
Xval Accuracy: 0.717800
Saving fund_name_0.717800.h5
Starting 06:53:34
Epoch 1/1
Finished 06:55:52
Xval Accuracy: 0.716300
Saving fund_name_0.716300.h5
Starting 06:56:01
Epoch 1/1
Finished 06:58:20
Xval Accuracy: 0.718700
Saving fund_name_0.718700.h5
Starting 06:58:28
Epoch 1/1
Finished 07:00:47
Xval Accuracy: 0.720100
Saving fund_name_0.720100.h5
Starting 07:00:56
Epoch 1/1
Finished 07:03:15
Xval Accuracy: 0.712200
Saving fund_name_0.712200.h5
Starting 07:03:23
Epoch 1/1
Finished 07:05:41
Xval Accuracy: 0.715400
Saving fund_name_0.715400.h5
Starting 0

In [None]:
# def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def random_fund_name(first_letters='', temperature=0.667):
    x = np.zeros((1, MAXLEN, len(chars)))
    x[0] = X[0]
    i = 1
    for c in first_letters:
        try:
            x[0][i][char_to_ix[c]] = 1
            i += 1
        except:
            pass

#        print(vec_to_string(x[0]))
        
    for j in range(len(first_letters), MAXLEN-1):
#        print(vec_to_string(x[0]))
        preds = model.predict(x, verbose=0)
        probs = preds[0]
        #next_index = np.argmax(np.random.multinomial(1, probs, 1))
        next_index=sample(probs, temperature=temperature)
        if ix_to_char[next_index] == ENDCHAR:
            break
        x[0][j+1][next_index]=1

    return vec_to_string(x[0])

In [None]:
for _ in range(10):
    #print(random_fund_name())    
    print(random_fund_name(first_letters='st', temperature=1.0))

In [None]:
x = np.zeros((1, MAXLEN, len(chars)))
x[0] = X[0]
print(vec_to_string(x[0]))
preds = model.predict(x, verbose=0)
probs = preds[0]
next_index = np.random.multinomial(1, probs, 1)
next_index

In [None]:
preds = model.predict(x, verbose=1)
probs = preds[0]
next_index = sample(probs)
x[0][4][next_index]=1
vec_to_string(x)

In [None]:
next_index = sample(probs)
ix_to_char[next_index]
x[0][1][next_index]=1vec_to_string(x)

In [None]:
preds = model.predict(x, verbose=1)

In [None]:
probs=preds[0]
probs

In [None]:
sys.path

In [None]:
!cd

In [None]:
!pwd
import time

In [None]:
print('%s Saving...' % time.strftime("%H:%M:%S"))               
modelname = "generator_gru" 
model.save("%s.h5" % modelname)
model.save_weights("%s_weights.h5" % modelname)
with open("%s.json" % modelname, "wb") as fjson:
     fjson.write(model.to_json()) 
print('%s Saved.' % time.strftime("%H:%M:%S"))               




In [None]:
vec_to_string(X[2])

In [None]:
bestmodel = 'fund_name_0.801384.h5'
model = load_model(bestmodel)
model.summary()

In [None]:
X[0].shape

In [None]:
sum(X[0])