# Imports

In [None]:
import numpy as np
import sys
import re
import random as rd
from collections import Counter
import glob
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import h5py
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.models import load_model
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping

# inspired by https://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/

sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

# Function definitions

In [None]:
def preproc(text):
    'preprocessing of large text string'
    text = re.sub(' Mr.', ' Mr', text)
    text = re.sub(' Mrs.', ' Mrs', text)
    text = re.sub(' Messrs.', ' Messrs', text)
    text = text.lower()
    text = re.sub('\[(.*?)\]', '', text)
    text = re.sub('\n\n', ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub(',', '', text)
    text = re.sub('_', '', text)
    text = re.sub(';', '', text)
    text = re.sub('_figure_', '', text)
    text = re.sub('\d+','',text)
    return text

# get all words in target text file, plus all files in a directory for optional pre-training
def get_data(targetname,pretrains = None):
    'load text from file, optional pretraining'
    rawwords=[]
    if pretrains: # optional
        filename = glob.glob(pretrains) # get directory and file ending for pretraining files
        count=0
        for i in filename:
            text = open(i).read()
            text = preproc(text) # preprocess file text
            wds = re.findall(r"[\w']+|[.,!?;]", text) # split text into individual words
            for wd in wds:
                rawwords.append(wd) # append individual words to our raw training array
            count +=1
            print("loading file...", count+1, "/",len(filename)," :: ",i)
    print('Words in pretraining set: ', len(rawwords))
    text = open(targetname).read() # now do the same for target file
    text = preproc(text)
    wds = re.findall(r"[\w']+|[.,!?;]", text)
    print('Words in target set: ', len(wds))
    for wd in wds:
        rawwords.append(wd)
    return rawwords

def predict_words(n_pred, words, model, int_to_word, word_to_int, seq_length):
    'predict words given a model'
    start = np.random.randint(0, len(words)-seq_length) # pick a random seed
    pattern = words[start:start+seq_length] # get a full sequence
    print("Seed: "," ".join(pattern))
    pattern = [word_to_int[pat] for pat in pattern] # turn to ints for model
    print("Prediction: ")
    for i in range(n_pred): # for n_pred words (length of predicted sequence)
        x = np.reshape(pattern, (1, len(pattern), 1)) # make data pretty for model
        x = x / float(n_words)
        prediction = model.predict(x, verbose=0) # get model predictions (probabilities of unique words)
        index = np.random.multinomial(1, np.squeeze(prediction)) # sample over word probabilities to get actual prediction
        result = int_to_word[list(index).index(1)]
        seq_in = [int_to_word[value] for value in pattern]
        while result=='RARE': # if model predicts rare, sample again until it finds a more frequent word
            index = np.random.multinomial(1, np.squeeze(prediction))
            result = int_to_word[list(index).index(1)]
        sys.stdout.write(result) # print result
        sys.stdout.write(" ")
        pattern.append(list(index).index(1))
        pattern = pattern[1:len(pattern)] # delete first element of pattern and continue (slowly gets rid of seed)
    return pattern

# Load (pre-)training data

In [None]:
rawwords = get_data('gwtext.txt','pretrain/*.txt')

# get word counts and dictionaries and make a category for rare words
word_counts = Counter(word for word in rawwords)
words = [ word if word_counts[word]>3 else 'RARE' for word in rawwords ]
unique_words = sorted(list(set(words)))
word_to_int = dict((c, i) for i, c in enumerate(unique_words))
int_to_word = dict((i, c) for i, c in enumerate(unique_words))

# get number of words
n_words = len(words)
n_uwords = len(unique_words)
print('Total Words (without rare words): ', n_words)
print('Unique Words (without rare words): ', n_uwords)

# Make training batch files

In [None]:
batchsize=150
seq_length=20
batches = int((n_words-seq_length)/batchsize) # rounded down
savedir = "traindat"
batches

In [None]:
count = 0
for bat in range(0,batches): # for each batch of words, roll a seq_length window over the words to create individual training samples
    print("Batch ",bat+1," out of ",batches)
    X = np.zeros([batchsize, seq_length, 1])
    dataY = np.zeros([batchsize, 1])
    batcount = 0
    for i in range(count, count + batchsize):
        seq_in = words[i:i + seq_length] # seq_length number of words from all training words
        seq_out = words[i + seq_length] # the word after that, training signal word 21
        X[batcount,:,0]=[word_to_int[word] for word in seq_in] # seq of words turned into int value
        dataY[batcount,0]=word_to_int[seq_out] # training signal word words turned into int value
        batcount+=1
    X = X / np.float32(n_uwords) # normalize
    y = np_utils.to_categorical(dataY,num_classes=n_uwords) # one hot encoder
    f = savedir+"/"+"train"+str(bat)+".h5" # file name for batch, then save all
    h5f = h5py.File(f, 'w')
    h5f.create_dataset('X', data=X)
    h5f.create_dataset('y', data=y)
    h5f.close()
    count += batchsize # move one batch forward and repeat

In [None]:
print("Done!", batches,"batches of size", batchsize, "(sequences of",seq_length,")")

# Define LSTM model

In [None]:
model = Sequential()
model.add(LSTM(256, input_shape=(seq_length, 1), return_sequences=True, recurrent_dropout=0.1,dropout=0.2))
model.add(LSTM(256, input_shape=(seq_length, 1), return_sequences=True, recurrent_dropout=0.1,dropout=0.2))
model.add(LSTM(256))
model.add(Dense(n_uwords, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

# (Pre-)train model

In [None]:
episodes=1 # 
fromdir = "traindat"
eploss = []
for ep in range(0,episodes):
    print("Episode ",ep+1," out of ",episodes, "...")
    count = 0
    batarr = list(range(0,batches))
    rd.shuffle(batarr)
    batloss = []
    for bat in batarr:
        # load ep
        f = fromdir+"/"+"train"+str(bat)+".h5"
        h5f = h5py.File(f, 'r')
        X = h5f['X'][:]
        y = h5f['y'][:]
        h5f.close()
        loss = model.train_on_batch(X, y)
        batloss.append(loss)
        print("Batch ",count," out of ",batches,"(",bat,")"," :: loss: ",loss)
        count +=1
    eploss.append(np.mean(batloss))
    modn='weights/model'+str(ep+1)+'.h5'
    model.save(modn)
    print("")   

# Predictions after (pre-)training

In [None]:
pattern = predict_words(40, words, model, int_to_word, word_to_int,seq_length)


# Get main training set

In [None]:
# get target text and make a category for rare words
rawwords = get_data(10,'gwtext.txt')
# get word counts and dictionaries and make a category for rare words
word_counts = Counter(word for word in rawwords)
words = [ word if word_counts[word]>3 else 'RARE' for word in rawwords ]
unique_words = sorted(list(set(words)))
word_to_int = dict((c, i) for i, c in enumerate(unique_words))
int_to_word = dict((i, c) for i, c in enumerate(unique_words))

# get number of words
n_words = len(words)
n_uwords = len(unique_words)
print('Total Words (without rare words): ', n_words)
print('Unique Words (without rare words): ', n_uwords)

In [None]:
dataX = []
dataY = []
for i in range(0, n_words-seq_length, 1):
    seq_in = words[i:i + seq_length] # seq_length number of words from all training words
    seq_out = words[i + seq_length] # the word after that, training signal
    dataX.append([word_to_int[word] for word in seq_in]) # seq of words turned into int value
    dataY.append(word_to_int[seq_out]) # training signal word words turned into int value
n_patterns = len(dataX)
X = np.reshape(dataX, (n_patterns, seq_length, 1)) # [individual samples, length of sequence, features]
X = X / np.float32(n_uwords) # normalize
y = np_utils.to_categorical(dataY,num_classes=n_uwords) # one hot encoder


In [None]:
# remove last layer and retrain on only the target file
model.pop()
model.add(Dense(n_uwords, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [None]:
# define the checkpoint
filepath="washington-weight-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

# fit the model
model.fit(X, y, epochs=5, batch_size=200, callbacks=callbacks_list)

In [None]:
model = load_model('washington-weight-03-6.0754.hdf5')

In [None]:
predict_words(100, words, model, int_to_word, word_to_int,seq_length)