# Imports

In [3]:
import numpy as np
import re
import h5py
from collections import Counter
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.models import load_model
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
import random
from keras import optimizers
import pickle
import sys

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def preproc(text):
    'preprocessing of large text string'
    text = re.sub(' Mr.', ' Mr', text)
    text = re.sub(' Mrs.', ' Mrs', text)
    text = re.sub(' Messrs.', ' Messrs', text)
    text = text.lower()
    text = re.sub('\[(.*?)\]', '', text)
    text = re.sub('\n\n', ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub(',', '', text)
    text = re.sub(' {2,}', '', text)
    text = re.sub('_', '', text)
    text = re.sub(';', '', text)
    text = re.sub('_figure_', '', text)
    text = re.sub('\d+','',text)
    return text

def get_dicts(rawwords):
        # get word counts and dictionaries and make a category for rare words
        word_counts = Counter(word for word in rawwords)
        words = [ word if word_counts[word]>5 else 'RARE' for word in rawwords ]
        unique_words = sorted(list(set(words)))
        word_to_int = dict((c, i) for i, c in enumerate(unique_words))
        int_to_word = dict((i, c) for i, c in enumerate(unique_words))
        n_words = len(words)
        n_uwords = len(unique_words)
        print('Total Words (without rare words): ', n_words)
        print('Unique Words (without rare words): ', n_uwords)
        return words,n_words,n_uwords,word_to_int,int_to_word

def load_data(name):
        with open (name, 'rb') as fp:
            text = pickle.load(fp)
        print('using saved data... (',len(text),')')
        return text
    
# get all words in target text file, plus all files in a directory for optional pre-training
def get_data(targetname,pretrains = None):
    'load text from file, optional pretraining'
    rawwords=[]
    text = open(targetname).read() # now do the same for target file
    text = preproc(text)
    wds = re.findall(r"[\w']+|[.,!?;]", text)
    print('Words in target set: ', len(wds))
    for wd in wds:
        rawwords.append(wd)     
    with open('text_main.pkl', 'wb') as fp:
        pickle.dump(rawwords, fp)
    
    if pretrains: # optional
        filename = glob.glob(pretrains) # get directory and file ending for pretraining files
        count=0
        for i in filename:
            text = open(i).read()
            text = preproc(text) # preprocess file text
            wds = re.findall(r"[\w']+|[.,!?;]", text) # split text into individual words
            for wd in wds:
                rawwords.append(wd) # append individual words to our raw training array
            count +=1
            print('loading file...', count+1, '/',len(filename),' :: ',i)
        print('Words in pretraining set: ', len(rawwords))
        with open('text_pretrain.pkl', 'wb') as fp:
            pickle.dump(rawwords, fp)
    return rawwords
    
def predict_words(npred,div):
        start = np.random.randint(0, len(words)-seq_length) # pick a random seed
        pattern = words[start:start+seq_length] # get a full sequence
        print('Seed: "' + ' '.join(pattern) + '"')
        for i in range(npred):
                x_pred = np.zeros((1, seq_length, len(sorted(list(set(words))))))
                for t, wd in enumerate(pattern):
                        x_pred[0, t, word_to_int[wd]] = 1.
                preds = model.predict(x_pred, verbose=0)[0]
                preds = np.asarray(preds).astype('float64')
                preds = np.log(preds)/div
                exp_preds = np.exp(preds)
                preds = exp_preds / np.sum(exp_preds)
                res = int_to_word[np.argmax(np.random.multinomial(1, preds, 1))]
                while res=='RARE': # or res==pattern[-1]: # if model predicts rare, sample again until it finds a more frequent word
                        res = int_to_word[np.argmax(np.random.multinomial(1, preds,1))]
                pattern.append(res)
                pattern = pattern[1:]
                sys.stdout.write(res+' ')
                sys.stdout.flush()


In [4]:
# get text & words
raw_pretrain = get_data('gwtext.txt')
rawwords = load_data('text_main.pkl')
words,n_words,n_uwords,word_to_int,int_to_word = get_dicts(rawwords)

batch_size = 100 # how many sequences to train concurrently per weight update
seq_length = 30 # number of words per training sequence

n_examples = len(words)-seq_length # total number of available example sequences
n_batches = n_examples/batch_size # how many batches from full set of examples

model = load_model('modelE4.h5')

Words in target set:  228311
using saved data... ( 228311 )
Total Words (without rare words):  228311
Unique Words (without rare words):  2897


In [5]:
predict_words(100,1.0)

Seed: "to fear but for the general service and no hopes but the advantages it will RARE from the success of our operations therefore cannot be supposed to have any private"
have cannot out of all exclusive money more pleasure by least other attempting up captain ship and the madam of matter seek my could a court of drafts . little esteem done this is that an acquainted seven the assistance were a go between this wishes that it been though it . last put that mr blank flat down to town that to be orders of an considerable opinion hand to them to general a cause way of the many . i shall myself to bring want and building that they from it however my getting how sent if i 