In [2]:
import numpy as np
from tensorflow import keras
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.layers import LSTM, SimpleRNN
from keras.preprocessing.sequence import pad_sequences
from pickle import dump, load
import matplotlib.pyplot as plt
from keras.utils.vis_utils import plot_model

Using TensorFlow backend.


## Task 1

In [3]:
def get_train_data(textfile, window_size, stride):
    # open the file as read, read text, and close file
    file = open(textfile, 'r')
    text = file.read()
    file.close()
    
    # strip of the new line characters so that we 
    # have one long sequence of characters separated only 
    # by white space
    tokens = text.split()
    data = ' '.join(tokens)
    
    # get sequences of characters of length window_size+1
    sequences = []
    for i in range(window_size, len(data), stride):
        sequence = data[i - window_size: i + 1]
        sequences.append(sequence)
    print('Total Sequences: %d' % len(sequences))
    
    # save sequences 
    data = '\n'.join(sequences)
    file = open('train_data.txt', 'w')
    file.write(data)
    file.close()

In [3]:
get_train_data('beatles.txt', window_size=10, stride=1)

Total Sequences: 166732


## Task 2

In [27]:
def preprocess_data(train_textfile):
    # open the file as read, read text, and close file
    file = open(train_textfile, 'r')
    text = file.read()
    file.close()
    
    # get list of sequences by splitting the text by new line
    lines = text.split('\n')
    
    # get unique characters
    chars = sorted(list(set(text)))
    # get mapping of character to integer values and store in a dictionary
    char_to_i_mapping = dict((c, i) for i, c in enumerate(chars))
    # save the mapping
    dump(char_to_i_mapping, open('mapping.pkl', 'wb'))
    # get vocabulary size
    vocab_size = len(char_to_i_mapping)
    print('Vocabulary size: %d' % vocab_size)
    
    # integer encode each sequence of characters using the dictionary mapping
    sequences = []
    for line in lines:
        # integer encode line
        encoded_seq = [char_to_i_mapping[char] for char in line]
        # store
        sequences.append(encoded_seq)
    
    # now separate the integer encoded sequences into input and output
    sequences = np.array(sequences)
    X = sequences[:,:-1]
    y = sequences[:,-1]
    
    # now one-hot encode each character, meaning each character becomes a vector of length vocab_size with a 1 marked 
    # for the character and 0s elsewhere
    sequences = [to_categorical(x, num_classes=vocab_size) for x in X]
    X = np.array(sequences)
    y = to_categorical(y, num_classes=vocab_size)
    print('X shape: %s and y shape: %s' %(X.shape, y.shape))
    
    return(X, y)

In [28]:
X, y = preprocess_data('train_data.txt')

Vocabulary size: 48
X shape: (33347, 10, 48) and y shape: (33347, 48)


## Task 3

In [29]:
def predict_characters(model, mapping, window_size, init_chars, n_chars):
    text = init_chars
    # predict a fixed number of characters
    for i in range(n_chars):
        # integer encode the characters
        encoded = [mapping[chara] for chara in text]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=window_size, truncating='pre')
        # one hot encode
        encoded = to_categorical(encoded, num_classes=len(mapping))
        # predict the next character
        pred_char = model.predict_classes(encoded, verbose=0)
        # reverse mapping of predicted character (integer to character)
        out_char = ''
        for char, index in mapping.items():
            if index == pred_char:
                out_char = char
                break
        # append to input
        text += out_char
    return text

## Task 4

In [41]:
class predict_during_training(keras.callbacks.Callback):    
    def __init__(self, model, sequences):
        self.model = model
        self.sequences = sequences
        
    def on_epoch_end(self, epoch, logs=None):
        mapping = load(open('mapping.pkl', 'rb'))
        window_size = len(self.sequences[0])
        if epoch%5 == 0:
            for i in self.sequences:
                text = predict_characters(self.model, mapping, window_size=window_size, 
                                      init_chars=i, n_chars=10)
                print(text) # return prediction
            print()

def train_model(model, X, y, n_epochs, model_name):
    # open the file as read, read text, and close file
    file = open('train_data.txt', 'r')
    text = file.read()
    file.close()
    # get list of sequences by splitting the text by new line
    lines = text.split('\n')
        
    # get list of 3 random sequences from training data which will be used to 
    # generate/predict characters during training
    random_indexes = list(np.random.randint(low=0, high=len(lines) - X.shape[1] - 1, size=3))
    random_sequences = [lines[index][:-1] for index in random_indexes]
    
    # compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    # fit model
    history = model.fit(X, y, epochs=n_epochs, verbose=1, 
                        callbacks=[predict_during_training(model, random_sequences)])
    # save model for later use
    model.save('models/%s.h5' %(model_name))
    # save model history
    with open('train_history/%s.pkl' %(model_name), 'wb') as file:
        dump(history.history, file)
        
    # Plot training loss values vs epochs
    epoch_loss_plot(history.history, n_epochs, model_name)
    
    return history

In [37]:
def epoch_loss_plot(history_dict, n_epochs, model_name):
    # Plot training loss values vs epochs
    #plt.figure(figsize=(10,8))
    plt.plot(history_dict['loss'])
    plt.title('Loss vs. Epochs')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train'], loc='upper right')
    plt.savefig('plots/epoch_loss_%s.png' %(model_name))
    plt.show()

## Task 6

In [51]:
# load the mapping
mapping = load(open('mapping.pkl', 'rb'))
# load model
model = load_model('models/rnn_75_4_2.h5')
print(model.summary())

# generate text
print(predict_characters(model, mapping, window_size=4, init_chars='good', n_chars=20))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_1 (SimpleRNN)     (None, 75)                9300      
_________________________________________________________________
dense_1 (Dense)              (None, 48)                3648      
Total params: 12,948
Trainable params: 12,948
Non-trainable params: 0
_________________________________________________________________
None
good like you see it be 


In [50]:
# load model
model = load_model('models/rnn_100_4_2.h5')
print(model.summary())

# generate text
print(predict_characters(model, mapping, window_size=4, init_chars='good', n_chars=20))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_1 (SimpleRNN)     (None, 100)               14900     
_________________________________________________________________
dense_1 (Dense)              (None, 48)                4848      
Total params: 19,748
Trainable params: 19,748
Non-trainable params: 0
_________________________________________________________________
None
good do you say the way 


In [55]:
# load model
model = load_model('models/rnn_75_10_5.h5')
print(model.summary())

# generate text
print(predict_characters(model, mapping, window_size=10, init_chars='sergeant p', n_chars=20))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_1 (SimpleRNN)     (None, 75)                9300      
_________________________________________________________________
dense_1 (Dense)              (None, 48)                3648      
Total params: 12,948
Trainable params: 12,948
Non-trainable params: 0
_________________________________________________________________
None
sergeant pe mo look don't be l


In [54]:
# load model
model = load_model('models/rnn_100_10_5.h5')
print(model.summary())

# generate text
print(predict_characters(model, mapping, window_size=10, init_chars='sergeant p', n_chars=20))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_1 (SimpleRNN)     (None, 100)               14900     
_________________________________________________________________
dense_1 (Dense)              (None, 48)                4848      
Total params: 19,748
Trainable params: 19,748
Non-trainable params: 0
_________________________________________________________________
None
sergeant pepper’s lonely heart


In [56]:
# load model
model = load_model('models/rnn_multi_100_10_5.h5')
print(model.summary())

# generate text
print(predict_characters(model, mapping, window_size=10, init_chars='sergeant p', n_chars=20))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_1 (SimpleRNN)     (None, None, 100)         14900     
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 100)         0         
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, 100)               20100     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 48)                4848      
Total params: 39,848
Trainable params: 39,848
Non-trainable params: 0
_________________________________________________________________
None
sergeant please me down the lo


In [57]:
# load model
model = load_model('models/lstm_75_4_2.h5')
print(model.summary())

# generate text
print(predict_characters(model, mapping, window_size=4, init_chars='good', n_chars=20))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 75)                37200     
_________________________________________________________________
dense_1 (Dense)              (None, 48)                3648      
Total params: 40,848
Trainable params: 40,848
Non-trainable params: 0
_________________________________________________________________
None
good day i love you know


In [58]:
# load model
model = load_model('models/lstm_100_4_2.h5')
print(model.summary())

# generate text
print(predict_characters(model, mapping, window_size=4, init_chars='good', n_chars=20))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 100)               59600     
_________________________________________________________________
dense_1 (Dense)              (None, 48)                4848      
Total params: 64,448
Trainable params: 64,448
Non-trainable params: 0
_________________________________________________________________
None
good love you know the s


In [60]:
# load model
model = load_model('models/lstm_75_10_5.h5')
print(model.summary())

# generate text
print(predict_characters(model, mapping, window_size=10, init_chars='sergeant p', n_chars=20))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 75)                37200     
_________________________________________________________________
dense_1 (Dense)              (None, 48)                3648      
Total params: 40,848
Trainable params: 40,848
Non-trainable params: 0
_________________________________________________________________
None
sergeant peppare the rouly fru


In [61]:
# load model
model = load_model('models/lstm_100_10_5.h5')
print(model.summary())

# generate text
print(predict_characters(model, mapping, window_size=10, init_chars='sergeant p', n_chars=20))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 100)               59600     
_________________________________________________________________
dense_1 (Dense)              (None, 48)                4848      
Total params: 64,448
Trainable params: 64,448
Non-trainable params: 0
_________________________________________________________________
None
sergeant pepper’s lonely heart


In [62]:
# load model
model = load_model('models/lstm_multi_100_10_5.h5')
print(model.summary())

# generate text
print(predict_characters(model, mapping, window_size=10, init_chars='sergeant p', n_chars=20))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 10, 100)           59600     
_________________________________________________________________
dropout_1 (Dropout)          (None, 10, 100)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 48)                4848      
Total params: 144,848
Trainable params: 144,848
Non-trainable params: 0
_________________________________________________________________
None
sergeant pepper’s lonely wanna
