In [None]:
# Chad Lohrli A14218619 | COGS 181 Final Project

# Description: This code allows users to build, train, and test 3 different LSTM Recurrent Nerual Networks
# as well as generate samples texts from the learned models 

# Disclaimer:
# This code is inspired from Andrej Karpathy's Char-RNN https://gist.github.com/karpathy/d4dee566867f8291f086
# and fchollet's implementation in Keras https://github.com/fchollet/keras/blob/master/examples/lstm_text_generation.py

In [7]:
#Libraries
from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
import numpy as np
import random
import sys
import matplotlib.pyplot as plt
%matplotlib inline


def parseText(text,maxlen,step):
    text = open(text).read()
    print('corpus length:', len(text))

    chars = sorted(list(set(text)))
    print('total chars:', len(chars))
    char2idx = dict((c, i) for i, c in enumerate(chars))
    idx2char = dict((i, c) for i, c in enumerate(chars))

    # cut the text in semi-redundant sequences of maxlen characters
    #maxlen = 95
    #step = 3
    
    sentences = []
    next_chars = []
    for i in range(0, len(text) - maxlen, step):
        sentences.append(text[i: i + maxlen])
        next_chars.append(text[i + maxlen])
    print('nb sequences:', len(sentences))


    print('Vectorization...')
    x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
    y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            x[i, t, char2idx[char]] = 1
        y[i, char2idx[next_chars[i]]] = 1
        
    return text,len(chars),len(text),char2idx,idx2char,x,y


def buildModel(choice,vocab,text,weightFile='',dropout=False):
    print('Build model')
    
    #base model
    model = Sequential()
    
    #Model A
    if(choice == 1):

        model.add(LSTM(256, input_shape=(maxlen, vocab)))
        model.add(Dense(vocab))
        model.add(Activation('softmax'))
    
        if(len(weightFile) != 0):
            model.load_weights(weightFile)
            
        model.compile(loss='categorical_crossentropy', optimizer='adam')
        return model
    
    #Model B
    if(choice == 2):
        
        model.add(LSTM(256, input_shape=(maxlen, vocab),return_sequences=True))
        if(dropout):
            model.add(Dropout(0.2))
        model.add(LSTM(256))
        model.add(Dense(vocab))
        model.add(Activation('softmax'))
    
        if(len(weightFile) != 0):
            model.load_weights(weightFile)
            
        model.compile(loss='categorical_crossentropy', optimizer='adam')
        
        return model
    
    #Model C
    if(choice == 3):
        
        model.add(LSTM(512, input_shape=(maxlen, vocab),return_sequences=True))
        model.add(LSTM(512,return_sequences=True))
        model.add(LSTM(512))
        model.add(Dense(vocab))
        model.add(Activation('softmax'))
    
        if(len(weightFile) != 0):
            model.load_weights(weightFile)
            
        model.compile(loss='categorical_crossentropy', optimizer='adam')
        
        return model

    print("please choose index from 1,2,3")
    return
 
def trainModel(choice,x,y,model,text,batch_size,epochs):
    
    modelName = ""
    if(choice == 1):
        modelName = "Model_A"
    elif(choice == 2):
        modelName = "Model_B"
    elif(choice == 3):
        modelName = "Model_C"
    
    #setup file for saving weights
    saveFile = text + "-" + modelName + "-{epoch:02d}-{loss:.4f}.hdf5"
    checkpoint = ModelCheckpoint(saveFile, monitor='loss', verbose=1, save_best_only=True, mode='min')
    callbacks_list = [checkpoint]
    
    #train
    history = model.fit(x, y,batch_size=batch_size,epochs=epochs,callbacks=callbacks_list)
    return history

def plotLoss(history):
    print(history.history.keys())
    plt.plot(history.history['loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def generateText(dataset,corpus,vocab,maxlen,sampleLength,char2idx,idx2char,temp):
    
    #random seed
    start_index = random.randint(0, corpus - maxlen - 1)
    
    generated = ''
    sentence = dataset[start_index: start_index + maxlen]
    generated += sentence
    
    print("--Seed--")
    print('"' + sentence + '"')
    print()
    
    for i in range(sampleLength):
        x_pred = np.zeros((1, maxlen, vocab))
        for t, char in enumerate(sentence):
            x_pred[0, t, char2idx[char]] = 1.

        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, temp)
        next_char = idx2char[next_index]

        generated += next_char
        sentence = sentence[1:] + next_char

        sys.stdout.write(next_char)
        sys.stdout.flush()
    print()

#input params
text = "shuffled_tweets_10k.txt" #dataset
maxlen = 95 #length of sequences
step = 3 #splits dataset according to step
choice = 3 #choice of model 
weightFile = "shuffled_tweets-10k.txt-Model_C-11-0.7177.hdf5" #load weights from file

In [2]:
#parse text and create mappings and gather necessary variables 
#vocab = len(chars) | corpus = len(text)

dataset,vocab,corpus,char2idx,idx2char,x,y = parseText(text,maxlen,step)

corpus length: 1035027
total chars: 176
nb sequences: 344978
Vectorization...


In [8]:
#buid model

batch_size = 256
epochs = 10

model = buildModel(choice,vocab,text,weightFile)

Build model


In [None]:
#train model
history = trainModel(choice,x,y,model,text,batch_size,epochs)

In [2]:
#generate text

sampleLength = 400 #number of characters to output
temp = 0.4 #deterministic factor for samples

generateText(dataset,corpus,vocab,maxlen,sampleLength,char2idx,idx2char,temp)


