In [1]:
####
# Original network adapted from karpathy
# minesh.mathew@gmail.com
# modified version of text generation example in keras;
# trained in a many-to-many fashion using a time distributed dense layer

####
from __future__ import print_function

import gc
from matplotlib import pyplot as plt
from matplotlib import pyplot
import matplotlib as mptl
import pylab
from collections import defaultdict
import keras
from keras.models import load_model
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
# from keras.layers import LSTM, TimeDistributedDense, SimpleRNN  #DEPRECATED TimeDistributedDense
from keras.layers import LSTM, TimeDistributed, SimpleRNN, Embedding, GRU, Bidirectional, CuDNNLSTM
from keras.layers.convolutional import Conv1D, MaxPooling1D
from sklearn.feature_extraction.text import CountVectorizer
from keras.callbacks import ModelCheckpoint
from keras.utils.data_utils import get_file
from keras.utils import to_categorical
import numpy as np
from time import sleep
import random
import sys
import os
import pickle
import gzip
from sklearn.metrics import confusion_matrix
import itertools
#import h5py

Using TensorFlow backend.


In [2]:
# ---------------------- GENERIC UTILITIES --------------------------
def plot_confusion_matrix(cm, class_labels, title='Confusion matrix',
                          filename = 'Confusion_Matrix.png', cmap=plt.cm.Blues):
    """
    Plot a confusion matrix
    :param cm: a confusion matrix generated by sklearn.metrics.confusion_matrix
    :param class_labels: set of text labels
    :param title: title of figure
    :param cmap: color map for the confusion matrix
    :return:
    """
    class_count = len(class_labels)
    fig = plt.figure(title)
    fig.set_size_inches(10, 8)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(class_count+1)
    plt.xticks(tick_marks, class_labels, rotation=45)
    plt.yticks(tick_marks, class_labels)
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black")
    #plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig(filename, dpi=100)
    np.set_printoptions(precision=2)
    plt.show(block=False)

In [3]:
#-------------------- TEXT FILE PREPROCESSING -----------------

def preprocess_text_file(filename, classID, maxlen=40 ):
    """load a file and split the text it contains into sequences of length = maxlen
    returns text, chars, char_indices, indices_char, sentences, next_chars
    text: the raw text (turned into lowercase)
    chars: a list of unique characters in the text
    char_indices: a dictionary of the character-to-index conversion
    indices_char: a dictionary of the index-to-character conversion
    sequences: a list of the sequences of max length extracted from the file (stride specified by the step variable below)
    next_chars: a list of the corresponding sequences of max_length next-characters following each of the sequence character members
      in other words, each member of next_chars is contains the last maxlen-1 chars of the correspondiong sequence ...
      and the next character from the text after the last character in that sequence """
# #     print('loading: ', filename)
    text = open(filename).read().lower()
    print('corpus length:', len(text))
    chars = sorted(list(set(text)))  #returns unique characters from the text
#     print('total chars:', len(chars))
    char_indices = dict((c, i) for i, c in enumerate(chars))
#     print('chars_indices:', char_indices)
    indices_char = dict((i, c) for i, c in enumerate(chars))
    step = 1
    length = 400
    sequences = []
    classIDs = []
    
    #first generate sentences of characters
    for i in range(0, len(text) - length + 1, length):
        sequences.append(text[i: i + length])  # input seq is from i to i  + maxlen

    # Get the train, val, and test sets
    
    smoltrainX = []
    smolvalX = []
    smoltestX = []
    
    bigTrainX = []
    bigValX = []
    bigTestX = []
     
    trainY = []
    valY = []
    testY = []
    
    # Parse the sentences into the training, val, and test sets
    for i in range(len(sequences)):
        smoltrainX.append( sequences[i][ 0 : 159 ] )
        smolvalX.append( sequences[i][ 160 : 319 ] )
        smoltestX.append( sequences[i][ 320 : ] )
    
    
    # Create the 40 char length for each set
    for sentence in smoltrainX:
        for i in range( 0, len(sentence ) - maxlen + 1, step):
            bigTrainX.append( sentence[ i : i + maxlen ] )
            trainY.append( classID )
         
    for sentence in smolvalX:
        for i in range( 0, len(sentence ) - maxlen + 1, step):
            bigValX.append( sentence[ i : i + maxlen ] )
            valY.append( classID )
        
    for sentence in smoltestX:
        for i in range( 0, len(sentence ) - maxlen + 1, step):
            bigTestX.append( sentence[ i : i + maxlen ] )
            testY.append( classID )
    
    return text, chars, char_indices, indices_char, sequences, bigTrainX, trainY, bigValX, valY, bigTestX, testY



def save_processed_text(filename, text, chars, char_indices, indices_char, sentences, next_chars ):
    """helper method for saving processed text in a compressed file so that you dont have to preprocess it each time"""
    my_text = [text, chars, char_indices, indices_char, sentences, next_chars] #dont save the one-hot vectors
    with gzip.open(filename, 'wb') as f:  # Python 3: open(..., 'wb')
        pickle.dump(my_text, f)
    # np.savez_compressed(filename,
    #                     text = text,
    #                     chars = chars,
    #                     char_indices=char_indices,
    #                     indices_char=indices_char,
    #                     sentences=sentences,
    #                     next_chars=next_chars)
    #                     # allow_pickle=True, protocol=4)


def load_processed_text(filename):
    """helper method for saving processed text in a compressed file"""
    with gzip.open(filename, 'rb') as f:  # Python 3: open(..., 'rb')
        text, chars, char_indices, indices_char, sentences, next_chars = pickle.load(f)
    # loaded = np.load(filename)
    # text = loaded['text']
    # chars = loaded['chars']
    # char_indices= loaded['char_indices']
    # indices_char= loaded['indices_char']
    # sentences= loaded['sentences']
    # next_chars= loaded['next_chars']
    return text, chars, char_indices, indices_char, sentences, next_chars


def vectorize_text(chars, char_indices, sentences, label, maxlen=40):  #UNUSED: "next_chars"
    """Accepts a list of sentences to convert to indices.  Used characters, their corresponding indeces to produce a set of sequences
    of X and corresponding labels y"""
    # now generate dummy variables (1-hot vectors) for the sequences of characters
    print('Vectorization processing... this could take a while...')
    print(len(sentences))
    print(len(chars))
    X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
#     y = np.zeros((len(sentences), maxlen, len(chars)),
#                  dtype=np.bool)  # y is also a sequence , or  a seq of 1 hot vectors
    joblength = len(sentences)
    tenpercent = joblength/10
    nextpercent = tenpercent
    print(" part 1 of 2")
    for i, sentence in enumerate(sentences):
        if i>nextpercent:
            print(i, " of ", joblength, " completed")
            nextpercent += tenpercent
        for t, char in enumerate(sentence):
            X[i, t, char_indices[char]] = 1  # X has dimension [sentence_count, sentence_length, char_count]
    print(" part 2 of 2")
    nextpercent = tenpercent

#     y = to_categorical( label )
    
    print('vetorization completed')
    return X


def generate_text(model, char_indices, indices_char, seed_string="brutus:", generate_character_count=320):
    """Generates text using a model"""
    print("seed string --> ", seed_string)
    print('The generated text is: ')
    sys.stdout.write(seed_string),
    # x=np.zeros((1, len(seed_string), len(chars)))
    for i in range(generate_character_count):
        x = np.zeros((1, len(seed_string), len(chars)))
        for t, char in enumerate(seed_string):
            x[0, t, char_indices[char]] = 1.
        preds = model.predict(x, verbose=0)[0]
        # print (np.argmax(preds[7]))
        next_index = np.argmax(preds[len(seed_string) - 1])

        # next_index=np.argmax(preds[len(seed_string)-11])
        # print (preds.shape)
        # print (preds)
        # next_index = sample(preds, 1) #diversity is 1
        next_char = indices_char[next_index]
        seed_string = seed_string + next_char

        # print (seed_string)
        # print ('##############')
        # if i==40:
        #    print ('####')
        sys.stdout.write(next_char)
    sys.stdout.flush()

    

In [4]:
# ---------------- MODEL FILE I/O ---------------------------
def save_model(model, save_dir=os.path.join(os.getcwd(), 'saved_models'),
               model_file_name='keras_cifar10_trained_model.h5'):
    """
    Save model and current weights
    :param model: Keras model
    :param save_dir: path name to save directory
    :param model_file_name: filename for saved model
    :return: nothing
    """
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
    model_path = os.path.join(save_dir, model_file_name)
    model.save(model_path)
    print('Saved trained model at %s ' % model_path)


def load_model(save_dir, model_file_name):
    # Load model and weights
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
    model_path = os.path.join(save_dir, model_file_name)
    model = keras.models.load_model(model_path)
    print('Loaded trained model from %s ' % model_path)
    return model


# ---------------- MODEL ARCHITECTURE ---------------------------
def build_model(characters):
    # build the model: 2 stacked LSTM
    print('Build model...')
    model = Sequential()
    # model.add(LSTM(512, return_sequences=True, input_shape=(maxlen, len(chars))))  # original one
    #model.add(LSTM(512, input_dim=len(characters), return_sequences=True))  # minesh witout specifying the input_length
    model.add(LSTM(512, return_sequences=True, input_shape=(None,len(characters)) ))  # minesh witout specifying the input_length
    model.add(LSTM(512, return_sequences=True))  # - original
    model.add(Dropout(0.2))
    # model.add(TimeDistributedDense(len(chars)))   #Deprecated TimeDistributedDense
    model.add(TimeDistributed(Dense(len(characters))))  # BJB:  is this really working??
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    print('model is made')
    # train the model, output generated text after each iteration
    print(model.summary())
    return model


# PROBLEMS TO FIX...
# C:\Users\bborghetti\Anaconda3\lib\site-packages\ipykernel_launcher.py:34: 
#             UserWarning: The `input_dim` and `input_length` arguments in recurrent layers are deprecated. Use `input_shape` instead.
# C:\Users\bborghetti\Anaconda3\lib\site-packages\ipykernel_launcher.py:34: 
#             UserWarning: Update your `LSTM` call to the Keras 2 API: `LSTM(512, return_sequences=True, input_shape=(None, 45))`



# ---------------- MODEL TRAINING ---------------------------
# def train_net(model, x, y, training_iterations=6, maxlen=40, save_all_model_iterations=True):
#     chkRNN = ModelCheckpoint('best_modelRNN.h5', monitor='val_acc', save_best_only=True, mode='max', verbose=1)
#     for training_iteration in range(1, training_iterations+1):
#         print()
#         print('-' * 50)
#         print('Training Iteration (epoch) #:', training_iteration)
#         history = model.fit(x, 
#                             y, 
#                             batch_size=128, 
#                             epochs=1, 
#                             verbose=1)    #train 1 epoch at a time using previous weights
#         sleep(0.1)  # https://github.com/fchollet/keras/issues/2110

#         # saving models at the following iterations -- uncomment it if you want tos save weights and load it later
#         # if training_iteration==1 or training_iteration==3 or training_iteration==5 or training_iteration==10 or training_iteration==20 or training_iteration==30 or training_iteration==50 or training_iteration==60 :

#         # # save every training_iteration of weights
#         # model.save_weights('Karpathy_LSTM_weights_' + str(training_iteration) + '.h5', overwrite=True)
#         # start_index = random.randint(0, len(text) - maxlen - 1)
        
#         save_dir = os.path.join(os.getcwd(), 'saved_models')
#         current_model_file_name = 'LSTM_model_' + str(training_iteration) + '.h5'
#         if save_all_model_iterations:
#             save_model(model=model, save_dir=save_dir, model_file_name=current_model_file_name)
#         sys.stdout.flush()
#         print('Training:\n\tLoss: {}\n\tAccuracy: {}'.format( history.history['loss'][0], history.history['acc'][0]) )
#         print()
#         print('Validation:\n\tLoss: {}\n\tAccuracy: {}'.format( history.history['val_loss'][0], history.history['val_acc'][0]) )
#         print()
#     return model


# ---------------- MODEL EVALUATION ---------------------------
# def test_model(model, observations, targets):
#     '''
#     STUDENT SHOULD WRITE THIS CODE
#     :param model: a trained RNN model which accepts a sequence and outputs a target class (0;1;2;3)
#     :param observations: a list of 40-character sequences to classify
#     :param targets: a list of the true classes of the 40-character sequences
#     :return: a sklearn confusion matrix
#     '''
#     #< put student code here to test the model >
#     predicted_class_IDs = model.predict_classes( observations )
#     actual_class_IDs = targets

#     # generate & print confusion matrix to screen
#     cm = confusion_matrix(actual_class_IDs, predicted_class_IDs)
#     return cm

In [5]:
################################################################
#--------------------- Main Code -----------------------------


# pick the filename you want to use, and comment out the rest
# make sure you have this directory structure
# QUICK CHECK w/SMALL TEXT
# raw_text_filename='./textdatasets/tinytesttext.txt'
# CLASS DATASETS BELOW
# raw_text_filename='./textdatasets/0_bible.txt'
raw_text_filename='./textdatasets/1_nietzsche.txt'
# raw_text_filename='./textdatasets/2_shakespeare.txt'
# raw_text_filename='./textdatasets/3_warpeace.txt'



#raw_text_filename='./textdatasets/trumptweets.txt'

processed_filename = raw_text_filename+'.pklz'  # save process will append a .pklz on the filename

#set the boolean below to true to double check the save file is working
TEST_SAVE_LOAD_EQUAL = False
if TEST_SAVE_LOAD_EQUAL:
    text, chars, char_indices, indices_char, sentences, next_chars \
        = preprocess_text_file(raw_text_filename)
    save_processed_text(processed_filename, text, chars, char_indices, indices_char, sentences, next_chars )
    _text, _chars, _char_indices, _indices_char, _sentences, _next_chars = load_processed_text(processed_filename)
    print("Testing equivalences of preprocessed and  saved-preprocessed text (6 checks)")
    print("  text equal: ",np.array_equal(_text, text))
    print("  chars equal: ",np.array_equal(_chars, chars))
    print("  char_indices equal: ",np.array_equal(_char_indices, char_indices))
    print("  indices_chars equal: ",np.array_equal(_indices_char, indices_char))
    print("  sentences equal: ",np.array_equal(_sentences, sentences))
    print("  next_chars equal: ",np.array_equal(_next_chars, next_chars))
    print("Testing vectorization")
    X, y = vectorize_text(_chars, _char_indices, _sentences, _next_chars)
    print("Vectorization test complete")

    
PROCESS_RAW_TEXT = False  #set to True to process a previously unseen textfile - otherwise load a preprocessed file
if PROCESS_RAW_TEXT:
    text, chars, char_indices, indices_char, sentences, next_chars \
        = preprocess_text_file(raw_text_filename, classID=0 ) 
    save_processed_text(processed_filename, text, chars, char_indices, indices_char, sentences, next_chars)
else:  # instead, load previously processed text
    text, chars, char_indices, indices_char, sentences, next_chars = load_processed_text(processed_filename)


#vectorized form takes too much space to save... so process in real time
X, y = vectorize_text(chars, char_indices, sentences, next_chars)


TRAIN_MODE = False   #SET TO FALSE BEFORE SUBMITTING YOUR ASSIGNMENT!!!
if TRAIN_MODE:
    model = build_model(characters=chars)
    model_epoch_training_iterations = 10    #the bigger your text corpus, the smaller you can make this
    model = train_net(model=model, x=X, y=y,
                      training_iterations=model_epoch_training_iterations,
                      save_all_model_iterations=True)
else:  # load a model from a file
    # decide which iteration of the trained model you want to explore
    model_training_iteration = 8
    save_dir = os.path.join(os.getcwd(), 'saved_models')
    current_model_file_name = 'LSTM_model_' + str(model_training_iteration) + '.h5'
    model = load_model(save_dir=save_dir, model_file_name=current_model_file_name)

GENERATE_TEXT_MODE = False
if GENERATE_TEXT_MODE:   # generate text mode
    #decide which saved model to load
    #make up a string of characters to start with
    seed_string = "certainly"
    # decide how many text characters you want to generate:
    gen_char_count = 200
    generate_text(model, char_indices, indices_char, seed_string, generate_character_count=gen_char_count)

FileNotFoundError: [Errno 2] No such file or directory: './textdatasets/1_nietzsche.txt.pklz'

# Step 1

Student Architecture Design: Your goal in this step is to design and document your proposed model.

   a.Design the model:  Your model has a different ML task than Karpathy’s – you need to determine from which text a sequence of characters most likely came from.  Your input observation will be a string of text (40 characters) drawn from the training text.  Your output will be a class ID (0_bible.txt; 1_nietzsche.text; 2_shakespeare.txt; 3_warpeace.txt;).  You can provide a class ID for each character in the input sequence, but you must provide an overall class ID (0, 1, 2, or 3) for the whole input sequence.  You could use one-hot encoding here.

   b.Document your design:  Examine the layers of your model in the code, and develop a diagram (e.g. using Keras functions and/or Powerpoint) of the layers like you’ve seen in class or on the internet.  In text, describe the architecture and its parameterization.  Discuss: How many parameters are there in your whole model?  In each layer?  How did you decide on overall size/capacity? What other features are present in your model (like dropout) and why did you use them?  How does your model differ from the Karpathy-like model? 

   c.Implement your design in code 


In [7]:
def buildMyModel():
    
    model = Sequential()

    model.add(CuDNNLSTM( 10, input_shape=( None, len(chars))))
#     model.add( Bidirectional( CuDNNLSTM( 25, input_shape=( None, len(chars) ) ) ) )
    
    model.add(Dropout( 0.2 ) )
    
    # Because we are using one hot encoding to represent the class, we need to ouput 4 values, one for each class
    model.add( Dense( 4, activation='softmax' ) ) 
    
    # Use categorical cross entropy for a multiclass classification problem
    model.compile( loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [8]:
gallaher = buildMyModel()

NameError: name 'chars' is not defined

In [52]:
gallaher.summary()

ValueError: This model has not yet been built. Build the model first by calling build() or calling fit() with some data. Or specify input_shape or batch_input_shape in the first layer for automatic build. 

# 2.	Design customized code to generate training  / validation / test data:  
Modify the existing training code to match your assigned task (inputs are character sequences and outputs are class IDs).  You will need to figure out how to read and vectorize each of the four text files and use them all to train your network.  You will also need to split the data into train (40%) / validation (40%) / test (20%) samples.   Do this is a way such that

   •	The order of any characters in text is preserved, 
   
   •	That none of the characters in any n-character sequence are used in more than one partition of the train/val/test split 
   
   •	The text for each partition is interleaved so that all portions of the text are represented in each of the partitions (in other words, DON’T just cut the text by location in the file into beginning=training, middle = validation, end=test).  

   •	The same amount of data from each class for training / validation.  Note that the text files for each class are different sizes, so be careful how you implement this.  You will need to use less than the full amount of text from each file.

Assuming that the sequence length n=40, one way to do this split might be to take each chunk the text file into a sequence of 400-character segments (ignoring the final segment of less than 400 characters in the file) and then partition these segments into train, val, and test such that the first 160 chars are used for training, the second 160 chars are used for validation, and the last 80 chars are used for test.  Then, within each group, use vectorize_text to parse the text into sub-strings, and recombine the substrings into full sets for train; val; test sets.  Also, make sure that you do this segmentation with respect for the classes where the text came from.  A final caution:  don’t use off-the-shelf train-val-test splitters available in other packages unless you first ensure they meet all of the requirements expressed in this step… and document how they achieve this.  If you build your own code for this part, fully document how you did it in your report.


In [9]:
# Get the train, val, and test sets
trainXHolder = []
valXHolder = []
testXHolder = []

trainYHolder = []
valYHolder = []
testYHolder = []

In [10]:
PROCESS = True

if PROCESS:
    text0, chars0, char_indices0, indices_char0, sequences0, trX, trY, vX, vY, teX, teY = preprocess_text_file('./textdatasets/0_bible.txt', 0)
    trainXHolder += ( trX ) 
    valXHolder   += ( vX )
    testXHolder  += ( teX )
    trainYHolder += ( trY )
    valYHolder   += ( vY )
    testYHolder  += ( teY )

    text1, chars1, char_indices1, indices_char1, sequences1, trX, trY, vX, vY, teX, teY = preprocess_text_file('./textdatasets/1_nietzsche.txt', 1)
    trainXHolder += ( trX )
    valXHolder   += ( vX )
    testXHolder  += ( teX )
    trainYHolder += ( trY )
    valYHolder   += ( vY )
    testYHolder  += ( teY )

    text2, chars2, char_indices2, indices_char2, sequences2, trX, trY, vX, vY, teX, teY = preprocess_text_file('./textdatasets/2_shakespeare.txt', 2)
    trainXHolder += ( trX )
    valXHolder   += ( vX )
    testXHolder  += ( teX )
    trainYHolder += ( trY )
    valYHolder   += ( vY )
    testYHolder  += ( teY )

    text3, chars3, char_indices3, indices_char3, sequences3, trX, trY, vX, vY, teX, teY = preprocess_text_file('./textdatasets/3_warpeace.txt', 3)
    trainXHolder += ( trX )
    valXHolder   += ( vX )
    testXHolder  += ( teX )
    trainYHolder += ( trY )
    valYHolder   += ( vY )
    testYHolder  += ( teY )

    # X, y = vectorize_text( chars, char_indices, trX, trY )

    # create a dict that holds all chars and indices and free unneeded memory
    text = text0 + text1 + text2 + text3
    del text0, text1, text2, text3
    chars = set( chars0 + chars1 + chars2 + chars3 )
    del chars0, chars1, chars2, chars3
    char_indices = {**char_indices0, **char_indices1, **char_indices2, **char_indices3}
    del char_indices0, char_indices1, char_indices2, char_indices3
    indices_char = {**indices_char0, **indices_char1, **indices_char2, **indices_char3}
    del indices_char0, indices_char1, indices_char2, indices_char3
        

corpus length: 4332559
corpus length: 600901
corpus length: 4573338
corpus length: 3196232


In [11]:
len(trainXHolder)

3810720

In [12]:
# Vectorize all the sets
VECTORIZE = True

if VECTORIZE:
    gc.collect()
    
    trainX = vectorize_text( chars, char_indices, trainXHolder, trainYHolder )
    trainY = to_categorical( trainYHolder )

    gc.collect()
    
    valX = vectorize_text( chars, char_indices, valXHolder, valYHolder )
    valY = to_categorical( valYHolder )

    gc.collect()
    
#     testX = vectorize_text( chars, char_indices, testXHolder, testYHolder )
#     testY = to_categorical( testYHolder )
    
    del trainXHolder, trainYHolder
    del valXHolder, valYHolder
#     del testXHolder, testYHolder
    
    gc.collect()

Vectorization processing... this could take a while...
3810720
68
 part 1 of 2
381073  of  3810720  completed
762145  of  3810720  completed
1143217  of  3810720  completed
1524289  of  3810720  completed
1905361  of  3810720  completed
2286433  of  3810720  completed
2667505  of  3810720  completed
3048577  of  3810720  completed
3429649  of  3810720  completed
 part 2 of 2
vetorization completed
Vectorization processing... this could take a while...
3810720
68
 part 1 of 2
381073  of  3810720  completed
762145  of  3810720  completed
1143217  of  3810720  completed
1524289  of  3810720  completed
1905361  of  3810720  completed
2286433  of  3810720  completed
2667505  of  3810720  completed
3048577  of  3810720  completed
3429649  of  3810720  completed
 part 2 of 2
vetorization completed


In [13]:
gc.collect()

20

# 3.	Design Validation-based Training Code:  

Modify the provided training code to enable examining additional information per epoch such as training loss and accuracy and validation loss and accuracy.  You may want to implement early stopping based on the validation set, however since you will be training 1 epoch at a time, you could also just take performance measurements after each training epoch.  If you are making decisions using the validation data (i.e. early stopping), remember to not use the same data to also evaluate the model’s performance.  Include code to capture the validation performance so that it can be plotted (in a later HW step).  


In [15]:
def train_net(model, trainx, trainy, valx, valy, training_iterations=6, maxlen=40, save_all_model_iterations=True):
    chkRNN = ModelCheckpoint('best_modelRNN.h5', monitor='val_acc', save_best_only=True, mode='max', verbose=1)
    for training_iteration in range(1, training_iterations+1):

        print()
        print('-' * 50)
        print('Training Iteration (epoch) #:', training_iteration)
        history = model.fit(trainx, 
                            trainy, 
                            batch_size=128, 
                            epochs=1, 
                            verbose=1,
                            validation_data=( valx, valy ) )    #train 1 epoch at a time using previous weights
        sleep(0.1)  # https://github.com/fchollet/keras/issues/2110

        # saving models at the following iterations -- uncomment it if you want tos save weights and load it later
        # if training_iteration==1 or training_iteration==3 or training_iteration==5 or training_iteration==10 or training_iteration==20 or training_iteration==30 or training_iteration==50 or training_iteration==60 :

        # # save every training_iteration of weights
        # model.save_weights('Karpathy_LSTM_weights_' + str(training_iteration) + '.h5', overwrite=True)
        # start_index = random.randint(0, len(text) - maxlen - 1)
        
        save_dir = os.path.join(os.getcwd(), 'saved_models')
        current_model_file_name = 'LSTM_model_' + str(training_iteration) + '.h5'
        if save_all_model_iterations:
            save_model(model=model, save_dir=save_dir, model_file_name=current_model_file_name)
            # save the history too
            history_name = 'model_history_' + str(training_iteration)
            with open( history_name, 'wb' ) as handle:
                pickle.dump( history.history, handle )
                
        sys.stdout.flush()
        print('Training:\n\tLoss: {}\n\tAccuracy: {}'.format( history.history['loss'][0], history.history['acc'][0]) )
        print()
        print('Validation:\n\tLoss: {}\n\tAccuracy: {}'.format( history.history['val_loss'][0], history.history['val_acc'][0]) )
        print()
    return model, history

# 4.	Design Testing code: 
The testing code provided in the HW is designed for the original Karpathy task.  Your testing code should have the signature test_model(model, observations, targets) and it should return a sklearn confusion matrix (cm).  observations is an arbitrary-sized list of 40-character sequences to classify, and targets is a list of the correct classes of those sequences.   A code shell for this function has been provided – but you will need to populate it with working code.

In [16]:
def test_model(model, observations, targets):
    '''
    STUDENT SHOULD WRITE THIS CODE
    :param model: a trained RNN model which accepts a sequence and outputs a target class (0;1;2;3)
    :param observations: a list of 40-character sequences to classify
    :param targets: a list of the true classes of the 40-character sequences
    :return: a sklearn confusion matrix
    '''
    #< put student code here to test the model >
    predicted_class_IDs = model.predict_classes( observations )
    actual_class_IDs = targets

    # generate & print confusion matrix to screen
    cm = confusion_matrix(actual_class_IDs, predicted_class_IDs)
    return cm

# 5.	New Architecture Training Check: 
Note that this classification task may not take as many training epochs as past HW assignments have taken – but each epoch may take longer to train than in previous assignments.  You may experience that overfitting begins earlier than in previous assignments.  Train your new Keras model for 2 epochs.  Confirm that training works (training loss lower in the second epoch) and track how long training takes.   You may need to adjust your model architecture based on the expected training time for 20 epochs (don’t build something that you don’t have time to train).  Report the estimated time for training 20 epochs.

In [17]:
gc.collect()

44

In [21]:
# model_training_iteration = 1
#     save_dir = os.path.join(os.getcwd(), 'saved_models')
#     current_model_file_name = 'LSTM_model_' + str(model_training_iteration) + '.h5'
#     gallaher = load_model(save_dir=save_dir, model_file_name=current_model_file_name)

TRAIN_MODE = True   #SET TO FALSE BEFORE SUBMITTING YOUR ASSIGNMENT!!!
if TRAIN_MODE:
    model_epoch_training_iterations = 2    #the bigger your text corpus, the smaller you can make this
    gallaher, history = train_net(model=gallaher, trainx=trainX, trainy=trainY, valx=valX, valy=valY,
                      training_iterations=model_epoch_training_iterations)


--------------------------------------------------
Training Iteration (epoch) #: 1


ValueError: You are passing a target array of shape (1299720, 1) while using as loss `categorical_crossentropy`. `categorical_crossentropy` expects targets to be binary matrices (1s and 0s) of shape (samples, classes). If your targets are integer classes, you can convert them to the expected format via:
```
from keras.utils import to_categorical
y_binary = to_categorical(y_int)
```

Alternatively, you can use the loss function `sparse_categorical_crossentropy` instead, which does expect integer targets.

In [45]:
def graphHistory( history, name ):

    # Plotting the Accuracy vs Epoch Graph
    plt.plot(history['acc'])
    plt.plot(history['val_acc'])
    plt.title(name + ' Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper left')
    plt.show()

    # Plotting the Loss vs Epoch Graphs
    plt.plot(history['loss'])
    plt.plot(history['val_loss'])
    plt.title( name + ' Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper left')
    plt.show()

In [47]:
graphHistory( gallaher.history, 'yee')

TypeError: 'History' object is not subscriptable

# 6.	New Architecture Training:  
Train your network incrementally by epoch, monitoring both training and validation loss and accuracy.  Your goal is to design a network which could achieve a training accuracy of 100% and as high validation accuracy as possible.  Since you are saving each epoch’s model, you can backtrack to the model that achieves the highest validation accuracy for use in test.   In this way, you sidestep the danger of early stopping too early.  Plot training and validation curves as you go and use them to help you decide whether to stop training, keep training, or go back and redesign your network.

   a. If your network training accuracy has not yet plateaued that means you should train for at least another epoch – maybe several more.

   b.	If your network training accuracy plateaus for a few epochs and it is far below 100% accuracy and the loss is still improving, it may mean you just need to train it for more epochs (be patient).  
    
   c.	If your training loss plateaus then you may need to adjust the optimizer parameters such as learning rate or decay, AND/OR design a different network.  

   d.	Capacity check:  Once your training gets close to 100% with loss close to zero, then this means your network has sufficient capacity to learn the task and you are on your way to overfitting.  You are unlikely to be improving validation metrics if you keep training.  Note that dropout may disturb this type of capacity-check – dropout may prevent training performance from ever getting much better than validation performance.

Your network still may not perform well in generalization (validation accuracy and loss) even if it has good training performance.  Don’t get too hung up on trying to get great validation accuracy – if you are above 50% accuracy for a 4 class classification problem, you are doing decently better than chance.  Provide plots of training and validation loss and accuracy and describe what happened in your decisionmaking process (especially if you decided to re-design your network).  Consult chapter 11 in the deep learning book for additional guidance.
