# Model Training
> To facilitate a more automated training procedure, the model training is moved to a standalone python script.  
This keeps Keras much happier in terms of required restarts and memory usage.

In [1]:
%load_ext autoreload
%autoreload 2
from importlib import reload

import numpy as np
import time # !
import json
from matplotlib import pyplot as plt

from keras.utils import to_categorical

import glove_helper
from loadutils import conll2003Data, saveProcessedData, retrieve_model
from common import vocabulary, utils

Using TensorFlow backend.


In [2]:
## AZ UPDATE 4/21/2020:  I switched the columns back in load_utils.py
# I also re-generated the data files, so results may differ from before

"""
Pick which language to train on
"""
LANGUAGE = "es"
# LANGUAGE = "ca"

"""
Pick how much of the training data to use
"""

#TRAIN_AMOUNT = "100"

TRAIN_AMOUNT_100 = "100"
TRAIN_AMOUNT_50 = "50"
TRAIN_AMOUNT_10 = "10"

DIRECTORY = "../data/pos_tagging/"+LANGUAGE+"/"

#training file depends on low-resource or not; 100%, 50%, or 10% of training data used

#TRAIN_FILE = DIRECTORY+"train_"+TRAIN_AMOUNT+".txt"

TRAIN_FILE_100 = DIRECTORY+"train_"+TRAIN_AMOUNT_100+".txt"
TRAIN_FILE_50 = DIRECTORY+"train_"+TRAIN_AMOUNT_50+".txt"
TRAIN_FILE_10 = DIRECTORY+"train_"+TRAIN_AMOUNT_10+".txt"

# dev, test, and vectors stay the same
DEV_FILE = DIRECTORY+"dev.txt"
TEST_FILE = DIRECTORY+"test.txt"
VECTORS = "data/"+LANGUAGE+"/wiki."+LANGUAGE+".zip"

# out files for IPC
HYPER_PARAM_FILE = "hyper_params.json"

VOCAB_SIZE = 20000

## Local helper utils

In [3]:
# local untils

# timeit decorator
def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            print ('%r  %2.2f ms' % \
                  (method.__name__, (te - ts) * 1000))
        return result
    return timed

In [4]:
def construct_embedding_matrix(embed_dim, vocab_size, SIZE):
    """
    construct embedding matrix from GloVe 6Bn word data
    
    reuse glove_helper code from w266 
    
    Returns: an embedding matrix directly plugged into keras.layers.Embedding(weights=[embedding_matrix])
    """
    reload(glove_helper)
    hands = glove_helper.Hands(vector_zip=VECTORS, ndim=embed_dim)
    embedding_matrix = np.zeros((vocab_size, embed_dim))
    
    if (SIZE == 100): 
        vocabData = vocabData_100
    
    
    elif (SIZE == 50):
        vocabData = vocabData_50
    
    
    else:
        vocabData = vocabData_10
    
    
    for i in range(vocabData.vocab.size):
        word = vocabData.vocab.ids_to_words([i])[0]
        try:
            embedding_vector = hands.get_vector(word)
        except:
            embedding_vector = hands.get_vector("<unk>")
        embedding_matrix[i] = embedding_vector

    return embedding_matrix

In [5]:
def plot_history( history):
    # summarize history for accuracy
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

## Load the Data

In [6]:
# UPDATES!
global_max_features = 20000
windowLength = 11
#testNumSents = 20000

# Use training set to build vocab here
#vocabData = conll2003Data(TRAIN_FILE)
#vocabData.buildVocab( vocabSize=global_max_features)

vocabData_100 = conll2003Data(TRAIN_FILE_100)
vocabData_100.buildVocab( vocabSize=global_max_features)

vocabData_50 = conll2003Data(TRAIN_FILE_50)
vocabData_50.buildVocab( vocabSize=global_max_features)

vocabData_10 = conll2003Data(TRAIN_FILE_10)
vocabData_10.buildVocab( vocabSize=global_max_features)

# Format training data
'''
trainX, trainX_pos, trainX_capitals, trainY  = vocabData.formatWindowedData( 
                                                  vocabData.train_sentences, 
                                                  windowLength=windowLength,
                                                  verbose=False)
'''

trainX_100, trainX_pos_100, trainX_capitals_100, trainY_100  = vocabData_100.formatWindowedData( 
                                                  vocabData_100.train_sentences, 
                                                  windowLength=windowLength,
                                                  verbose=False)

trainX_50, trainX_pos_50, trainX_capitals_50, trainY_50  = vocabData_50.formatWindowedData( 
                                                  vocabData_50.train_sentences, 
                                                  windowLength=windowLength,
                                                  verbose=False)

trainX_10, trainX_pos_10, trainX_capitals_10, trainY_10  = vocabData_10.formatWindowedData( 
                                                  vocabData_10.train_sentences, 
                                                  windowLength=windowLength,
                                                  verbose=False)

# read in dev data

'''
devSents = vocabData.readFile( DEV_FILE)
devX, devX_pos, devX_capitals, devY = vocabData.formatWindowedData( 
                                              devSents, 
                                              windowLength=windowLength,
                                              verbose=False)
'''
devSents_100 = vocabData_100.readFile( DEV_FILE)
devX_100, devX_pos_100, devX_capitals_100, devY_100 = vocabData_100.formatWindowedData( 
                                              devSents_100, 
                                              windowLength=windowLength,
                                              verbose=False)
devSents_50 = vocabData_50.readFile( DEV_FILE)
devX_50, devX_pos_50, devX_capitals_50, devY_50 = vocabData_50.formatWindowedData( 
                                              devSents_50, 
                                              windowLength=windowLength,
                                              verbose=False)
devSents_10 = vocabData_10.readFile( DEV_FILE)
devX_10, devX_pos_10, devX_capitals_10, devY_10 = vocabData_10.formatWindowedData( 
                                              devSents_10, 
                                              windowLength=windowLength,
                                              verbose=False)

# read in the test data

'''
testSents = vocabData.readFile( TEST_FILE)
testX, testX_pos, testX_capitals, testY = vocabData.formatWindowedData( 
                                                testSents, 
                                                windowLength=windowLength,
                                                verbose=False)
'''

testSents_100 = vocabData_100.readFile( TEST_FILE)
testX_100, testX_pos_100, testX_capitals_100, testY_100 = vocabData_100.formatWindowedData( 
                                                testSents_100, 
                                                windowLength=windowLength,
                                                verbose=False)

testSents_50 = vocabData_50.readFile( TEST_FILE)
testX_50, testX_pos_50, testX_capitals_50, testY_50 = vocabData_50.formatWindowedData( 
                                                testSents_50, 
                                                windowLength=windowLength,
                                                verbose=False)

testSents_10 = vocabData_10.readFile( TEST_FILE)
testX_10, testX_pos_10, testX_capitals_10, testY_10 = vocabData_10.formatWindowedData( 
                                                testSents_10, 
                                                windowLength=windowLength,
                                                verbose=False)

----------------------------------------------------
reading file from path ../data/pos_tagging/es/train_100.txt
'readFile'  4790.20 ms
----------------------------------------------------
building vocabulary from TRAINING data...
'buildVocab'  3923.46 ms
----------------------------------------------------
reading file from path ../data/pos_tagging/es/train_50.txt
'readFile'  1336.91 ms
----------------------------------------------------
building vocabulary from TRAINING data...
'buildVocab'  1155.26 ms
----------------------------------------------------
reading file from path ../data/pos_tagging/es/train_10.txt
'readFile'  209.61 ms
----------------------------------------------------
building vocabulary from TRAINING data...
'buildVocab'  420.40 ms
----------------------------------------------------
formatting sentences into input windows...
'formatWindowedData'  8808.05 ms
----------------------------------------------------
formatting sentences into input windows...
'formatWind

In [7]:
# Load GloVe embedding matrix

# setting it to global instead of hyper_param dictionaries because embedding \
# dimensions need to be decided before the data is loaded for the decoder output
# global_embed_dim = 50
global_embed_dim = 300

'''
embedding_matrix = construct_embedding_matrix( global_embed_dim, 
                                               global_max_features)
'''

embedding_matrix_100 = construct_embedding_matrix( global_embed_dim, 
                                               global_max_features, 100)

embedding_matrix_50 = construct_embedding_matrix( global_embed_dim, 
                                               global_max_features, 50)

embedding_matrix_10 = construct_embedding_matrix( global_embed_dim, 
                                               global_max_features, 10)




Loading vectors from data/es/wiki.es.zip
Parsing file: data/es/wiki.es.zip:wiki.es.vec
Found 985,668 words.
Parsing vectors... Done! (W.shape = (985671, 300))
Loading vectors from data/es/wiki.es.zip
Parsing file: data/es/wiki.es.zip:wiki.es.vec
Found 985,668 words.
Parsing vectors... Done! (W.shape = (985671, 300))
Loading vectors from data/es/wiki.es.zip
Parsing file: data/es/wiki.es.zip:wiki.es.vec
Found 985,668 words.
Parsing vectors... Done! (W.shape = (985671, 300))


In [14]:
# Get Y

# cat train/dev/test to make sure we have all labels
# Y_cat_all 

# encoding 1-hot for ner targets
#trainY_cat = to_categorical(trainY.astype('float32'))
#devY_cat = to_categorical(devY.astype('float32'), num_classes=trainY_cat.shape[1])
#testY_cat = to_categorical(testY.astype('float32'), num_classes=trainY_cat.shape[1])

trainY_cat_100 = to_categorical(trainY_100.astype('float32'))
devY_cat_100 = to_categorical(devY_100.astype('float32'), num_classes=trainY_cat_100.shape[1])
testY_cat_100 = to_categorical(testY_100.astype('float32'), num_classes=trainY_cat_100.shape[1])

trainY_cat_50 = to_categorical(trainY_50.astype('float32'))
devY_cat_50 = to_categorical(devY_50.astype('float32'), num_classes=trainY_cat_50.shape[1])
testY_cat_50 = to_categorical(testY_50.astype('float32'), num_classes=trainY_cat_50.shape[1])

trainY_cat_10 = to_categorical(trainY_10.astype('float32'))
devY_cat_10 = to_categorical(devY_10.astype('float32'), num_classes=trainY_cat_10.shape[1])
testY_cat_10 = to_categorical(testY_10.astype('float32'), num_classes=trainY_cat_10.shape[1])



#trainY_cat = np.array(list(map( lambda i: np.array(i[3:], dtype=np.float), trainY_cat)), dtype=np.float)
#devY_cat = np.array(list(map( lambda i: np.array(i[3:], dtype=np.float), devY_cat)), dtype=np.float)
#testY_cat = np.array(list(map( lambda i: np.array(i[3:], dtype=np.float), testY_cat)), dtype=np.float)

trainY_cat_100 = np.array(list(map( lambda i: np.array(i[3:], dtype=np.float), trainY_cat_100)), dtype=np.float)
devY_cat_100 = np.array(list(map( lambda i: np.array(i[3:], dtype=np.float), devY_cat_100)), dtype=np.float)
testY_cat_100 = np.array(list(map( lambda i: np.array(i[3:], dtype=np.float), testY_cat_100)), dtype=np.float)

trainY_cat_50 = np.array(list(map( lambda i: np.array(i[3:], dtype=np.float), trainY_cat_50)), dtype=np.float)
devY_cat_50 = np.array(list(map( lambda i: np.array(i[3:], dtype=np.float), devY_cat_50)), dtype=np.float)
testY_cat_50 = np.array(list(map( lambda i: np.array(i[3:], dtype=np.float), testY_cat_50)), dtype=np.float)

trainY_cat_10 = np.array(list(map( lambda i: np.array(i[3:], dtype=np.float), trainY_cat_10)), dtype=np.float)
devY_cat_10 = np.array(list(map( lambda i: np.array(i[3:], dtype=np.float), devY_cat_10)), dtype=np.float)
testY_cat_10 = np.array(list(map( lambda i: np.array(i[3:], dtype=np.float), testY_cat_10)), dtype=np.float)


In [15]:
# Get decoder Y -- 50 dim embedding of center word

'''
train_decoderY = embedding_matrix[trainX[:,4]]
dev_decoderY = embedding_matrix[devX[:,4]]
test_decoderY = embedding_matrix[testX[:,4]]
'''

train_decoderY_100 = embedding_matrix_100[trainX_100[:,4]]
dev_decoderY_100 = embedding_matrix_100[devX_100[:,4]]
test_decoderY_100 = embedding_matrix_100[testX_100[:,4]]

train_decoderY_50 = embedding_matrix_50[trainX_50[:,4]]
dev_decoderY_50 = embedding_matrix_50[devX_50[:,4]]
test_decoderY_50 = embedding_matrix_50[testX_50[:,4]]

train_decoderY_10 = embedding_matrix_10[trainX_10[:,4]]
dev_decoderY_10 = embedding_matrix_10[devX_10[:,4]]
test_decoderY_10 = embedding_matrix_10[testX_10[:,4]]

In [21]:
"""
AZ:  trying to get pos tags crashes the kernel on my laptop, try on another device or on HPC
"""

# Get X pos tags

# encoding 1-hot for pos tags
'''
trainX_pos_cat = to_categorical(trainX_pos.astype('float32'))
devX_pos_cat = to_categorical(devX_pos.astype('float32'), num_classes=trainX_pos_cat.shape[2]) 
testX_pos_cat = to_categorical(testX_pos.astype('float32'), num_classes=trainX_pos_cat.shape[2])
'''

trainX_pos_cat_100 = to_categorical(trainX_pos_100.astype('float32'))
devX_pos_cat_100 = to_categorical(devX_pos_100.astype('float32'), num_classes=trainX_pos_cat_100.shape[2]) 
testX_pos_cat_100 = to_categorical(testX_pos_100.astype('float32'), num_classes=trainX_pos_cat_100.shape[2])

trainX_pos_cat_50 = to_categorical(trainX_pos_50.astype('float32'))
devX_pos_cat_50 = to_categorical(devX_pos_50.astype('float32'), num_classes=trainX_pos_cat_50.shape[2]) 
testX_pos_cat_50 = to_categorical(testX_pos_50.astype('float32'), num_classes=trainX_pos_cat_50.shape[2])

trainX_pos_cat_10 = to_categorical(trainX_pos_10.astype('float32'))
devX_pos_cat_10 = to_categorical(devX_pos_10.astype('float32'), num_classes=trainX_pos_cat_10.shape[2]) 
testX_pos_cat_10 = to_categorical(testX_pos_10.astype('float32'), num_classes=trainX_pos_cat_10.shape[2])

'''
trainX_pos_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), trainX_pos_cat)), dtype=np.float)
devX_pos_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), devX_pos_cat)), dtype=np.float)
testX_pos_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), testX_pos_cat)), dtype=np.float)
'''

trainX_pos_cat_100 = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), trainX_pos_cat_100)), dtype=np.float)
devX_pos_cat_100 = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), devX_pos_cat_100)), dtype=np.float)
testX_pos_cat_100 = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), testX_pos_cat_100)), dtype=np.float)

trainX_pos_cat_50 = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), trainX_pos_cat_50)), dtype=np.float)
devX_pos_cat_50 = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), devX_pos_cat_50)), dtype=np.float)
testX_pos_cat_50 = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), testX_pos_cat_50)), dtype=np.float)

trainX_pos_cat_10 = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), trainX_pos_cat_10)), dtype=np.float)
devX_pos_cat_10 = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), devX_pos_cat_10)), dtype=np.float)
testX_pos_cat_10 = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), testX_pos_cat_10)), dtype=np.float)

ValueError: setting an array element with a sequence.

In [18]:
# Get X capitalization 

# encoding 1-hot for capitalization info  ("allCaps", "upperInitial", "lowercase", "mixedCaps", "noinfo")
'''
trainX_capitals_cat = to_categorical(trainX_capitals.astype('float32'))
devX_capitals_cat = to_categorical(devX_capitals.astype('float32'), num_classes=trainX_capitals_cat.shape[2]) 
testX_capitals_cat = to_categorical(testX_capitals.astype('float32'), num_classes=trainX_capitals_cat.shape[2])
'''

trainX_capitals_cat_100 = to_categorical(trainX_capitals_100.astype('float32'))
devX_capitals_cat_100 = to_categorical(devX_capitals_100.astype('float32'), num_classes=trainX_capitals_cat_100.shape[2]) 
testX_capitals_cat_100 = to_categorical(testX_capitals_100.astype('float32'), num_classes=trainX_capitals_cat_100.shape[2])

trainX_capitals_cat_50 = to_categorical(trainX_capitals_50.astype('float32'))
devX_capitals_cat_50 = to_categorical(devX_capitals_50.astype('float32'), num_classes=trainX_capitals_cat_50.shape[2]) 
testX_capitals_cat_50 = to_categorical(testX_capitals_50.astype('float32'), num_classes=trainX_capitals_cat_50.shape[2])

trainX_capitals_cat_10 = to_categorical(trainX_capitals_10.astype('float32'))
devX_capitals_cat_10 = to_categorical(devX_capitals_10.astype('float32'), num_classes=trainX_capitals_cat_10.shape[2]) 
testX_capitals_cat_10 = to_categorical(testX_capitals_10.astype('float32'), num_classes=trainX_capitals_cat_10.shape[2])

'''
trainX_capitals_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), trainX_capitals_cat)), dtype=np.float)
devX_capitals_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), devX_capitals_cat)), dtype=np.float)
testX_capitals_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), testX_capitals_cat)), dtype=np.float)
'''

trainX_capitals_cat_100 = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), trainX_capitals_cat_100)), dtype=np.float)
devX_capitals_cat_100 = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), devX_capitals_cat_100)), dtype=np.float)
testX_capitals_cat_100 = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), testX_capitals_cat_100)), dtype=np.float)

trainX_capitals_cat_50 = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), trainX_capitals_cat_50)), dtype=np.float)
devX_capitals_cat_50 = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), devX_capitals_cat_50)), dtype=np.float)
testX_capitals_cat_50 = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), testX_capitals_cat_50)), dtype=np.float)

trainX_capitals_cat_10 = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), trainX_capitals_cat_10)), dtype=np.float)
devX_capitals_cat_10 = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), devX_capitals_cat_10)), dtype=np.float)
testX_capitals_cat_10 = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), testX_capitals_cat_10)), dtype=np.float)

## Set up model parameters

In [19]:
'''
# define hyper parameters for model
# CAPSNET
hyper_param_caps = {
    
    'max_features' : global_max_features,  # 20000
    'maxlen' : trainX.shape[1],  # window size (11)
#     'poslen' : trainX_pos_cat.shape[2],  # pos classes (216)
    # TODO: turn this into feature types, or only use that for capsule dimension?
    'capitallen' : trainX_capitals_cat.shape[2],  # capitalization classes (5)
    'ner_classes' : trainY_cat.shape[1],  # 18, POS coarse tags 
    'embed_dim' : global_embed_dim,  # word embedding size (300)
    'num_routing' : 3, 

    'use_glove' : True,
    'allow_glove_retrain' : False,
    'use_pos_tags' : True,
    'use_capitalization_info' : True,    
    
    'conv1_filters' : 256,
    'conv1_kernel_size' : 3,
    'conv1_strides' : 1,
    'conv1_padding' : 'valid',
    
    'use_2D_primarycaps' : False,
    'primarycaps_dim_capsule' : 8,
#     'primarycaps_dim_capsule' : 22, # number of feature types?
    'primarycaps_n_channels' : 32,
    'primarycaps_kernel_size' : 3,
    'primarycaps_strides' : 1,
    'primarycaps_padding' : 'valid',

    'ner_capsule_dim' : 16,
#     'ner_capsule_dim' : 18, # number of coarse POS tags?

    
    'num_dynamic_routing_passes' : 3,
    
    # decoder is still work in progress
    'use_decoder' : False,
    'decoder_feed_forward_1' : 100,
    'decoder_feed_forward_2' : 100, 
    'decoder_dropout' : 0.3,
    
    'save_dir' : './result',
    'batch_size' : 100,
    'debug' : 2,
    'epochs' : 5,
    'stopping_patience' : 3, 
    'dropout_p' : 0.25,
    'embed_dropout' : 0.25,
    'lam_recon' : 0.0005,
    
    'optimizer' : 'Adam', #or 'SGD'
    'loss_function' : 'custom_cosine', # mean_squared_error
}
'''

# define hyper parameters for model
# CAPSNET
hyper_param_caps_100 = {
    
    'max_features' : global_max_features,  # 20000
    'maxlen' : trainX_100.shape[1],  # window size (11)
    'poslen' : trainX_pos_cat_100.shape[2],  # pos classes (216)
    # TODO: turn this into feature types, or only use that for capsule dimension?
    'capitallen' : trainX_capitals_cat_100.shape[2],  # capitalization classes (5)
    'ner_classes' : trainY_cat_100.shape[1],  # 18, POS coarse tags 
    'embed_dim' : global_embed_dim,  # word embedding size (300)
    'num_routing' : 3, 

    'use_glove' : True,
    'allow_glove_retrain' : False,
    'use_pos_tags' : True,
    'use_capitalization_info' : True,    
    
    'conv1_filters' : 256,
    'conv1_kernel_size' : 3,
    'conv1_strides' : 1,
    'conv1_padding' : 'valid',
    
    'use_2D_primarycaps' : False,
    'primarycaps_dim_capsule' : 8,
#     'primarycaps_dim_capsule' : 22, # number of feature types?
    'primarycaps_n_channels' : 32,
    'primarycaps_kernel_size' : 3,
    'primarycaps_strides' : 1,
    'primarycaps_padding' : 'valid',

    'ner_capsule_dim' : 16,
#     'ner_capsule_dim' : 18, # number of coarse POS tags?

    
    'num_dynamic_routing_passes' : 3,
    
    # decoder is still work in progress
    'use_decoder' : False,
    'decoder_feed_forward_1' : 100,
    'decoder_feed_forward_2' : 100, 
    'decoder_dropout' : 0.3,
    
    'save_dir' : './result',
    'batch_size' : 100,
    'debug' : 2,
    'epochs' : 5,
    'stopping_patience' : 3, 
    'dropout_p' : 0.25,
    'embed_dropout' : 0.25,
    'lam_recon' : 0.0005,
    
    'optimizer' : 'Adam', #or 'SGD'
    'loss_function' : 'custom_cosine', # mean_squared_error
}

#-----------------------------------------------------------------------------------------

# define hyper parameters for model
# CAPSNET
hyper_param_caps_50 = {
    
    'max_features' : global_max_features,  # 20000
    'maxlen' : trainX_50.shape[1],  # window size (11)
    'poslen' : trainX_pos_cat_50.shape[2],  # pos classes (216)
    # TODO: turn this into feature types, or only use that for capsule dimension?
    'capitallen' : trainX_capitals_cat_50.shape[2],  # capitalization classes (5)
    'ner_classes' : trainY_cat_50.shape[1],  # 18, POS coarse tags 
    'embed_dim' : global_embed_dim,  # word embedding size (300)
    'num_routing' : 3, 

    'use_glove' : True,
    'allow_glove_retrain' : False,
    'use_pos_tags' : True,
    'use_capitalization_info' : True,    
    
    'conv1_filters' : 256,
    'conv1_kernel_size' : 3,
    'conv1_strides' : 1,
    'conv1_padding' : 'valid',
    
    'use_2D_primarycaps' : False,
    'primarycaps_dim_capsule' : 8,
#     'primarycaps_dim_capsule' : 22, # number of feature types?
    'primarycaps_n_channels' : 32,
    'primarycaps_kernel_size' : 3,
    'primarycaps_strides' : 1,
    'primarycaps_padding' : 'valid',

    'ner_capsule_dim' : 16,
#     'ner_capsule_dim' : 18, # number of coarse POS tags?

    
    'num_dynamic_routing_passes' : 3,
    
    # decoder is still work in progress
    'use_decoder' : False,
    'decoder_feed_forward_1' : 100,
    'decoder_feed_forward_2' : 100, 
    'decoder_dropout' : 0.3,
    
    'save_dir' : './result',
    'batch_size' : 100,
    'debug' : 2,
    'epochs' : 5,
    'stopping_patience' : 3, 
    'dropout_p' : 0.25,
    'embed_dropout' : 0.25,
    'lam_recon' : 0.0005,
    
    'optimizer' : 'Adam', #or 'SGD'
    'loss_function' : 'custom_cosine', # mean_squared_error
}

#----------------------------------------------------------------------------------

# define hyper parameters for model
# CAPSNET
hyper_param_caps_10 = {
    
    'max_features' : global_max_features,  # 20000
    'maxlen' : trainX_10.shape[1],  # window size (11)
    'poslen' : trainX_pos_cat_10.shape[2],  # pos classes (216)
    # TODO: turn this into feature types, or only use that for capsule dimension?
    'capitallen' : trainX_capitals_cat_10.shape[2],  # capitalization classes (5)
    'ner_classes' : trainY_cat_10.shape[1],  # 18, POS coarse tags 
    'embed_dim' : global_embed_dim,  # word embedding size (300)
    'num_routing' : 3, 

    'use_glove' : True,
    'allow_glove_retrain' : False,
    'use_pos_tags' : True,
    'use_capitalization_info' : True,    
    
    'conv1_filters' : 256,
    'conv1_kernel_size' : 3,
    'conv1_strides' : 1,
    'conv1_padding' : 'valid',
    
    'use_2D_primarycaps' : False,
    'primarycaps_dim_capsule' : 8,
#     'primarycaps_dim_capsule' : 22, # number of feature types?
    'primarycaps_n_channels' : 32,
    'primarycaps_kernel_size' : 3,
    'primarycaps_strides' : 1,
    'primarycaps_padding' : 'valid',

    'ner_capsule_dim' : 16,
#     'ner_capsule_dim' : 18, # number of coarse POS tags?

    
    'num_dynamic_routing_passes' : 3,
    
    # decoder is still work in progress
    'use_decoder' : False,
    'decoder_feed_forward_1' : 100,
    'decoder_feed_forward_2' : 100, 
    'decoder_dropout' : 0.3,
    
    'save_dir' : './result',
    'batch_size' : 100,
    'debug' : 2,
    'epochs' : 5,
    'stopping_patience' : 3, 
    'dropout_p' : 0.25,
    'embed_dropout' : 0.25,
    'lam_recon' : 0.0005,
    
    'optimizer' : 'Adam', #or 'SGD'
    'loss_function' : 'custom_cosine', # mean_squared_error
}





NameError: name 'trainX_pos_cat_100' is not defined

In [20]:
'''

# define hyper parameters for model
# CNN
hyper_param_cnn = {
    
    'max_features' : global_max_features,  # 20000
    'maxlen' : trainX.shape[1],  # window size (11)
    'poslen' : trainX_pos_cat.shape[2],  # pos classes (216)
    'capitallen' : trainX_capitals_cat.shape[2],  # capitalization classes (5)
    'ner_classes' : trainY_cat.shape[1],  # 18 
    'embed_dim' : global_embed_dim,  # word embedding size (300)
    'num_routing' : 3, 

    'use_glove' : True,
    'allow_glove_retrain' : False,
    'use_pos_tags' : True,
    'use_capitalization_info' : True,    
    
    'conv1_filters' : 256,
    'conv1_kernel_size' : 3,
    'conv1_strides' : 1,
    'conv1_padding' : 'valid',
    
    'conv2_filters' : 256,
    'conv2_kernel_size' : 3,
    'conv2_strides' : 1,
    'conv2_padding' : 'valid',
    
    'conv3_filters' : 128,
    'conv3_kernel_size' : 3,
    'conv3_strides' : 1,
    'conv3_padding' : 'valid',
    
    'max_pooling_size' : 3,
    'max_pooling_strides' : 1,
    'max_pooling_padding' : 'valid',
    'maxpool_dropout' : 0.3,
    
    'feed_forward_1' : 328,
    'ff1_dropout' : 0.3,
    'feed_forward_2' : 192,
    'ff2_dropout' : 0.3,
    
    'save_dir' : './result',
    'batch_size' : 100,
    'debug' : 2,
    'epochs' : 5,
    'stopping_patience' : 5, # default to same as epochs, ie don't use
    'dropout_p' : 0.25,
    'embed_dropout' : 0.25,  # set to 0 to disable dropout
    'lam_recon' : 0.0005,
    
    'optimizer' : 'Adam', #or 'SGD'
}

'''

# define hyper parameters for model
# CNN
hyper_param_cnn_100 = {
    
    'max_features' : global_max_features,  # 20000
    'maxlen' : trainX_100.shape[1],  # window size (11)
    'poslen' : trainX_pos_cat_100.shape[2],  # pos classes (216)
    'capitallen' : trainX_capitals_cat_100.shape[2],  # capitalization classes (5)
    'ner_classes' : trainY_cat_100.shape[1],  # 18 
    'embed_dim' : global_embed_dim,  # word embedding size (300)
    'num_routing' : 3, 

    'use_glove' : True,
    'allow_glove_retrain' : False,
    'use_pos_tags' : True,
    'use_capitalization_info' : True,    
    
    'conv1_filters' : 256,
    'conv1_kernel_size' : 3,
    'conv1_strides' : 1,
    'conv1_padding' : 'valid',
    
    'conv2_filters' : 256,
    'conv2_kernel_size' : 3,
    'conv2_strides' : 1,
    'conv2_padding' : 'valid',
    
    'conv3_filters' : 128,
    'conv3_kernel_size' : 3,
    'conv3_strides' : 1,
    'conv3_padding' : 'valid',
    
    'max_pooling_size' : 3,
    'max_pooling_strides' : 1,
    'max_pooling_padding' : 'valid',
    'maxpool_dropout' : 0.3,
    
    'feed_forward_1' : 328,
    'ff1_dropout' : 0.3,
    'feed_forward_2' : 192,
    'ff2_dropout' : 0.3,
    
    'save_dir' : './result',
    'batch_size' : 100,
    'debug' : 2,
    'epochs' : 5,
    'stopping_patience' : 5, # default to same as epochs, ie don't use
    'dropout_p' : 0.25,
    'embed_dropout' : 0.25,  # set to 0 to disable dropout
    'lam_recon' : 0.0005,
    
    'optimizer' : 'Adam', #or 'SGD'
}

#------------------------------------------------------------------------------

# define hyper parameters for model
# CNN
hyper_param_cnn_50 = {
    
    'max_features' : global_max_features,  # 20000
    'maxlen' : trainX_50.shape[1],  # window size (11)
    'poslen' : trainX_pos_cat_50.shape[2],  # pos classes (216)
    'capitallen' : trainX_capitals_cat_50.shape[2],  # capitalization classes (5)
    'ner_classes' : trainY_cat_50.shape[1],  # 18 
    'embed_dim' : global_embed_dim,  # word embedding size (300)
    'num_routing' : 3, 

    'use_glove' : True,
    'allow_glove_retrain' : False,
    'use_pos_tags' : True,
    'use_capitalization_info' : True,    
    
    'conv1_filters' : 256,
    'conv1_kernel_size' : 3,
    'conv1_strides' : 1,
    'conv1_padding' : 'valid',
    
    'conv2_filters' : 256,
    'conv2_kernel_size' : 3,
    'conv2_strides' : 1,
    'conv2_padding' : 'valid',
    
    'conv3_filters' : 128,
    'conv3_kernel_size' : 3,
    'conv3_strides' : 1,
    'conv3_padding' : 'valid',
    
    'max_pooling_size' : 3,
    'max_pooling_strides' : 1,
    'max_pooling_padding' : 'valid',
    'maxpool_dropout' : 0.3,
    
    'feed_forward_1' : 328,
    'ff1_dropout' : 0.3,
    'feed_forward_2' : 192,
    'ff2_dropout' : 0.3,
    
    'save_dir' : './result',
    'batch_size' : 100,
    'debug' : 2,
    'epochs' : 5,
    'stopping_patience' : 5, # default to same as epochs, ie don't use
    'dropout_p' : 0.25,
    'embed_dropout' : 0.25,  # set to 0 to disable dropout
    'lam_recon' : 0.0005,
    
    'optimizer' : 'Adam', #or 'SGD'
}

#-----------------------------------------------------------------------

# define hyper parameters for model
# CNN
hyper_param_cnn_10 = {
    
    'max_features' : global_max_features,  # 20000
    'maxlen' : trainX_10.shape[1],  # window size (11)
    'poslen' : trainX_pos_cat_10.shape[2],  # pos classes (216)
    'capitallen' : trainX_capitals_cat_10.shape[2],  # capitalization classes (5)
    'ner_classes' : trainY_cat_10.shape[1],  # 18 
    'embed_dim' : global_embed_dim,  # word embedding size (300)
    'num_routing' : 3, 

    'use_glove' : True,
    'allow_glove_retrain' : False,
    'use_pos_tags' : True,
    'use_capitalization_info' : True,    
    
    'conv1_filters' : 256,
    'conv1_kernel_size' : 3,
    'conv1_strides' : 1,
    'conv1_padding' : 'valid',
    
    'conv2_filters' : 256,
    'conv2_kernel_size' : 3,
    'conv2_strides' : 1,
    'conv2_padding' : 'valid',
    
    'conv3_filters' : 128,
    'conv3_kernel_size' : 3,
    'conv3_strides' : 1,
    'conv3_padding' : 'valid',
    
    'max_pooling_size' : 3,
    'max_pooling_strides' : 1,
    'max_pooling_padding' : 'valid',
    'maxpool_dropout' : 0.3,
    
    'feed_forward_1' : 328,
    'ff1_dropout' : 0.3,
    'feed_forward_2' : 192,
    'ff2_dropout' : 0.3,
    
    'save_dir' : './result',
    'batch_size' : 100,
    'debug' : 2,
    'epochs' : 5,
    'stopping_patience' : 5, # default to same as epochs, ie don't use
    'dropout_p' : 0.25,
    'embed_dropout' : 0.25,  # set to 0 to disable dropout
    'lam_recon' : 0.0005,
    
    'optimizer' : 'Adam', #or 'SGD'
}




NameError: name 'trainX_pos_cat_100' is not defined

## Save All Data to Disk

In [None]:
# save all loaded data for use by training process
'''
saveProcessedData( trainX, trainX_capitals_cat, trainX_pos_cat, devX, devX_capitals_cat,
                   devX_pos_cat, trainY_cat, devY_cat, embedding_matrix, train_decoderY, dev_decoderY)
'''

saveProcessedData( trainX_100, trainX_capitals_cat_100, trainX_pos_cat_100, devX_100, devX_capitals_cat_100,
                   devX_pos_cat_100, trainY_cat_100, devY_cat_100, embedding_matrix_100, train_decoderY_100, dev_decoderY_100)

saveProcessedData( trainX_50, trainX_capitals_cat_50, trainX_pos_cat_50, devX_50, devX_capitals_cat_50,
                   devX_pos_cat_50, trainY_cat_50, devY_cat_50, embedding_matrix_50, train_decoderY_50, dev_decoderY_50)

saveProcessedData( trainX_10, trainX_capitals_cat_10, trainX_pos_cat_10, devX_10, devX_capitals_cat_10,
                   devX_pos_cat_10, trainY_cat_10, devY_cat_10, embedding_matrix_10, train_decoderY_10, dev_decoderY_10)


## Model Training Functions

In [None]:
@timeit 
def trainModelSP( testFunc, modelName, hyper_params, embed_matrix=None, verbose=False):
    """
    testFunc - the name of the python file to run
    modelName - the internal name (ID) of the model to train
    hyper_params - a dict of hyper parameters
    """
    # save the hyperparams
    with open(HYPER_PARAM_FILE, mode='w') as fp:
        json.dump( hyper_params, fp)
    
    # call the train function
    # consider replacing with a call to subprocess!!
    !python {testFunc} {modelName} {HYPER_PARAM_FILE}



In [None]:
@timeit 
def testFeatures( testFunc, modelName, hyper_params, TRAIN_AMOUNT):
    """
    builds and trains models for the configuration in hyper_params,
    1 for each input feature configuration: base, pos, caps, pos + caps 
      (no longer training pos and caps independently)
    
    testFunc - the name of the python file to run
    modelName - the model name to use for labeling
    """
    hypers = hyper_params.copy()
    
    # try the embeddings with different features
    
    # base
    curModel = modelName + "_" + LANGUAGE + "_" + TRAIN_AMOUNT + "_base"
    trainModelSP( testFunc, curModel, hypers )
    
    # pos tags
    curModel = modelName + "_" + LANGUAGE + "_" + TRAIN_AMOUNT + "_features"
    hypers['use_pos_tags'] = True
    hypers['use_capitalization_info'] = False
    trainModelSP( testFunc, curModel, hypers )
    
    # capitalization info
    #curModel = modelName + "_caps"
    #hypers['use_pos_tags'] = False
    #hypers['use_capitalization_info'] = True
    #trainModelSP( testFunc, curModel, hypers )
    
    # both
#     curModel = modelName + "_pos_caps"
#     hypers['use_pos_tags'] = True
#     hypers['use_capitalization_info'] = True
#     trainModelSP( testFunc, curModel, hypers )
    

##  Training
> the output isn't pretty, but we don't really need it since everything is stored in the history log. It is really just to show a sign of life.  
> * The below is just an example of how to set hyper parameters and train multiple models.

### train capsnet

In [None]:
# capsnet training function
testFunc = "trainCapsModel.py"


hypers = hyper_param_caps_100.copy()
hypers['epochs'] = 1
hypers['stopping_patience'] = 3
hypers['use_pos_tags'] = False
hypers['use_capitalization_info'] = False


# # try different embeddings
# # learn embeddings
#print("\n\nLearn Embeddings")
#hypers['use_glove'] = False
#hypers['embed_dropout'] = 0.0
#testFeatures( testFunc, "learn", hypers)

# # learn embeddings + Dropout
# print("\n\nLearn Embeddings and Dropout")
# hypers['use_glove'] = False
# hypers['embed_dropout'] = 0.25
# testFeatures( testFunc, "learn_dropout", hypers)


# use glove, no learn
print("\n\nGlove Embeddings")
hypers['use_glove'] = True
hypers['allow_glove_retrain'] = False
hypers['embed_dropout'] = 0.0
testFeatures( testFunc, "glove_nolearn", hypers, "100")




hypers = hyper_param_caps_50.copy()
hypers['epochs'] = 1
hypers['stopping_patience'] = 3
hypers['use_pos_tags'] = False
hypers['use_capitalization_info'] = False

print("\n\nGlove Embeddings")
hypers['use_glove'] = True
hypers['allow_glove_retrain'] = False
hypers['embed_dropout'] = 0.0
testFeatures( testFunc, "glove_nolearn", hypers, "50")



hypers = hyper_param_caps_10.copy()
hypers['epochs'] = 1
hypers['stopping_patience'] = 3
hypers['use_pos_tags'] = False
hypers['use_capitalization_info'] = False

print("\n\nGlove Embeddings")
hypers['use_glove'] = True
hypers['allow_glove_retrain'] = False
hypers['embed_dropout'] = 0.0
testFeatures( testFunc, "glove_nolearn", hypers, "10")



# # use glove, no learn + Dropout
# print("\n\nGlove Embeddings and Dropout")
# hypers['use_glove'] = True
# hypers['allow_glove_retrain'] = False
# hypers['embed_dropout'] = 0.25
# testFeatures( testFunc, "glove_nolearn_dropout", hypers)

# # use glove, learn
#print("\n\nGlove Embeddings with Learning")
#hypers['use_glove'] = True
#hypers['allow_glove_retrain'] = True
#hypers['embed_dropout'] = 0.0
#testFeatures( testFunc, "glove_learn", hypers)

# # # use glove, learn + Dropout
# print("\n\nGlove Embeddings with Learning and Dropout")
# hypers['use_glove'] = True
# hypers['allow_glove_retrain'] = True
# hypers['embed_dropout'] = 0.25
# testFeatures( testFunc, "glove_learn_dropout", hypers)


### train CNN

In [None]:
# CNN training function
testFunc = "trainCNNModel.py"



hypers = hyper_param_cnn_100.copy()
hypers['epochs'] = 1
hypers['stopping_patience'] = 3
hypers['use_pos_tags'] = False
hypers['use_capitalization_info'] = False

# # try different embeddings
# # learn embeddings
# print("\n\nLearn Embeddings")
# hypers['use_glove'] = False
# hypers['embed_dropout'] = 0.0
# testFeatures( testFunc, "learn", hypers)

# # learn embeddings + Dropout
# print("\n\nLearn Embeddings and Dropout")
# hypers['use_glove'] = False
# hypers['embed_dropout'] = 0.25
# testFeatures( testFunc, "learn_dropout", hypers)

# use glove, no learn
print("\n\nGlove Embeddings")
hypers['use_glove'] = True
hypers['allow_glove_retrain'] = False
hypers['embed_dropout'] = 0.0
testFeatures( testFunc, "glove_nolearn_cnn", hypers, "100")

hypers = hyper_param_cnn_50.copy()
hypers['epochs'] = 1
hypers['stopping_patience'] = 3
hypers['use_pos_tags'] = False
hypers['use_capitalization_info'] = False

# use glove, no learn
print("\n\nGlove Embeddings")
hypers['use_glove'] = True
hypers['allow_glove_retrain'] = False
hypers['embed_dropout'] = 0.0
testFeatures( testFunc, "glove_nolearn_cnn", hypers, "50")

hypers = hyper_param_cnn_10.copy()
hypers['epochs'] = 1
hypers['stopping_patience'] = 3
hypers['use_pos_tags'] = False
hypers['use_capitalization_info'] = False

# use glove, no learn
print("\n\nGlove Embeddings")
hypers['use_glove'] = True
hypers['allow_glove_retrain'] = False
hypers['embed_dropout'] = 0.0
testFeatures( testFunc, "glove_nolearn_cnn", hypers, "10")
# # use glove, no learn + Dropout
# print("\n\nGlove Embeddings and Dropout")
# hypers['use_glove'] = True
# hypers['allow_glove_retrain'] = False
# hypers['embed_dropout'] = 0.25
# testFeatures( testFunc, "glove_nolearn_dropout", hypers)

# # use glove, learn
#print("\n\nGlove Embeddings with Learning")
#hypers['use_glove'] = True
#hypers['allow_glove_retrain'] = True
#hypers['embed_dropout'] = 0.0
#testFeatures( testFunc, "glove_learn", hypers)

# # # use glove, learn + Dropout
# print("\n\nGlove Embeddings with Learning and Dropout")
# hypers['use_glove'] = True
# hypers['allow_glove_retrain'] = True
# hypers['embed_dropout'] = 0.25
# testFeatures( testFunc, "glove_learn_dropout", hypers)


In [None]:
# # SGD testing
# # capsnet training function
# testFunc = "trainCapsModel.py"

# hypers = hyper_param_caps.copy()
# hypers['optimizer'] = "SGD"
# print("Training with SGD - Nesterov Momentum Optimizer")

# hypers['epochs'] = 1
# hypers['stopping_patience'] = 3
# hypers['use_pos_tags'] = False
# hypers['use_capitalization_info'] = False

# # # try different embeddings
# # # learn embeddings
# # print("\n\nLearn Embeddings")
# # hypers['use_glove'] = False
# # hypers['embed_dropout'] = 0.0
# # testFeatures( testFunc, "SGD_primcaps_learn", hypers)

# # # learn embeddings + Dropout
# # print("\n\nLearn Embeddings and Dropout")
# # hypers['use_glove'] = False
# # hypers['embed_dropout'] = 0.25
# # testFeatures( testFunc, "SGD_primcaps_learn_dropout", hypers)

# # use glove, no learn
# print("\n\nGlove Embeddings")
# hypers['use_glove'] = True
# hypers['allow_glove_retrain'] = False
# hypers['embed_dropout'] = 0.0
# testFeatures( testFunc, "SGD_primcaps_glove_nolearn", hypers)

# # # use glove, no learn + Dropout
# # print("\n\nGlove Embeddings and Dropout")
# # hypers['use_glove'] = True
# # hypers['allow_glove_retrain'] = False
# # hypers['embed_dropout'] = 0.25
# # testFeatures( testFunc, "SGD_primcaps_glove_nolearn_dropout", hypers)

# # # use glove, learn
# # print("\n\nGlove Embeddings with Learning")
# # hypers['use_glove'] = True
# # hypers['allow_glove_retrain'] = True
# # hypers['embed_dropout'] = 0.0
# # testFeatures( testFunc, "SGD_primcaps_glove_learn", hypers)

# # # # use glove, learn + Dropout
# # print("\n\nGlove Embeddings with Learning and Dropout")
# # hypers['use_glove'] = True
# # hypers['allow_glove_retrain'] = True
# # hypers['embed_dropout'] = 0.25
# # testFeatures( testFunc, "SGD_primcaps_glove_learn_dropout", hypers)