## Init

### Jupyter configurations

In [1]:
# expand cell widths to 100% for better output viewing
# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:100% !important; }</style>"))

### Imports

In [2]:
import tensorflow as tf
import keras
import keras.backend as K
import numpy as np
import pandas as pd
import sys, inspect, argparse, importlib, traceback, re 
from copy import deepcopy

Using TensorFlow backend.


### Function: Accuracy Metric

In [3]:
def exact_match_accuracy(y_true, y_pred):
    """Custom accuracy function, Measure prediction accuracy against the ground truth.
    A sample'accuracy is 1 if all characters exactly match the ground truth and 0 otherwise.
    
    # Arguments
        y_true: A Tensor holding the true y values for the batch
        y_pred: A Tensor holding the predicted y values for the batch
        
    # Returns
        A Tensor of rank 0 containing the % of correctly predicted batches
    """
    
    # onehot to index     (batch, width, onehot:int) -> (batch, width:int)
    argmax_true = tf.math.argmax(y_true, axis=-1)
    
    # onehot to index    (batch, width, onehot:int) -> (batch, width:int)
    argmax_pred = tf.math.argmax(y_pred, axis=-1)
    
    # match characters   (batch, width:int) -> (batch, width:bool)
    match_char = tf.math.equal(argmax_true, argmax_pred)
    
    # require all character in sample to match (batch, width:bool) -> (batch:bool)
    match_word = tf.math.reduce_all(match_char, axis=-1)
    
    # bool to int                                   (batch:bool) -> (batch:float)
    match_int = tf.cast(match_word, tf.float32)
    
    # percentage of samples that are an exact match (batch:float) -> float
    return tf.reduce_mean(match_int)

### Function: Log function

In [4]:
verbose = False
def log(*l, **d): 
    """Print function that supresses the print if 'verbose' is set False.
    
    # Arguments
        *l, **d: unnamed and named arguments forwarded to the print function
    """
    if verbose: print(*l, **d)
        
        
        
training_history = []
def training_log(x, y, a, b, e, l, m):
    """Logs the training result for a training run
    
    # Arguments
        values for the history dictionary
    """
    training_history.append({'x':x, 'y':y, 'architecture':a, 'batch size':b, 'epochs':e, 'loss':l, 'accuracy':m})

## Preprocess Data

### Function: Vectorize Strings

In [5]:
def vectorize_data(data):
    """Vectorise multidimensional arrays of strings, 
    Convert a tensor rank n of strings into a tensor rank n+1 of int.
    
    # Arguments
        data: multidimensional array (lists) of strings
        
    # Returns
        multidimensional array with string replaced with vectors
    """
    
    # Recursively calls it self with a tensor rank n-1 until rank=0 and vectorize the string
    
    # base case, return vectorised string (characters replaced with numbers)
    if type(data) == str:
        return [char_to_int[char] for char in data.upper()]
    
    # call self on each item (rank n-1) along the top level axis
    else:
        try:
            # return list of the vectorised items
            return [vectorize_data(d) for d in data]
        except Exception as e:
            return []


def devectorise_data(data):
    """'De-vectorize' data, convert vectors to string.
    Each sample produces one string, 'words' are concatinated.
    
    # Arguments
        data: multidimensional numpy array of ints
        
    # Returns
        array of strings, one string per sample
    """
    
    # number of samples in batch
    length = data.shape[0]
    
    # reduce dimensionality to batch size x nnumber of characters
    data = data.reshape(length, -1)
    
    # blank array of strings
    strings = np.full((length,), '', dtype=object)
    
    # generate string for each sample
    for i in range(length):
        strings[i] = ''.join([int_to_char[int(i)] for i in data[i]])
    
    return strings

### Function: Add Padding Tokens

In [6]:
# find size of largest array across each dimension to computer shape of bounding ndarray
def size(data):
    """Deetrmines the shape (sizes along each axis) of a list based multidimensional array
    
    # Arguments
        data: list based multidimensional array
        
    # Returns
        a tuple holding the sizes the dimensions of the array
    """
    
    # Recursivelt call it self on 
    
    # base case, this item is a value, return empty shape
    if type(data) == int:
        return ()
    
    # number of items in the top level list
    this_size = len(data)
    
    # if has items, inner shape is max of items shape
    if this_size > 0:
        inner_sizes = np.array([size(d) for d in data])
        inner_sizes = tuple(np.amax(inner_sizes, axis=0))
        
    # if no items, inner empty shape
    else:
        inner_sizes = ()
    
    # this hsape is size of top level list and max shape of inner shapes
    return (this_size,) + inner_sizes
    
    
def insert_vector(matrix, data, indices=()):
    """insert vectors from list based multidimensional arrays into a numpy ndarray
    
    # Arguments
        matrix: the ndarray to insert vectors into
        data: the list based multimensional array of vectors
        indices: 
    """
    
    # Recursively call it self to insert rank n-1 arrays into the array
    
    # base case, insert int into ndarray
    if type(data) == int:
        matrix[indices] = data
        
    # data is rank >0, insert each item at the next level into the matrix, indices indicates sublocations
    else:
        for i in range(len(data)):
            insert_vector(matrix, data[i], indices + (i,))
    

def pad_vector_data(data, pad_token, pad_shape=None):
    """create a uniformly shaped numpy ndarray filled with padding and insert the data into it.
    
    # Arguments
        data: list based multidimensional array
        pad_token: padding token (int)
        pad_shape (optional): shape of ndarray, otherwise fit data
        
    # Returns 
        numpy ndarray containing padded data
    """
    
    # determine the shape needed to fit the data
    shape = size(data)
    if pad_shape != None:
        shape = tuple(np.maximum(pad_shape, shape))

    # empty matrix filled with the padding token
    matrix = np.full(shape, pad_token, np.int32)

    # insert data into the ndarray
    insert_vector(matrix, data, ())

    return matrix

### Function: Split dataset

In [7]:
def split(data, sizes):
    """Split the data into subsets of the specified sizes.
    
    # Arguments
        data: array of samples
        sizes: list of sizes
    """
    
    sizes = list(sizes)
    
    # convert sizes running totals, (5, 10, 5) -> (5, 15, 20)
    for i in range(1, len(sizes)):
        sizes[i] += sizes[i-1]
    
    # extract sized subsets, (5, 15, 20) -> 0:5, 5:15, 15:20
    slices = [slice(i,j) for i, j in zip([0]+sizes, sizes)]
    
    return [data[s] for s in slices]

### Function: Shuffle dataset

In [8]:
def shuffle(*data):
    """Shuffle the datasets togeather (same order).
    
    # Arguments
        *data: list of datasets
        
    # Returns 
        list of shuffled datasets
    """
    
    order = np.arange(len(data[0]))         # default order of elements
    np.random.shuffle(order)                # randomise order
    return [d[order] for d in data]         # new array with items in the randimised order

### Function: Extract relevant data

In [9]:
def extract(*keys, **cuts):
    """Extract and return required features (datasets). E.g.: get FileName and LineName.
    Optionally apply cuts to the data, E.g.: if cuts['a'] = (3,9) then a = a[:, 3:9]
    
    # Arguments
        *keys: list of names of the datasets
        **cuts: cuts to aplly to dataset of corresponding name/key
    
    # Returns
        list of datasets
        list of corresponding dat shapes
    """
    
    onehots, shapes = [], []
        
    for key in keys:
        
        # get data from dictionary
        onehot = onehot_data[key]

        # apply cuts
        cut = cuts.get(key, [[None]])
        cut = [slice(*c) for c in cut]
        cut = len(onehot.shape)*[slice(None)] + cut + [slice(None)]
        cut = tuple(cut[-len(onehot.shape):])
        onehot = onehot[cut]
        
        # calculate shape
        shape = (None, *onehot.shape[1:])[-3:]

        onehots.append(onehot)
        shapes.append(shape)

    return onehots, shapes
    
    
def split_and_shuffle(*onehots, sizes=None, shuffle_before=False, shuffle_after=True):
    """Shuffle datasets, Split datasets into subsets, shuffle subsets
    
    # Arguments
        *onehots: list of datasets
        sizes: list of subset sizes
        shuffle_before: whether to shuffle before the split
        shuffle_after: whether to shuffle after the split
        
    # Returns
        matrix of subsets with shape dadaset x subsets
    """
    
    # sizes of subjets
    sizes = sizes or [None]
    key_count = len(onehots)
    subset_count = len(sizes)
    
    # shuffle sets before splitting them
    if shuffle_before:
        onehots = shuffle(*onehots)

    # split datasets
    onehots_subsets = np.full((key_count, subset_count), None)
    onehots_subsets[:,:] = [split(onehot, sizes) for onehot in onehots]

    # shuffle subsets
    if shuffle_after:
        for i in range(subset_count):
            onehots_subsets[:,i] = shuffle(*onehots_subsets[:,i])

    return onehots_subsets

### Tokens and Characters

In [10]:
# tokens used to communicate non character entities
tokens = ['<Padding>', '<Go>', '<EndOfString>', '<UnknownChar>']

# get set of characters to be used, use static preset list of characters
#available_chars = list(" ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890-_().,\\/\"':&")
available_chars = list(" ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890-_().,\\/\"':&")

# generate character to int and int to character maps
char_to_int = {c: i for i, c in enumerate(tokens + available_chars)}
int_to_char = {i: c for c, i in char_to_int.items()}
char_count = len(char_to_int) # number of character available

### Read raw data

In [11]:
raw_source_file = 'SHUP 2D Files Training Data.csv'

# read raw training data
data_df = pd.read_csv(raw_source_file, dtype=str)

# read columns into dictionary (column name -> column values)
data = {feature:data_df[feature].values for feature in data_df.columns.values}

### Perform preprocessing

In [12]:
# split strings into words
delimiters = r'( |_|-|\.|\,|/|\\|\(|\)|&|:|\'|")' # any of '-_().,\\/\"':& '
replacement = r'\0\g<1>\0' # surround delimiter with splitting token

# split strings into words acording to RegEx for some datasets
data['LookupDOSFilePath_Words'] = np.array([re.sub(delimiters, replacement, s).split('\0') for s in data['LookupDOSFilePath']])
data['FileName_Words'] = np.array([re.sub(delimiters, replacement, s).split('\0') for s in data['FileName']])
data['LineName_Words'] = np.array([re.sub(delimiters, replacement, s).split('\0') for s in data['LineName']])
data['SurveyName_Words'] = np.array([re.sub(delimiters, replacement, s).split('\0') for s in data['SurveyName']])

# vectorise string in each dataset
vectorized_data = {f: vectorize_data(data[f])    for f in data}

# move datasets into padded ndarrays
padded_data =     {f: pad_vector_data(vectorized_data[f], char_to_int['<Padding>'])    for f in vectorized_data}

# convert int to one hot encodings
onehot_data =     {f: keras.utils.to_categorical(padded_data[f], char_count)    for f in padded_data}

for f in onehot_data: print(f"'{f}':".ljust(30), onehot_data[f].shape)

'Unique Record ID':            (23903, 6, 53)
'FileName':                    (23903, 87, 53)
'Original_FileName':           (23903, 71, 53)
'SurveyNum':                   (23903, 5, 53)
'SurveyName':                  (23903, 39, 53)
'LineName':                    (23903, 23, 53)
'SurveyType':                  (23903, 6, 53)
'PrimaryDataType':             (23903, 14, 53)
'SecondaryDataType':           (23903, 36, 53)
'TertiaryDataType':            (23903, 17, 53)
'Quaternary':                  (23903, 8, 53)
'File_Range':                  (23903, 13, 53)
'First_SP_CDP':                (23903, 8, 53)
'Last_SP_CDP':                 (23903, 7, 53)
'CompletionYear':              (23903, 4, 53)
'TenureType':                  (23903, 3, 53)
'Operator Name':               (23903, 47, 53)
'GSQBarcode':                  (23903, 17, 53)
'EnergySource':                (23903, 29, 53)
'LookupDOSFilePath':           (23903, 181, 53)
'Source Of Data':              (23903, 8, 53)
'LookupDOSFilePath_Wo

## Function: Test and show samlpe output

In [13]:
def test(model, x_test=None, y_test=None, x_preview=None, y_preview=None):
    """Test a model, calculate accuracy from test data, compute and show previews.
    
    # Arguments
         model: model to test
         x_test: accuracy test inputs
         y_test: accuracy test true outputs
         x_preview: inputs to print and show
         y_preview: true outputs to print and show
         
    # Returns
        loss computed from test set
        accuracy of predictions on test set
    """
    
    # if preview data is provided, run predictions on preview data and decode them to strings
    if (x_preview is not None) and (y_preview is not None):
        t_size = len(x_preview)
        
        # run predictions and decode
        p_one_hot = model.predict(x_preview)
        p_vector = np.argmax(p_one_hot, -1)
        p_vector = p_vector.reshape((t_size, -1))
        p_strings = devectorise_data(p_vector)

        # decode ground truth
        y_vector = np.argmax(y_preview, -1)
        y_vector = y_vector.reshape((t_size, -1))
        y_strings = devectorise_data(y_vector)

        # decode inputs
        x_vector = np.argmax(x_preview, -1)
        x_vector = x_vector.reshape((t_size, -1))
        x_strings = devectorise_data(x_vector)

        # numbering for samples
        n_strings = [f'{i}. ' for i in range(t_size)]
        
        # replace '<Padding>' with ' ' and remove spaces at start and end
        x_strings = [re.sub('(<Padding>)+', ' ', s).strip() for s in x_strings]
        y_strings = [re.sub('(<Padding>)+', ' ', s).strip() for s in y_strings]
        p_strings = [re.sub('(<Padding>)+', ' ', s).strip() for s in p_strings]
        
        # compute minimum width of each column
        n_w = max([len(s) for s in n_strings])
        x_w = max([len(s) for s in x_strings])
        y_w = max([len(s) for s in y_strings])
        p_w = max([len(s) for s in p_strings])
        
        # create equal width rows with number(n), input (x), true output (y), predicted output (p) and if it matches
        y_p_strings = ["'  '".join([n.ljust(n_w), x.ljust(x_w), y.ljust(y_w), p.ljust(p_w), str(y==p)]) for n, x, y, p in zip(n_strings, x_strings, y_strings, p_strings)]

        print(*y_p_strings, sep='\n', end='\n\n')

        
    # if test data is provided, run predictions and measure accuracies
    if (x_test is not None) and (y_test is not None):
        
        # metric names
        metrics = [model.loss] + model.metrics
        
        # accuracy on entire training set
        accuracies = model.evaluate(x_test, y_test)
        print(*list(zip(metrics, accuracies)), sep='\n', end='\n\n') # evaluate and list loss and each metric

        return accuracies[0], accuracies[-1]

## Training Parameters

In [14]:
# maps model name onto model object
models = {}

# maps embedding name onto embedding object
embeddings = {}

In [15]:
#architecture = ''

# dimensionality of embedded characters
character_embedding_size = 10

# compile parameters for models
metrics = ['mean_absolute_error', 'categorical_accuracy', exact_match_accuracy] # binary_accuracy
loss = 'categorical_crossentropy' # poisson mean_squared_logarithmic_error categorical_crossentropy

# compile parameters for embedding models
embed_loss='categorical_crossentropy'
embed_metrics=['accuracy', 'mean_absolute_error', 'categorical_accuracy', exact_match_accuracy]

## Character Embedding

### Auto Encoder Character Data

In [16]:
# create offset input and output sequences to training a preditive embedding model.

(x_char_onehot,), ((embed_word_count, embed_char_count, embed_ones_count),) = extract('LookupDOSFilePath')
(x_char_onehot,) = shuffle(x_char_onehot)
shape = x_char_onehot.shape
x_embed_size = len(x_char_onehot)

# create columns of padding tokens
padding = np.full((*shape[:-2], 1), char_to_int['<Padding>'])
#padding = np.array([[char_to_int['<Padding>']]] * x_embed_size)
padding = keras.utils.to_categorical(padding, char_count)
padding = padding.reshape(*shape[:-2], 1, shape[-1])

# 'abcd' -> ('_abcd', 'abcd_')
x_embed_train = np.concatenate((x_char_onehot, padding), axis=-2)
y_embed_train = np.concatenate((padding, x_char_onehot), axis=-2)
x_embed_test = np.concatenate((x_char_onehot, padding), axis=-2)
y_embed_test = np.concatenate((padding, x_char_onehot), axis=-2)

embed_char_count += 1

### Auto Encoder: Input, Hidden, Output

In [17]:
architecture = 'Character-Embedding'

model_E_D_NN = keras.Sequential()
model_E_D_NN.add(keras.layers.Dense(character_embedding_size, name='lh', input_shape=(embed_char_count, char_count,)))
model_E_D_NN.add(keras.layers.Dense(char_count, activation='sigmoid', name='lo'))
#model_E_D_NN.add(keras.layers.Dropout(0.001))
model_E_D_NN.compile(optimizer='adam', loss=embed_loss, metrics=embed_metrics)
models[architecture] = model_E_D_NN
print(model_E_D_NN.summary())

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lh (Dense)                   (None, 182, 10)           540       
_________________________________________________________________
lo (Dense)                   (None, 182, 53)           583       
Total params: 1,123
Trainable params: 1,123
Non-trainable params: 0
_________________________________________________________________
None


### Train Encoder and Decoder

In [18]:
# x -> y   predictive
epochs = 2
batch_size = 16
models['Character-Embedding'].fit(x_embed_train, y_embed_train, batch_size=batch_size, epochs=epochs)
test(models['Character-Embedding'], None, None, x_embed_train[:1], y_embed_train[:1])

Instructions for updating:
Use tf.cast instead.
Epoch 1/2
Epoch 2/2
0. '  '\SHUP\2D_SURVEYS\PROCESSED_AND_SUPPORT_DATA\1980\MAJOR_MOROCCO\SEGY\MAJOR_MOROCCO_HSB-821_PROCESSED_SDU07104TA_203293.SGY'  '\SHUP\2D_SURVEYS\PROCESSED_AND_SUPPORT_DATA\1980\MAJOR_MOROCCO\SEGY\MAJOR_MOROCCO_HSB-821_PROCESSED_SDU07104TA_203293.SGY'  'A\SSUA\ED\SORSG\AUOROS\\SEDTAED\SUUROADETATA\198A_T_ROD_ROROORA\SSGA_T_ROD_ROROORDS\\09\\DUOROS\\SED\ES80\80ATD\89\190\SG'  'False



In [19]:
# x -> x   direct
epochs = 5
batch_size = 16
models['Character-Embedding'].fit(x_embed_train, x_embed_train, batch_size=batch_size, epochs=epochs)
test(models['Character-Embedding'], None, None, x_embed_train[:1], x_embed_train[:1])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0. '  '\SHUP\2D_SURVEYS\PROCESSED_AND_SUPPORT_DATA\1980\MAJOR_MOROCCO\SEGY\MAJOR_MOROCCO_HSB-821_PROCESSED_SDU07104TA_203293.SGY'  '\SHUP\2D_SURVEYS\PROCESSED_AND_SUPPORT_DATA\1980\MAJOR_MOROCCO\SEGY\MAJOR_MOROCCO_HSB-821_PROCESSED_SDU07104TA_203293.SGY'  '\SHUP\2D_SURVEYS\PROCESSED_AND_SUPPORT_DATA\1980\MAJOR_MOROCCO\SEGY\MAJOR_MOROCCO_HSB-821_PROCESSED_SDU07104TA_203293.SGY'  'True



In [20]:
# w1 = model.layers[0].get_weights()
# w2 = model.layers[1].get_weights()

# w = [np.copy(w1[0]), np.zeros(w1[1].shape)]
# wi = [np.linalg.pinv(w1[0]), np.zeros(w2[1].shape)]

# m = keras.Sequential()
# m.add(keras.layers.Dense(character_embedding_size, activation='linear', name='lh', input_shape=(embed_char_count, voc_size,)))
# m.add(keras.layers.Dense(voc_size, activation='sigmoid', name='lo'))
# m.compile(optimizer='adam', loss=embed_loss, metrics=embed_metrics)

# m.layers[0].set_weights(w)
# m.layers[1].set_weights(wi)

# encode_weights, decode_weights = w, wi

# accuracy = model.evaluate(embed_test_x, embed_test_x)
# metric_names = [embed_loss] + embed_metrics
# dict(zip(metric_names, accuracy))

embeddings['Character-Embedding'] = [models['Character-Embedding'].layers[0].get_weights(), models['Character-Embedding'].layers[1].get_weights()]

In [21]:
# test
test(models['Character-Embedding'], x_embed_train, x_embed_train, x_embed_train[:5], x_embed_train[:5])

0. '  '\SHUP\2D_SURVEYS\PROCESSED_AND_SUPPORT_DATA\1980\MAJOR_MOROCCO\SEGY\MAJOR_MOROCCO_HSB-821_PROCESSED_SDU07104TA_203293.SGY                                      '  '\SHUP\2D_SURVEYS\PROCESSED_AND_SUPPORT_DATA\1980\MAJOR_MOROCCO\SEGY\MAJOR_MOROCCO_HSB-821_PROCESSED_SDU07104TA_203293.SGY                                      '  '\SHUP\2D_SURVEYS\PROCESSED_AND_SUPPORT_DATA\1980\MAJOR_MOROCCO\SEGY\MAJOR_MOROCCO_HSB-821_PROCESSED_SDU07104TA_203293.SGY                                      '  'True
1. '  '\SHUP\2D_SURVEYS\PROCESSED_AND_SUPPORT_DATA\1980\ROCKY\SEGY\ROCKY_PR85-66_FINAL_QR017432_174230.SGY                                                            '  '\SHUP\2D_SURVEYS\PROCESSED_AND_SUPPORT_DATA\1980\ROCKY\SEGY\ROCKY_PR85-66_FINAL_QR017432_174230.SGY                                                            '  '\SHUP\2D_SURVEYS\PROCESSED_AND_SUPPORT_DATA\1980\ROCKY\SEGY\ROCKY_PR85-66_FINAL_QR017432_174230.SGY                                                            '  'Tru

(0.00018033243587115954, 1.0)

## Word Embedding

### Auto Encoder Folder Data

In [22]:
class WordEmbedding():
    """Defines, holds, trains and provides word and character embedding models.
    Allows other models to apply its embeddings.
    """
    
    def __init__(self, x_name, compression=0.1, weight_constraint=10.0, clipnorm=1):
        """creates a new word embedding and copyes the pre-trained character embedding
        
        # Arguments
            x_name: name of the dataset for which the embedding is
            compression: ratio of of onehot to latent vector size
            weight_constraint: keras.layers.Dense(kernel_constraint value)
            clipnorm: keras.optimizers.OPTIMIZER(clipnorm value)
        """

        # extract and shuffle data
        (x_word_onehot,), ((embed_word_count, embed_char_count, embed_ones_count),) = extract(x_name)
        (x_word_onehot,) = shuffle(x_word_onehot)
        
        # add small amount to 0's to avoid 0 values
        # e = 0.01
        # x_word_onehot = x_word_onehot * (1-e) + e

        # if data is not split into words, create extra word diemnstion
        if(embed_word_count == None):
            embed_word_count = 1
            x_word_onehot = np.expand_dims(x_word_onehot, 1)
        
        self.x_word_onehot = x_word_onehot

        # define embedding sizes
        self.character_embedding_size = character_embedding_size
        self.word_embedding_size = int(embed_char_count * self.character_embedding_size * compression) + 1
        c_size, w_size = self.character_embedding_size, self.word_embedding_size
        
        
        # define model

        # character embedding
        self.l_encode_character = keras.layers.Dense(
            c_size, 
            name='char_encode', 
            input_shape=(embed_word_count, embed_char_count, embed_ones_count), 
            kernel_constraint=keras.constraints.max_norm(weight_constraint)
        )
        self.l_char_to_word = keras.layers.Reshape(
            (embed_word_count, embed_char_count * c_size,)
        )

        # word auto encoder
        self.l_encode_word = keras.layers.Dense(
            w_size, name='lh', 
            input_shape=(embed_char_count, char_count,), 
            kernel_constraint=keras.constraints.max_norm(weight_constraint)
        )
        self.l_decode_word = keras.layers.Dense(
            embed_char_count * c_size, 
            activation='sigmoid', name='lo', 
            kernel_constraint=keras.constraints.max_norm(weight_constraint)
        )

        # character de embedding
        self.l_word_to_char = keras.layers.Reshape(
            (embed_word_count, embed_char_count, c_size)
        )
        self.l_decode_character = keras.layers.Dense(
            embed_ones_count, 
            activation='sigmoid', 
            name='char_decode', 
            kernel_constraint=keras.constraints.max_norm(weight_constraint)
        )

        self.model = keras.Sequential([
            self.l_encode_character,
            #keras.layers.Dropout(0.1),
            self.l_char_to_word,
            #keras.layers.Dropout(0.1),
            self.l_encode_word,
            #keras.layers.Dropout(0.1),
            self.l_decode_word,
            #keras.layers.Dropout(0.1),
            self.l_word_to_char,
            #keras.layers.Dropout(0.1),
            self.l_decode_character,
        ])

        RMSprop = keras.optimizers.RMSprop(clipnorm=clipnorm)
        self.model.compile(optimizer=RMSprop, loss=embed_loss, metrics=embed_metrics)

        
        # set pre-trained character embedding
        self.l_encode_character.set_weights(deepcopy(embeddings['Character-Embedding'][0]))
        self.l_decode_character.set_weights(deepcopy(embeddings['Character-Embedding'][1]))

        print(self.model.summary())
    
    
    def train(self, epochs=5, batch_size=32):
        """Train the model and print accuracies
        
        # Arguments
            epochs: number of epochs to train for
            batch_size: batch size
        """
        
        # perform training
        self.model.fit(self.x_word_onehot, self.x_word_onehot, batch_size=batch_size, epochs=epochs)
        test(self.model, self.x_word_onehot, self.x_word_onehot, self.x_word_onehot[:3], self.x_word_onehot[:3])

        
    def apply_encode(self, new_encode_character, new_encode_word):
        """Apply weights for character and word encoding to layers of outher models
        
        # Arguments
            new_encode_character: layer to recieve character embedding weights
            new_encode_word: layer to recieve word embedding weights"""
        
        new_encode_character.set_weights(deepcopy(self.l_encode_character.get_weights()))
        new_encode_word.set_weights(deepcopy(self.l_encode_word.get_weights()))

        
    def apply_decode(self, new_decode_word, new_decode_character):
        """Apply weights for character and word decoding to layers of outher models
        
        # Arguments
            new_encode_character: layer to recieve character embedding weights
            new_encode_word: layer to recieve word embedding weights"""
        
        new_decode_word.set_weights(deepcopy(self.l_decode_word.get_weights()))
        new_decode_character.set_weights(deepcopy(self.l_decode_character.get_weights()))
        

### Train word embedding

#### LineName

In [23]:
embeddings['LineName_Embedding'] = WordEmbedding('LineName', 0.15)
embeddings['LineName_Embedding'].train(15, 16)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
char_encode (Dense)          (None, 1, 23, 10)         540       
_________________________________________________________________
reshape_1 (Reshape)          (None, 1, 230)            0         
_________________________________________________________________
lh (Dense)                   (None, 1, 35)             8085      
_________________________________________________________________
lo (Dense)                   (None, 1, 230)            8280      
_________________________________________________________________
reshape_2 (Reshape)          (None, 1, 23, 10)         0         
_________________________________________________________________
char_decode (Dense)          (None, 1, 23, 53)         583       
Total params: 17,488
Trainable params: 17,488
Non-trainable params: 0
_________________________________________________________________
None
E

#### FileName_Words

In [24]:
embeddings['FileName_Words_Embedding'] = WordEmbedding('FileName_Words', 0.2)
embeddings['FileName_Words_Embedding'].train(10, 16)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
char_encode (Dense)          (None, 29, 13, 10)        540       
_________________________________________________________________
reshape_3 (Reshape)          (None, 29, 130)           0         
_________________________________________________________________
lh (Dense)                   (None, 29, 27)            3537      
_________________________________________________________________
lo (Dense)                   (None, 29, 130)           3640      
_________________________________________________________________
reshape_4 (Reshape)          (None, 29, 13, 10)        0         
_________________________________________________________________
char_decode (Dense)          (None, 29, 13, 53)        583       
Total params: 8,300
Trainable params: 8,300
Non-trainable params: 0
_________________________________________________________________
None
Epo

#### SurveyName

In [25]:
embeddings['SurveyName_Embedding'] = WordEmbedding('SurveyName', 0.15, 8, 0.01)
embeddings['SurveyName_Embedding'].train(15, 16)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
char_encode (Dense)          (None, 1, 39, 10)         540       
_________________________________________________________________
reshape_5 (Reshape)          (None, 1, 390)            0         
_________________________________________________________________
lh (Dense)                   (None, 1, 59)             23069     
_________________________________________________________________
lo (Dense)                   (None, 1, 390)            23400     
_________________________________________________________________
reshape_6 (Reshape)          (None, 1, 39, 10)         0         
_________________________________________________________________
char_decode (Dense)          (None, 1, 39, 53)         583       
Total params: 47,592
Trainable params: 47,592
Non-trainable params: 0
_________________________________________________________________
None
E

#### GSQBarcode

In [26]:
embeddings['GSQBarcode_Embedding'] = WordEmbedding('GSQBarcode', 0.15, 8, 0.01)
embeddings['GSQBarcode_Embedding'].train(15, 16)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
char_encode (Dense)          (None, 1, 17, 10)         540       
_________________________________________________________________
reshape_7 (Reshape)          (None, 1, 170)            0         
_________________________________________________________________
lh (Dense)                   (None, 1, 26)             4446      
_________________________________________________________________
lo (Dense)                   (None, 1, 170)            4590      
_________________________________________________________________
reshape_8 (Reshape)          (None, 1, 17, 10)         0         
_________________________________________________________________
char_decode (Dense)          (None, 1, 17, 53)         583       
Total params: 10,159
Trainable params: 10,159
Non-trainable params: 0
_________________________________________________________________
None
E

#### LookupDOSFilePath_Words

In [27]:
embeddings['LookupDOSFilePath_Words_Embedding'] = WordEmbedding('LookupDOSFilePath_Words', 0.15, 8, 0.01)
embeddings['LookupDOSFilePath_Words_Embedding'].train(15, 16)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
char_encode (Dense)          (None, 63, 13, 10)        540       
_________________________________________________________________
reshape_9 (Reshape)          (None, 63, 130)           0         
_________________________________________________________________
lh (Dense)                   (None, 63, 20)            2620      
_________________________________________________________________
lo (Dense)                   (None, 63, 130)           2730      
_________________________________________________________________
reshape_10 (Reshape)         (None, 63, 13, 10)        0         
_________________________________________________________________
char_decode (Dense)          (None, 63, 13, 53)        583       
Total params: 6,473
Trainable params: 6,473
Non-trainable params: 0
_________________________________________________________________
None
Epo

## Models

### LSTM-RNN3: Input, Embed Characters, Embed Words, (LSTM), De-embed Words, De-embed Characters, Output

In [32]:
architecture = 'LSTM-RNN3'

# selected datasets
x_name, y_name = 'LookupDOSFilePath_Words', 'LineName'
cuts = {} # {x_name:[[None, 6]] , y_name:[[None, 6]]}
(x_onehot, y_onehot), ((x_word_count, x_char_count, x_ones_count),(y_word_count, y_char_count, y_ones_count)) = extract(x_name, y_name, **cuts)

# corresponding embeddings
x_embed = embeddings[x_name+'_Embedding']
y_embed = embeddings[y_name+'_Embedding']


lstm_hidden_size = int(y_embed.word_embedding_size * 3)


# character embedding
l_encode_character = keras.layers.Dense(
    x_embed.character_embedding_size, 
    input_shape=(x_word_count, x_char_count, x_ones_count), 
    kernel_constraint=keras.constraints.max_norm(10.)
)

# word embedding
l_char_to_word = keras.layers.Reshape(
    (x_word_count, x_char_count * x_embed.character_embedding_size,)
)
l_encode_word = keras.layers.Dense(
    x_embed.word_embedding_size, 
    kernel_constraint=keras.constraints.max_norm(10.)
)

# lstm processing
l_lstm = keras.layers.LSTM(
    lstm_hidden_size,
    activation='sigmoid', 
    implementation=2, 
    unroll=True, 
    kernel_constraint=keras.constraints.max_norm(10.)
)
l_decode = keras.layers.Dense(
    y_embed.word_embedding_size,
    activation='sigmoid', 
    kernel_constraint=keras.constraints.max_norm(10.)
)

# word de-embedding
l_decode_word = keras.layers.Dense(
    y_char_count * y_embed.character_embedding_size,
    activation='sigmoid', 
    kernel_constraint=keras.constraints.max_norm(10.)
)
l_word_to_char = keras.layers.Reshape(
    (y_char_count, character_embedding_size)
)

# character de-embedding
l_decode_character = keras.layers.Dense(
    y_ones_count,
    activation='sigmoid',
    kernel_constraint=keras.constraints.max_norm(10.)
)


# model definition
models[architecture] = keras.Sequential([
    l_encode_character,
    l_char_to_word,
    l_encode_word,
    l_lstm,
    l_decode,
    l_decode_word,
    l_word_to_char,
    l_decode_character,
])

# compile model
models[architecture].compile(optimizer='adam', loss=loss, metrics=metrics)
print(models[architecture].summary())

# set pretrained embedding weights
x_embed.apply_encode(l_encode_character, l_encode_word)
y_embed.apply_decode(l_decode_word, l_decode_character)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 63, 13, 10)        540       
_________________________________________________________________
reshape_11 (Reshape)         (None, 63, 130)           0         
_________________________________________________________________
dense_2 (Dense)              (None, 63, 20)            2620      
_________________________________________________________________
lstm_1 (LSTM)                (None, 105)               52920     
_________________________________________________________________
dense_3 (Dense)              (None, 35)                3710      
_________________________________________________________________
dense_4 (Dense)              (None, 230)               8280      
_________________________________________________________________
reshape_12 (Reshape)         (None, 23, 10)            0         
__________

### GRU

In [35]:
# architecture = 'GRU-RNN3'

# x_name, y_name = 'FileName_Words', 'SurveyName'
# cuts = {} # {x_name:[[None, 6]] , y_name:[[None, 6]]}
# (x_onehot, y_onehot), ((x_word_count, x_char_count, x_ones_count),(y_word_count, y_char_count, y_ones_count)) = extract(x_name, y_name, **cuts)

# x_embed = embeddings[x_name+'_Embedding']
# y_embed = embeddings[y_name+'_Embedding']

# gru_hidden_size = int(y_embed.word_embedding_size * 3)


# # character embedding
# l_encode_character = keras.layers.Dense(   x_embed.character_embedding_size, input_shape=(x_word_count, x_char_count, x_ones_count))

# # word embedding
# l_char_to_word =     keras.layers.Reshape( (x_word_count, x_char_count * x_embed.character_embedding_size,))
# l_encode_word =      keras.layers.Dense(   x_embed.word_embedding_size)

# # lstm processing
# l_lstm =             keras.layers.GRU(     gru_hidden_size,   activation='sigmoid',   implementation=2,   unroll=True)
# l_decode =           keras.layers.Dense(   y_embed.word_embedding_size,   activation='sigmoid')

# l_decode_word =      keras.layers.Dense(   y_char_count * y_embed.character_embedding_size,   activation='sigmoid')
# l_word_to_char =     keras.layers.Reshape( (y_char_count, character_embedding_size))

# l_decode_character = keras.layers.Dense(   y_ones_count,   activation='sigmoid')


# models[architecture] = keras.Sequential([
#     l_encode_character,
#     l_char_to_word,
#     l_encode_word,
#     l_lstm,
#     l_decode,
#     l_decode_word,
#     l_word_to_char,
#     l_decode_character,
# ])

# models[architecture].compile(optimizer='adam', loss=loss, metrics=metrics)
# print(models[architecture].summary())

# # set pretrained embedding weights
# x_embed.apply_encode(l_encode_character, l_encode_word)
# y_embed.apply_decode(l_decode_word, l_decode_character)

## Save/Restore weights

In [36]:
#DE = models['E-D-NN'].get_weights()
#model_GRU_1 = model
#model_GRU_2 = model
#model_GRU_3 = model

In [37]:
#model.set_weights(GRU)
#model = model_GRU_3
#models['E-D-NN'].set_weights(DE)

## Run and Evaluate

In [38]:
def train(model, epochs, batch_size):
    """Peroform training on a model
    
    # Arguments
        model: the model to train
        epochs: number of epochs to train for
        batch_size: batch size for training
    """
    
    # split and shuffle data
    ((x_train, x_test, x_preview), (y_train, y_test, y_preview)) = split_and_shuffle(x_onehot, y_onehot, sizes=(20000, 1000, 15))

    # train
    model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs)
    
    # test
    l, a = test(model, x_test, y_test, x_preview, y_preview)
    
    # show training history
    training_log(x_name, y_name, architecture, batch_size, epochs, l, a)
    print(*training_history[::-1], sep='\n')

In [39]:
train(models['LSTM-RNN3'], 100, 64)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200


Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
 4160/20000 [=====>........................] - ETA: 22s - loss: 0.0533 - mean_absolute_error: 0.0095 - categorical_accuracy: 0.9920 - exact_match_accuracy: 0.8748

KeyboardInterrupt: 