## Init

### Jupyter configurations

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

### Imports

In [2]:
import tensorflow as tf
import keras
import keras.backend as K
import numpy as np
import pandas as pd
import preprocessing as pp
import sys, inspect, argparse, importlib, traceback, re

Using TensorFlow backend.


### Function: Accuracy Metric

In [3]:
# percentage of samples that exactly match
def exact_match_accuracy(y_true, y_pred):
    argmax_true = tf.math.argmax(y_true, axis=-1)            # onehot to index               (batch, width, onehot:int) -> (batch, width:int)
    argmax_pred = tf.math.argmax(y_pred, axis=-1)            # onehot to index               (batch, width, onehot:int) -> (batch, width:int)
    match_char = tf.math.equal(argmax_true, argmax_pred)     # match characters              (batch, width:int) -> (batch, width:bool)
    match_word = tf.math.reduce_all(match_char, axis=-1)     # require all character in sample to match      (batch, width:bool) -> (batch:bool)
    match_int = tf.cast(match_word, tf.float32)              # bool to int                                   (batch:bool) -> (batch:float)
    return tf.reduce_mean(match_int)                         # percentage of samples that are an exact match (batch:float) -> float

### Function: Log function

In [4]:
verbose = False
def log(*l, **d): 
    if verbose: print(*l, **d)
        
training_history = []

def training_log(x, y, a, b, e, l, m):
    training_history.append({'x':x, 'y':y, 'architecture':a, 'batch size':b, 'epochs':e, 'loss':l, 'accuracy':m})

## Preprocess Data

### Tokens and Characters

In [5]:
# tokens used to communicate non character entities
# tokens = ['<Padding>', '<Go>', '<EndOfString>', '<UnknownChar>', '<SurveyNum>', '<SurveyName>', '<LineName>', '<SurveyType>', '<PrimaryDataType>', '<SecondaryDataType>', '<TertiaryDataType>', '<Quaternary>', '<File_Range>', '<First_SP_CDP>', '<Last_SP_CDP>', '<CompletionYear>', '<TenureType>', '<Operator Name>', '<GSQBarcode>', '<EnergySource>', '<LookupDOSFilePath>', '<Source Of Data>']
tokens = ['<Padding>', '<Go>', '<EndOfString>', '<UnknownChar>']

# get set of characters to be used, use static preset list of characters
#available_chars = list(" ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890-_().,\\/\"':&")
available_chars = list(" ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890-_().,\\/\"':&")

# generate character to int and int to character maps
char_to_int = {c: i for i, c in enumerate(tokens + available_chars)}
int_to_char = {i: c for c, i in char_to_int.items()}
char_count = len(char_to_int) # number of character available

### Read raw data

In [6]:
raw_source_file = 'SHUP 2D Files Training Data.csv'

# read raw training data
data_df = pd.read_csv(raw_source_file, dtype=str)
data = {feature:data_df[feature].values for feature in data_df.columns.values}

data['LookupDOSFilePath_Words'] = np.array([s.split('\\')[2:] for s in data['LookupDOSFilePath']])
data['FileName_Words'] = np.array([s.split('_') for s in data['FileName']])
data['LineName_Words'] = np.array([s.split('-') for s in data['LineName']])

print(data.keys())

dict_keys(['Unique Record ID', 'FileName', 'Original_FileName', 'SurveyNum', 'SurveyName', 'LineName', 'SurveyType', 'PrimaryDataType', 'SecondaryDataType', 'TertiaryDataType', 'Quaternary', 'File_Range', 'First_SP_CDP', 'Last_SP_CDP', 'CompletionYear', 'TenureType', 'Operator Name', 'GSQBarcode', 'EnergySource', 'LookupDOSFilePath', 'Source Of Data', 'LookupDOSFilePath_Words', 'FileName_Words', 'LineName_Words'])


### Function: Vectorize Strings

In [7]:
def vectorize_data(data):
    if type(data) == str:
        return [char_to_int[char] for char in data.upper()]
    
    else:
        try:
            return [vectorize_data(d) for d in data]
        except Exception as e:
            #traceback.print_exc()
            #print(data, type(data))
            return []


def devectorise_data(data):
        
    length = data.shape[0]
    
    data = data.reshape(length, -1)
    strings = np.full((length,), '', dtype=object)
    
    for i in range(length):
        strings[i] = ''.join([int_to_char[int(i)] for i in data[i]])
    
    return strings
    
    
#     ndim = data.ndim
#     if data.dtype != object:
#         data = data.astype(object)
    
#     # decode vector into string
#     if ndim == 1:
#         return ''.join([int_to_char[int(i)] for i in data])
    
#     # go to next level
#     else:
#         for i in range(len(data)):
#             data[i] = devectorise_data(data[i])
            
#         return data

### Function: Add Padding Tokens

In [8]:
# find size of largest array across each dimension to computer shape of bounding ndarray
def size(data):
    
    if type(data) == int:
        return ()
    
    this_size = len(data)
    
    if this_size > 0:
        inner_sizes = np.array([size(d) for d in data])
        inner_sizes = tuple(np.amax(inner_sizes, axis=0))
    else:
        inner_sizes = ()
    
    return (this_size,) + inner_sizes
    
    
def insert_vector(matrix, data, indices):
    if type(data) == int:
        matrix[indices] = data
    else:
        for i in range(len(data)):
            insert_vector(matrix, data[i], indices + (i,))
    

def pad_vector_data(data, pad_token, pad_shape=None):
    
    shape = size(data)
    if pad_shape != None:
        shape = tuple(np.maximum(pad_shape, shape))

    # empty matrix
    matrix = np.full(shape, pad_token, np.int32)

    insert_vector(matrix, data, ())

    return matrix

### Function: Split dataset

In [9]:
def split(data, sizes):
    sizes = list(sizes)
    
    for i in range(1, len(sizes)):
        sizes[i] += sizes[i-1]
    
    slices = [slice(i,j) for i, j in zip([0]+sizes, sizes)]
    
    return [data[s] for s in slices]

### Function: Shuffle dataset

In [10]:
def shuffle(*data):
    order = np.arange(len(data[0]))         # default order of elements
    np.random.shuffle(order)                # randomise order
    return [d[order] for d in data]         # new array with items in the randimised order

### Function: Extract relevant data

In [11]:
def extract(*keys, **cuts):
    
    onehots, shapes = [], []
        
    for key in keys:
        
        # get data from dictionary
        onehot = onehot_data[key]

        # apply cuts
        cut = cuts.get(key, [[None]])
        cut = [slice(*c) for c in cut]
        cut = len(onehot.shape)*[slice(None)] + cut + [slice(None)]
        cut = tuple(cut[-len(onehot.shape):])
        onehot = onehot[cut]
        
        # calculate shape
        shape = (None, *onehot.shape[1:])[-3:]

        onehots.append(onehot)
        shapes.append(shape)

    return onehots, shapes
    
def split_and_shuffle(*onehots, sizes=None, shuffle_before=False, shuffle_after=True):
    sizes = sizes or [None]
    key_count = len(onehots)
    subset_count = len(sizes)
    
    if shuffle_before:
        onehots = shuffle(*onehots)

    onehots_subsets = np.full((key_count, subset_count), None)
    onehots_subsets[:,:] = [split(onehot, sizes) for onehot in onehots]

    if shuffle_after:
        for i in range(subset_count):
            onehots_subsets[:,i] = shuffle(*onehots_subsets[:,i])

    return onehots_subsets

### Perform preprocessing

In [12]:
# extract LookupDOSFilePath for speccial processing

vectorized_data = {f: vectorize_data(data[f])    for f in data}
padded_data =     {f: pad_vector_data(vectorized_data[f], char_to_int['<Padding>'])    for f in vectorized_data}
onehot_data =     {f: keras.utils.to_categorical(padded_data[f], char_count)    for f in padded_data}

for f in onehot_data: print(f"'{f}':".ljust(30), onehot_data[f].shape)

'Unique Record ID':            (23903, 6, 53)
'FileName':                    (23903, 87, 53)
'Original_FileName':           (23903, 71, 53)
'SurveyNum':                   (23903, 5, 53)
'SurveyName':                  (23903, 39, 53)
'LineName':                    (23903, 23, 53)
'SurveyType':                  (23903, 6, 53)
'PrimaryDataType':             (23903, 14, 53)
'SecondaryDataType':           (23903, 36, 53)
'TertiaryDataType':            (23903, 17, 53)
'Quaternary':                  (23903, 8, 53)
'File_Range':                  (23903, 13, 53)
'First_SP_CDP':                (23903, 8, 53)
'Last_SP_CDP':                 (23903, 7, 53)
'CompletionYear':              (23903, 4, 53)
'TenureType':                  (23903, 3, 53)
'Operator Name':               (23903, 47, 53)
'GSQBarcode':                  (23903, 17, 53)
'EnergySource':                (23903, 29, 53)
'LookupDOSFilePath':           (23903, 181, 53)
'Source Of Data':              (23903, 8, 53)
'LookupDOSFilePath_Wo

## Function: Test and show samlpe output

In [13]:
def test(model, x_test=None, y_test=None, x_preview=None, y_preview=None):
    
    if (x_preview is not None) and (y_preview is not None):
        t_size = len(x_preview)
        
        p_one_hot = model.predict(x_preview)
        p_vector = np.argmax(p_one_hot, -1)
        p_vector = p_vector.reshape((t_size, -1))
        p_strings = devectorise_data(p_vector)

        y_vector = np.argmax(y_preview, -1)
        y_vector = y_vector.reshape((t_size, -1))
        y_strings = devectorise_data(y_vector)

        x_vector = np.argmax(x_preview, -1)
        x_vector = x_vector.reshape((t_size, -1))
        x_strings = devectorise_data(x_vector)

        n_strings = [f'{i}. ' for i in range(t_size)]
        x_strings = [re.sub('(<Padding>)+', ' ', s).strip() for s in x_strings]
        y_strings = [re.sub('(<Padding>)+', ' ', s).strip() for s in y_strings]
        p_strings = [re.sub('(<Padding>)+', ' ', s).strip() for s in p_strings]
        n_w, x_w, y_w, p_w = max([len(s) for s in n_strings]), max([len(s) for s in x_strings]), max([len(s) for s in y_strings]), max([len(s) for s in p_strings])
        y_p_strings = ["'  '".join([n.ljust(n_w), x.ljust(x_w), y.ljust(y_w), p.ljust(p_w), str(y==p)]) for n, x, y, p in zip(n_strings, x_strings, y_strings, p_strings)]

        print(*y_p_strings, sep='\n', end='\n\n')

    if (x_test is not None) and (y_test is not None):
        
        # metric names
        metrics = [model.loss] + model.metrics
        
        # accuracy on entire training set
        accuracies = model.evaluate(x_test, y_test)
        print(*list(zip(metrics, accuracies)), sep='\n', end='\n\n') # evaluate and list loss and each metric

        return accuracies[0], accuracies[-1]

## Training Parameters

In [14]:

character_embedding_size = 10
word_embedding_size = 40
architecture = ''

metrics = ['mean_absolute_error', 'categorical_accuracy', exact_match_accuracy] # binary_accuracy
loss = 'categorical_crossentropy' # poisson mean_squared_logarithmic_error categorical_crossentropy

embed_loss='categorical_crossentropy'
embed_metrics=['accuracy', 'mean_absolute_error', 'categorical_accuracy', exact_match_accuracy]

models = {}

## Character Embedding

### Auto Encoder Character Data

In [15]:
# create offset input and output sequences to training a preditive embedding model.

(x_char_onehot,), ((embed_word_count, embed_char_count, embed_ones_count),) = extract('LookupDOSFilePath')
(x_char_onehot,) = shuffle(x_char_onehot)
shape = x_char_onehot.shape
x_embed_size = len(x_char_onehot)

# create columns of padding tokens
padding = np.full((*shape[:-2], 1), char_to_int['<Padding>'])
#padding = np.array([[char_to_int['<Padding>']]] * x_embed_size)
padding = keras.utils.to_categorical(padding, char_count)
padding = padding.reshape(*shape[:-2], 1, shape[-1])

# 'abcd' -> ('_abcd', 'abcd_')
x_embed_train = np.concatenate((x_char_onehot, padding), axis=-2)
y_embed_train = np.concatenate((padding, x_char_onehot), axis=-2)
x_embed_test = np.concatenate((x_char_onehot, padding), axis=-2)
y_embed_test = np.concatenate((padding, x_char_onehot), axis=-2)

embed_char_count += 1

### Auto Encoder: Input, Hidden, Output

In [16]:
architecture = 'Character-Embedding'

model_E_D_NN = keras.Sequential()
model_E_D_NN.add(keras.layers.Dense(character_embedding_size, name='lh', input_shape=(embed_char_count, char_count,)))
model_E_D_NN.add(keras.layers.Dense(char_count, activation='sigmoid', name='lo'))
#model_E_D_NN.add(keras.layers.Dropout(0.001))
model_E_D_NN.compile(optimizer='adam', loss=embed_loss, metrics=embed_metrics)
models[architecture] = model_E_D_NN
print(model_E_D_NN.summary())

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lh (Dense)                   (None, 182, 10)           540       
_________________________________________________________________
lo (Dense)                   (None, 182, 53)           583       
Total params: 1,123
Trainable params: 1,123
Non-trainable params: 0
_________________________________________________________________
None


### Train Encoder and Decoder

In [17]:
# x -> y   predictive
epochs = 2
batch_size = 16
models['Character-Embedding'].fit(x_embed_train, y_embed_train, batch_size=batch_size, epochs=epochs)
test(models['Character-Embedding'], None, None, x_embed_train[:1], y_embed_train[:1])

Instructions for updating:
Use tf.cast instead.
Epoch 1/2
Epoch 2/2
0. '  '\SHUP\2D_SURVEYS\PROCESSED_AND_SUPPORT_DATA\1990\GREGORY_2D_JACKSON_AND_BOLAN_3D\SEGY\GREGORY_2D_JACKSON_AND_BOLAN_3D_93-EPL_FILTERED_SCALED_QR014370_138618.SGY'  '\SHUP\2D_SURVEYS\PROCESSED_AND_SUPPORT_DATA\1990\GREGORY_2D_JACKSON_AND_BOLAN_3D\SEGY\GREGORY_2D_JACKSON_AND_BOLAN_3D_93-EPL_FILTERED_SCALED_QR014370_138618.SGY'  'A_SSUA\ED_SORSG_AUOROS__SEDTAED_SUUROADETATA\118AEOSEROGD\ED_TOC_RADTAED\RATAD1EA_SEGAEOSEROGD\ED_TOC_RADTAED\RATAD1ED111SUAD_FAASOSED_OTASED_O8\1118D\191\91_EG'  'False



In [18]:
# x -> x   direct
epochs = 3
batch_size = 16
models['Character-Embedding'].fit(x_embed_train, x_embed_train, batch_size=batch_size, epochs=epochs)
test(models['Character-Embedding'], None, None, x_embed_train[:1], x_embed_train[:1])

Epoch 1/3
Epoch 2/3
Epoch 3/3
0. '  '\SHUP\2D_SURVEYS\PROCESSED_AND_SUPPORT_DATA\1990\GREGORY_2D_JACKSON_AND_BOLAN_3D\SEGY\GREGORY_2D_JACKSON_AND_BOLAN_3D_93-EPL_FILTERED_SCALED_QR014370_138618.SGY'  '\SHUP\2D_SURVEYS\PROCESSED_AND_SUPPORT_DATA\1990\GREGORY_2D_JACKSON_AND_BOLAN_3D\SEGY\GREGORY_2D_JACKSON_AND_BOLAN_3D_93-EPL_FILTERED_SCALED_QR014370_138618.SGY'  '\SHUP\2D_SURVEYS\PROCESSED_AND_SUPPORT_DATA\1990\GREGORY_2D_JACKSON_AND_BOLAN_3D\SEGY\GREGORY_2D_JACKSON_AND_BOLAN_3D_93-EPL_FILTERED_SCALED_QR014370_138618.SGY'  'True



In [19]:
# w1 = model.layers[0].get_weights()
# w2 = model.layers[1].get_weights()

# w = [np.copy(w1[0]), np.zeros(w1[1].shape)]
# wi = [np.linalg.pinv(w1[0]), np.zeros(w2[1].shape)]

# m = keras.Sequential()
# m.add(keras.layers.Dense(character_embedding_size, activation='linear', name='lh', input_shape=(embed_char_count, voc_size,)))
# m.add(keras.layers.Dense(voc_size, activation='sigmoid', name='lo'))
# m.compile(optimizer='adam', loss=embed_loss, metrics=embed_metrics)

# m.layers[0].set_weights(w)
# m.layers[1].set_weights(wi)

# encode_weights, decode_weights = w, wi

# accuracy = model.evaluate(embed_test_x, embed_test_x)
# metric_names = [embed_loss] + embed_metrics
# dict(zip(metric_names, accuracy))
char_encode_weights, char_decode_weights = models['Character-Embedding'].layers[0].get_weights(), models['Character-Embedding'].layers[1].get_weights()

In [20]:
# test
test(models['Character-Embedding'], x_embed_train, x_embed_train, x_embed_train[:5], x_embed_train[:5])

0. '  '\SHUP\2D_SURVEYS\PROCESSED_AND_SUPPORT_DATA\1990\GREGORY_2D_JACKSON_AND_BOLAN_3D\SEGY\GREGORY_2D_JACKSON_AND_BOLAN_3D_93-EPL_FILTERED_SCALED_QR014370_138618.SGY'  '\SHUP\2D_SURVEYS\PROCESSED_AND_SUPPORT_DATA\1990\GREGORY_2D_JACKSON_AND_BOLAN_3D\SEGY\GREGORY_2D_JACKSON_AND_BOLAN_3D_93-EPL_FILTERED_SCALED_QR014370_138618.SGY'  '\SHUP\2D_SURVEYS\PROCESSED_AND_SUPPORT_DATA\1990\GREGORY_2D_JACKSON_AND_BOLAN_3D\SEGY\GREGORY_2D_JACKSON_AND_BOLAN_3D_93-EPL_FILTERED_SCALED_QR014370_138618.SGY'  'True
1. '  '\SHUP\2D_SURVEYS\PROCESSED_AND_SUPPORT_DATA\1980\JUANDAH\SEGY\JUANDAH_CSJ88-25X_FINAL_MIGRATED_SDU10912TA_130518.SGY                                            '  '\SHUP\2D_SURVEYS\PROCESSED_AND_SUPPORT_DATA\1980\JUANDAH\SEGY\JUANDAH_CSJ88-25X_FINAL_MIGRATED_SDU10912TA_130518.SGY                                            '  '\SHUP\2D_SURVEYS\PROCESSED_AND_SUPPORT_DATA\1980\JUANDAH\SEGY\JUANDAH_CSJ88-25X_FINAL_MIGRATED_SDU10912TA_130518.SGY                                            '

(0.0011421034636629125, 0.9995398067188219)

## Word Embedding

### Auto Encoder Folder Data

In [21]:
(x_word_onehot,), ((embed_word_count, embed_char_count, embed_ones_count),) = extract('FileName_Words')
(x_word_onehot,) = shuffle(x_word_onehot)
embed_word_count, embed_char_count, embed_ones_count

(13, 23, 53)

### Auto Encoder Model: Input, Hidden, Output

In [22]:
architecture = 'Word-Embedding'

model_E_W_NN = keras.Sequential()
# character embedding
model_E_W_NN.add(keras.layers.Dense(character_embedding_size, name='char_encode', trainable=True, input_shape=(embed_word_count, embed_char_count, embed_ones_count)))
model_E_W_NN.add(keras.layers.Reshape((embed_word_count, embed_char_count * character_embedding_size,)))

# word auto encoder
model_E_W_NN.add(keras.layers.Dense(word_embedding_size, name='lh', input_shape=(embed_char_count, char_count,)))
model_E_W_NN.add(keras.layers.Dense(embed_char_count * character_embedding_size, activation='sigmoid', name='lo'))

# character de embedding
model_E_W_NN.add(keras.layers.Reshape((embed_word_count, embed_char_count, character_embedding_size)))
model_E_W_NN.add(keras.layers.Dense(embed_ones_count, activation='sigmoid', trainable=True, name='char_decode')) 

model_E_W_NN.compile(optimizer='adam', loss=embed_loss, metrics=embed_metrics)
models[architecture] = model_E_W_NN
print(model_E_W_NN.summary())

# set pretrained embedding weights
model_E_W_NN.layers[0].set_weights(char_encode_weights)
model_E_W_NN.layers[-1].set_weights(char_decode_weights)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
char_encode (Dense)          (None, 13, 23, 10)        540       
_________________________________________________________________
reshape_1 (Reshape)          (None, 13, 230)           0         
_________________________________________________________________
lh (Dense)                   (None, 13, 40)            9240      
_________________________________________________________________
lo (Dense)                   (None, 13, 230)           9430      
_________________________________________________________________
reshape_2 (Reshape)          (None, 13, 23, 10)        0         
_________________________________________________________________
char_decode (Dense)          (None, 13, 23, 53)        583       
Total params: 19,793
Trainable params: 19,793
Non-trainable params: 0
_________________________________________________________________
None


### Train Encoder and Decoder

In [23]:
# x -> x   direct
epochs = 6
batch_size = 16
models['Word-Embedding'].fit(x_word_onehot, x_word_onehot, batch_size=batch_size, epochs=epochs)
test(models['Word-Embedding'], x_word_onehot, x_word_onehot, x_word_onehot[:3], x_word_onehot[:3])

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
0. '  'EAST BALLYMENA NOMBY BRENTWOOD M97-NO03 FINAL MIGRATED SDU02060TA 238515.SGY'  'EAST BALLYMENA NOMBY BRENTWOOD M97-NO03 FINAL MIGRATED SDU02060TA 238515.SGY'  'EAST BALLYMENA NOMBY BRENTWOOD M97-NO03 FINAL MIGRATED SDU02060TA 238515.SGY'  'True
1. '  'OXLEY 90-GKB FINAL STACK SDU10912TA 211694.SGY                              '  'OXLEY 90-GKB FINAL STACK SDU10912TA 211694.SGY                              '  'OXLEY 90-GKB FINAL STACK SDU10912TA 211694.SGY                              '  'True
2. '  'CATHOO EHC85-104 UNFILTERED FINAL SDU06351TA 198149.SGY                     '  'CATHOO EHC85-104 UNFILTERED FINAL SDU06351TA 198149.SGY                     '  'CATHOO EHC85-104 UNFILTERED FINAL SDU06351TA 198149.SGY                     '  'True

('categorical_crossentropy', 0.003113500618446009)
('accuracy', 0.9998554733215164)
('mean_absolute_error', 0.010085613616532459)
('categorical_accuracy', 0.9998554733215164)
(<functi

(0.003113500618446009, 0.9976217906400159)

In [24]:
word_encode_weights, word_decode_weights = models['Word-Embedding'].layers[2].get_weights(), models['Word-Embedding'].layers[3].get_weights()

## Models

### P-NN: Input, Embedding, Output

In [1]:
architecture = 'P-NN'

model_P_NN = keras.Sequential()
model_P_NN.add(keras.layers.Embedding(y_shape_ones, embedding_size, name='le', input_length=x_shape_char))   # embed characters into dense embedded space
model_P_NN.add(keras.layers.Flatten())                                                                       # flatten to 1D per sample
model_P_NN.add(keras.layers.Dense(y_shape_char*y_shape_ones, activation='exponential', name='lo'))           # dense layer
model_P_NN.add(keras.layers.Dropout(0.001))                                                                  # dropout to prevent overfitting
model_P_NN.add(keras.layers.Reshape((y_shape_char, y_shape_ones)))                                           # un flatten
model_P_NN.compile(optimizer='adam', loss=loss, metrics=metrics)
models[architecture] = model_P_NN
print(model_P_NN.summary())

NameError: name 'keras' is not defined

### FF-NN: Input, Embedding, Hidden, Output

In [37]:
architecture = 'FF-NN'
hidden_size = (y_shape_ones*embedding_size + y_shape_char*y_shape_ones) // 2

model_FF_NN = keras.Sequential()
model_FF_NN.add(keras.layers.Embedding(y_shape_ones, embedding_size, name='le', input_length=x_shape_char))   # embed characters into dense embedded space
model_FF_NN.add(keras.layers.Flatten())                                                                       # flatten to 1D per sample
model_FF_NN.add(keras.layers.Dense(hidden_size, activation='exponential', name='lh'))                         # dense layer
model_FF_NN.add(keras.layers.Dropout(0.2))                                                                    # dropout to prevent overfitting
model_FF_NN.add(keras.layers.Dense(y_shape_char*y_shape_ones, activation='exponential', name='lo'))           # dense layer
model_FF_NN.add(keras.layers.Reshape((y_shape_char, y_shape_ones)))                                           # un flatten
model_FF_NN.compile(optimizer='adam', loss=loss, metrics=metrics)
models[architecture] = model_FF_NN
print(model_FF_NN.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
le (Embedding)               (None, 23, 20)            1580      
_________________________________________________________________
flatten_5 (Flatten)          (None, 460)               0         
_________________________________________________________________
lh (Dense)                   (None, 1264)              582704    
_________________________________________________________________
dropout_6 (Dropout)          (None, 1264)              0         
_________________________________________________________________
lo (Dense)                   (None, 948)               1199220   
_________________________________________________________________
reshape_7 (Reshape)          (None, 12, 79)            0         
Total params: 1,783,504
Trainable params: 1,783,504
Non-trainable params: 0
_________________________________________________________________


### LSTM-RNN1: Input, Embedding, (LSTM), Output

In [83]:
architecture = 'LSTM-RNN1'
lstm_hidden_size = embedding_size * 15

model_LSTM_RNN1 = keras.Sequential()
model_LSTM_RNN1.add(keras.layers.Embedding(y_shape_ones, embedding_size, name='le', input_length=x_shape_char))   # embed characters into dense embedded space
#model_LSTM_RNN1.add(keras.layers.Dropout(0.2))                                                                    # dropout to prevent overfitting
model_LSTM_RNN1.add(keras.activation.exponential())
model_LSTM_RNN1.add(keras.layers.LSTM(y_shape_char * y_shape_ones, activation='exponential', implementation=2, unroll=True))                # lstm recurrent cell
model_LSTM_RNN1.add(keras.layers.Reshape((y_shape_char, y_shape_ones)))                                           # un flatten
model_LSTM_RNN1.compile(optimizer='adam', loss=loss, metrics=metrics)
models[architecture] = model_LSTM_RNN1
print(model_LSTM_RNN1.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
le (Embedding)               (None, 23, 20)            1580      
_________________________________________________________________
lstm_7 (LSTM)                (None, 948)               3674448   
_________________________________________________________________
reshape_15 (Reshape)         (None, 12, 79)            0         
Total params: 3,676,028
Trainable params: 3,676,028
Non-trainable params: 0
_________________________________________________________________
None


### LSTM-RNN2: Input, Embedding, (LSTM), Output

In [261]:
architecture = 'LSTM-RNN2'

x_name, y_name = 'LineName', 'LineName'
cuts = {x_name:[[None, 6]] , y_name:[[None, 6]]}
(x_onehot, y_onehot), ((x_word_count, x_char_count, x_ones_count),(y_word_count, y_char_count, y_ones_count)) = extract(x_name, y_name, **cuts)

lstm_hidden_size = y_char_count * character_embedding_size * 2

model_LSTM_RNN2 = keras.Sequential()
model_LSTM_RNN2.add(keras.layers.Dense(character_embedding_size, name='char_encode', trainable=False, input_shape=(x_char_count, x_ones_count)))                 # embed characters into dense embedded space
#model_LSTM_RNN2.add(keras.layers.Dropout(0.2))                                                                            # dropout to prevent overfitting
model_LSTM_RNN2.add(keras.layers.LSTM(lstm_hidden_size, activation='sigmoid', implementation=2, unroll=True, name='lstm1'))     # lstm recurrent cell
#model_LSTM_RNN2.add(keras.layers.Dropout(0.2))                                                                            # dropout to prevent overfitting
model_LSTM_RNN2.add(keras.layers.Dense(y_char_count * character_embedding_size, activation='sigmoid', name='decode'))  # dense layer, decode/de-embed
model_LSTM_RNN2.add(keras.layers.Reshape((y_char_count, character_embedding_size)))                                        # un flatten
model_LSTM_RNN2.add(keras.layers.Dense(y_ones_count, activation='sigmoid', trainable=False, name='char_decode'))                             # dense layer, decode/de-embed
model_LSTM_RNN2.compile(optimizer='adam', loss=loss, metrics=metrics)
models[architecture] = model_LSTM_RNN2
print(model_LSTM_RNN2.summary())

# set pretrained embedding weights
model_LSTM_RNN2.layers[0].set_weights(char_encode_weights)
model_LSTM_RNN2.layers[-1].set_weights(char_decode_weights)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
char_encode (Dense)          (None, 6, 20)             1080      
_________________________________________________________________
lstm1 (LSTM)                 (None, 240)               250560    
_________________________________________________________________
decode (Dense)               (None, 120)               28920     
_________________________________________________________________
reshape_21 (Reshape)         (None, 6, 20)             0         
_________________________________________________________________
char_decode (Dense)          (None, 6, 53)             1113      
Total params: 281,673
Trainable params: 279,480
Non-trainable params: 2,193
_________________________________________________________________
None


### LSTM-RNN3: Input, Embed Characters, Embed Words, (LSTM), De-embed Words, De-embed Characters, Output

In [33]:
# WARNING the maximum word length of FileName_words == 23 == LineName length
# this is only a conincidance and other data will differe and cause errors
architecture = 'LSTM-RNN3'

x_name, y_name = 'FileName_Words', 'LineName'
cuts = {} # {x_name:[[None, 6]] , y_name:[[None, 6]]}
(x_onehot, y_onehot), ((x_word_count, x_char_count, x_ones_count),(y_word_count, y_char_count, y_ones_count)) = extract(x_name, y_name, **cuts)

lstm_hidden_size = 100
embed_trainable = True

model_LSTM_RNN3 = keras.Sequential()

# character embedding
model_LSTM_RNN3.add(keras.layers.Dense(character_embedding_size, name='char_encode', trainable=embed_trainable, input_shape=(x_word_count, x_char_count, x_ones_count)))

# word embedding
model_LSTM_RNN3.add(keras.layers.Reshape((x_word_count, x_char_count * character_embedding_size,)))
model_LSTM_RNN3.add(keras.layers.Dense(word_embedding_size, trainable=embed_trainable, name='word_encode'))

# lstm processing
model_LSTM_RNN3.add(keras.layers.LSTM(lstm_hidden_size, activation='sigmoid', implementation=2, unroll=True, name='lstm1'))
model_LSTM_RNN3.add(keras.layers.Dense(word_embedding_size, activation='sigmoid', name='lstm_decode'))

# word de embedding
model_LSTM_RNN3.add(keras.layers.Dense(y_char_count * character_embedding_size, activation='sigmoid', trainable=embed_trainable, name='word_decode'))
model_LSTM_RNN3.add(keras.layers.Reshape((y_char_count, character_embedding_size)))

# character de embedding
model_LSTM_RNN3.add(keras.layers.Dense(y_ones_count, activation='sigmoid', trainable=embed_trainable, name='char_decode')) 


model_LSTM_RNN3.compile(optimizer='adam', loss=loss, metrics=metrics)
models[architecture] = model_LSTM_RNN3
print(model_LSTM_RNN3.summary())

# set pretrained embedding weights
model_LSTM_RNN3.layers[0].set_weights(char_encode_weights)
model_LSTM_RNN3.layers[2].set_weights(word_encode_weights)
model_LSTM_RNN3.layers[-3].set_weights(word_decode_weights)
model_LSTM_RNN3.layers[-1].set_weights(char_decode_weights)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
char_encode (Dense)          (None, 13, 23, 10)        540       
_________________________________________________________________
reshape_7 (Reshape)          (None, 13, 230)           0         
_________________________________________________________________
word_encode (Dense)          (None, 13, 40)            9240      
_________________________________________________________________
lstm1 (LSTM)                 (None, 100)               56400     
_________________________________________________________________
lstm_decode (Dense)          (None, 40)                4040      
_________________________________________________________________
word_decode (Dense)          (None, 230)               9430      
_________________________________________________________________
reshape_8 (Reshape)          (None, 23, 10)            0         
__________

### GRU-RNN1: Input, Embedding, (GRU), Output

In [32]:
architecture = 'GRU-RNN1'
lstm_hidden_size = voc_size * 15

model_GRU_RNN1 = keras.Sequential()
model_GRU_RNN1.add(keras.layers.Embedding(y_shape_ones, embedding_size, name='le', input_length=x_shape_char))   # embed characters into dense embedded space
#model_GRU_RNN1.add(keras.layers.Dropout(0.2))                                                                    # dropout to prevent overfitting
model_GRU_RNN1.add(keras.layers.GRU(y_shape_char * y_shape_ones, activation='relu', implementation=2, unroll=True))                # lstm recurrent cell
model_GRU_RNN1.add(keras.layers.Reshape((y_shape_char, y_shape_ones)))                                           # un flatten
model_GRU_RNN1.compile(optimizer='adam', loss=loss, metrics=metrics)
models[architecture] = model_GRU_RNN1
print(model_GRU_RNN1.summary())

NameError: name 'voc_size' is not defined

### GRU-RNN2: Imput Embedding, (GRU), Decoder, Output

In [188]:
architecture = 'GRU-RNN2'

x_name, y_name = 'LineName', 'LineName'
cuts = {x_name:[[None, 12]] , y_name:[[None, 12]]}
(x_onehot, y_onehot), ((x_word_count, x_char_count, x_ones_count),(y_word_count, y_char_count, y_ones_count)) = extract(x_name, y_name, **cuts)

lstm_hidden_size = y_char_count * character_embedding_size + 40

model_GRU_RNN2 = keras.Sequential()
model_GRU_RNN2.add(keras.layers.Dense(character_embedding_size, name='char_encode', trainable=False, input_shape=(x_char_count, x_ones_count)))                 # embed characters into dense embedded space
model_GRU_RNN2.add(keras.layers.Dropout(0.2))                                                                            # dropout to prevent overfitting
model_GRU_RNN2.add(keras.layers.GRU(lstm_hidden_size, activation='sigmoid', implementation=2, unroll=True, name='gru1'))     # gru recurrent cell
model_GRU_RNN2.add(keras.layers.Dropout(0.2))                                                                            # dropout to prevent overfitting
model_GRU_RNN2.add(keras.layers.Dense(y_char_count * character_embedding_size, activation='sigmoid', name='decode'))  # dense layer, decode/de-embed
model_GRU_RNN2.add(keras.layers.Reshape((y_char_count, character_embedding_size)))                                        # un flatten
model_GRU_RNN2.add(keras.layers.Dense(y_ones_count, activation='sigmoid', trainable=False, name='char_decode'))                             # dense layer, decode/de-embed
model_GRU_RNN2.compile(optimizer='adam', loss=loss, metrics=metrics)
models[architecture] = model_GRU_RNN2
print(model_GRU_RNN2.summary())

# set pretrained embedding weights
model_GRU_RNN2.layers[0].set_weights(char_encode_weights)
model_GRU_RNN2.layers[-1].set_weights(char_decode_weights)

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
char_encode (Dense)          (None, 12, 10)            540       
_________________________________________________________________
dropout_1 (Dropout)          (None, 12, 10)            0         
_________________________________________________________________
gru1 (GRU)                   (None, 160)               82080     
_________________________________________________________________
dropout_2 (Dropout)          (None, 160)               0         
_________________________________________________________________
decode (Dense)               (None, 120)               19320     
_________________________________________________________________
reshape_4 (Reshape)          (None, 12, 10)            0         
___________________________

## Save/Restore weights

In [184]:
#DE = models['E-D-NN'].get_weights()
#model_GRU_1 = model
#model_GRU_2 = model
#model_GRU_3 = model

In [133]:
#model.set_weights(GRU)
#model = model_GRU_3
#models['E-D-NN'].set_weights(DE)

## Run and Evaluate

In [35]:
epochs = 70
batch_size = 64

model = models['LSTM-RNN3']
((x_train, x_test, x_preview), (y_train, y_test, y_preview)) = split_and_shuffle(x_onehot, y_onehot, sizes=(20000, 1000, 15))

model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs)
l, a = test(model, x_test, y_test, x_preview, y_preview)
training_log(x_name, y_name, architecture, batch_size, epochs, l, a)
print(*training_history[::-1], sep='\n')

Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70


Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70
0.  '  'AR91 AR91-31 FINAL FILTERED SDU02007TA 238981.SGY                              '  'AR91-31 '  'AR91-31 '  'True
1.  '  'AR91 AR91-32 FINAL FILTERED SDU02007TA 238980.SGY                              '  'AR91-32 '  'AR91-32 '  'True
2.  '  'WALLABELLA AND EXTENSION 84-W36 FINAL FILTERED MIGRATED SDU02007TA 238970.SGY  '  '84-W36  '  '84-W36  '  'True
3.  '  'WOODDUCK 87-WD15 FINAL FILTERED SDU02007TA 238983.SGY                          '  '87-WD15 '  '87-WD15 '  'True
4.  '  'AR91 AR91-31 FINAL FILTERED MIGRATED SDU02007TA 238977.SGY                     '  'AR91-31 '  'AR91-31 '  'True
5.  '  'AR91 AR91-30 FINAL FILTERED SDU02007TA 23898

In [262]:
print(*training_history[::-1], sep='\n')

{'x': 'FileName_Words', 'y': 'LineName', 'architecture': 'LSTM-RNN3', 'batch size': 16, 'epochs': 1, 'loss': 3.552898067474365, 'accuracy': 0.0}
{'x': 'FileName_Words', 'y': 'LineName', 'architecture': 'LSTM-RNN3', 'batch size': 16, 'epochs': 1, 'loss': 3.563540615081787, 'accuracy': 0.0}
{'x': 'FileName_Words', 'y': 'LineName', 'architecture': 'LSTM-RNN3', 'batch size': 16, 'epochs': 1, 'loss': 3.5646687889099122, 'accuracy': 0.0}
{'x': 'FileName_Words', 'y': 'LineName', 'architecture': 'LSTM-RNN3', 'batch size': 16, 'epochs': 1, 'loss': 3.4647589626312256, 'accuracy': 0.0}
{'x': 'FileName_Words', 'y': 'LineName', 'architecture': 'LSTM-RNN3', 'batch size': 128, 'epochs': 1, 'loss': 3.4894262046813966, 'accuracy': 0.0}
{'x': 'FileName_Words', 'y': 'LineName', 'architecture': 'LSTM-RNN3', 'batch size': 64, 'epochs': 0, 'loss': 3.9734037208557127, 'accuracy': 0.0}
