## Jupyter configurations

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Imports

In [1]:
import tensorflow as tf
import keras
import keras.backend as K
import numpy as np
# import pandas as pd
import preprocessing as pp
import sys, inspect, argparse, importlib

importlib.reload(pp)

Using TensorFlow backend.


<module 'preprocessing' from 'H:\\gsq-metadata-extraction\\filepath-metadata-extraction\\preprocessing.py'>

## Accuracy Metric

In [2]:
# percentage of samples that exactly match
def exact_match_accuracy(y_true, y_pred):
    argmax_true = tf.math.argmax(y_true, axis=2)            # onehot to index               (batch, width, onehot:int) -> (batch, width:int)
    argmax_pred = tf.math.argmax(y_pred, axis=2)            # onehot to index               (batch, width, onehot:int) -> (batch, width:int)
    match_char = tf.math.equal(argmax_true, argmax_pred)    # match characters              (batch, width:int) -> (batch, width:bool)
    match_word = tf.math.reduce_all(match_char, axis=1)     # require all character in sample to match      (batch, width:bool) -> (batch:bool)
    match_int = tf.cast(match_word, tf.float32)             # bool to int                                   (batch:bool) -> (batch:int)
    return tf.reduce_mean(match_int)                        # percentage of samples that are an exact match (batch:int) -> int

## Log function

In [3]:
verbose = False
def log(*l, **d): 
    if verbose: print(*l, **d)
        
training_history = []

def training_log(x, y, a, b, e, l, m):
    training_history.append({'x':x, 'y':y, 'architecture':a, 'batch size':b, 'epochs':e, 'loss':l, 'accuracy':m})

## Load Data

In [5]:
data = pp.load('training_data.p')
voc_size = pp.char_count

## Preprocess Data

In [6]:
# load data
# Unique Record ID	FileName	Original_FileName	SurveyNum	SurveyName	LineName	SurveyType	PrimaryDataType	SecondaryDataType	TertiaryDataType	Quaternary	File_Range
# First_SP_CDP	Last_SP_CDP	CompletionYear	TenureType	Operator Name	GSQBarcode	EnergySource	LookupDOSFilePath

train_x, train_y, test_x, test_y, showcase_x, showcase_y, x_shape_char, x_shape_ones, y_shape_char, y_shape_ones, x_name, y_name\
    = pp.preprocess(data, x_name='FileName', y_name='LineName', x_categorical=True, y_categorical= True, epoch_size=5000, x_cut_s=None, x_cut_e=None, y_cut_s=None, y_cut_e=12)

train_x	(4000, 87)	train_y	(4000, 23)
train_x	(4000, 87, 79)	train_y	(4000, 12, 79)	test_x	(1000, 87, 79)	test_y	(1000, 12, 79)	showcase_x	(5, 87, 79)	showcase_y	(5, 12, 79)


## Test and show samlpe output

In [7]:
def test(model, test_x, test_y, showcase_x, showcase_y):
    p_one_hot = model.predict(showcase_x)
    p_vector = np.argmax(p_one_hot, 2)
    p_strings = pp.decode_data(p_vector)

    y_vector = np.argmax(showcase_y, 2)
    y_strings = pp.decode_data(y_vector)

    x_vector = np.argmax(showcase_x, 2)
    x_strings = pp.decode_data(x_vector)

    x_strings = [s.replace('<Padding>', '') for s in x_strings]
    y_strings = [s.replace('<Padding>', '') for s in y_strings]
    p_strings = [s.replace('<Padding>', '') for s in p_strings]
    x_w, y_w, p_w = max([len(s) for s in x_strings]), max([len(s) for s in y_strings]), max([len(s) for s in p_strings])
    y_p_strings = ['  '.join([x.ljust(x_w), y.ljust(y_w), p.ljust(p_w), str(y==p)]) for x, y, p in zip(x_strings, y_strings, p_strings)]

    print(*y_p_strings, sep='\n', end='\n\n')

    # accuracy on entire training set
    accuracies = model.evaluate(test_x, test_y)
    print(*list(zip([loss]+metrics, accuracies)), sep='\n', end='\n\n') # evaluate and list loss and each metric
    
    return accuracies[0], accuracies[-1]

## Training Parameters

In [64]:
embedding_size = 15
character_embedding_size = 10
architecture = ''

metrics = ['mean_absolute_error', 'categorical_accuracy', 'binary_accuracy', exact_match_accuracy]
loss = 'mean_squared_logarithmic_error' # poisson mean_squared_logarithmic_error categorical_crossentropy

models = {}

## Embedding

### Auto Encoder: Input, Hidden, Output

In [65]:
architecture = 'Character-Embedding'

model_E_D_NN = keras.Sequential()
model_E_D_NN.add(keras.layers.Dense(character_embedding_size, activation='linear', name='lh', input_shape=(voc_size,)))
model_E_D_NN.add(keras.layers.Dense(voc_size, activation='sigmoid', name='lo'))
#model_E_D_NN.add(keras.layers.Dropout(0.001))
model_E_D_NN.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'mean_absolute_error', 'categorical_accuracy', 'binary_accuracy'])
models[architecture] = model_E_D_NN
print(model_E_D_NN.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lh (Dense)                   (None, 10)                800       
_________________________________________________________________
lo (Dense)                   (None, 79)                869       
Total params: 1,669
Trainable params: 1,669
Non-trainable params: 0
_________________________________________________________________
None


### Train Encoder and Decoder

In [66]:
epochs = 5
batch_size = 64
model = models['Character-Embedding']

model.fit(train_x.reshape(-1, voc_size), train_x.reshape(-1, voc_size), batch_size=batch_size, epochs=epochs)
model.evaluate(test_x.reshape(-1, voc_size), test_x.reshape(-1, voc_size))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[4.15132975557487e-05,
 0.999995635460163,
 5.570285267031094e-06,
 0.9996551724137931,
 0.999995635460163]

In [129]:
# w1 = model.layers[0].get_weights()
# w2 = model.layers[1].get_weights()

# w = [np.copy(w1[0]), np.zeros(w1[1].shape)]
# wi = [np.linalg.pinv(w1[0]), np.zeros(w2[1].shape)]

# m = keras.Sequential()
# m.add(keras.layers.Dense(character_embedding_size, activation='linear', name='lh', input_shape=(voc_size,)))
# m.add(keras.layers.Dense(voc_size, activation='sigmoid', name='lo'))
# m.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'mean_absolute_error', 'categorical_accuracy', 'binary_accuracy'])

# m.layers[0].set_weights(w)
# m.layers[1].set_weights(wi)

# model.evaluate(test_x.reshape(-1, voc_size), test_x.reshape(-1, voc_size))
model.layers[0].get_weights()[1]

array([-1.3858225 ,  1.0831605 , -1.2977322 , -0.1188067 ,  0.54794115,
       -1.2280831 ,  0.78454804, -1.4279423 , -1.0548017 , -1.165022  ],
      dtype=float32)

### Save Encoder and Decoder

In [33]:
#models['Character-Embedding'].get_weights()
models['Character-Embedding'].layers

[<keras.layers.core.Dense at 0x1e48a308438>,
 <keras.layers.core.Dense at 0x1e48a308d68>]

## Models

### P-NN: Input, Embedding, Output

In [1]:
architecture = 'P-NN'

model_P_NN = keras.Sequential()
model_P_NN.add(keras.layers.Embedding(y_shape_ones, embedding_size, name='le', input_length=x_shape_char))   # embed characters into dense embedded space
model_P_NN.add(keras.layers.Flatten())                                                                       # flatten to 1D per sample
model_P_NN.add(keras.layers.Dense(y_shape_char*y_shape_ones, activation='exponential', name='lo'))           # dense layer
model_P_NN.add(keras.layers.Dropout(0.001))                                                                  # dropout to prevent overfitting
model_P_NN.add(keras.layers.Reshape((y_shape_char, y_shape_ones)))                                           # un flatten
model_P_NN.compile(optimizer='adam', loss=loss, metrics=metrics)
models[architecture] = model_P_NN
print(model_P_NN.summary())

NameError: name 'keras' is not defined

### FF-NN: Input, Embedding, Hidden, Output

In [37]:
architecture = 'FF-NN'
hidden_size = (y_shape_ones*embedding_size + y_shape_char*y_shape_ones) // 2

model_FF_NN = keras.Sequential()
model_FF_NN.add(keras.layers.Embedding(y_shape_ones, embedding_size, name='le', input_length=x_shape_char))   # embed characters into dense embedded space
model_FF_NN.add(keras.layers.Flatten())                                                                       # flatten to 1D per sample
model_FF_NN.add(keras.layers.Dense(hidden_size, activation='exponential', name='lh'))                         # dense layer
model_FF_NN.add(keras.layers.Dropout(0.2))                                                                    # dropout to prevent overfitting
model_FF_NN.add(keras.layers.Dense(y_shape_char*y_shape_ones, activation='exponential', name='lo'))           # dense layer
model_FF_NN.add(keras.layers.Reshape((y_shape_char, y_shape_ones)))                                           # un flatten
model_FF_NN.compile(optimizer='adam', loss=loss, metrics=metrics)
models[architecture] = model_FF_NN
print(model_FF_NN.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
le (Embedding)               (None, 23, 20)            1580      
_________________________________________________________________
flatten_5 (Flatten)          (None, 460)               0         
_________________________________________________________________
lh (Dense)                   (None, 1264)              582704    
_________________________________________________________________
dropout_6 (Dropout)          (None, 1264)              0         
_________________________________________________________________
lo (Dense)                   (None, 948)               1199220   
_________________________________________________________________
reshape_7 (Reshape)          (None, 12, 79)            0         
Total params: 1,783,504
Trainable params: 1,783,504
Non-trainable params: 0
_________________________________________________________________


### LSTM-RNN1: Input, Embedding, (LSTM), Output

In [83]:
architecture = 'LSTM-RNN1'
lstm_hidden_size = embedding_size * 15

model_LSTM_RNN1 = keras.Sequential()
model_LSTM_RNN1.add(keras.layers.Embedding(y_shape_ones, embedding_size, name='le', input_length=x_shape_char))   # embed characters into dense embedded space
#model_LSTM_RNN1.add(keras.layers.Dropout(0.2))                                                                    # dropout to prevent overfitting
model_LSTM_RNN1.add(keras.activation.exponential())
model_LSTM_RNN1.add(keras.layers.LSTM(y_shape_char * y_shape_ones, activation='exponential', implementation=2, unroll=True))                # lstm recurrent cell
model_LSTM_RNN1.add(keras.layers.Reshape((y_shape_char, y_shape_ones)))                                           # un flatten
model_LSTM_RNN1.compile(optimizer='adam', loss=loss, metrics=metrics)
models[architecture] = model_LSTM_RNN1
print(model_LSTM_RNN1.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
le (Embedding)               (None, 23, 20)            1580      
_________________________________________________________________
lstm_7 (LSTM)                (None, 948)               3674448   
_________________________________________________________________
reshape_15 (Reshape)         (None, 12, 79)            0         
Total params: 3,676,028
Trainable params: 3,676,028
Non-trainable params: 0
_________________________________________________________________
None


### LSTM-RNN2: Input, Embedding, (LSTM), Output

In [18]:
architecture = 'LSTM-RNN2'
lstm_hidden_size = embedding_size * 15

model_LSTM_RNN2 = keras.Sequential()
model_LSTM_RNN2.add(keras.layers.Embedding(y_shape_ones, embedding_size, name='le', input_length=x_shape_char))   # embed characters into dense embedded space
model_LSTM_RNN2.add(keras.layers.Dropout(0.2))                                                                    # dropout to prevent overfitting
model_LSTM_RNN2.add(keras.layers.LSTM(lstm_hidden_size, return_sequences=True, return_state=True))                # lstm recurrent cell
model_LSTM_RNN2.add(keras.layers.Dropout(0.2))                                                                    # dropout to prevent overfitting
model_LSTM_RNN2.add(keras.layers.Dense(y_shape_char * y_shape_ones))                                              # dense combine time series into single output
model_LSTM_RNN2.add(keras.layers.Reshape((y_shape_char, y_shape_ones)))                                           # un flatten
model_LSTM_RNN2.compile(optimizer='adam', loss=loss, metrics=metrics)
models[architecture] = model_LSTM_RNN2
print(model_LSTM_RNN2.summary())

TypeError: All layers in a Sequential model should have a single output tensor. For multi-output layers, use the functional API.

### GRU-RNN1: Input, Embedding, (GRU), Output

In [152]:
architecture = 'GRU-RNN1'
lstm_hidden_size = voc_size * 15

model_GRU_RNN1 = keras.Sequential()
model_GRU_RNN1.add(keras.layers.Embedding(y_shape_ones, embedding_size, name='le', input_length=x_shape_char))   # embed characters into dense embedded space
#model_GRU_RNN1.add(keras.layers.Dropout(0.2))                                                                    # dropout to prevent overfitting
model_GRU_RNN1.add(keras.layers.GRU(y_shape_char * y_shape_ones, activation='relu', implementation=2, unroll=True))                # lstm recurrent cell
model_GRU_RNN1.add(keras.layers.Reshape((y_shape_char, y_shape_ones)))                                           # un flatten
model_GRU_RNN1.compile(optimizer='adam', loss=loss, metrics=metrics)
models[architecture] = model_GRU_RNN1
print(model_GRU_RNN1.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
le (Embedding)               (None, 23, 20)            1580      
_________________________________________________________________
gru_8 (GRU)                  (None, 948)               2755836   
_________________________________________________________________
reshape_24 (Reshape)         (None, 12, 79)            0         
Total params: 2,757,416
Trainable params: 2,757,416
Non-trainable params: 0
_________________________________________________________________
None


### GRU-RNN2: Imput Embedding, (GRU), Decoder, Output

In [74]:
architecture = 'GRU-RNN2'
lstm_hidden_size = y_shape_char * character_embedding_size + 40

model_GRU_RNN2 = keras.Sequential()
model_GRU_RNN2.add(keras.layers.Dense(character_embedding_size, activation='linear', name='char_encode', trainable=False, input_shape=(x_shape_char, x_shape_ones)))                 # embed characters into dense embedded space
#model_GRU_RNN2.add(keras.layers.Dropout(0.2))                                                                            # dropout to prevent overfitting
model_GRU_RNN2.add(keras.layers.GRU(lstm_hidden_size, activation='sigmoid', implementation=2, unroll=True, name='gru1'))     # gru recurrent cell
#model_GRU_RNN2.add(keras.layers.Dropout(0.2))                                                                            # dropout to prevent overfitting
model_GRU_RNN2.add(keras.layers.Dense(y_shape_char * character_embedding_size, activation='sigmoid', name='decode'))  # dense layer, decode/de-embed
model_GRU_RNN2.add(keras.layers.Reshape((y_shape_char, character_embedding_size)))                                        # un flatten
model_GRU_RNN2.add(keras.layers.Dense(y_shape_ones, activation='sigmoid', trainable=False, name='char_decode'))                             # dense layer, decode/de-embed
model_GRU_RNN2.compile(optimizer='adam', loss=loss, metrics=metrics)
models[architecture] = model_GRU_RNN2
print(model_GRU_RNN2.summary())

# set pretrained embedding weights
model_GRU_RNN2.layers[0].set_weights( models['Character-Embedding'].layers[0].get_weights())
model_GRU_RNN2.layers[-1].set_weights(models['Character-Embedding'].layers[-1].get_weights())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
char_encode (Dense)          (None, 87, 10)            800       
_________________________________________________________________
gru1 (GRU)                   (None, 160)               82080     
_________________________________________________________________
decode (Dense)               (None, 120)               19320     
_________________________________________________________________
reshape_11 (Reshape)         (None, 12, 10)            0         
_________________________________________________________________
char_decode (Dense)          (None, 12, 79)            869       
Total params: 103,069
Trainable params: 101,400
Non-trainable params: 1,669
_________________________________________________________________
None


## Save/Restore weights

In [184]:
#DE = models['E-D-NN'].get_weights()
#model_GRU_1 = model
#model_GRU_2 = model
#model_GRU_3 = model

In [133]:
#model.set_weights(GRU)
#model = model_GRU_3
#models['E-D-NN'].set_weights(DE)

## Run and Evaluate

In [None]:
epochs = 1
batch_size = 128
model = models['GRU-RNN2']

model.fit(train_x, train_y, batch_size=batch_size, epochs=epochs)
l, a = test(model, test_x, test_y, showcase_x, showcase_y)
training_log(x_name, y_name, architecture, batch_size, epochs, l, a)
print(*training_history[::-1], sep='\n')

Epoch 1/1
 768/4000 [====>.........................] - ETA: 5:33 - loss: 0.0748 - mean_absolute_error: 0.2342 - categorical_accuracy: 0.0016 - binary_accuracy: 0.8382 - exact_match_accuracy: 0.0000e+00