In [None]:
## how might one reformat a columnar table?
##  - this notebook is only meant to explore one possible, simple method and how an end to end solution may look.

In [1]:
import numpy as np

In [2]:
input_text = """

  right       10
  forward     m/s
  velocity
  left        50
  forward     m/s
  velocity
  speed       .5
  ratio

"""

output_text = """

right forward velocity 10 m/s
left forward velocity 50 m/s
speed ratio .5

"""

In [3]:
from nltk.tokenize import word_tokenize

In [4]:
vocab = sorted([ '\n', ' ', '<>' ] + word_tokenize(input_text))

n_vocab = len(vocab)

word_to_index = dict([ (w, i) for i, w in enumerate(vocab) ])
index_to_word = dict([ (i, w) for i, w in enumerate(vocab) ])

print(vocab)

['\n', ' ', '.5', '10', '50', '<>', 'forward', 'forward', 'left', 'm/s', 'm/s', 'ratio', 'right', 'speed', 'velocity', 'velocity']


In [5]:
import re

In [6]:
def translate(text: str, word_to_index: dict):
    matches = re.findall('(\n|[ ]|\w+\/\w+|[\d.]*\d[\d.]*|[a-z]+)', text)
    return [ word_to_index[key] for key in matches ]

def translate_output(text: str, word_to_index: dict, size: int):
    a = translate(text, word_to_index)
    
    ## pad,
    n = len(a)
    if n < size:
        p = size - n
        a = a + [ word_to_index['<>'] for _ in range(p) ]
    
    return a

In [7]:
input_vector = np.array(translate(input_text, word_to_index))
input_vector

array([ 0,  0,  1,  1, 12,  1,  1,  1,  1,  1,  1,  1,  3,  0,  1,  1,  7,
        1,  1,  1,  1,  1, 10,  0,  1,  1, 15,  0,  1,  1,  8,  1,  1,  1,
        1,  1,  1,  1,  1,  4,  0,  1,  1,  7,  1,  1,  1,  1,  1, 10,  0,
        1,  1, 15,  0,  1,  1, 13,  1,  1,  1,  1,  1,  1,  1,  2,  0,  1,
        1, 11,  0,  0])

In [8]:
output_vector = np.array(translate_output(output_text, word_to_index, len(input_vector)))
output_vector

array([ 0,  0, 12,  1,  7,  1, 15,  1,  3,  1, 10,  0,  8,  1,  7,  1, 15,
        1,  4,  1, 10,  0, 13,  1, 11,  1,  2,  0,  0,  5,  5,  5,  5,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
        5,  5,  5,  5])

In [9]:
assert len(input_vector) == len(output_vector)

In [10]:
num_encoder_tokens = len(input_vector)
num_decoder_tokens = len(output_vector)

In [11]:
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Input, LSTM, TimeDistributed, Dense, Activation, RepeatVector

Using TensorFlow backend.


In [12]:
model = Sequential()

model.add(LSTM(100, input_shape=(72, n_vocab)))
model.add(RepeatVector(72))

model.add(LSTM(100, return_sequences=True))

model.add(TimeDistributed(Dense(n_vocab, activation='softmax')))

In [13]:
X = np.array([to_categorical(input_vector.tolist(), n_vocab)])
y = np.array([to_categorical(output_vector, n_vocab)])

## output should be probability distribution, (softmax, one-hot)
print(y[0][5])

[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [14]:
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics = ['accuracy']
)

## fire away!!!!, over-train!!!
model.fit(X, y, epochs=800)

Epoch 1/800
Epoch 2/800
Epoch 3/800
Epoch 4/800
Epoch 5/800
Epoch 6/800
Epoch 7/800
Epoch 8/800
Epoch 9/800
Epoch 10/800
Epoch 11/800
Epoch 12/800
Epoch 13/800
Epoch 14/800
Epoch 15/800
Epoch 16/800
Epoch 17/800
Epoch 18/800
Epoch 19/800
Epoch 20/800
Epoch 21/800
Epoch 22/800
Epoch 23/800
Epoch 24/800
Epoch 25/800
Epoch 26/800
Epoch 27/800
Epoch 28/800
Epoch 29/800
Epoch 30/800
Epoch 31/800
Epoch 32/800
Epoch 33/800
Epoch 34/800
Epoch 35/800
Epoch 36/800
Epoch 37/800
Epoch 38/800
Epoch 39/800
Epoch 40/800
Epoch 41/800
Epoch 42/800
Epoch 43/800
Epoch 44/800
Epoch 45/800
Epoch 46/800
Epoch 47/800
Epoch 48/800
Epoch 49/800
Epoch 50/800
Epoch 51/800
Epoch 52/800
Epoch 53/800
Epoch 54/800
Epoch 55/800
Epoch 56/800
Epoch 57/800
Epoch 58/800
Epoch 59/800
Epoch 60/800
Epoch 61/800
Epoch 62/800
Epoch 63/800
Epoch 64/800
Epoch 65/800
Epoch 66/800
Epoch 67/800
Epoch 68/800
Epoch 69/800
Epoch 70/800
Epoch 71/800
Epoch 72/800
Epoch 73/800
Epoch 74/800
Epoch 75/800
Epoch 76/800
Epoch 77/800
Epoch 78

<keras.callbacks.callbacks.History at 0x14fb86c18>

In [15]:
prediction = [
    np.argmax(x)
    for x 
    in model.predict(X)[0]
]

In [16]:
print(input_text)



  right       10
  forward     m/s
  velocity
  left        50
  forward     m/s
  velocity
  speed       .5
  ratio




In [19]:
predicted_output_text = re.sub(r'<>', '', ''.join([ index_to_word[i] for i in prediction ]))
predicted_output_text

'\n\nright forward velocity 10 m/s\nleft forward velocity 50 m/s\nspeed ratio .5\n\n'

In [20]:
## flattened columns,
print(predicted_output_text)



right forward velocity 10 m/s
left forward velocity 50 m/s
speed ratio .5


