In [1]:
## how might one format a columnar table?
##  - this notebook is only meant to explore one possible, simple method and how an end to end solution may look.

In [2]:
import numpy as np

In [3]:
input_text = """

  right       10
  forward     m/s
  velocity
  left        50
  forward     m/s
  velocity
  speed       .5
  ratio

"""

output_text = """

right forward velocity 10 m/s
left forward velocity 50 m/s
speed ratio .5

"""

In [4]:
from nltk.tokenize import word_tokenize

In [5]:
vocab = sorted([ '\n', ' ', '<>' ] + word_tokenize(input_text))

n_vocab = len(vocab)

word_to_index = dict([ (w, i) for i, w in enumerate(vocab) ])
index_to_word = dict([ (i, w) for i, w in enumerate(vocab) ])

print(vocab)

['\n', ' ', '.5', '10', '50', '<>', 'forward', 'forward', 'left', 'm/s', 'm/s', 'ratio', 'right', 'speed', 'velocity', 'velocity']


In [6]:
import re

In [7]:
def translate(text: str, word_to_index: dict):
    matches = re.findall('(\n|[ ]|\w+\/\w+|[\d.]*\d[\d.]*|[a-z]+)', text)
    return [ word_to_index[key] for key in matches ]

def translate_output(text: str, word_to_index: dict, size: int):
    a = translate(text, word_to_index)
    
    ## pad,
    n = len(a)
    if n < size:
        p = size - n
        a = a + [ word_to_index['<>'] for _ in range(p) ]
    
    return a

In [8]:
input_vector = np.array(translate(input_text, word_to_index))
input_vector

array([ 0,  0,  1,  1, 12,  1,  1,  1,  1,  1,  1,  1,  3,  0,  1,  1,  7,
        1,  1,  1,  1,  1, 10,  0,  1,  1, 15,  0,  1,  1,  8,  1,  1,  1,
        1,  1,  1,  1,  1,  4,  0,  1,  1,  7,  1,  1,  1,  1,  1, 10,  0,
        1,  1, 15,  0,  1,  1, 13,  1,  1,  1,  1,  1,  1,  1,  2,  0,  1,
        1, 11,  0,  0])

In [9]:
output_vector = np.array(translate_output(output_text, word_to_index, len(input_vector)))
output_vector

array([ 0,  0, 12,  1,  7,  1, 15,  1,  3,  1, 10,  0,  8,  1,  7,  1, 15,
        1,  4,  1, 10,  0, 13,  1, 11,  1,  2,  0,  0,  5,  5,  5,  5,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
        5,  5,  5,  5])

In [10]:
assert len(input_vector) == len(output_vector)

In [11]:
num_encoder_tokens = len(input_vector)
num_decoder_tokens = len(output_vector)

In [12]:
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Input, LSTM, TimeDistributed, Dense, Bidirectional, Dropout

Using TensorFlow backend.


In [13]:
### https://github.com/keras-team/keras/blob/master/examples/addition_rnn.py

model = Sequential()

model.add(Bidirectional(LSTM(100, input_shape=(72, n_vocab), return_sequences=True)))
model.add(Dropout(0.2))

model.add(LSTM(100, return_sequences=True))
model.add(Dropout(0.2))

## apply dense network to each t
model.add(TimeDistributed(Dense(n_vocab, activation='softmax')))

In [14]:
X = np.array([to_categorical(input_vector.tolist(), n_vocab)])
y = np.array([to_categorical(output_vector, n_vocab)])

## output should be probability distribution, (softmax, one-hot)
print(y[0][5])

[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [15]:
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics = ['accuracy']
)

## fire away!!!!, over-train!!!
model.fit(X, y, epochs=500)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.callbacks.callbacks.History at 0x146cd93c8>

In [16]:
prediction = [
    np.argmax(x)
    for x 
    in model.predict(X)[0]
]

In [17]:
print(input_text)



  right       10
  forward     m/s
  velocity
  left        50
  forward     m/s
  velocity
  speed       .5
  ratio




In [18]:
predicted_output_text = re.sub(r'<>', '', ''.join([ index_to_word[i] for i in prediction ]))
predicted_output_text

'\n\nright forward velocity 10 m/s\nleft forward velocity 50 m/s\nspeed ratio .5\n\n'

In [19]:
## flattened columns,
print(predicted_output_text)



right forward velocity 10 m/s
left forward velocity 50 m/s
speed ratio .5


