In [1]:
# LSTM with Variable Length Input Sequences to One Character Output
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.utils import np_utils
from keras.preprocessing.sequence import pad_sequences


Using Theano backend.


In [2]:

# define the raw dataset
alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
# create mapping of characters to integers (0-25) and the reverse
char_to_int = dict((c, i) for i, c in enumerate(alphabet))
int_to_char = dict((i, c) for i, c in enumerate(alphabet))
# prepare the dataset of input to output pairs encoded as integers
num_inputs = 1000
max_len = 5
dataX = []
dataY = []
for i in range(num_inputs):
    start = numpy.random.randint(len(alphabet)-2)
    end = numpy.random.randint(start, min(start+max_len,len(alphabet)-1))
    sequence_in = alphabet[start:end+1]
    sequence_out = alphabet[end + 1]
    dataX.append([char_to_int[char] for char in sequence_in])
    dataY.append(char_to_int[sequence_out])
    print sequence_in, '->', sequence_out
# convert list of lists to array and pad sequences if needed
X = pad_sequences(dataX, maxlen=max_len, dtype='float32')
# reshape X to be [samples, time steps, features]
X = numpy.reshape(X, (X.shape[0], max_len, 1))
# normalize
X = X / float(len(alphabet))
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

XY -> Z
STU -> V
QRSTU -> V
V -> W
GHIJ -> K
NOP -> Q
OPQR -> S
FGH -> I
QR -> S
FGHIJ -> K
TUVWX -> Y
STUV -> W
NOPQ -> R
QRST -> U
XY -> Z
LMNO -> P
PQRST -> U
KLMNO -> P
NOPQ -> R
AB -> C
BCD -> E
L -> M
MNOP -> Q
NO -> P
Q -> R
WXY -> Z
XY -> Z
TUVWX -> Y
ST -> U
OPQRS -> T
VW -> X
MNOPQ -> R
TUVWX -> Y
ABC -> D
EFG -> H
P -> Q
W -> X
STUV -> W
RST -> U
W -> X
UVWX -> Y
OPQR -> S
STU -> V
MNOP -> Q
WX -> Y
I -> J
D -> E
EFG -> H
WXY -> Z
W -> X
HI -> J
RS -> T
STUV -> W
HIJK -> L
QR -> S
DEFGH -> I
A -> B
VWXY -> Z
OPQ -> R
OPQR -> S
STU -> V
C -> D
STUVW -> X
IJKL -> M
UVWXY -> Z
EFG -> H
TUVWX -> Y
JKL -> M
VW -> X
TUV -> W
NO -> P
O -> P
RSTUV -> W
CDEFG -> H
PQ -> R
X -> Y
NOPQ -> R
PQ -> R
LM -> N
UVW -> X
DEFGH -> I
XY -> Z
HIJ -> K
DEFG -> H
TUVWX -> Y
NOPQR -> S
MNOPQ -> R
IJKLM -> N
JKLM -> N
LMNOP -> Q
GH -> I
LM -> N
VWXY -> Z
RSTUV -> W
B -> C
P -> Q
G -> H
HIJK -> L
IJ -> K
P -> Q
KL -> M
FG -> H
JKLMN -> O
QRS -> T
U -> V
Q -> R
RSTU -> V
XY -> Z
FG -> H
BCDE -> F
WXY

In [3]:
# create and fit the model
batch_size = 1
model = Sequential()
model.add(LSTM(32, input_shape=(X.shape[1], 1)))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, nb_epoch=500, batch_size=batch_size, verbose=2)
# summarize performance of the model
scores = model.evaluate(X, y, verbose=0)
print("Model Accuracy: %.2f%%" % (scores[1]*100))


Epoch 1/500
421s - loss: 3.0852 - acc: 0.0720
Epoch 2/500
428s - loss: 2.7835 - acc: 0.1320
Epoch 3/500
472s - loss: 2.4160 - acc: 0.1970
Epoch 4/500
607s - loss: 2.1670 - acc: 0.2370
Epoch 5/500
615s - loss: 2.0016 - acc: 0.3100
Epoch 6/500
452s - loss: 1.8741 - acc: 0.3350
Epoch 7/500
420s - loss: 1.7705 - acc: 0.3900
Epoch 8/500
419s - loss: 1.6743 - acc: 0.4170
Epoch 9/500
419s - loss: 1.5906 - acc: 0.4430
Epoch 10/500
418s - loss: 1.5197 - acc: 0.5010
Epoch 11/500
420s - loss: 1.4414 - acc: 0.5190
Epoch 12/500
423s - loss: 1.3772 - acc: 0.5470
Epoch 13/500
423s - loss: 1.3206 - acc: 0.5870
Epoch 14/500
426s - loss: 1.2664 - acc: 0.6210
Epoch 15/500
461s - loss: 1.2140 - acc: 0.6370
Epoch 16/500
446s - loss: 1.1618 - acc: 0.6460
Epoch 17/500
450s - loss: 1.1204 - acc: 0.6920
Epoch 18/500
450s - loss: 1.0895 - acc: 0.6830
Epoch 19/500
451s - loss: 1.0441 - acc: 0.7050
Epoch 20/500
448s - loss: 1.0109 - acc: 0.7070
Epoch 21/500
445s - loss: 0.9543 - acc: 0.7390
Epoch 22/500
450s - lo

In [4]:
# demonstrate some model predictions
for i in range(20):
    pattern_index = numpy.random.randint(len(dataX))
    pattern = dataX[pattern_index]
    x = pad_sequences([pattern], maxlen=max_len, dtype='float32')
    x = numpy.reshape(x, (1, max_len, 1))
    x = x / float(len(alphabet))
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    print seq_in, "->", result

['H', 'I', 'J'] -> K
['F', 'G', 'H'] -> I
['V', 'W'] -> X
['L', 'M'] -> N
['K', 'L', 'M', 'N'] -> O
['W', 'X'] -> Y
['X', 'Y'] -> Z
['R', 'S', 'T', 'U'] -> V
['V', 'W'] -> X
['B', 'C'] -> D
['E', 'F', 'G'] -> H
['H', 'I', 'J', 'K'] -> L
['I', 'J', 'K'] -> L
['S', 'T', 'U', 'V', 'W'] -> X
['S', 'T', 'U', 'V', 'W'] -> X
['M', 'N', 'O', 'P'] -> Q
['K', 'L', 'M'] -> N
['L', 'M', 'N'] -> O
['M', 'N'] -> O
['C', 'D', 'E', 'F'] -> G
