# predicting next char in sequence

In [1]:
# load all necessary libraries
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.utils import np_utils
from keras.utils import print_summary
from keras.preprocessing.sequence import pad_sequences

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# fix random seed for reproducibility
numpy.random.seed(7)

In [3]:
# define the raw dataset
alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
print(enumerate(alphabet))

<enumerate object at 0x7f456735a480>


In [4]:
# create a mapping of character to number and reverse
char_to_int = dict((c, i) for i, c in enumerate(alphabet))
int_to_char = dict((i, c) for i, c in enumerate(alphabet))

In [5]:
# create a dataset with input output pair of max 5 char in sequece
# encoded as integer

num_inputs = 1000
max_len = 5
dataX = []
dataY = []

for i in range(num_inputs):
    start = numpy.random.randint(len(alphabet)-2)
    end = numpy.random.randint(start, min(start+max_len, len(alphabet)-1))
    seq_in = alphabet[start:end+1]
    seq_out = alphabet[end+1]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append([char_to_int[seq_out]])
    print(seq_in, '-->', seq_out)
    print(dataX[i], '==>', dataY[i])

PQRST --> U
[15, 16, 17, 18, 19] ==> [20]
W --> X
[22] ==> [23]
O --> P
[14] ==> [15]
OPQ --> R
[14, 15, 16] ==> [17]
IJKLM --> N
[8, 9, 10, 11, 12] ==> [13]
QRSTU --> V
[16, 17, 18, 19, 20] ==> [21]
ABCD --> E
[0, 1, 2, 3] ==> [4]
X --> Y
[23] ==> [24]
GHIJ --> K
[6, 7, 8, 9] ==> [10]
M --> N
[12] ==> [13]
XY --> Z
[23, 24] ==> [25]
QRST --> U
[16, 17, 18, 19] ==> [20]
ABC --> D
[0, 1, 2] ==> [3]
JKLMN --> O
[9, 10, 11, 12, 13] ==> [14]
OP --> Q
[14, 15] ==> [16]
XY --> Z
[23, 24] ==> [25]
D --> E
[3] ==> [4]
T --> U
[19] ==> [20]
B --> C
[1] ==> [2]
QRSTU --> V
[16, 17, 18, 19, 20] ==> [21]
HIJ --> K
[7, 8, 9] ==> [10]
JKLM --> N
[9, 10, 11, 12] ==> [13]
ABCDE --> F
[0, 1, 2, 3, 4] ==> [5]
X --> Y
[23] ==> [24]
V --> W
[21] ==> [22]
DE --> F
[3, 4] ==> [5]
DEFG --> H
[3, 4, 5, 6] ==> [7]
BCDE --> F
[1, 2, 3, 4] ==> [5]
EFGH --> I
[4, 5, 6, 7] ==> [8]
BCDE --> F
[1, 2, 3, 4] ==> [5]
FG --> H
[5, 6] ==> [7]
RST --> U
[17, 18, 19] ==> [20]
TUV --> W
[19, 20, 21] ==> [22]
STUV --> W
[18,

[13, 14] ==> [15]
KLM --> N
[10, 11, 12] ==> [13]
TUVWX --> Y
[19, 20, 21, 22, 23] ==> [24]
U --> V
[20] ==> [21]
CDEFG --> H
[2, 3, 4, 5, 6] ==> [7]
FGHI --> J
[5, 6, 7, 8] ==> [9]
STUVW --> X
[18, 19, 20, 21, 22] ==> [23]
JKLM --> N
[9, 10, 11, 12] ==> [13]
ABC --> D
[0, 1, 2] ==> [3]
JKLMN --> O
[9, 10, 11, 12, 13] ==> [14]
TUVWX --> Y
[19, 20, 21, 22, 23] ==> [24]
D --> E
[3] ==> [4]
EFGH --> I
[4, 5, 6, 7] ==> [8]
IJ --> K
[8, 9] ==> [10]
UVW --> X
[20, 21, 22] ==> [23]
OPQR --> S
[14, 15, 16, 17] ==> [18]
N --> O
[13] ==> [14]
VWXY --> Z
[21, 22, 23, 24] ==> [25]
ABC --> D
[0, 1, 2] ==> [3]
J --> K
[9] ==> [10]
RS --> T
[17, 18] ==> [19]
LMNOP --> Q
[11, 12, 13, 14, 15] ==> [16]
BC --> D
[1, 2] ==> [3]
OPQ --> R
[14, 15, 16] ==> [17]
JKLM --> N
[9, 10, 11, 12] ==> [13]
WX --> Y
[22, 23] ==> [24]
BCD --> E
[1, 2, 3] ==> [4]
RSTU --> V
[17, 18, 19, 20] ==> [21]
GHI --> J
[6, 7, 8] ==> [9]
O --> P
[14] ==> [15]
R --> S
[17] ==> [18]
QR --> S
[16, 17] ==> [18]
HIJKL --> M
[7, 8, 9, 1

DE --> F
[3, 4] ==> [5]
K --> L
[10] ==> [11]
ABC --> D
[0, 1, 2] ==> [3]
E --> F
[4] ==> [5]
STU --> V
[18, 19, 20] ==> [21]
TU --> V
[19, 20] ==> [21]
G --> H
[6] ==> [7]
AB --> C
[0, 1] ==> [2]
J --> K
[9] ==> [10]
FGH --> I
[5, 6, 7] ==> [8]
MNOP --> Q
[12, 13, 14, 15] ==> [16]
VW --> X
[21, 22] ==> [23]
CD --> E
[2, 3] ==> [4]
TUVWX --> Y
[19, 20, 21, 22, 23] ==> [24]
F --> G
[5] ==> [6]
VWX --> Y
[21, 22, 23] ==> [24]
LMNO --> P
[11, 12, 13, 14] ==> [15]
GHIJ --> K
[6, 7, 8, 9] ==> [10]
TUVWX --> Y
[19, 20, 21, 22, 23] ==> [24]
JKL --> M
[9, 10, 11] ==> [12]
LM --> N
[11, 12] ==> [13]
EFGHI --> J
[4, 5, 6, 7, 8] ==> [9]
MNO --> P
[12, 13, 14] ==> [15]
H --> I
[7] ==> [8]
M --> N
[12] ==> [13]
S --> T
[18] ==> [19]
STU --> V
[18, 19, 20] ==> [21]
QRST --> U
[16, 17, 18, 19] ==> [20]
PQR --> S
[15, 16, 17] ==> [18]
RSTUV --> W
[17, 18, 19, 20, 21] ==> [22]
ST --> U
[18, 19] ==> [20]
RSTUV --> W
[17, 18, 19, 20, 21] ==> [22]
JKLM --> N
[9, 10, 11, 12] ==> [13]
T --> U
[19] ==> [20]


In [6]:
# pad sequences with 0
X = pad_sequences(dataX, maxlen=max_len, dtype='float32')
X[0]

array([15., 16., 17., 18., 19.], dtype=float32)

In [7]:
X.shape

(1000, 5)

In [8]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(X, (X.shape[0], max_len, 1))
# normalize
X = X / float(len(alphabet))
# one hot encode the output variable
Y = np_utils.to_categorical(dataY)

In [9]:
print(X.shape[1])
model = Sequential()

# 32 cells(units) in lstm
# number of units is expiremental and can be changed according to your data
# num_units is the size of the LSTM’s hidden state (which is also the size of the output). 
model.add(LSTM(32, input_shape=(X.shape[1], 1)))
print("y shape:", Y.shape)
model.add(Dense(Y.shape[1], activation="softmax"))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

5
y shape: (1000, 26)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 32)                4352      
_________________________________________________________________
dense_1 (Dense)              (None, 26)                858       
Total params: 5,210
Trainable params: 5,210
Non-trainable params: 0
_________________________________________________________________


In [None]:
# verbose: Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch
# epochs is num of iterations

model.fit(X, Y, epochs=500, batch_size=1, verbose=2)

Epoch 1/500
 - 7s - loss: 3.0782 - acc: 0.0640
Epoch 2/500
 - 7s - loss: 2.7668 - acc: 0.1280
Epoch 3/500
 - 7s - loss: 2.4411 - acc: 0.1960
Epoch 4/500
 - 7s - loss: 2.2164 - acc: 0.2610
Epoch 5/500
 - 7s - loss: 2.0685 - acc: 0.3070
Epoch 6/500
 - 8s - loss: 1.9450 - acc: 0.3270
Epoch 7/500
 - 7s - loss: 1.8435 - acc: 0.3510
Epoch 8/500
 - 7s - loss: 1.7590 - acc: 0.3740
Epoch 9/500
 - 7s - loss: 1.6790 - acc: 0.4160
Epoch 10/500
 - 7s - loss: 1.6035 - acc: 0.4500
Epoch 11/500
 - 6s - loss: 1.5349 - acc: 0.4700
Epoch 12/500
 - 7s - loss: 1.4742 - acc: 0.4940
Epoch 13/500
 - 6s - loss: 1.4192 - acc: 0.5120
Epoch 14/500
 - 6s - loss: 1.3624 - acc: 0.5550
Epoch 15/500
 - 6s - loss: 1.3167 - acc: 0.5570
Epoch 16/500
 - 7s - loss: 1.2693 - acc: 0.5890
Epoch 17/500
 - 7s - loss: 1.2233 - acc: 0.5770
Epoch 18/500
 - 7s - loss: 1.1910 - acc: 0.6210
Epoch 19/500
 - 7s - loss: 1.1417 - acc: 0.6290
Epoch 20/500
 - 8s - loss: 1.1169 - acc: 0.6450
Epoch 21/500
 - 7s - loss: 1.0771 - acc: 0.6630
E

In [None]:
print("Model Accuracy:", model.evaluate(X, Y, verbose=0))

In [None]:
# check result for first 5 seq
for i in range(20):
    pattern_index = numpy.random.randint(len(dataX))
    pattern = dataX[pattern_index]
    x = pad_sequences([pattern], maxlen=max_len, dtype='float32')
    x = numpy.reshape(x, (1, max_len, 1))
    x = x / float(len(alphabet))
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    print(seq_in, "->", result)