# predicting next char in sequence

In [17]:
# load all necessary libraries
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.utils import np_utils
from keras.utils import print_summary
from keras.preprocessing.sequence import pad_sequences

In [2]:
# fix random seed for reproducibility
numpy.random.seed(7)

In [3]:
# define the raw dataset
alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
print(enumerate(alphabet))

<enumerate object at 0x7f2cd4937360>


In [4]:
# create a mapping of character to number and reverse
char_to_int = dict((c, i) for i, c in enumerate(alphabet))
int_to_char = dict((i, c) for i, c in enumerate(alphabet))

In [5]:
# create a dataset with input output pair of max 5 char in sequece
# encoded as integer

num_inputs = 1000
max_len = 5
dataX = []
dataY = []

for i in range(num_inputs):
    start = numpy.random.randint(len(alphabet)-2)
    end = numpy.random.randint(start, min(start+max_len, len(alphabet)-1))
    seq_in = alphabet[start:end+1]
    seq_out = alphabet[end+1]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append([char_to_int[seq_out]])
    print(seq_in, '-->', seq_out)
    print(dataX[i], '==>', dataY[i])

PQRST --> U
[15, 16, 17, 18, 19] ==> [20]
W --> X
[22] ==> [23]
O --> P
[14] ==> [15]
OPQ --> R
[14, 15, 16] ==> [17]
IJKLM --> N
[8, 9, 10, 11, 12] ==> [13]
QRSTU --> V
[16, 17, 18, 19, 20] ==> [21]
ABCD --> E
[0, 1, 2, 3] ==> [4]
X --> Y
[23] ==> [24]
GHIJ --> K
[6, 7, 8, 9] ==> [10]
M --> N
[12] ==> [13]
XY --> Z
[23, 24] ==> [25]
QRST --> U
[16, 17, 18, 19] ==> [20]
ABC --> D
[0, 1, 2] ==> [3]
JKLMN --> O
[9, 10, 11, 12, 13] ==> [14]
OP --> Q
[14, 15] ==> [16]
XY --> Z
[23, 24] ==> [25]
D --> E
[3] ==> [4]
T --> U
[19] ==> [20]
B --> C
[1] ==> [2]
QRSTU --> V
[16, 17, 18, 19, 20] ==> [21]
HIJ --> K
[7, 8, 9] ==> [10]
JKLM --> N
[9, 10, 11, 12] ==> [13]
ABCDE --> F
[0, 1, 2, 3, 4] ==> [5]
X --> Y
[23] ==> [24]
V --> W
[21] ==> [22]
DE --> F
[3, 4] ==> [5]
DEFG --> H
[3, 4, 5, 6] ==> [7]
BCDE --> F
[1, 2, 3, 4] ==> [5]
EFGH --> I
[4, 5, 6, 7] ==> [8]
BCDE --> F
[1, 2, 3, 4] ==> [5]
FG --> H
[5, 6] ==> [7]
RST --> U
[17, 18, 19] ==> [20]
TUV --> W
[19, 20, 21] ==> [22]
STUV --> W
[18,

[13, 14] ==> [15]
KLM --> N
[10, 11, 12] ==> [13]
TUVWX --> Y
[19, 20, 21, 22, 23] ==> [24]
U --> V
[20] ==> [21]
CDEFG --> H
[2, 3, 4, 5, 6] ==> [7]
FGHI --> J
[5, 6, 7, 8] ==> [9]
STUVW --> X
[18, 19, 20, 21, 22] ==> [23]
JKLM --> N
[9, 10, 11, 12] ==> [13]
ABC --> D
[0, 1, 2] ==> [3]
JKLMN --> O
[9, 10, 11, 12, 13] ==> [14]
TUVWX --> Y
[19, 20, 21, 22, 23] ==> [24]
D --> E
[3] ==> [4]
EFGH --> I
[4, 5, 6, 7] ==> [8]
IJ --> K
[8, 9] ==> [10]
UVW --> X
[20, 21, 22] ==> [23]
OPQR --> S
[14, 15, 16, 17] ==> [18]
N --> O
[13] ==> [14]
VWXY --> Z
[21, 22, 23, 24] ==> [25]
ABC --> D
[0, 1, 2] ==> [3]
J --> K
[9] ==> [10]
RS --> T
[17, 18] ==> [19]
LMNOP --> Q
[11, 12, 13, 14, 15] ==> [16]
BC --> D
[1, 2] ==> [3]
OPQ --> R
[14, 15, 16] ==> [17]
JKLM --> N
[9, 10, 11, 12] ==> [13]
WX --> Y
[22, 23] ==> [24]
BCD --> E
[1, 2, 3] ==> [4]
RSTU --> V
[17, 18, 19, 20] ==> [21]
GHI --> J
[6, 7, 8] ==> [9]
O --> P
[14] ==> [15]
R --> S
[17] ==> [18]
QR --> S
[16, 17] ==> [18]
HIJKL --> M
[7, 8, 9, 1

QRS --> T
[16, 17, 18] ==> [19]
QRSTU --> V
[16, 17, 18, 19, 20] ==> [21]
DEF --> G
[3, 4, 5] ==> [6]
UV --> W
[20, 21] ==> [22]
D --> E
[3] ==> [4]
BC --> D
[1, 2] ==> [3]
OPQRS --> T
[14, 15, 16, 17, 18] ==> [19]
EFGH --> I
[4, 5, 6, 7] ==> [8]
QRST --> U
[16, 17, 18, 19] ==> [20]
EF --> G
[4, 5] ==> [6]
RST --> U
[17, 18, 19] ==> [20]
JKL --> M
[9, 10, 11] ==> [12]
STU --> V
[18, 19, 20] ==> [21]
UVWX --> Y
[20, 21, 22, 23] ==> [24]
EFGHI --> J
[4, 5, 6, 7, 8] ==> [9]
JKLMN --> O
[9, 10, 11, 12, 13] ==> [14]
P --> Q
[15] ==> [16]
BCD --> E
[1, 2, 3] ==> [4]
TU --> V
[19, 20] ==> [21]
O --> P
[14] ==> [15]
RST --> U
[17, 18, 19] ==> [20]
D --> E
[3] ==> [4]
VWXY --> Z
[21, 22, 23, 24] ==> [25]
R --> S
[17] ==> [18]
P --> Q
[15] ==> [16]
CDE --> F
[2, 3, 4] ==> [5]
X --> Y
[23] ==> [24]
UVWXY --> Z
[20, 21, 22, 23, 24] ==> [25]
DEFGH --> I
[3, 4, 5, 6, 7] ==> [8]
NOP --> Q
[13, 14, 15] ==> [16]
ABCD --> E
[0, 1, 2, 3] ==> [4]
B --> C
[1] ==> [2]
BC --> D
[1, 2] ==> [3]
VW --> X
[21, 2

In [6]:
# pad sequences with 0
X = pad_sequences(dataX, maxlen=max_len, dtype='float32')
X[0]

array([ 15.,  16.,  17.,  18.,  19.], dtype=float32)

In [7]:
X.shape

(1000, 5)

In [8]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(X, (X.shape[0], max_len, 1))
# normalize
X = X / float(len(alphabet))
# one hot encode the output variable
Y = np_utils.to_categorical(dataY)

In [9]:
print(X.shape[1])
model = Sequential()

# 32 cells(units) in lstm
# number of units is expiremental and can be changed according to your data
# num_units is the size of the LSTM’s hidden state (which is also the size of the output). 
model.add(LSTM(32, input_shape=(X.shape[1], 1)))
# Y.shape[1] --> dimensionality of the output spa
model.add(Dense(Y.shape[1], activation="softmax"))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.
()

5
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 32)                4352      
_________________________________________________________________
dense_1 (Dense)              (None, 26)                858       
Total params: 5,210
Trainable params: 5,210
Non-trainable params: 0
_________________________________________________________________


In [10]:
# verbose: Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch
# epochs is num of iterations

model.fit(X, Y, epochs=500, batch_size=1, verbose=2)

Epoch 1/500
 - 7s - loss: 3.0782 - acc: 0.0630
Epoch 2/500
 - 6s - loss: 2.7666 - acc: 0.1280
Epoch 3/500
 - 6s - loss: 2.4404 - acc: 0.1960
Epoch 4/500
 - 6s - loss: 2.2153 - acc: 0.2600
Epoch 5/500
 - 6s - loss: 2.0671 - acc: 0.3070
Epoch 6/500
 - 6s - loss: 1.9430 - acc: 0.3220
Epoch 7/500
 - 6s - loss: 1.8417 - acc: 0.3440
Epoch 8/500
 - 6s - loss: 1.7573 - acc: 0.3720
Epoch 9/500
 - 6s - loss: 1.6774 - acc: 0.4230
Epoch 10/500
 - 6s - loss: 1.6017 - acc: 0.4470
Epoch 11/500
 - 6s - loss: 1.5326 - acc: 0.4740
Epoch 12/500
 - 6s - loss: 1.4722 - acc: 0.4970
Epoch 13/500
 - 6s - loss: 1.4183 - acc: 0.5080
Epoch 14/500
 - 6s - loss: 1.3618 - acc: 0.5530
Epoch 15/500
 - 6s - loss: 1.3166 - acc: 0.5620
Epoch 16/500
 - 6s - loss: 1.2699 - acc: 0.5930
Epoch 17/500
 - 6s - loss: 1.2246 - acc: 0.5830
Epoch 18/500
 - 6s - loss: 1.1915 - acc: 0.6210
Epoch 19/500
 - 6s - loss: 1.1416 - acc: 0.6310
Epoch 20/500
 - 6s - loss: 1.1194 - acc: 0.6480
Epoch 21/500
 - 6s - loss: 1.0773 - acc: 0.6600
E

Epoch 171/500
 - 6s - loss: 0.2514 - acc: 0.9240
Epoch 172/500
 - 6s - loss: 0.2539 - acc: 0.9300
Epoch 173/500
 - 7s - loss: 0.2885 - acc: 0.9100
Epoch 174/500
 - 6s - loss: 0.3426 - acc: 0.9050
Epoch 175/500
 - 7s - loss: 0.2434 - acc: 0.9330
Epoch 176/500
 - 6s - loss: 0.2454 - acc: 0.9250
Epoch 177/500
 - 7s - loss: 0.2440 - acc: 0.9310
Epoch 178/500
 - 6s - loss: 0.2475 - acc: 0.9260
Epoch 179/500
 - 6s - loss: 0.2507 - acc: 0.9210
Epoch 180/500
 - 6s - loss: 0.2676 - acc: 0.9070
Epoch 181/500
 - 6s - loss: 0.2985 - acc: 0.9130
Epoch 182/500
 - 7s - loss: 0.2329 - acc: 0.9330
Epoch 183/500
 - 7s - loss: 0.2396 - acc: 0.9350
Epoch 184/500
 - 6s - loss: 0.2410 - acc: 0.9300
Epoch 185/500
 - 6s - loss: 0.3403 - acc: 0.8930
Epoch 186/500
 - 6s - loss: 0.2424 - acc: 0.9290
Epoch 187/500
 - 6s - loss: 0.2304 - acc: 0.9400
Epoch 188/500
 - 7s - loss: 0.2300 - acc: 0.9290
Epoch 189/500
 - 7s - loss: 0.2358 - acc: 0.9360
Epoch 190/500
 - 7s - loss: 0.2363 - acc: 0.9320
Epoch 191/500
 - 7s 

 - 8s - loss: 0.2091 - acc: 0.9380
Epoch 339/500
 - 8s - loss: 0.1312 - acc: 0.9700
Epoch 340/500
 - 7s - loss: 0.1325 - acc: 0.9690
Epoch 341/500
 - 7s - loss: 0.1346 - acc: 0.9660
Epoch 342/500
 - 7s - loss: 0.1363 - acc: 0.9660
Epoch 343/500
 - 7s - loss: 0.1367 - acc: 0.9650
Epoch 344/500
 - 7s - loss: 0.1376 - acc: 0.9620
Epoch 345/500
 - 7s - loss: 0.2333 - acc: 0.9490
Epoch 346/500
 - 7s - loss: 0.2151 - acc: 0.9450
Epoch 347/500
 - 7s - loss: 0.1298 - acc: 0.9740
Epoch 348/500
 - 8s - loss: 0.1342 - acc: 0.9660
Epoch 349/500
 - 7s - loss: 0.1325 - acc: 0.9700
Epoch 350/500
 - 8s - loss: 0.1362 - acc: 0.9610
Epoch 351/500
 - 8s - loss: 0.1360 - acc: 0.9630
Epoch 352/500
 - 8s - loss: 0.3209 - acc: 0.9310
Epoch 353/500
 - 8s - loss: 0.1290 - acc: 0.9670
Epoch 354/500
 - 9s - loss: 0.1288 - acc: 0.9690
Epoch 355/500
 - 8s - loss: 0.1314 - acc: 0.9600
Epoch 356/500
 - 7s - loss: 0.1327 - acc: 0.9650
Epoch 357/500
 - 8s - loss: 0.1332 - acc: 0.9660
Epoch 358/500
 - 8s - loss: 0.1649

<keras.callbacks.History at 0x7f2d0d4fe1d0>

In [12]:
print("Model Accuracy:", model.evaluate(X, Y, verbose=0))

Model Accuracy: [0.48155676662921904, 0.93000000000000005]


In [14]:
# check result for first 5 seq
for i in range(20):
    pattern_index = numpy.random.randint(len(dataX))
    pattern = dataX[pattern_index]
    x = pad_sequences([pattern], maxlen=max_len, dtype='float32')
    x = numpy.reshape(x, (1, max_len, 1))
    x = x / float(len(alphabet))
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    print(seq_in, "->", result)

['V', 'W', 'X', 'Y'] -> Z
['A', 'B', 'C', 'D'] -> E
['C'] -> D
['K', 'L', 'M', 'N'] -> O
['B'] -> C
['C', 'D', 'E', 'F', 'G'] -> H
['Q', 'R'] -> S
['T', 'U', 'V', 'W', 'X'] -> W
['D', 'E', 'F', 'G', 'H'] -> I
['B', 'C', 'D', 'E', 'F'] -> G
['C', 'D', 'E', 'F'] -> G
['C'] -> D
['K', 'L', 'M'] -> N
['B', 'C', 'D', 'E'] -> F
['N', 'O'] -> P
['P'] -> Q
['W'] -> X
['V', 'W', 'X'] -> Y
['C'] -> D
['O', 'P', 'Q', 'R', 'S'] -> T
