In [20]:
from keras.datasets import imdb
from keras import Sequential
from keras.layers import Dense, SimpleRNN, Embedding, Flatten
from keras.utils import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np

In [21]:
docs = ['Aoa',
'good muslim',
'nice work keep it up',
        'very good work']

In [22]:
tokenizer = Tokenizer(oov_token='nothing')
tokenizer.fit_on_texts(docs)

In [24]:
len(tokenizer.word_index)

10

In [25]:
sequences = tokenizer.texts_to_sequences(docs)
sequences

[[4], [2, 5], [6, 3, 7, 8, 9], [10, 2, 3]]

In [26]:
# zero will be padded after words (written in number form)
# to equalize with max word sentence 3rd sentence of docs
sequences = pad_sequences(sequences, padding='post')
sequences

array([[ 4,  0,  0,  0,  0],
       [ 2,  5,  0,  0,  0],
       [ 6,  3,  7,  8,  9],
       [10,  2,  3,  0,  0]], dtype=int32)

In [36]:
model = Sequential()
# embedding layer has 11 unique words(9 my sentences unique words+1 oov_token+0 from pad
# every word should be converted to 2 number dense vector
# best output_dim is found by tuning
# Using the length of the padded sequences as the input length
model.add(Embedding(11, output_dim=2, input_shape=(sequences.shape[1],)))

model.summary()

  super().__init__(**kwargs)


In [37]:
# every word represented by 2 number vector so 1st sentence having 5 such pairs
pred = model.predict(sequences)
print(pred)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[[[-0.04939089 -0.01359441]
  [ 0.03086294 -0.01960527]
  [ 0.03086294 -0.01960527]
  [ 0.03086294 -0.01960527]
  [ 0.03086294 -0.01960527]]

 [[-0.00052194  0.02268868]
  [ 0.01793129  0.00354175]
  [ 0.03086294 -0.01960527]
  [ 0.03086294 -0.01960527]
  [ 0.03086294 -0.01960527]]

 [[ 0.02690903  0.0347026 ]
  [-0.03458896  0.01581844]
  [-0.00793725 -0.02022902]
  [-0.00548432 -0.03425213]
  [-0.00734059  0.03195846]]

 [[ 0.02559089  0.01872673]
  [-0.00052194  0.02268868]
  [-0.03458896  0.01581844]
  [ 0.03086294 -0.01960527]
  [ 0.03086294 -0.01960527]]]


In [49]:
# only 10000 words load to simplify model
X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)

In [50]:
X_train = pad_sequences(X_train,padding='post', maxlen=50)
X_test = pad_sequences(X_test, padding='post',maxlen=50)

In [51]:
X_train.shape

(25000, 50)

In [58]:
from keras.layers import Input

model = Sequential()
# Add an Input layer to explicitly define the input shape
model.add(Input(shape=(X_train.shape[1],)))
# tupple is req in input shape batch size none will be handled by keras
model.add(Embedding(10000, 2)) # fixed to 10000 words may be more in this data, 2 output
model.add(SimpleRNN(32, return_sequences=False))
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [59]:
# batch size by defaut 32 so 25000/32=782 steps
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train, epochs=5, validation_data=(X_test,y_test))

Epoch 1/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 16ms/step - acc: 0.5258 - loss: 0.6839 - val_acc: 0.7763 - val_loss: 0.4799
Epoch 2/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 17ms/step - acc: 0.8152 - loss: 0.4187 - val_acc: 0.8031 - val_loss: 0.4274
Epoch 3/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 18ms/step - acc: 0.8648 - loss: 0.3298 - val_acc: 0.8112 - val_loss: 0.4234
Epoch 4/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 16ms/step - acc: 0.8848 - loss: 0.2932 - val_acc: 0.8050 - val_loss: 0.4336
Epoch 5/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 19ms/step - acc: 0.9056 - loss: 0.2559 - val_acc: 0.8001 - val_loss: 0.4533
