In [50]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Embedding,Flatten,Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from numpy import array

### define the documents and thei class labels

In [51]:
docs = ['Well done!',
'Good work',
'Great effort',
'nice work',
'Excellent!',
'Weak',
'Poor effort!',
'not good',
'poor work',
'Could have done better.']

## define class labels
labels = array([1,1,1,1,1,0,0,0,0,0])

In [52]:
labels

array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

In [53]:
docs

['Well done!',
 'Good work',
 'Great effort',
 'nice work',
 'Excellent!',
 'Weak',
 'Poor effort!',
 'not good',
 'poor work',
 'Could have done better.']

In [54]:
## integer encode the documents
voc_size=10000

## one hot representation

In [55]:
onehot_rep = [one_hot(words,voc_size) for words in docs]
onehot_rep

[[8287, 5675],
 [8863, 1576],
 [8406, 1172],
 [5279, 1576],
 [6920],
 [6417],
 [429, 1172],
 [7130, 8863],
 [429, 1576],
 [5029, 6979, 5675, 9982]]

In [56]:
## We will pad all input sequences to have the length of 4 again for pad_sequences
docs_length = 8
embedded_docs = pad_sequences(onehot_rep,padding='pre',maxlen=docs_length)
print(embedded_docs)

[[   0    0    0    0    0    0 8287 5675]
 [   0    0    0    0    0    0 8863 1576]
 [   0    0    0    0    0    0 8406 1172]
 [   0    0    0    0    0    0 5279 1576]
 [   0    0    0    0    0    0    0 6920]
 [   0    0    0    0    0    0    0 6417]
 [   0    0    0    0    0    0  429 1172]
 [   0    0    0    0    0    0 7130 8863]
 [   0    0    0    0    0    0  429 1576]
 [   0    0    0    0 5029 6979 5675 9982]]


In [57]:
### We are now ready to define our Embedding layer as part of our neural network model.
### The Embedding has a vocabulary of 50 and an input length of 4. We will choose a small embedding space of 8 dimensions.
#### The model is a simple binary classification model. Importantly, the output from the Embedding layer will be 4 vectors of 8 dimensions each, one for each word. We flatten this to a one 32-element vector to pass on to the Dense output layer.


dims = 8

## define the model
model = Sequential()
model.add(Embedding(voc_size,8,input_length=docs_length))
model.add(Flatten())
model.add(Dense(1,activation='sigmoid'))
## compile the model
model.compile(optimizer = 'adam',loss='binary_crossentropy',metrics=['accuracy'])

## summerized the model
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 8, 8)              80000     
_________________________________________________________________
flatten_4 (Flatten)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 80,065
Trainable params: 80,065
Non-trainable params: 0
_________________________________________________________________


In [58]:
### Finally, we can fit and evaluate the classification model.

## fit the model
model.fit(embedded_docs,labels,epochs=50,verbose=0)
## evaluate the model


<tensorflow.python.keras.callbacks.History at 0x128d4ed4f40>

In [59]:
print(model.predict(embedded_docs))

[[0.52463216]
 [0.53970015]
 [0.5172006 ]
 [0.5480965 ]
 [0.5328357 ]
 [0.47571224]
 [0.48362216]
 [0.48075306]
 [0.51677233]
 [0.41849992]]


In [60]:
## evaluate the model
loss,accuracy = model.evaluate(embedded_docs,labels,verbose=0)
loss

0.6383388638496399

In [61]:
embedded_docs[0]

array([   0,    0,    0,    0,    0,    0, 8287, 5675])

In [62]:
print(model.predict(embedded_docs)[0])

[0.52463216]


In [63]:
accuracy

0.8999999761581421

In [64]:
print('Accuracy: %f' % (accuracy*100))

Accuracy: 89.999998
