In [10]:
import numpy as np
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding

In [11]:
reviews = ['nice food',
           'amazing restaurant',
           'too good',
           'just love it!',
           'will do again',
           'horrible food',
           'never go there',
           'poor service',
           'poor quality',
           'needs improvement']

sentiment = np.array([1,1,1,1,1,0,0,0,0,0])

In [16]:
#next we convert this statements to numbers using one_hot encoding
one_hot('amazing restaurant',30)#assings unique numbers to each word and range should be between the range specified in this case 30

[15, 4]

In [18]:
#using the above example we can covert all the statements as follows
vocab_size = 30
encoded_reviews = [one_hot(d, vocab_size) for d in reviews]
print(encoded_reviews)
encoded_reviews

[[4, 8], [15, 4], [13, 1], [15, 11, 5], [21, 11, 29], [5, 8], [8, 29, 13], [7, 4], [7, 19], [8, 14]]


[[4, 8],
 [15, 4],
 [13, 1],
 [15, 11, 5],
 [21, 11, 29],
 [5, 8],
 [8, 29, 13],
 [7, 4],
 [7, 19],
 [8, 14]]

In [21]:
#you realize some sentences are two word sentences and others three, to balance this we need to do padding
#padding adds all zeros to lesser patches to balance with bigger patch
max_length = 3
padded_reviews = pad_sequences(encoded_reviews, maxlen = max_length, padding = 'post')
padded_reviews

array([[ 4,  8,  0],
       [15,  4,  0],
       [13,  1,  0],
       [15, 11,  5],
       [21, 11, 29],
       [ 5,  8,  0],
       [ 8, 29, 13],
       [ 7,  4,  0],
       [ 7, 19,  0],
       [ 8, 14,  0]])

In [29]:
#next we define our embedded vector size
embedded_vector_size = 4

model = Sequential()
model.add(Embedding(vocab_size, embedded_vector_size, input_length = max_length, name = 'embedding'))
model.add(Flatten())
model.add(Dense(1, activation = 'sigmoid'))


In [30]:
#next we define our X and y
X = padded_reviews
y = sentiment


In [31]:
#we can now compile our model
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 4)              120       
                                                                 
 flatten_2 (Flatten)         (None, 12)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 13        
                                                                 
Total params: 133
Trainable params: 133
Non-trainable params: 0
_________________________________________________________________


In [32]:
#we now train our model
model.fit(X,y, epochs = 50, verbose=0)

<keras.callbacks.History at 0x230e65bbaf0>

In [33]:
#evaluate the model
loss, accuracy = model.evaluate(X,y)



In [36]:
#we can now check the weights given to each word
weights = model.get_layer('embedding').get_weights()[0]
weights
len(weights)

30

In [37]:
#we can now try to check whether our model can predict correctly the words nice and amazing which were assigned 4 and 15 in our case
weights[4]

array([-0.02453337,  0.02097336,  0.06079375, -0.02149011], dtype=float32)

In [38]:
weights[15]

array([-0.01397455,  0.02096418,  0.07142939, -0.01550776], dtype=float32)