In [None]:
########################## Word Embeddings ##########################

In [None]:
# Importing required libraries

import numpy as np
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding

In [None]:
# The food review data

reviews = ['nice food',
        'amazing restaurant',
        'too good',
        'just loved it!',
        'will go again',
        'horrible food',
        'never go there',
        'poor service',
        'poor quality',
        'needs improvement'] # features


sentiment = np.array([1,1,1,1,1,0,0,0,0,0]) # labels

In [None]:
one_hot("nice food",30) # takes the review and encodes all the words in it numerically to any number < 30

[17, 9]

In [None]:
vocab_size = 30
encoded_reviews = [one_hot(r, vocab_size) for r in reviews] # encoded all the reviews
print(encoded_reviews) # vectors for all the words in the reviews

[[17, 9], [4, 11], [18, 19], [25, 28, 10], [8, 28, 19], [26, 9], [7, 28, 16], [7, 11], [7, 5], [13, 15]]


In [None]:
max_length = 4
padded_revs = pad_sequences(encoded_reviews, maxlen=max_length, padding='post') # padding because sentences are of varying length based on the sentence
# with the maximum length
print(padded_revs)

[[17  9  0  0]
 [ 4 11  0  0]
 [18 19  0  0]
 [25 28 10  0]
 [ 8 28 19  0]
 [26  9  0  0]
 [ 7 28 16  0]
 [ 7 11  0  0]
 [ 7  5  0  0]
 [13 15  0  0]]


In [None]:
embeded_vector_size = 5

model = Sequential()
model.add(Embedding(vocab_size, embeded_vector_size, input_length=max_length,name="embedding")) # adding an embedding layer
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [None]:
# Creating features and labels

features = padded_revs
labels = sentiment

In [None]:
# Compiling the model and using binary_crossentropy since output is either in 1 or 0

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 5)              150       
                                                                 
 flatten_1 (Flatten)         (None, 20)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 21        
                                                                 
Total params: 171
Trainable params: 171
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
model.fit(features, labels, epochs=50, verbose=0)

<keras.callbacks.History at 0x7e701a2f9150>

In [None]:
# Evaluating the model
loss, accuracy = model.evaluate(features, labels)
accuracy



1.0

In [None]:
weights = model.get_layer('embedding').get_weights()[0] # embedded vectors
len(weights)

30

In [None]:
weights[17] # embedding for the word 'nice' is 17

array([-0.03723816, -0.04700213, -0.04930208, -0.0984641 , -0.08253745],
      dtype=float32)

In [None]:
weights[4] # embedding for the word 'amazing' is 4

array([-0.09370555, -0.05406364, -0.02081859, -0.04902738, -0.09160612],
      dtype=float32)

In [None]:
# We can see the vectors are different for nice and amazing although these words are kind of similar because dataset is small.