In [None]:
import numpy as np
import pandas as pd
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, SpatialDropout1D
from keras.layers import LSTM
from keras.datasets import imdb
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
max_words = 20_000
line_length = 80
batch_size = 32

In [None]:
# http://ai.stanford.edu/~amaas/data/sentiment/
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_words)

In [None]:
# number of reviews
x_train.shape

In [None]:
# words in each review
for i in range(10):
    print(i, len(x_train[i]))

In [None]:
# sentiment, positive or negative
np.unique(y_train)

In [None]:
# mapping dictionaries
word_to_id = imdb.get_word_index()
id_to_word = {v:k for k,v in word_to_id.items()}

In [None]:
word_to_id['this']

In [None]:
id_to_word[11]

In [None]:
id_to_word[0] = '<START>'

In [None]:
def review(index):
    return ' '.join([id_to_word[max(0, idx-3)] for idx in x_train[index]])

In [None]:
review(0)

In [None]:
y_train[0]

In [None]:
# make all rows the same length
x_train = sequence.pad_sequences(x_train, maxlen=line_length)
x_test = sequence.pad_sequences(x_test, maxlen=line_length)

In [None]:
x_train.shape

In [None]:
x_train[:2]

In [None]:
y_train[:2]

In [None]:
model = Sequential()
model.add(Embedding(max_words, 128))
model.add(LSTM(128, dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# ************************************************
# CHANGE THE EPOCHS, BELOW, TO GET HIGHER ACCURACY
# ************************************************

history = model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=2,
          validation_data=(x_test, y_test))

In [None]:
fig, ax1 = plt.subplots(1,1,figsize=(12,7))
ax1.plot(history.epoch, history.history['loss'], marker='^', color='purple')
ax1.set_xlabel('epochs')
ax1.set_ylabel('loss', color='purple')
ax1.tick_params('y', colors='purple')

ax2 = ax1.twinx()
plt.plot(history.epoch, history.history['acc'], marker='+', color='green', label='train')
ax2.set_ylim(0,1)

ax3 = ax1.twinx()
plt.plot(history.epoch, history.history['val_acc'], marker='*', color='red', label='validation')
ax3.set_ylim(0,1)

fig.suptitle('imdb sentiment reviews');
fig.legend();

In [None]:
embedding = model.get_layer('embedding_1').get_weights()
embedding[0].shape

```

"Word embeddings" are a family of natural language processing techniques aiming at mapping semantic meaning into a geometric space. This is done by associating a numeric vector to every word in a dictionary, such that the distance (e.g. L2 distance or more commonly cosine distance) between any two vectors would capture part of the semantic relationship between the two associated words. The geometric space formed by these vectors is called an embedding space.

For instance, "coconut" and "polar bear" are words that are semantically quite different, so a reasonable embedding space would represent them as vectors that would be very far apart. But "kitchen" and "dinner" are related words, so they should be embedded close to each other.

The most common application of this layer is for text processing. Let's see a simple example. Our training set consists only of two phrases:

Hope to see you soon
Nice to see you again

So we can encode these phrases by assigning each word a unique integer number (by order of appearance in our training dataset for example). Then our phrases could be rewritten as:

[0, 1, 2, 3, 4]
[5, 1, 2, 3, 6]

Now imagine we want to train a network whose first layer is an embeding layer. In this case, we should initialize it as follows:

Embedding(7, 2, input_length=5)

The first argument (7) is the number of distinct words in the training set. The second argument (2) indicates the size of the embedding vectors. The input_length argumet, of course, determines the size of each input sequence.

Once the network has been trained, we can get the weights of the embedding layer, which in this case will be of size (7, 2) and can be thought as the table used to map integers to embedding vectors:

+------------+------------+
|   index    |  Embedding |
+------------+------------+
|     0      | [1.2, 3.1] |
|     1      | [0.1, 4.2] |
|     2      | [1.0, 3.1] |
|     3      | [0.3, 2.1] |
|     4      | [2.2, 1.4] |
|     5      | [0.7, 1.7] |
|     6      | [4.1, 2.0] |
+------------+------------+

So according to these embeddings, our second training phrase will be represented as:

[[0.7, 1.7], [0.1, 4.2], [1.0, 3.1], [0.3, 2.1], [4.1, 2.0]]

- Word embeddings provide a dense representation of words and their relative meanings.
- They are an improvement over sparse representations used in simpler bag of word model representations.
- Word embeddings can be learned from text data and reused among projects. They can also be learned as part of fitting a neural network on text data.

```

In [None]:
#predict sentiment from reviews
bad = "this movie was terrible and bad"
good = "i really liked the movie and had fun"
for review in [good,bad]:
    tmp = []
    for word in review.split(" "):
        tmp.append(word_to_id[word])
    tmp_padded = sequence.pad_sequences([tmp], maxlen=line_length) 
    print("{}. Sentiment: {:.2f}".format(  review, model.predict(np.array([tmp_padded][0]))[0][0]  ))