# User rating based on the review - RNN

In [1]:
import pickle

import numpy as np

from sklearn.model_selection import train_test_split

import tensorflow as tf

tf.logging.set_verbosity(tf.logging.ERROR)
tf.random.set_random_seed(47)

from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

Using TensorFlow backend.


In [2]:
max_features = 20000
# cut texts after this number of words (among top max_features most common words)
maxlen = 100
batch_size = 256
epochs = 4

We've already preprocessed our text data when we were training TF-IDF based classifier, so now we're able to load it and skip preprocessing step.

In [3]:
with open("auxiliary/reviews_dataset_preprocessed.pkl", "rb") as f:
    X = pickle.load(f)

with open("auxiliary/reviews_dataset.pkl", "rb") as f:
    _, y = pickle.load(f)

X[:5], y[:5]

(['потрясающе красивый график космос игра полюбить наличие интересный осмысленный сюжет удобный управление',
  'заметный труд область теоретический антропология этнография язык изложение доступный качество перевод достойный замечательный серия этнографический библиотека',
  'отличный продолжение отличный игра',
  'графика неплохой остальное скучно тупо проехать определенный участок хороший управление вызывать никакой приятный ощущение игра график',
  'таки удерживаться написать топ скоро стереть прочитывать ужасный игра star wars empire at war dvd поискать нормальный сюжет стратегия вовсе провал убеждаться браться план игра отстой хороший стратегия эпизод star wars star wars galactic battlegrounds играть понимать'],
 [5, 5, 5, 3, 1])

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=37, stratify=y
)

# save an example sentence for later
example_idx = 8
example_review = X_train[example_idx]

In [5]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)

word_index = tokenizer.word_index
print(f"Found {len(word_index)} unique tokens")

Found 203040 unique tokens


In [6]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

X_train shape: (615400, 100)
X_test shape: (153851, 100)


In [7]:
# subtract 1 because keras to_categorical() expects integers from 0 to num_classes
y_train = to_categorical(np.array(y_train) - 1)
y_test = to_categorical(np.array(y_test) - 1)

print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

y_train shape: (615400, 5)
y_test shape: (153851, 5)


Here's the final representation of data in a way it'll be fed to our network:

In [8]:
X_train[example_idx]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  4,  6, 46, 13],
      dtype=int32)

Zeros up to the last 4 numbers is padding. The meaningful indices in the end correspond to words in the following sentence:

In [9]:
example_review

'купить отличный вещь удобный'

Let's make sure it corresponds to the representation above by looking up word indices.

In [10]:
for word in example_review.split():
    print(f"{word} : {word_index[word]}")

купить : 4
отличный : 6
вещь : 46
удобный : 13


In [11]:
model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
# model.add(Bidirectional(LSTM(64)))
# model.add(Dropout(0.5))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(5, activation="softmax"))

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 128)          2560000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 645       
Total params: 2,692,229
Trainable params: 2,692,229
Non-trainable params: 0
_________________________________________________________________


In [12]:
checkpoint = ModelCheckpoint(
    "auxiliary/lstm_best_weights.h5",
    monitor="val_acc",
    verbose=1,
    save_best_only=True,
    mode="max",
)

history = model.fit(
    X_train,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=[X_test, y_test],
)

Train on 615400 samples, validate on 153851 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
