<a href="https://colab.research.google.com/github/didi64/Colab_Test/blob/main/Classify_Reviews2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os, sys
from google.colab import drive
ROOT = '/content/drive/My Drive/CAS_Gregi/'
drive.mount('/content/drive', force_remount=True)

nb_path = '/content/notebooks'
os.symlink(ROOT + 'CO_modules', nb_path)
sys.path.insert(0,nb_path)

Mounted at /content/drive


In [2]:
import pickle
with open(ROOT + 'data/hotelreviews_cleaned_train_test_spell.pkl','rb') as f:
    data_train, data_test = pickle.load(f)

In [3]:
import random
import numpy as np
import tensorflow as tf
from functools import reduce

In [4]:
def fix_shape(text):
    return tf.constant(text, shape = (1,))

def get_dataset(d):
    ''' get dataset from dict {score: list_of_revies,...}'''
    # list of tuples [(score, review),...]
    all_reviews = reduce(lambda x,y: x + y, [[(r, k) for k,v in d.items() for r in v]])
    random.shuffle(all_reviews)
    reviews = tf.constant([r for r,_ in all_reviews], shape=(len(all_reviews), 1))
    gpt_scores  = tf.constant([s - 1 for _, s in all_reviews])

    n = len(reviews)
    ds = tf.data.Dataset.from_tensor_slices((reviews[:n], gpt_scores[:n]))
    return ds

In [5]:
ds_train = get_dataset(data_train)
ds_test = get_dataset(data_test)

In [6]:
all_reviews = reduce(lambda x,y: x + y, [[(r, k) for d in (data_train, data_test) for k,v in d.items() for r in v]])
WORDS = reduce(lambda x,y:x|y, [set(x[0].split()) for x in all_reviews])

In [7]:
len(WORDS)

9033

In [8]:
# standardization: one element of  standardizations or a callable mapping strings to strings
standardizations = [None, "lower_and_strip_punctuation", "lower", "strip_punctuation"]
max_tokens = 10_000
ngrams = None
sequence_length = 200

tv_layer = tf.keras.layers.TextVectorization(
    max_tokens = max_tokens,
    standardize = standardizations[0],
    ngrams=ngrams,
    output_mode='int',
    output_sequence_length = sequence_length,
    pad_to_max_tokens=True,
)

In [9]:
tv_layer.adapt([r[0] for r in all_reviews])
vocab = tv_layer.get_vocabulary()
len(vocab) # max_tokens

9035

In [10]:
tv_layer(data_train[3][30])

<tf.Tensor: shape=(200,), dtype=int64, numpy=
array([  16,    8,  436,   82,  163,    5,  365,  222,   37,   21,    2,
        377,  259,   36,   20,    5,  276,   10,  104, 2640,   21,    2,
        377,  466,   16,   20,  528,    2,  207,   55,    5,  118,  275,
          3,   44,  105,  210,  192, 1695,    6,  316,    8,    2,  115,
          7,   13,    2,   94,   25,   65,  264,   42,  367,    3,  586,
          5,  664,    4,  392,    8,    2,    7,   36,   14,   40, 1155,
        107,  316,   17,  429,   29,  296,    5,  531,    3,    5,  357,
        113,   28,   89,   28,  291,  398,    6, 4099,    5, 1109, 3744,
          2,    7,    4,   52,    3,  722,    3,    2,   41,    4,   98,
         30,  106,  544,    8,    2,   67,    3,  738,  585, 1656,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
     

In [11]:
output_dim =  16
em_layer = tf.keras.layers.Embedding(input_dim = max_tokens + 1, output_dim= output_dim)
em_layer(tv_layer('what a hotel'))

<tf.Tensor: shape=(200, 16), dtype=float32, numpy=
array([[ 2.3678217e-02, -2.1151638e-02, -2.3410155e-02, ...,
         4.5185115e-02,  4.9323205e-02,  2.0979334e-02],
       [-2.6473487e-02, -9.7542033e-03,  4.6490718e-02, ...,
         2.7178898e-03,  2.2125531e-02, -1.7344214e-02],
       [-1.5140474e-02, -3.6553599e-02,  5.3130090e-05, ...,
        -7.5159669e-03,  2.5050528e-03,  2.0912077e-02],
       ...,
       [-4.3067422e-02, -3.4635913e-02,  2.3401227e-02, ...,
         4.9287785e-02, -1.7889142e-02, -1.5301332e-03],
       [-4.3067422e-02, -3.4635913e-02,  2.3401227e-02, ...,
         4.9287785e-02, -1.7889142e-02, -1.5301332e-03],
       [-4.3067422e-02, -3.4635913e-02,  2.3401227e-02, ...,
         4.9287785e-02, -1.7889142e-02, -1.5301332e-03]], dtype=float32)>

In [12]:
model = tf.keras.Sequential([
    tv_layer,
    em_layer,
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(10),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation = 'relu')])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 200)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 200, 16)           160016    
                                                                 
 dropout (Dropout)           (None, 200, 16)           0         
                                                                 
 lstm (LSTM)                 (None, 10)                1080      
                                                                 
 dropout_1 (Dropout)         (None, 10)                0         
                                                                 
 flatten (Flatten)           (None, 10)                0         
                                                        

In [13]:
metric = tf.keras.metrics.MeanAbsoluteError(name='mean_absolute_error', dtype=None)
loss =  tf.keras.losses.MeanSquaredError()
model.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    loss=loss,
    metrics=[metric],
)

In [14]:
# batch train and validate date if this not already happend
if next(iter(ds_train))[0].shape[0] == 1:
    ds_train = ds_train.batch(32)

if next(iter(ds_test))[0].shape[0] == 1:
    ds_test = ds_test.batch(32)

In [15]:
i = 0
for review, _ in iter(ds_train):
    i += 1
    if (i %10) ==0:
        print(i, end = ', ')
    model(review)

10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 

In [None]:
checkpoint_filepath = ROOT + '/My_Models/lstm10/checkpoints'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

In [None]:
# The model weights (that are considered the best) are loaded into the
# model.
# model.load_weights(checkpoint_filepath)

In [None]:
model.fit(
    ds_train,
    epochs = 500,
    validation_data = ds_test,
    callbacks=[model_checkpoint_callback]
)

model.save_weights(ROOT + 'data/models/lstm10')

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

In [None]:
next(iter(ds_train))

In [18]:
model.evaluate(ds_test)



[2.0017638206481934, 1.2105352878570557]

In [None]:
# save the model
#model.save(ROOT + 'data/models/lstm10.sav')

In [17]:
text = fix_shape('the best hotel i ever stayed at')
model.predict(text)



array([[1.9284965]], dtype=float32)

In [None]:
text = '''\
the breakfast was excellent and the room very clean and spacious.
the staff was very friendly.
I can really recommend this place'''
model.predict(fix_shape(text))

In [None]:
text = '''\
the breakfast was excellent and the room very clean and spacious.
the staff was very friendly.
I can relly recommend this place'''
model.predict(fix_shape(text))

In [None]:
text = '''\
the breakfast was ok, but nothing special.
the staff was very friendly.
But the room we small and not suitable for working'''
model.predict(fix_shape(text))

In [None]:
text = '''\
the breakfast was ok, but nothing special.
the staff was quite rude.
And the room we small and not suitable for working'''
model.predict(fix_shape(text))

In [None]:
text = '''\
the breakfast was bad, only bread and coffe.
Checkin took forever.
No roomservice.
The room we small and not suitable for working'''
model.predict(fix_shape(text))