<a href="https://colab.research.google.com/github/didi64/Colab_Test/blob/main/Classify_Reviews2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Load the cleaned reviews and train an LSTM-model

In [28]:
import os, sys
from google.colab import drive
ROOT = '/content/drive/MyDrive/CAS_Gregi/'
IMPORT_PATH_SYMLINK = {'link': '/content/modules', 'target': ROOT + 'CO_modules'}

drive.mount('/content/drive', force_remount=False)
if not os.path.exists(IMPORT_PATH_SYMLINK['link']):
    os.symlink(IMPORT_PATH_SYMLINK['target'], IMPORT_PATH_SYMLINK['link'])
if IMPORT_PATH_SYMLINK['link'] not in sys.path:
    sys.path.insert(0, IMPORT_PATH_SYMLINK['link'])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls -l {IMPORT_PATH_SYMLINK['link']}
!ls /content $ROOT /content/drive {IMPORT_PATH_SYMLINK['link']}

In [38]:
import pickle
with open(ROOT + 'data/hotelreviews_cleaned_train_test_spell.pkl','rb') as f:
    data_train, data_test = pickle.load(f)

In [39]:
import random
import numpy as np
import tensorflow as tf
from functools import reduce

In [40]:
def fix_shape(text):
    return tf.constant(text, shape = (1,))

def get_dataset(d):
    ''' get dataset from dict {score: list_of_revies,...}'''
    # list of tuples [(score, review),...]
    all_reviews = reduce(lambda x,y: x + y, [[(r, k) for k,v in d.items() for r in v]])
    random.shuffle(all_reviews)
    reviews = tf.constant([r for r,_ in all_reviews], shape=(len(all_reviews), 1))
    gpt_scores  = tf.constant([s - 1 for _, s in all_reviews])

    n = len(reviews)
    ds = tf.data.Dataset.from_tensor_slices((reviews[:n], gpt_scores[:n]))
    return ds

def batch(ds, batch_size = 32):
    xs, ys =  next(iter(ds))
    if xs.shape == () or xs.shape[0] == 1:
        ds = ds.batch(batch_size)
    return ds

def test_model(model, ds):
    i = 0
    for xs, _ in iter(ds):
        i += 1
        if (i %10) ==0:
            print(i, end = ', ')
        model(xs)

In [41]:
ds_train = batch(get_dataset(data_train))
ds_test  = batch(get_dataset(data_test))

In [42]:
all_reviews = reduce(lambda x,y: x + y, [[(r, k) for d in (data_train, data_test) for k,v in d.items() for r in v]])
WORDS = reduce(lambda x,y:x|y, [set(x[0].split()) for x in all_reviews])

In [65]:
# Vocab Size, number of review, fraction of reviews with more than 200 words
len(WORDS), len(all_reviews), len([nwords for x in all_reviews if (nwords := len(x[0].split())) > 200])/len(all_reviews)

(9033, 6000, 0.029)

In [77]:
# standardization: one element of  standardizations or a callable mapping strings to strings
standardizations = [None, "lower_and_strip_punctuation", "lower", "strip_punctuation"]
max_tokens = len(WORDS) + 2 # number of words plus the tokens '' and '[UNK]'
ngrams = None
sequence_length = 200

tv_layer = tf.keras.layers.TextVectorization(
    max_tokens = max_tokens,
    standardize = standardizations[0],
    ngrams=ngrams,
    output_mode='int',
    output_sequence_length = sequence_length,
    pad_to_max_tokens=True,
)

In [78]:
tv_layer.adapt([r[0] for r in all_reviews])
vocab = tv_layer.get_vocabulary()
len(vocab),  vocab[:5]# max_tokens

(9035, ['', '[UNK]', 'the', 'and', 'was'])

In [79]:
tv_layer(data_train[3][30])

<tf.Tensor: shape=(200,), dtype=int64, numpy=
array([  16,    8,  436,   82,  163,    5,  365,  222,   37,   21,    2,
        377,  259,   36,   20,    5,  276,   10,  104, 2640,   21,    2,
        377,  466,   16,   20,  528,    2,  207,   55,    5,  118,  275,
          3,   44,  105,  210,  192, 1695,    6,  316,    8,    2,  115,
          7,   13,    2,   94,   25,   65,  264,   42,  367,    3,  586,
          5,  664,    4,  392,    8,    2,    7,   36,   14,   40, 1155,
        107,  316,   17,  429,   29,  296,    5,  531,    3,    5,  357,
        113,   28,   89,   28,  291,  398,    6, 4099,    5, 1109, 3744,
          2,    7,    4,   52,    3,  722,    3,    2,   41,    4,   98,
         30,  106,  544,    8,    2,   67,    3,  738,  585, 1656,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
     

In [80]:
vocab[16], vocab[8], vocab[436], data_train[3][30].split()[:3]

('it', 'in', 'located', ['it', 'in', 'located'])

In [81]:
output_dim =  16
em_layer = tf.keras.layers.Embedding(input_dim = max_tokens + 1, output_dim= output_dim)
em_layer(tv_layer('what a hotel'))

<tf.Tensor: shape=(200, 16), dtype=float32, numpy=
array([[ 0.0476017 , -0.03642512, -0.02449933, ..., -0.02889631,
        -0.01723612, -0.01314093],
       [ 0.02439922,  0.0304555 , -0.01999265, ..., -0.04715417,
        -0.03328749, -0.02525299],
       [ 0.00140228,  0.02744489, -0.042287  , ..., -0.03759339,
        -0.00454023,  0.0016163 ],
       ...,
       [-0.00029685,  0.02801936, -0.02321785, ...,  0.00977889,
        -0.02967455,  0.0495114 ],
       [-0.00029685,  0.02801936, -0.02321785, ...,  0.00977889,
        -0.02967455,  0.0495114 ],
       [-0.00029685,  0.02801936, -0.02321785, ...,  0.00977889,
        -0.02967455,  0.0495114 ]], dtype=float32)>

In [82]:
model = tf.keras.Sequential([
    tv_layer,
    em_layer,
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(10),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation = 'relu')], name = 'lstm10_1')
model.summary()

Model: "lstm10_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_3 (TextV  (None, 200)              0         
 ectorization)                                                   
                                                                 
 embedding_1 (Embedding)     (None, 200, 16)           144576    
                                                                 
 dropout_6 (Dropout)         (None, 200, 16)           0         
                                                                 
 lstm_2 (LSTM)               (None, 10)                1080      
                                                                 
 dropout_7 (Dropout)         (None, 10)                0         
                                                                 
 flatten_2 (Flatten)         (None, 10)                0         
                                                          

In [83]:
metric = tf.keras.metrics.MeanAbsoluteError(name='mean_absolute_error', dtype=None)
loss =  tf.keras.losses.MeanSquaredError()
model.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    loss=loss,
    metrics=[metric],
)

In [85]:
# test if reviews pass through model, show sample output
for ds in (ds_train, ds_test):  test_model(model, ds)

10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, tf.Tensor(
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]], shape=(8, 1), dtype=float32)


In [96]:
model_path = ROOT + 'My_Models/' + model.name +'/'
if not os.path.exists(ROOT + model_path):
    os.makedirs(ROOT + model_path, exist_ok = False)
    print('Created directory ' + ROOT + model_path)

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=model_path + 'checkpoints',
    save_weights_only=True,
    monitor='mean_absolute_error',
    mode='min',
    save_best_only=True)

In [105]:
# load from checkpoint
# model.load_weights(model_path + 'checkpoints')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7ca14bdeb7f0>

In [None]:
if os.path.exists(model_path + 'current_weights.index'):
    model.load_weights(model_path + 'current_weights')
    print('Found previous weights')
else:
    print('No previous weights found!')

model.fit(
    ds_train,
    epochs = 1000,
    validation_data = ds_test,
    callbacks=[model_checkpoint_callback]
)
model.save_weights(model_path + 'current_weights', save_format='tf')
print('Current weights saved to ' + model_path + 'current_weights')

In [54]:

# next(iter(ds_train))

In [106]:
model.evaluate(ds_test)



[1.4831992387771606, 1.03293776512146]

In [None]:
model.load_weights(ROOT + 'data/models/lstm10')


In [None]:
# save the model
#model.save(ROOT + 'data/models/lstm10.sav')

In [17]:
text = fix_shape('the best hotel i ever stayed at')
model.predict(text)



array([[1.9284965]], dtype=float32)

In [22]:
text = '''\
the breakfast was excellent and the room very clean and spacious.
the staff was very friendly.
I can really recommend this place'''
model.predict(fix_shape(text))



array([[3.2970164]], dtype=float32)

In [None]:
text = '''\
the breakfast was excellent and the room very clean and spacious.
the staff was very friendly.
I can relly recommend this place'''
model.predict(fix_shape(text))

In [None]:
text = '''\
the breakfast was ok, but nothing special.
the staff was very friendly.
But the room we small and not suitable for working'''

model.predict(fix_shape(text))

In [23]:
text = '''\
the breakfast was ok, but nothing special.
the staff was quite rude.
And the room we small and not suitable for working'''
model.predict(fix_shape(text))



array([[3.2970164]], dtype=float32)

In [1]:
text = '''\
the breakfast was bad, only bread and coffe.
Checkin took forever.
No roomservice.
The room we small and not suitable for working'''
model.predict(fix_shape(text))

NameError: ignored