<a href="https://colab.research.google.com/github/didi64/Colab_Test/blob/main/Classify_Reviews2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Load the cleaned reviews and train an LSTM-model

In [1]:
import os, sys
from google.colab import drive
ROOT = '/content/drive/MyDrive/CAS_Gregi/'
MODULE_PATH = 'CO_modules/'
MODEL_PATH  = 'My_Models/'

IMPORT_PATH_SYMLINK = {'link': '/content/modules', 'target': ROOT + MODULE_PATH}

drive.mount('/content/drive', force_remount=False)
if not os.path.exists(IMPORT_PATH_SYMLINK['link']):
    os.symlink(IMPORT_PATH_SYMLINK['target'], IMPORT_PATH_SYMLINK['link'])
if IMPORT_PATH_SYMLINK['link'] not in sys.path:
    sys.path.insert(0, IMPORT_PATH_SYMLINK['link'])

Mounted at /content/drive


In [None]:
!ls -l {IMPORT_PATH_SYMLINK['link']}
!ls /content $ROOT /content/drive {IMPORT_PATH_SYMLINK['link']}

In [3]:
import pickle
with open(ROOT + 'data/hotelreviews_cleaned_train_test_spell.pkl','rb') as f:
    data_train, data_test = pickle.load(f)

In [4]:
import random
import numpy as np
import tensorflow as tf
from functools import reduce

In [5]:
def fix_shape(text):
    return tf.constant(text, shape = (1,))

def get_dataset(d):
    ''' get dataset from dict {score: list_of_revies,...}'''
    # list of tuples [(score, review),...]
    all_reviews = reduce(lambda x,y: x + y, [[(r, k) for k,v in d.items() for r in v]])
    random.shuffle(all_reviews)
    reviews = tf.constant([r for r,_ in all_reviews], shape=(len(all_reviews), 1))
    gpt_scores  = tf.constant([s - 1 for _, s in all_reviews])

    n = len(reviews)
    ds = tf.data.Dataset.from_tensor_slices((reviews[:n], gpt_scores[:n]))
    return ds

def batch(ds, batch_size = 32):
    xs, ys =  next(iter(ds))
    if xs.shape == () or xs.shape[0] == 1:
        ds = ds.batch(batch_size)
    return ds

def test_model(model, ds):
    i = 0
    for xs, _ in iter(ds):
        i += 1
        if (i %10) ==0:
            print(i, end = ', ')
        model(xs)

In [6]:
ds_train = batch(get_dataset(data_train))
ds_test  = batch(get_dataset(data_test))

In [7]:
all_reviews = reduce(lambda x,y: x + y, [[(r, k) for d in (data_train, data_test) for k,v in d.items() for r in v]])
WORDS = reduce(lambda x,y:x|y, [set(x[0].split()) for x in all_reviews])

In [8]:
# Vocab Size, number of review, fraction of reviews with more than 200 words
len(WORDS), len(all_reviews), len([nwords for x in all_reviews if (nwords := len(x[0].split())) > 200])/len(all_reviews)

(9033, 6000, 0.029)

In [9]:
# standardization: one element of  standardizations or a callable mapping strings to strings
standardizations = [None, "lower_and_strip_punctuation", "lower", "strip_punctuation"]
max_tokens = len(WORDS) + 2 # number of words plus the tokens '' and '[UNK]'
ngrams = None
sequence_length = 200

tv_layer = tf.keras.layers.TextVectorization(
    max_tokens = max_tokens,
    standardize = standardizations[0],
    ngrams=ngrams,
    output_mode='int',
    output_sequence_length = sequence_length,
    pad_to_max_tokens=True,
)

In [10]:
tv_layer.adapt([r[0] for r in all_reviews])
vocab = tv_layer.get_vocabulary()
len(vocab),  vocab[:5]# max_tokens

(9035, ['', '[UNK]', 'the', 'and', 'was'])

In [11]:
tv_layer(data_train[3][30])

<tf.Tensor: shape=(200,), dtype=int64, numpy=
array([  16,    8,  436,   82,  163,    5,  365,  222,   37,   21,    2,
        377,  259,   36,   20,    5,  276,   10,  104, 2640,   21,    2,
        377,  466,   16,   20,  528,    2,  207,   55,    5,  118,  275,
          3,   44,  105,  210,  192, 1695,    6,  316,    8,    2,  115,
          7,   13,    2,   94,   25,   65,  264,   42,  367,    3,  586,
          5,  664,    4,  392,    8,    2,    7,   36,   14,   40, 1155,
        107,  316,   17,  429,   29,  296,    5,  531,    3,    5,  357,
        113,   28,   89,   28,  291,  398,    6, 4099,    5, 1109, 3744,
          2,    7,    4,   52,    3,  722,    3,    2,   41,    4,   98,
         30,  106,  544,    8,    2,   67,    3,  738,  585, 1656,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
     

In [80]:
vocab[16], vocab[8], vocab[436], data_train[3][30].split()[:3]

('it', 'in', 'located', ['it', 'in', 'located'])

In [12]:
output_dim =  16
em_layer = tf.keras.layers.Embedding(input_dim = max_tokens + 1, output_dim= output_dim)
em_layer(tv_layer('what a hotel'))

<tf.Tensor: shape=(200, 16), dtype=float32, numpy=
array([[ 0.04427708, -0.01064781,  0.02105146, ...,  0.0004003 ,
         0.03167543,  0.04943534],
       [ 0.01172709, -0.02232736,  0.04722979, ..., -0.04705388,
        -0.04199237,  0.04385627],
       [-0.04036009,  0.0199624 , -0.04517343, ..., -0.02099259,
         0.02872104, -0.03966552],
       ...,
       [-0.02293332,  0.00828331, -0.03109035, ...,  0.00870107,
        -0.04472695,  0.01328453],
       [-0.02293332,  0.00828331, -0.03109035, ...,  0.00870107,
        -0.04472695,  0.01328453],
       [-0.02293332,  0.00828331, -0.03109035, ...,  0.00870107,
        -0.04472695,  0.01328453]], dtype=float32)>

In [13]:
lstm = tf.keras.layers.LSTM(10)
lstm_bi = tf.keras.layers.Bidirectional(lstm, merge_mode='concat')

model = tf.keras.Sequential([
    tv_layer,
    em_layer,
    tf.keras.layers.Dropout(0.2),
    lstm_bi,
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation = 'relu')], name = 'lstm_bi10')
model.summary()

Model: "lstm_bi10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 200)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 200, 16)           144576    
                                                                 
 dropout (Dropout)           (None, 200, 16)           0         
                                                                 
 bidirectional (Bidirectiona  (None, 20)               2160      
 l)                                                              
                                                                 
 dropout_1 (Dropout)         (None, 20)                0         
                                                                 
 flatten (Flatten)           (None, 20)                0 

In [14]:
metric = tf.keras.metrics.MeanAbsoluteError(name='mean_absolute_error', dtype=None)
loss =  tf.keras.losses.MeanSquaredError()
model.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    loss=loss,
    metrics=[metric],
)

In [15]:
# test if reviews pass through model, show sample output
for ds in (ds_train, ds_test):  test_model(model, ds)

10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 10, 20, 30, 

In [16]:
model_path = ROOT + MODEL_PATH + model.name +'/'
if not os.path.exists(ROOT + model_path):
    os.makedirs(ROOT + model_path, exist_ok = False)
    print('Created directory ' + ROOT + model_path)

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=model_path + 'checkpoints',
    save_weights_only=True,
    monitor='mean_absolute_error',
    mode='min',
    save_best_only=True)

Created directory /content/drive/MyDrive/CAS_Gregi//content/drive/MyDrive/CAS_Gregi/My_Models/lstm_bi10/


In [22]:
def save_model(model):
    model_path = ROOT + MODEL_PATH + model.name +'/'
    if not os.path.exists(ROOT + model_path):
        os.makedirs(ROOT + model_path, exist_ok = False)
        print('Created directory ' + ROOT + model_path)

    model.save(model_path + model.name + '_save', save_format='tf')

def load_model(model):
    model_path = ROOT + MODEL_PATH + model.name +'/'
    model = tf.keras.models.load_model(model_path + model.name + '_save')
    return model

In [105]:
# load from checkpoint
# model.load_weights(model_path + 'checkpoints')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7ca14bdeb7f0>

In [29]:
if os.path.exists(model_path + 'current_weights.index'):
    model.load_weights(model_path + 'current_weights')
    print('Found previous weights')
else:
    print('No previous weights found!')

history =  model.fit(
    ds_train,
    epochs = 300,
    validation_data = ds_test,
    callbacks=[model_checkpoint_callback]
)
model.save_weights(model_path + 'current_weights', save_format='tf')
print('Current weights saved to ' + model_path + 'current_weights')

Found previous weights
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/30

In [28]:
# model.save_weights(model_path + 'weights_after_1000_epochs', save_format='tf')

In [54]:

# next(iter(ds_train))

In [23]:
# save model
save_model(model)

Created directory /content/drive/MyDrive/CAS_Gregi//content/drive/MyDrive/CAS_Gregi/My_Modelslstm_bi10/




In [None]:
# load the model
# model = load_model(model)

In [30]:
model.evaluate(ds_test)



[0.400241494178772, 0.4034956097602844]

In [25]:
def accuracy(ds):

    ys_pred = (model.predict(ds).reshape((-1,))).round()
    ys = np.concatenate([y for x, y in ds], axis=0)
    errs = abs(ys - ys_pred)
    N = len(ys)
    relative_hits = [np.count_nonzero(errs==i)/N for i in range(5)]
    return relative_hits

def np_accuracy(ds):
    f = lambda x:x<0 and -1 or x>0 and 1 or 0
    ys_pred = (model.predict(ds).reshape((-1,))).round() - 2
    ys_pred = np.array([f(x) for x in ys_pred])
    ys = np.concatenate([y for x, y in ds], axis=0) - 2
    ys = np.array([f(x) for x in ys])

    errs = abs(ys - ys_pred)
    N = len(ys)
    relative_hits = [np.count_nonzero(errs==i)/N for i in [0,1,2]]
    return relative_hits

In [26]:
accuracy(ds_train), accuracy(ds_test)



([1.0, 0.0, 0.0, 0.0, 0.0], [0.652, 0.314, 0.031, 0.003, 0.0])

In [31]:
accuracy(ds_train), accuracy(ds_test)



([0.999, 0.001, 0.0, 0.0, 0.0], [0.656, 0.306, 0.034, 0.004, 0.0])

In [33]:
text = fix_shape('the best hotel i ever stayed at')
model.predict(text)



array([[2.2713227]], dtype=float32)

In [35]:
text = '''
the breakfast was excellent and the room very clean and spacious
the staff was very friendly
I can really recommend this place'''.lower().replace('\n', ' ')
model.predict(fix_shape(text))



array([[3.428855]], dtype=float32)

In [39]:
text = '''\
the breakfast was ok, but nothing special
the staff was very friendl.
But the room we small and not suitable for working'''.lower().replace('\n', ' ')

model.predict(fix_shape(text))



array([[2.32862]], dtype=float32)

In [40]:
text = '''\
the breakfast was ok, but nothing special
the staff was quite rude
And the room we small and not suitable for working'''.lower().replace('\n', ' ')
model.predict(fix_shape(text))



array([[1.7753965]], dtype=float32)

In [41]:
text = '''\
the breakfast was bad, only bread and coffe
Checkin took forever
No roomservice
The room we small and not suitable for working'''.lower().replace('\n', ' ')
model.predict(fix_shape(text))



array([[2.0958402]], dtype=float32)