In [1]:
import utils as utils
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
DATA_FILE_PATH = 'data.json'

In [3]:
train_tups, test_tups = utils.split_data(utils.generate_tuples_from_file(DATA_FILE_PATH, num_samples=10000), test_size=0.2)

print('Training set size: ', len(train_tups[0]))
print('Test set size: ', len(test_tups[0]))
print('Sample tokenized review: ', train_tups[0][0])
print('Sample rating: ', train_tups[1][0])

Training set size:  8000
Test set size:  2000
Sample tokenized review:  ['This', 'is', 'an', 'incredible', 'spot', 'in', 'the', 'funk', 'zone', 'of', 'Santa', 'Barbara', '.', 'Their', 'burgers', 'are', 'overpriced', '(', '$', '18/', '$', '14', ')', 'which', 'is', 'why', 'they', 'lost', 'a', 'star', ',', 'otherwise', 'great', 'beer', 'selection', 'and', 'the', 'lobster', 'Mac', 'and', 'cheese', 'was', 'delicious', '.', 'Highly', 'recommend', 'stopping', 'by', '.', 'Beer', 'cooler', 'had', 'a', 'great', 'selection', 'as', 'well', '.']
Sample rating:  4


In [4]:
# implement FFNN with doc2vec embeddings as input

tagged_docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(train_tups[0])]
d2v_model = Doc2Vec(tagged_docs, vector_size=100, window=2, min_count=1, workers=4)

vectors = [d2v_model.dv[i] for i in range(len(tagged_docs))]

In [5]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import SGD
from keras.metrics import Recall, Precision, F1Score

input_dim = 100

model = Sequential()
model.add(Dense(100, activation='relu', input_shape=(input_dim,)))  
model.add(Dropout(0.5))
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.5))


# put in an output layer
model.add(Dense(5, activation='softmax'))

model.summary()
sgd = SGD(learning_rate=.01)
recall = Recall()
precision = Precision()
f1_score = F1Score()

# call compile here
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy', recall, precision, f1_score])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 100)               10100     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 50)                5050      
                                                                 
 dropout_1 (Dropout)         (None, 50)                0         
                                                                 
 dense_2 (Dense)             (None, 5)                 255       
                                                                 
Total params: 15405 (60.18 KB)
Trainable params: 15405 (60.18 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________




In [6]:
import numpy as np

# Convert input data to NumPy arrays
vectors_np = np.array(vectors)
train_tups_np = np.array(train_tups[1])
train_tups_np = utils.get_one_hot_encodings(train_tups_np)

print('Shape of input vectors: ', vectors_np.shape)
print('Shape of output vectors: ', train_tups_np.shape)

# Fit the model
model.fit(vectors_np, train_tups_np, epochs=5)

Shape of input vectors:  (8000, 100)
Shape of output vectors:  (8000, 5)
Epoch 1/5


ValueError: in user code:

    File "/opt/homebrew/lib/python3.11/site-packages/keras/src/engine/training.py", line 1377, in train_function  *
        return step_function(self, iterator)
    File "/opt/homebrew/lib/python3.11/site-packages/keras/src/engine/training.py", line 1360, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/homebrew/lib/python3.11/site-packages/keras/src/engine/training.py", line 1349, in run_step  **
        outputs = model.train_step(data)
    File "/opt/homebrew/lib/python3.11/site-packages/keras/src/engine/training.py", line 1127, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/opt/homebrew/lib/python3.11/site-packages/keras/src/engine/training.py", line 1185, in compute_loss
        return self.compiled_loss(
    File "/opt/homebrew/lib/python3.11/site-packages/keras/src/engine/compile_utils.py", line 277, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/opt/homebrew/lib/python3.11/site-packages/keras/src/losses.py", line 143, in __call__
        losses = call_fn(y_true, y_pred)
    File "/opt/homebrew/lib/python3.11/site-packages/keras/src/losses.py", line 270, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/opt/homebrew/lib/python3.11/site-packages/keras/src/losses.py", line 2454, in sparse_categorical_crossentropy
        return backend.sparse_categorical_crossentropy(
    File "/opt/homebrew/lib/python3.11/site-packages/keras/src/backend.py", line 5777, in sparse_categorical_crossentropy
        res = tf.nn.sparse_softmax_cross_entropy_with_logits(

    ValueError: `labels.shape` must equal `logits.shape` except for the last dimension. Received: labels.shape=(160,) and logits.shape=(32, 5)


In [None]:
review_to_predict = "The food was awful."
vectors_np = np.array([d2v_model.infer_vector(review_to_predict.split(' '))])
prob_distribution = model.predict(vectors_np)

print('Predicted rating: ', np.argmax(prob_distribution) + 1)

