In [1]:
import sklearn as sk
import numpy as np
import pandas as pd
import tensorflow as tf
import json
import csv
import sklearn.feature_extraction.text as sk_text
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn import metrics
from matplotlib.pyplot import figure, show
import random

In [2]:
# Make Reviews tsv file with business_id, stars, and text
outfile=open('review_stars.tsv', 'w')
sfile=csv.writer(outfile, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','stars','text']) #column titles

with open('yelp_dataset/yelp_academic_dataset_review.json', encoding="utf-8") as f:
    for line in f:
        row=json.loads(line)
        sfile.writerow([row['business_id'],row['stars'],(row['text']).encode('utf-8')])
outfile.close()

In [3]:
# Make business tsv file
outfile=open('business_stars.tsv', 'w')
sfile=csv.writer(outfile,delimiter='\t',quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','categories', 'name'])

with open('yelp_dataset/yelp_academic_dataset_business.json', encoding='utf-8') as f:
    for line in f:
        row = json.loads(line)
        if(row['review_count']>30):
            sfile.writerow([row['business_id'], row['categories'], (row['name']).encode('utf-8')])
outfile.close()

In [5]:
df_reviews=pd.read_csv('review_stars.tsv', sep='\t')
df_business=pd.read_csv('business_stars.tsv', sep='\t')
df_merged=pd.merge(df_reviews, df_business, on='business_id')
df_merged.to_csv('merged_df.tsv',sep='\t')
del df_reviews
del df_business
del df_merged

# Data Processing

In [6]:
df_merged=pd.read_csv('merged_df.tsv', sep='\t')

In [7]:
# Limiting number of reviews due to size
numEntries = 100000

In [8]:
vectorizer = sk_text.TfidfVectorizer(
                                    max_features=1000,
                                    max_df=2000,
                                    min_df=1)
matrix=vectorizer.fit_transform(df_merged.text[0:numEntries])
tfidf_data=matrix.toarray()
del matrix

x = tfidf_data
y = df_merged.stars[0:numEntries]

## Finding Best Options

In [9]:
model = Sequential()

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, random_state=32)

model.add(Dense(100, input_dim=x.shape[1], activation='relu')) 
model.add(Dense(50, activation='relu'))
model.add(Dense(1))

model.compile(loss='mean_squared_error', optimizer='adam')

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=2, verbose=2, mode='auto')  

checkpointer = ModelCheckpoint(filepath="dnn/best_weights.hdf5", verbose=0, save_best_only=True) # save best model

model.fit(x_train, y_train, validation_data=(x_test,y_test), callbacks=[monitor,checkpointer], verbose=2, epochs=100)

model.load_weights('dnn/best_weights.hdf5') # load weights from best model

2021-09-29 11:43:51.864897: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2021-09-29 11:43:51.867200: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (alec-xps13): /proc/driver/nvidia/version does not exist
2021-09-29 11:43:51.873129: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-09-29 11:43:53.077623: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 300000000 exceeds 10% of free system memory.
2021-09-29 11:43:53.562114: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (regi

Epoch 1/100


2021-09-29 11:43:55.013707: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 100000000 exceeds 10% of free system memory.


2344/2344 - 2s - loss: 1.4829 - val_loss: 1.2194
Epoch 2/100
2344/2344 - 1s - loss: 1.1804 - val_loss: 1.1638
Epoch 3/100
2344/2344 - 1s - loss: 1.1318 - val_loss: 1.1390
Epoch 4/100
2344/2344 - 2s - loss: 1.0627 - val_loss: 1.1522
Epoch 5/100
2344/2344 - 2s - loss: 1.0012 - val_loss: 1.1703
Epoch 00005: early stopping


In [16]:
pred=model.predict(x_test)

2021-09-29 11:46:44.305427: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 100000000 exceeds 10% of free system memory.


In [11]:
score = np.sqrt(metrics.mean_squared_error(pred,y_test))
print("Final RMSE score: {:.4}".format(score))

Final RMSE score: 1.067


# Single Business Predictions

In [12]:
df_unique_ids = df_merged.drop_duplicates(subset=['business_id']).reset_index()

In [13]:
def predict_single_business(id_num):
    df_single_business = df_merged[df_merged.business_id == id_num]
    vectorizer = sk_text.TfidfVectorizer(
                                    max_features=1000,
                                    max_df=2000,
                                    min_df=1)
    matrix=vectorizer.fit_transform(df_single_business.text)
    tfidf_data=matrix.toarray()
    if(tfidf_data.shape[1] >= 500):
        return df_single_business.name[:1].to_string(index=False)[1:].strip("'").strip('"'), model.predict(tfidf_data).mean(), df_single_business.stars.mean()

In [14]:
def single_business_compare(numBusinesses):
    for i in range(numBusinesses):
        index = random.randint(0, df_unique_ids.shape[0])
        name, predicted, real = predict_single_business(df_unique_ids.business_id[index])
        print("{:30}: predicted score: {:3.1f}, real score: {:3.1f}".format(name, predicted, real))

In [15]:
single_business_compare(5)

Bamboo Luau's Chinatown       : predicted score: 3.4, real score: 3.1
The Fresh Market              : predicted score: 3.2, real score: 3.7
Caffe Antico                  : predicted score: 3.6, real score: 3.3


ValueError: in user code:

    /home/alec/.local/lib/python3.9/site-packages/keras/engine/training.py:1586 predict_function  *
        return step_function(self, iterator)
    /home/alec/.local/lib/python3.9/site-packages/keras/engine/training.py:1576 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /home/alec/.local/lib/python3.9/site-packages/tensorflow/python/distribute/distribute_lib.py:1286 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /home/alec/.local/lib/python3.9/site-packages/tensorflow/python/distribute/distribute_lib.py:2849 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /home/alec/.local/lib/python3.9/site-packages/tensorflow/python/distribute/distribute_lib.py:3632 _call_for_each_replica
        return fn(*args, **kwargs)
    /home/alec/.local/lib/python3.9/site-packages/keras/engine/training.py:1569 run_step  **
        outputs = model.predict_step(data)
    /home/alec/.local/lib/python3.9/site-packages/keras/engine/training.py:1537 predict_step
        return self(x, training=False)
    /home/alec/.local/lib/python3.9/site-packages/keras/engine/base_layer.py:1020 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
    /home/alec/.local/lib/python3.9/site-packages/keras/engine/input_spec.py:250 assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer sequential is incompatible with the layer: expected axis -1 of input shape to have value 1000 but received input with shape (None, 916)
