In [2]:
import sklearn as sk
import numpy as np
import pandas as pd
import tensorflow as tf
import json
import csv
import sklearn.feature_extraction.text as sk_text
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn import metrics
from matplotlib.pyplot import figure, show

In [2]:
# Make Reviews tsv file with business_id, stars, and text
outfile=open('review_stars.tsv', 'w')
sfile=csv.writer(outfile, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','stars','text']) #column titles

with open('yelp_dataset/yelp_academic_dataset_review.json', encoding="utf-8") as f:
    for line in f:
        row=json.loads(line)
        sfile.writerow([row['business_id'],row['stars'],(row['text']).encode('utf-8')])
outfile.close()

In [3]:
# Make business tsv file
outfile=open('business_stars.tsv', 'w')
sfile=csv.writer(outfile,delimiter='\t',quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','categories', 'name'])

with open('yelp_dataset/yelp_academic_dataset_business.json', encoding='utf-8') as f:
    for line in f:
        row = json.loads(line)
        if(row['review_count']>=20):
            sfile.writerow([row['business_id'], row['categories'], (row['name']).encode('utf-8')])
outfile.close()

In [4]:
df_reviews=pd.read_csv('review_stars.tsv', sep='\t')
df_business=pd.read_csv('business_stars.tsv', sep='\t')
df_merged=pd.merge(df_reviews, df_business, on='business_id')
df_merged.to_csv('merged_df.tsv',sep='\t')
del df_reviews
del df_business

# Data Processing

In [3]:
df_merged=pd.read_csv('merged_df.tsv', sep='\t')

In [4]:
# Limiting number of reviews due to size
numEntries = 10000

In [15]:
vectorizer = sk_text.TfidfVectorizer(
                                    max_features=1000,
                                    max_df=1000,
                                    min_df=1)
matrix=vectorizer.fit_transform(df_merged.text[0:numEntries])
tfidf_data=matrix.toarray()

In [20]:
x = tfidf_data
y = df_merged.stars[0:numEntries]

In [17]:
model = Sequential()

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, random_state=32)

model.add(Dense(100, input_dim=x.shape[1], activation='relu')) 
model.add(Dense(10, activation='relu'))
model.add(Dense(1))

model.compile(loss='mean_squared_error', optimizer='adam')

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=2, verbose=1, mode='auto')  

#model.fit(x_train, y_train, validation_data=(x_test,y_test), callbacks=[monitor], verbose=2, epochs=1000)

checkpointer = ModelCheckpoint(filepath="dnn/best_weights.hdf5", verbose=0, save_best_only=True) # save best model

model.fit(x_train, y_train,validation_data=(x_test,y_test),callbacks=[monitor,checkpointer],verbose=2,epochs=100)

model.load_weights('dnn/best_weights.hdf5') # load weights from best model

Epoch 1/10
235/235 - 0s - loss: 3.7936 - val_loss: 1.0326
Epoch 2/10
235/235 - 0s - loss: 0.8299 - val_loss: 0.8410
Epoch 3/10
235/235 - 0s - loss: 0.7051 - val_loss: 0.8111
Epoch 4/10
235/235 - 0s - loss: 0.6607 - val_loss: 0.7930
Epoch 5/10
235/235 - 0s - loss: 0.6307 - val_loss: 0.7898
Epoch 6/10
235/235 - 0s - loss: 0.5910 - val_loss: 0.7893
Epoch 7/10
235/235 - 0s - loss: 0.5302 - val_loss: 0.7825
Epoch 8/10
235/235 - 0s - loss: 0.4258 - val_loss: 0.7887
Epoch 9/10
235/235 - 0s - loss: 0.2926 - val_loss: 0.8067
Epoch 00009: early stopping


In [1]:
pred=model.predict(x_test)

NameError: name 'model' is not defined

In [22]:
score = np.sqrt(metrics.mean_squared_error(pred,y_test))
print("Final RMSE score: {}".format(score))

Final RMSE score: 0.8845655210421726


# Random Business Testing

In [25]:
vectorizer = sk_text.TfidfVectorizer(
                                    max_features=1000,
                                    max_df=1000,
                                    min_df=1)
matrix=vectorizer.fit_transform(df_merged.groupby(text[:])
tfidf_data=matrix.toarray()

KeyboardInterrupt: 

In [23]:
x = tfidf_data
y = df_merged.stars[0:numEntries]

In [None]:
model.predict(x)