In [1]:
import json
import csv
import pandas as pd
import numpy as np
import scipy as sci
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping, ModelCheckpoint
import io
import requests
import tensorflow as tf
from scipy import sparse
from sklearn.model_selection import train_test_split
import sklearn.feature_extraction.text as sk_text
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, f1_score
import collections
from sklearn import preprocessing
import matplotlib.pyplot as plt
import shutil
import os

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
#Open business.json file, create tsv file with business_id, business name, categories, and review count to be used as features 
#and stars as label

outfile = open("business.tsv", 'w')
sfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','categories', 'stars', 'review_count', 'postal code'])
with open('yelp_academic_dataset_business.json', encoding="utf8") as f:
    for line in f:
        row = json.loads(line)
        sfile.writerow([row['business_id'], row['categories'], row['stars'], row['review_count'], row['postal_code']])

outfile.close()

business_df= pd.read_csv('business.tsv', delimiter ="\t")

In [3]:
#Open review.json file, create tsv file with business_id,text to be used as features 
#and stars as label

outfile = open("review_stars.tsv", 'w')
sfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','stars', 'text'])
with open('yelp_academic_dataset_review.json', encoding="utf8") as f:
    for line in f:
        row = json.loads(line)
        # some special char must be encoded in 'utf-8'
        sfile.writerow([row['business_id'], row['stars'], (row['text']).encode('utf-8')])

outfile.close()

review_df= pd.read_csv('review_stars.tsv', delimiter ="\t")

In [4]:
# Group all reviews by business_id
review_agg_df = review_df.groupby('business_id')['text'].sum()
review_df_ready_for_sklearn = pd.DataFrame({'business_id': review_agg_df.index, 'all_reviews': review_agg_df.values})

In [5]:
#Merge the resulting review aggregate dataframe with business dataframe
merge_df = pd.merge(business_df, review_df_ready_for_sklearn, on='business_id')

In [6]:
#Normalization of review count field so it becomes comparable and remove bias

from scipy.stats import zscore

merge_df.insert(3,'normalized_count',((merge_df['review_count'] - merge_df['review_count'].min()) / (merge_df['review_count'].max() - merge_df['review_count'].min())).astype(float))
merge_df['review_count'] = zscore(merge_df['review_count'])


In [8]:
# removing NaN categories

merge_df = merge_df[merge_df['categories'].notnull()]

In [9]:
# Extracting categories

from sklearn.preprocessing import MultiLabelBinarizer
encoded_categories = MultiLabelBinarizer()
category_matrix = encoded_categories.fit_transform(merge_df['categories'].str.split(','))

In [10]:
#TF-IDF calculation

tfidf = sk_text.TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,3))

In [11]:
# Fit the reviews column with TFIDFvectorizer
matrix = tfidf.fit_transform(merge_df['all_reviews'])
matrix = matrix.toarray()

In [12]:
# We are adding the normalized count to the original matrix with TFIDFvectorizer
x_matrix_minmax = np.column_stack((matrix, merge_df['normalized_count']))

# Zscore
x_matrix_zscore = np.column_stack((matrix, merge_df['review_count']))

In [13]:
#train test data for linear regression

x_train, x_test, y_train, y_test = train_test_split(x_matrix_minmax, merge_df['stars'] , test_size=0.2, random_state=42)

In [14]:
# linear regression

lin_reg_model = LinearRegression()

lin_reg_model.fit(x_train, y_train)

y_pred_linear = lin_reg_model.predict(x_test)

In [15]:
# list  the business with the stars and prediction

for i in range(0,10):
    idx=y_test.index[i]
    print("business id - %s actual stars  - %f predicted - %f" 
          %(merge_df['business_id'][idx], y_test[idx], y_pred_linear[i]))

business id - EmezZdxbvjydG5FkN6Mecw actual stars  - 2.000000 predicted - 2.663625
business id - cmaPrML-0zCJOs8_1VYmaw actual stars  - 4.500000 predicted - 4.174117
business id - E6U8zl527AsspbTf5nZCdw actual stars  - 3.000000 predicted - 3.135507
business id - zXRf_6Bs1yX9an_QKpzbHQ actual stars  - 2.000000 predicted - 2.148644
business id - vllzSssD2HXGlzGUcITxhw actual stars  - 4.000000 predicted - 3.575649
business id - AcGRSWCpb7YB95MTsHlGEw actual stars  - 2.000000 predicted - 2.569646
business id - zfEcOCrgUKe8xYOdqNVmmA actual stars  - 3.500000 predicted - 3.318817
business id - z-q6Wu-L-iDCftYVfoElPw actual stars  - 5.000000 predicted - 5.078186
business id - K7c5wAhxd6CqtmBmY47c7g actual stars  - 3.500000 predicted - 2.295289
business id - b30HREePgMGPZMPaExTZSA actual stars  - 4.000000 predicted - 4.860883


In [16]:
# RMS value

score_lin_classic = np.sqrt(mean_squared_error(y_test, y_pred_linear))
print("Root Mean Squared Error: %.2f" % score_lin_classic)
print('Variance score: %.2f' % r2_score(y_test, y_pred_linear))

Root Mean Squared Error: 0.31
Variance score: 0.69


In [17]:
# label encoding data for logistic regression

label_encoder = preprocessing.LabelEncoder()

merge_df['encoded_stars'] = label_encoder.fit_transform(merge_df['stars'])

In [18]:
# train test data afor other models

x_train_lr, x_test_lr, y_train_lr, y_test_lr = train_test_split(x_matrix_minmax, merge_df['encoded_stars'] , test_size=0.2, random_state=42)

In [19]:
# logistic Regression

Log_reg_model = LogisticRegression()

Log_reg_model.fit(x_train_lr, y_train_lr)

y_pred_logistic = Log_reg_model.predict(x_test_lr)


In [20]:
# RMs for logistic

score_log_classic = np.sqrt(mean_squared_error(y_test_lr, y_pred_logistic))
print("Root Mean Squared Error: %.2f" % score_log_classic)
print('Variance score: %.2f' % r2_score(y_test_lr, y_pred_logistic))

Coefficients: 
 [[ 0.54123407  0.4596704   0.24957758 ... -0.09934215 -0.1649154
  -0.50320092]
 [ 0.8270038   0.3860916   0.11110198 ... -0.04999578 -0.27721436
  -0.34176423]
 [ 1.15831347 -0.04428173  0.10653449 ... -0.18289116 -0.43772759
  -0.87256034]
 ...
 [-0.24115021  0.12496914 -0.48373097 ...  0.65980756  0.92526212
   1.99547613]
 [-0.64472009 -0.40898215 -0.27480355 ...  0.50743112  0.40815838
   0.75420204]
 [-1.37678973 -1.06791059 -0.30742935 ... -0.28820785  0.1929923
  -1.0200172 ]]
Mean squared error: 1.94
Variance score: 0.51


# Tensorflow Model for Regression

** Training without early stopping and Model Checkpoint and RELU **

In [21]:
#Tensor flow works well with 32 bit
y_stars_regression = merge_df['stars'].values.astype(np.float32)

In [22]:
#train test data
x_train_reg, x_test_reg, y_train_reg, y_test_reg = train_test_split(x_matrix_minmax, y_stars_regression , test_size=0.2)


In [23]:
# Tensorflow model for Regression with RELU

model_reg_relu = Sequential()

model_reg_relu.add(Dense(60, input_dim=x_train_reg.shape[1], activation='relu')) 
model_reg_relu.add(Dense(30, activation='relu')) # Hidden 2
model_reg_relu.add(Dense(1)) # Output

In [24]:
#Model training with Optimizer = adam

model_reg_relu.compile(loss='mean_squared_error', optimizer='adam')
model_reg_relu.fit(x_train_reg,y_train_reg,verbose=2,epochs=100) 


Epoch 1/100
 - 1s - loss: 1.7844
Epoch 2/100
 - 0s - loss: 0.3280
Epoch 3/100
 - 1s - loss: 0.2671
Epoch 4/100
 - 1s - loss: 0.2474
Epoch 5/100
 - 1s - loss: 0.2377
Epoch 6/100
 - 0s - loss: 0.2264
Epoch 7/100
 - 1s - loss: 0.2039
Epoch 8/100
 - 1s - loss: 0.1809
Epoch 9/100
 - 1s - loss: 0.1569
Epoch 10/100
 - 1s - loss: 0.1371
Epoch 11/100
 - 0s - loss: 0.1156
Epoch 12/100
 - 0s - loss: 0.0998
Epoch 13/100
 - 1s - loss: 0.0857
Epoch 14/100
 - 1s - loss: 0.0760
Epoch 15/100
 - 0s - loss: 0.0664
Epoch 16/100
 - 0s - loss: 0.0600
Epoch 17/100
 - 0s - loss: 0.0537
Epoch 18/100
 - 1s - loss: 0.0476
Epoch 19/100
 - 1s - loss: 0.0421
Epoch 20/100
 - 1s - loss: 0.0394
Epoch 21/100
 - 1s - loss: 0.0365
Epoch 22/100
 - 1s - loss: 0.0335
Epoch 23/100
 - 0s - loss: 0.0308
Epoch 24/100
 - 1s - loss: 0.0282
Epoch 25/100
 - 0s - loss: 0.0264
Epoch 26/100
 - 0s - loss: 0.0245
Epoch 27/100
 - 0s - loss: 0.0235
Epoch 28/100
 - 0s - loss: 0.0236
Epoch 29/100
 - 0s - loss: 0.0223
Epoch 30/100
 - 1s - lo

<keras.callbacks.History at 0x203b44b77b8>

In [25]:
# Predict stars
pred_reg_simple = model_reg_relu.predict(x_test_reg)
print("Shape: {}".format(pred_reg_simple.shape))


Shape: (1996, 1)


In [26]:
#Display 10 business

for i in range(10):
    print("{}. Business ID: {}, Actual Stars: {}, predicted Stars: {}".format(i+1,merge_df['business_id'][2000+i],y_test_reg[i],pred_reg_simple[i]))

1. Business ID: diaiQrxYFU1V5qxrFnW9fg, Actual Stars: 4.0, predicted Stars: [3.932604]
2. Business ID: TDTASGFy_aGp6vy0i23mDA, Actual Stars: 5.0, predicted Stars: [4.406552]
3. Business ID: VuKJ2s_JP8weQ54NfsXJXQ, Actual Stars: 5.0, predicted Stars: [5.098876]
4. Business ID: aGiBg2WKOpXS5-1DRnBiAQ, Actual Stars: 3.0, predicted Stars: [2.5978472]
5. Business ID: ZMmgFw2P4LWsFXNn1ZGc1g, Actual Stars: 5.0, predicted Stars: [5.0414157]
6. Business ID: sEKFq5u8P_s0-2mAZnx0JQ, Actual Stars: 2.0, predicted Stars: [1.7984514]
7. Business ID: rYziPPEILDXJ_F5uKR--YQ, Actual Stars: 2.5, predicted Stars: [2.4023979]
8. Business ID: Swm_uMOWNcJDZz5lXWyzKA, Actual Stars: 4.0, predicted Stars: [4.049566]
9. Business ID: 6nGnVP7M4qQRiclXxeqXSQ, Actual Stars: 5.0, predicted Stars: [4.8751554]
10. Business ID: Tc24GX9-ZPr4_SHU0nJZZA, Actual Stars: 5.0, predicted Stars: [4.797533]


In [27]:
# Measure RMSE error.  RMSE is common for regression.
score_nn_relu = np.sqrt(mean_squared_error(y_test_reg,pred_reg_simple))
print("Final score (RMSE): {}".format(score_nn_relu))
print('R2 score: %.2f' % r2_score(y_test_reg, pred_reg_simple))

Final score (RMSE): 0.571011483669281
R2 score: 0.67


** Training with early stopping and Model Checkpoint ReLU **

In [28]:
# setup early stopping monitor

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=3, verbose=1, mode='auto')

In [143]:
# set up checkpointer
checkpointer_relu = ModelCheckpoint(filepath="./best_weights_relu.hdf5", verbose=1, save_best_only=True)

In [30]:
for i in range(10):
    model_reg_relu.compile(loss='mean_squared_error', optimizer='adam')
    model_reg_relu.fit(x_train_reg,y_train_reg,validation_data=(x_test_reg,y_test_reg),callbacks=[monitor,checkpointer_relu],verbose=2,epochs=100) 

model_reg_relu.load_weights('./best_weights_relu.hdf5')

Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 1s - loss: 0.0053 - mean_absolute_error: 0.0520 - val_loss: 0.3349 - val_mean_absolute_error: 0.4353

Epoch 00001: val_loss improved from inf to 0.33489, saving model to ./best_weights_relu.hdf5
Epoch 2/100
 - 1s - loss: 0.0049 - mean_absolute_error: 0.0513 - val_loss: 0.3279 - val_mean_absolute_error: 0.4311

Epoch 00002: val_loss improved from 0.33489 to 0.32794, saving model to ./best_weights_relu.hdf5
Epoch 3/100
 - 1s - loss: 0.0059 - mean_absolute_error: 0.0563 - val_loss: 0.3301 - val_mean_absolute_error: 0.4322

Epoch 00003: val_loss did not improve from 0.32794
Epoch 4/100
 - 1s - loss: 0.0057 - mean_absolute_error: 0.0548 - val_loss: 0.3282 - val_mean_absolute_error: 0.4305

Epoch 00004: val_loss did not improve from 0.32794
Epoch 5/100
 - 1s - loss: 0.0051 - mean_absolute_error: 0.0519 - val_loss: 0.3348 - val_mean_absolute_error: 0.4354

Epoch 00005: val_loss did not improve from 0.32794
Epoch 00005: early stopp

 - 1s - loss: 0.0038 - mean_absolute_error: 0.0454 - val_loss: 0.3248 - val_mean_absolute_error: 0.4276

Epoch 00003: val_loss did not improve from 0.32390
Epoch 4/100
 - 1s - loss: 0.0036 - mean_absolute_error: 0.0437 - val_loss: 0.3238 - val_mean_absolute_error: 0.4265

Epoch 00004: val_loss improved from 0.32390 to 0.32380, saving model to ./best_weights_relu.hdf5
Epoch 5/100
 - 1s - loss: 0.0038 - mean_absolute_error: 0.0456 - val_loss: 0.3264 - val_mean_absolute_error: 0.4290

Epoch 00005: val_loss did not improve from 0.32380
Epoch 6/100
 - 1s - loss: 0.0038 - mean_absolute_error: 0.0447 - val_loss: 0.3246 - val_mean_absolute_error: 0.4271

Epoch 00006: val_loss did not improve from 0.32380
Epoch 7/100
 - 1s - loss: 0.0036 - mean_absolute_error: 0.0441 - val_loss: 0.3249 - val_mean_absolute_error: 0.4280

Epoch 00007: val_loss did not improve from 0.32380
Epoch 00007: early stopping
Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 2s - loss: 0.0033 - mean_absolute_e

In [31]:
# Predict stars
pred_reg_stopping = model_reg_relu.predict(x_test_reg)
print("Shape: {}".format(pred_reg_stopping.shape))

Shape: (1996, 1)


In [32]:
#Display 10 business

for i in range(10):
    print("{}. Business ID: {}, Actual Stars: {}, predicted Stars: {}".format(i+1,merge_df['business_id'][2000+i],y_test_reg[i],pred_reg_stopping[i]))

1. Business ID: diaiQrxYFU1V5qxrFnW9fg, Actual Stars: 4.0, predicted Stars: [4.173663]
2. Business ID: TDTASGFy_aGp6vy0i23mDA, Actual Stars: 5.0, predicted Stars: [4.563887]
3. Business ID: VuKJ2s_JP8weQ54NfsXJXQ, Actual Stars: 5.0, predicted Stars: [5.1475673]
4. Business ID: aGiBg2WKOpXS5-1DRnBiAQ, Actual Stars: 3.0, predicted Stars: [2.724129]
5. Business ID: ZMmgFw2P4LWsFXNn1ZGc1g, Actual Stars: 5.0, predicted Stars: [5.0381064]
6. Business ID: sEKFq5u8P_s0-2mAZnx0JQ, Actual Stars: 2.0, predicted Stars: [1.8037164]
7. Business ID: rYziPPEILDXJ_F5uKR--YQ, Actual Stars: 2.5, predicted Stars: [2.463274]
8. Business ID: Swm_uMOWNcJDZz5lXWyzKA, Actual Stars: 4.0, predicted Stars: [4.127126]
9. Business ID: 6nGnVP7M4qQRiclXxeqXSQ, Actual Stars: 5.0, predicted Stars: [5.0190687]
10. Business ID: Tc24GX9-ZPr4_SHU0nJZZA, Actual Stars: 5.0, predicted Stars: [4.821728]


In [33]:
# Measure RMSE error.  RMSE is common for regression.
score_nn_relu_stopping = np.sqrt(mean_squared_error(y_test_reg,pred_reg_stopping))
print("Final score (RMSE): {}".format(score_nn_relu_stopping))
print('R2 score: %.2f' % r2_score(y_test_reg, pred_reg_stopping))

Final score (RMSE): 0.5663977265357971
R2 score: 0.68


** Training without early stopping and Model Checkpoint and Sigmoid **

In [34]:
# Tensorflow model for Regression with sigmoid
model_reg_sig = Sequential()

model_reg_sig.add(Dense(25, input_dim=x_train_reg.shape[1], activation='sigmoid'))  
model_reg_sig.add(Dense(10, activation='sigmoid')) # Hidden 2
model_reg_sig.add(Dense(1)) # Output


In [35]:
# Model training with sigmoid 
model_reg_sig.compile(loss='mean_squared_error', optimizer='adam')
model_reg_sig.fit(x_train_reg,y_train_reg,verbose=2,epochs=100)

Epoch 1/100
 - 1s - loss: 4.7493
Epoch 2/100
 - 0s - loss: 1.1528
Epoch 3/100
 - 0s - loss: 1.0442
Epoch 4/100
 - 0s - loss: 1.0178
Epoch 5/100
 - 0s - loss: 0.9361
Epoch 6/100
 - 0s - loss: 0.6990
Epoch 7/100
 - 0s - loss: 0.4578
Epoch 8/100
 - 0s - loss: 0.3319
Epoch 9/100
 - 0s - loss: 0.2782
Epoch 10/100
 - 0s - loss: 0.2521
Epoch 11/100
 - 0s - loss: 0.2368
Epoch 12/100
 - 1s - loss: 0.2270
Epoch 13/100
 - 0s - loss: 0.2200
Epoch 14/100
 - 0s - loss: 0.2145
Epoch 15/100
 - 0s - loss: 0.2104
Epoch 16/100
 - 0s - loss: 0.2066
Epoch 17/100
 - 0s - loss: 0.2032
Epoch 18/100
 - 0s - loss: 0.2003
Epoch 19/100
 - 0s - loss: 0.1977
Epoch 20/100
 - 0s - loss: 0.1953
Epoch 21/100
 - 0s - loss: 0.1934
Epoch 22/100
 - 1s - loss: 0.1907
Epoch 23/100
 - 1s - loss: 0.1892
Epoch 24/100
 - 0s - loss: 0.1878
Epoch 25/100
 - 0s - loss: 0.1856
Epoch 26/100
 - 0s - loss: 0.1848
Epoch 27/100
 - 0s - loss: 0.1838
Epoch 28/100
 - 0s - loss: 0.1821
Epoch 29/100
 - 0s - loss: 0.1805
Epoch 30/100
 - 0s - lo

<keras.callbacks.History at 0x20395591518>

In [36]:
# Predict stars
pred_reg_sig_simple = model_reg_sig.predict(x_test_reg)
print("Shape: {}".format(pred_reg_sig_simple.shape))

Shape: (1996, 1)


In [37]:
#Display 10 business

for i in range(10):
    print("{}. Business ID: {}, Actual Stars: {}, predicted Stars: {}".format(i+1,merge_df['business_id'][2000+i],y_test_reg[i],pred_reg_sig_simple[i]))

1. Business ID: diaiQrxYFU1V5qxrFnW9fg, Actual Stars: 4.0, predicted Stars: [4.3263874]
2. Business ID: TDTASGFy_aGp6vy0i23mDA, Actual Stars: 5.0, predicted Stars: [4.8745313]
3. Business ID: VuKJ2s_JP8weQ54NfsXJXQ, Actual Stars: 5.0, predicted Stars: [4.9969363]
4. Business ID: aGiBg2WKOpXS5-1DRnBiAQ, Actual Stars: 3.0, predicted Stars: [2.9743967]
5. Business ID: ZMmgFw2P4LWsFXNn1ZGc1g, Actual Stars: 5.0, predicted Stars: [5.036073]
6. Business ID: sEKFq5u8P_s0-2mAZnx0JQ, Actual Stars: 2.0, predicted Stars: [2.214703]
7. Business ID: rYziPPEILDXJ_F5uKR--YQ, Actual Stars: 2.5, predicted Stars: [2.3479881]
8. Business ID: Swm_uMOWNcJDZz5lXWyzKA, Actual Stars: 4.0, predicted Stars: [4.137304]
9. Business ID: 6nGnVP7M4qQRiclXxeqXSQ, Actual Stars: 5.0, predicted Stars: [4.7220745]
10. Business ID: Tc24GX9-ZPr4_SHU0nJZZA, Actual Stars: 5.0, predicted Stars: [4.7905393]


In [38]:
# Measure RMSE error.  RMSE is common for regression.
score_nn_sig = np.sqrt(mean_squared_error(y_test_reg,pred_reg_sig_simple))
print("Final score (RMSE): {}".format(score_nn_sig))
print('R2 score: %.2f' % r2_score(y_test_reg, pred_reg_sig_simple))

Final score (RMSE): 0.49363577365875244
R2 score: 0.76


** Training with early stopping and Model Checkpoint and Sigmoid **

In [39]:
# setup early stopping monitor

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=3, verbose=1, mode='auto')

In [40]:
# set up checkpointer
checkpointer_sigmoid = ModelCheckpoint(filepath="./best_weights_sigmoid.hdf5", verbose=1, save_best_only=True)

In [41]:
for i in range(10):
    model_reg_sig.compile(loss='mean_squared_error', optimizer='adam')
    model_reg_sig.fit(x_train_reg,y_train_reg,validation_data=(x_test_reg,y_test_reg),callbacks=[monitor,checkpointer_sigmoid],verbose=2,epochs=100) 

model_reg_sig.load_weights('./best_weights_sigmoid.hdf5')

Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 1s - loss: 0.1337 - mean_absolute_error: 0.2757 - val_loss: 0.2471 - val_mean_absolute_error: 0.3644

Epoch 00001: val_loss improved from inf to 0.24711, saving model to ./best_weights_sigmoid.hdf5
Epoch 2/100
 - 1s - loss: 0.1332 - mean_absolute_error: 0.2748 - val_loss: 0.2433 - val_mean_absolute_error: 0.3608

Epoch 00002: val_loss improved from 0.24711 to 0.24332, saving model to ./best_weights_sigmoid.hdf5
Epoch 3/100
 - 0s - loss: 0.1326 - mean_absolute_error: 0.2745 - val_loss: 0.2444 - val_mean_absolute_error: 0.3623

Epoch 00003: val_loss did not improve from 0.24332
Epoch 4/100
 - 0s - loss: 0.1317 - mean_absolute_error: 0.2739 - val_loss: 0.2448 - val_mean_absolute_error: 0.3611

Epoch 00004: val_loss did not improve from 0.24332
Epoch 5/100
 - 0s - loss: 0.1316 - mean_absolute_error: 0.2734 - val_loss: 0.2453 - val_mean_absolute_error: 0.3613

Epoch 00005: val_loss did not improve from 0.24332
Epoch 00005: early

 - 1s - loss: 0.1037 - mean_absolute_error: 0.2445 - val_loss: 0.2678 - val_mean_absolute_error: 0.3770

Epoch 00005: val_loss did not improve from 0.24332
Epoch 00005: early stopping


In [42]:
# Predict stars
pred_reg_sig_stopping = model_reg_sig.predict(x_test_reg)
print("Shape: {}".format(pred_reg_sig_stopping.shape))

Shape: (1996, 1)


In [43]:
#Display 10 business

for i in range(10):
    print("{}. Business ID: {}, Actual Stars: {}, predicted Stars: {}".format(i+1,merge_df['business_id'][2000+i],y_test_reg[i],pred_reg_sig_stopping[i]))

1. Business ID: diaiQrxYFU1V5qxrFnW9fg, Actual Stars: 4.0, predicted Stars: [4.3110037]
2. Business ID: TDTASGFy_aGp6vy0i23mDA, Actual Stars: 5.0, predicted Stars: [4.848142]
3. Business ID: VuKJ2s_JP8weQ54NfsXJXQ, Actual Stars: 5.0, predicted Stars: [4.971846]
4. Business ID: aGiBg2WKOpXS5-1DRnBiAQ, Actual Stars: 3.0, predicted Stars: [2.980784]
5. Business ID: ZMmgFw2P4LWsFXNn1ZGc1g, Actual Stars: 5.0, predicted Stars: [5.0126925]
6. Business ID: sEKFq5u8P_s0-2mAZnx0JQ, Actual Stars: 2.0, predicted Stars: [2.21557]
7. Business ID: rYziPPEILDXJ_F5uKR--YQ, Actual Stars: 2.5, predicted Stars: [2.3348935]
8. Business ID: Swm_uMOWNcJDZz5lXWyzKA, Actual Stars: 4.0, predicted Stars: [4.1198955]
9. Business ID: 6nGnVP7M4qQRiclXxeqXSQ, Actual Stars: 5.0, predicted Stars: [4.702338]
10. Business ID: Tc24GX9-ZPr4_SHU0nJZZA, Actual Stars: 5.0, predicted Stars: [4.772306]


In [44]:
# Measure RMSE error.  RMSE is common for regression.
score_nn_sig_stopping = np.sqrt(mean_squared_error(y_test_reg,pred_reg_sig_stopping))
print("Final score (RMSE): {}".format(score_nn_sig_stopping))
print('R2 score: %.2f' % r2_score(y_test_reg, pred_reg_sig_stopping))

Final score (RMSE): 0.4932701885700226
R2 score: 0.76


** Training without early stopping and Model Checkpoint and Tanh **

In [45]:
#train test data
x_train_reg_tanh, x_test_reg_tanh, y_train_reg_tanh, y_test_reg_tanh = train_test_split(x_matrix_zscore, y_stars_regression , test_size=0.2)


In [46]:
# Tensorflow model for Regression with sigmoid
model_reg_tanh = Sequential()

model_reg_tanh.add(Dense(25, input_dim=x_train_reg_tanh.shape[1], activation='tanh'))  
model_reg_tanh.add(Dense(10, activation='tanh')) # Hidden 2
model_reg_tanh.add(Dense(1)) # Output


In [47]:
# Model training with sigmoid 
model_reg_tanh.compile(loss='mean_squared_error', optimizer='adam')
model_reg_tanh.fit(x_train_reg_tanh,y_train_reg_tanh,verbose=2,epochs=100)

Epoch 1/100
 - 1s - loss: 1.8583
Epoch 2/100
 - 1s - loss: 0.3537
Epoch 3/100
 - 1s - loss: 0.2513
Epoch 4/100
 - 0s - loss: 0.2288
Epoch 5/100
 - 0s - loss: 0.2163
Epoch 6/100
 - 0s - loss: 0.2090
Epoch 7/100
 - 0s - loss: 0.2025
Epoch 8/100
 - 0s - loss: 0.1961
Epoch 9/100
 - 0s - loss: 0.1911
Epoch 10/100
 - 0s - loss: 0.1857
Epoch 11/100
 - 0s - loss: 0.1812
Epoch 12/100
 - 0s - loss: 0.1784
Epoch 13/100
 - 1s - loss: 0.1735
Epoch 14/100
 - 1s - loss: 0.1692
Epoch 15/100
 - 0s - loss: 0.1651
Epoch 16/100
 - 0s - loss: 0.1625
Epoch 17/100
 - 0s - loss: 0.1602
Epoch 18/100
 - 0s - loss: 0.1569
Epoch 19/100
 - 0s - loss: 0.1555
Epoch 20/100
 - 0s - loss: 0.1529
Epoch 21/100
 - 0s - loss: 0.1496
Epoch 22/100
 - 0s - loss: 0.1488
Epoch 23/100
 - 1s - loss: 0.1454
Epoch 24/100
 - 1s - loss: 0.1433
Epoch 25/100
 - 1s - loss: 0.1416
Epoch 26/100
 - 1s - loss: 0.1399
Epoch 27/100
 - 1s - loss: 0.1379
Epoch 28/100
 - 0s - loss: 0.1354
Epoch 29/100
 - 1s - loss: 0.1348
Epoch 30/100
 - 0s - lo

<keras.callbacks.History at 0x201cbdb04a8>

In [48]:
# Predict stars
pred_reg_tanh_simple = model_reg_tanh.predict(x_test_reg_tanh)
print("Shape: {}".format(pred_reg_tanh_simple.shape))

Shape: (1996, 1)


In [49]:
#Display 10 business

for i in range(10):
    print("{}. Business ID: {}, Actual Stars: {}, predicted Stars: {}".format(i+1,merge_df['business_id'][2000+i],y_test_reg_tanh[i],pred_reg_tanh_simple[i]))

1. Business ID: diaiQrxYFU1V5qxrFnW9fg, Actual Stars: 4.0, predicted Stars: [4.5639753]
2. Business ID: TDTASGFy_aGp6vy0i23mDA, Actual Stars: 4.0, predicted Stars: [3.845708]
3. Business ID: VuKJ2s_JP8weQ54NfsXJXQ, Actual Stars: 3.5, predicted Stars: [3.7652197]
4. Business ID: aGiBg2WKOpXS5-1DRnBiAQ, Actual Stars: 4.5, predicted Stars: [5.173068]
5. Business ID: ZMmgFw2P4LWsFXNn1ZGc1g, Actual Stars: 3.5, predicted Stars: [3.076366]
6. Business ID: sEKFq5u8P_s0-2mAZnx0JQ, Actual Stars: 3.0, predicted Stars: [3.0328262]
7. Business ID: rYziPPEILDXJ_F5uKR--YQ, Actual Stars: 4.0, predicted Stars: [3.4399014]
8. Business ID: Swm_uMOWNcJDZz5lXWyzKA, Actual Stars: 4.0, predicted Stars: [4.311018]
9. Business ID: 6nGnVP7M4qQRiclXxeqXSQ, Actual Stars: 4.5, predicted Stars: [3.6813293]
10. Business ID: Tc24GX9-ZPr4_SHU0nJZZA, Actual Stars: 4.5, predicted Stars: [4.638321]


In [50]:
# Measure RMSE error.  RMSE is common for regression.
score_nn_tanh = np.sqrt(mean_squared_error(y_test_reg_tanh,pred_reg_tanh_simple))
print("Final score (RMSE): {}".format(score_nn_tanh))
print('R2 score: %.2f' % r2_score(y_test_reg_tanh, pred_reg_tanh_simple))

Final score (RMSE): 0.7128996849060059
R2 score: 0.54


** Training with early stopping and Model Checkpoint and Tanh **

In [51]:
# setup early stopping monitor

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=3, verbose=1, mode='auto')

In [52]:
# set up checkpointer
checkpointer_tanh = ModelCheckpoint(filepath="./best_weights_tanh.hdf5", verbose=1, save_best_only=True)

In [53]:
for i in range(10):
    model_reg_tanh.compile(loss='mean_squared_error', optimizer='adam')
    model_reg_tanh.fit(x_train_reg_tanh,y_train_reg_tanh,validation_data=(x_test_reg_tanh,y_test_reg_tanh),callbacks=[monitor,checkpointer_tanh],verbose=2,epochs=100) 

model_reg_tanh.load_weights('./best_weights_tanh.hdf5')

Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 1s - loss: 0.0294 - mean_absolute_error: 0.1266 - val_loss: 0.4955 - val_mean_absolute_error: 0.5021

Epoch 00001: val_loss improved from inf to 0.49546, saving model to ./best_weights_tanh.hdf5
Epoch 2/100
 - 1s - loss: 0.0278 - mean_absolute_error: 0.1225 - val_loss: 0.5096 - val_mean_absolute_error: 0.5077

Epoch 00002: val_loss did not improve from 0.49546
Epoch 3/100
 - 0s - loss: 0.0273 - mean_absolute_error: 0.1215 - val_loss: 0.5030 - val_mean_absolute_error: 0.5041

Epoch 00003: val_loss did not improve from 0.49546
Epoch 4/100
 - 0s - loss: 0.0271 - mean_absolute_error: 0.1210 - val_loss: 0.5135 - val_mean_absolute_error: 0.5110

Epoch 00004: val_loss did not improve from 0.49546
Epoch 00004: early stopping
Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 1s - loss: 0.0267 - mean_absolute_error: 0.1199 - val_loss: 0.5035 - val_mean_absolute_error: 0.5069

Epoch 00001: val_loss did not improve from 0.

In [54]:
# Predict stars
pred_reg_tanh_stopping = model_reg_tanh.predict(x_test_reg_tanh)
print("Shape: {}".format(pred_reg_tanh_stopping.shape))

Shape: (1996, 1)


In [55]:
#Display 10 business

for i in range(10):
    print("{}. Business ID: {}, Actual Stars: {}, predicted Stars: {}".format(i+1,merge_df['business_id'][2000+i],y_test_reg_tanh[i],pred_reg_tanh_stopping[i]))

1. Business ID: diaiQrxYFU1V5qxrFnW9fg, Actual Stars: 4.0, predicted Stars: [4.504297]
2. Business ID: TDTASGFy_aGp6vy0i23mDA, Actual Stars: 4.0, predicted Stars: [3.8482068]
3. Business ID: VuKJ2s_JP8weQ54NfsXJXQ, Actual Stars: 3.5, predicted Stars: [3.6773396]
4. Business ID: aGiBg2WKOpXS5-1DRnBiAQ, Actual Stars: 4.5, predicted Stars: [5.1430335]
5. Business ID: ZMmgFw2P4LWsFXNn1ZGc1g, Actual Stars: 3.5, predicted Stars: [3.172525]
6. Business ID: sEKFq5u8P_s0-2mAZnx0JQ, Actual Stars: 3.0, predicted Stars: [3.00534]
7. Business ID: rYziPPEILDXJ_F5uKR--YQ, Actual Stars: 4.0, predicted Stars: [3.407479]
8. Business ID: Swm_uMOWNcJDZz5lXWyzKA, Actual Stars: 4.0, predicted Stars: [4.2344093]
9. Business ID: 6nGnVP7M4qQRiclXxeqXSQ, Actual Stars: 4.5, predicted Stars: [3.590742]
10. Business ID: Tc24GX9-ZPr4_SHU0nJZZA, Actual Stars: 4.5, predicted Stars: [4.5824337]


In [56]:
# Measure RMSE error.  RMSE is common for regression.
score_nn_tanh_stopping = np.sqrt(mean_squared_error(y_test_reg_tanh,pred_reg_tanh_stopping))
print("Final score (RMSE): {}".format(score_nn_tanh_stopping))
print('R2 score: %.2f' % r2_score(y_test_reg_tanh, pred_reg_tanh_stopping))

Final score (RMSE): 0.7038869261741638
R2 score: 0.55


# Relu with Postal Code and Categories

In [69]:
# one-hot cooding of postal codes 

postal_hotcoded_df = pd.get_dummies(merge_df['postal code'], sparse = 'true')

x_matrix_postal = np.column_stack((x_matrix_minmax, postal_hotcoded_df))
x_matrix_final = np.column_stack((x_matrix_postal, category_matrix))



In [70]:
print(y_stars_regression.shape)


(9977,)


In [71]:
#train test data
x_train_reg, x_test_reg, y_train_reg, y_test_reg = train_test_split(x_matrix_final, y_stars_regression , test_size=0.2)

In [143]:
# set up checkpointer
checkpointer_relu = ModelCheckpoint(filepath="./best_weights_relu_postal.hdf5", verbose=1, save_best_only=True)

In [72]:
#Tensorflow model for regression
model_reg_relu = Sequential()

model_reg_relu.add(Dense(60, input_dim=x_train_reg.shape[1], activation='relu')) 
model_reg_relu.add(Dense(30, activation='relu')) # Hidden 2
model_reg_relu.add(Dense(1)) # Output

In [74]:
for i in range(10):
    model_reg_relu.compile(loss='mean_squared_error', optimizer='adam')
    model_reg_relu.fit(x_train_reg,y_train_reg,validation_data=(x_test_reg,y_test_reg),callbacks=[monitor,checkpointer_relu],verbose=2,epochs=100) 

model_reg_relu.load_weights('./best_weights_relu_postal.hdf5')

Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 5s - loss: 0.0035 - mean_absolute_error: 0.0448 - val_loss: 0.3192 - val_mean_absolute_error: 0.4348

Epoch 00001: val_loss did not improve from 0.31693
Epoch 2/100
 - 3s - loss: 0.0027 - mean_absolute_error: 0.0399 - val_loss: 0.3195 - val_mean_absolute_error: 0.4340

Epoch 00002: val_loss did not improve from 0.31693
Epoch 3/100
 - 3s - loss: 0.0031 - mean_absolute_error: 0.0423 - val_loss: 0.3221 - val_mean_absolute_error: 0.4359

Epoch 00003: val_loss did not improve from 0.31693
Epoch 4/100
 - 3s - loss: 0.0034 - mean_absolute_error: 0.0440 - val_loss: 0.3211 - val_mean_absolute_error: 0.4359

Epoch 00004: val_loss did not improve from 0.31693
Epoch 00004: early stopping
Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 7s - loss: 0.0035 - mean_absolute_error: 0.0448 - val_loss: 0.3212 - val_mean_absolute_error: 0.4365

Epoch 00001: val_loss did not improve from 0.31693
Epoch 2/100
 - 3s - loss: 0.0027 - m

 - 3s - loss: 0.0024 - mean_absolute_error: 0.0371 - val_loss: 0.3146 - val_mean_absolute_error: 0.4310

Epoch 00004: val_loss improved from 0.31491 to 0.31459, saving model to ./best_weights_relu.hdf5
Epoch 00004: early stopping


In [75]:
# Predict stars
pred_reg_stopping = model_reg_relu.predict(x_test_reg)
print("Shape: {}".format(pred_reg_stopping.shape))

Shape: (1996, 1)


In [76]:
#Display 10 business

for i in range(10):
    print("{}. Business ID: {}, Actual Stars: {}, predicted Stars: {}".format(i+1,merge_df['business_id'][2000+i],y_test_reg[i],pred_reg_stopping[i]))

1. Business ID: diaiQrxYFU1V5qxrFnW9fg, Actual Stars: 5.0, predicted Stars: [4.993835]
2. Business ID: TDTASGFy_aGp6vy0i23mDA, Actual Stars: 3.5, predicted Stars: [3.5794392]
3. Business ID: VuKJ2s_JP8weQ54NfsXJXQ, Actual Stars: 4.5, predicted Stars: [3.8313274]
4. Business ID: aGiBg2WKOpXS5-1DRnBiAQ, Actual Stars: 2.5, predicted Stars: [2.8150606]
5. Business ID: ZMmgFw2P4LWsFXNn1ZGc1g, Actual Stars: 4.5, predicted Stars: [4.0937324]
6. Business ID: sEKFq5u8P_s0-2mAZnx0JQ, Actual Stars: 3.5, predicted Stars: [3.6672797]
7. Business ID: rYziPPEILDXJ_F5uKR--YQ, Actual Stars: 3.0, predicted Stars: [3.667501]
8. Business ID: Swm_uMOWNcJDZz5lXWyzKA, Actual Stars: 3.0, predicted Stars: [3.0535188]
9. Business ID: 6nGnVP7M4qQRiclXxeqXSQ, Actual Stars: 3.5, predicted Stars: [3.469771]
10. Business ID: Tc24GX9-ZPr4_SHU0nJZZA, Actual Stars: 4.0, predicted Stars: [4.739553]


In [77]:
# Measure RMSE error.  RMSE is common for regression.
score_relu_postal = np.sqrt(mean_squared_error(y_test_reg,pred_reg_stopping))
print("Final score (RMSE): {}".format(score_relu_postal))
print('R2 score: %.2f' % r2_score(y_test_reg, pred_reg_stopping))

Final score (RMSE): 0.5608825087547302
R2 score: 0.69


# Experimenting with different optimizers for ReLU

In [171]:
# set up checkpointer
checkpointer_relu = ModelCheckpoint(filepath="./best_weights_relu_sgd.hdf5", verbose=1, save_best_only=True)

In [172]:
#Model training with Stochastic gradient descent optimizer(SGD).
for i in range(10):
    model_reg_relu.compile(loss='mean_squared_error', optimizer='sgd')
    model_reg_relu.fit(x_train_reg,y_train_reg,validation_data=(x_test_reg,y_test_reg),callbacks=[monitor,checkpointer_relu],verbose=2,epochs=100) 

model_reg_relu.load_weights('./best_weights_relu_sgd.hdf5')

Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 11s - loss: 0.0018 - mean_absolute_error: 0.0320 - val_loss: 0.3161 - val_mean_absolute_error: 0.4370

Epoch 00001: val_loss improved from inf to 0.31609, saving model to ./best_weights_relu_sgd.hdf5
Epoch 2/100
 - 4s - loss: 0.0019 - mean_absolute_error: 0.0322 - val_loss: 0.3112 - val_mean_absolute_error: 0.4337

Epoch 00002: val_loss improved from 0.31609 to 0.31120, saving model to ./best_weights_relu_sgd.hdf5
Epoch 3/100
 - 5s - loss: 0.0018 - mean_absolute_error: 0.0314 - val_loss: 0.3111 - val_mean_absolute_error: 0.4337

Epoch 00003: val_loss improved from 0.31120 to 0.31107, saving model to ./best_weights_relu_sgd.hdf5
Epoch 4/100
 - 4s - loss: 0.0018 - mean_absolute_error: 0.0313 - val_loss: 0.3105 - val_mean_absolute_error: 0.4334

Epoch 00004: val_loss improved from 0.31107 to 0.31047, saving model to ./best_weights_relu_sgd.hdf5
Epoch 5/100
 - 4s - loss: 0.0017 - mean_absolute_error: 0.0304 - val_loss: 0.3100 -

 - 4s - loss: 0.0012 - mean_absolute_error: 0.0258 - val_loss: 0.3111 - val_mean_absolute_error: 0.4336

Epoch 00004: val_loss did not improve from 0.30994
Epoch 00004: early stopping
Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 13s - loss: 0.0013 - mean_absolute_error: 0.0266 - val_loss: 0.3099 - val_mean_absolute_error: 0.4332

Epoch 00001: val_loss improved from 0.30994 to 0.30989, saving model to ./best_weights_relu_sgd.hdf5
Epoch 2/100
 - 4s - loss: 0.0012 - mean_absolute_error: 0.0256 - val_loss: 0.3106 - val_mean_absolute_error: 0.4334

Epoch 00002: val_loss did not improve from 0.30989
Epoch 3/100
 - 4s - loss: 0.0012 - mean_absolute_error: 0.0257 - val_loss: 0.3109 - val_mean_absolute_error: 0.4335

Epoch 00003: val_loss did not improve from 0.30989
Epoch 4/100
 - 4s - loss: 0.0012 - mean_absolute_error: 0.0259 - val_loss: 0.3105 - val_mean_absolute_error: 0.4333

Epoch 00004: val_loss did not improve from 0.30989
Epoch 00004: early stopping


In [173]:
# Predict stars
pred_reg_sgd = model_reg_relu.predict(x_test_reg)

In [174]:
#Display 10 business

for i in range(10):
    print("{}. Business ID: {}, Actual Stars: {}, predicted Stars: {}".format(i+1,merge_df['business_id'][2000+i],y_test_reg[i],pred_reg_sgd[i]))

1. Business ID: diaiQrxYFU1V5qxrFnW9fg, Actual Stars: 5.0, predicted Stars: [4.9187746]
2. Business ID: TDTASGFy_aGp6vy0i23mDA, Actual Stars: 3.5, predicted Stars: [3.6321445]
3. Business ID: VuKJ2s_JP8weQ54NfsXJXQ, Actual Stars: 4.5, predicted Stars: [4.071268]
4. Business ID: aGiBg2WKOpXS5-1DRnBiAQ, Actual Stars: 2.5, predicted Stars: [2.6122594]
5. Business ID: ZMmgFw2P4LWsFXNn1ZGc1g, Actual Stars: 4.5, predicted Stars: [3.9623418]
6. Business ID: sEKFq5u8P_s0-2mAZnx0JQ, Actual Stars: 3.5, predicted Stars: [3.6999474]
7. Business ID: rYziPPEILDXJ_F5uKR--YQ, Actual Stars: 3.0, predicted Stars: [3.9084706]
8. Business ID: Swm_uMOWNcJDZz5lXWyzKA, Actual Stars: 3.0, predicted Stars: [2.7844324]
9. Business ID: 6nGnVP7M4qQRiclXxeqXSQ, Actual Stars: 3.5, predicted Stars: [3.488697]
10. Business ID: Tc24GX9-ZPr4_SHU0nJZZA, Actual Stars: 4.0, predicted Stars: [4.9579816]


In [175]:
# Measure RMSE error.  RMSE is common for regression.
score_relu_sgd = np.sqrt(mean_squared_error(y_test_reg,pred_reg_sgd))
print("Final score (RMSE): {}".format(score_relu_sgd))
print('R2 score: %.2f' % r2_score(y_test_reg, pred_reg_sgd))

Final score (RMSE): 0.5566757917404175
R2 score: 0.70


In [176]:
# set up checkpointer
checkpointer_relu = ModelCheckpoint(filepath="./best_weights_relu_rmsprop.hdf5", verbose=1, save_best_only=True)

In [177]:
#Model training with RMSProp optimizer.
for i in range(10):
    model_reg_relu.compile(loss='mean_squared_error', optimizer='rmsprop')
    model_reg_relu.fit(x_train_reg,y_train_reg,validation_data=(x_test_reg,y_test_reg),callbacks=[monitor,checkpointer_relu],verbose=2,epochs=100) 

model_reg_relu.load_weights('./best_weights_relu_rmsprop.hdf5')

Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 13s - loss: 0.0035 - mean_absolute_error: 0.0445 - val_loss: 0.3135 - val_mean_absolute_error: 0.4350

Epoch 00001: val_loss improved from inf to 0.31351, saving model to ./best_weights_relu_rmsprop.hdf5
Epoch 2/100
 - 5s - loss: 0.0030 - mean_absolute_error: 0.0431 - val_loss: 0.3188 - val_mean_absolute_error: 0.4389

Epoch 00002: val_loss did not improve from 0.31351
Epoch 3/100
 - 5s - loss: 0.0030 - mean_absolute_error: 0.0431 - val_loss: 0.3119 - val_mean_absolute_error: 0.4352

Epoch 00003: val_loss improved from 0.31351 to 0.31194, saving model to ./best_weights_relu_rmsprop.hdf5
Epoch 4/100
 - 5s - loss: 0.0029 - mean_absolute_error: 0.0423 - val_loss: 0.3115 - val_mean_absolute_error: 0.4344

Epoch 00004: val_loss improved from 0.31194 to 0.31149, saving model to ./best_weights_relu_rmsprop.hdf5
Epoch 5/100
 - 6s - loss: 0.0029 - mean_absolute_error: 0.0422 - val_loss: 0.3169 - val_mean_absolute_error: 0.4371

Epoc

 - 12s - loss: 0.0025 - mean_absolute_error: 0.0387 - val_loss: 0.3144 - val_mean_absolute_error: 0.4361

Epoch 00001: val_loss did not improve from 0.30891
Epoch 2/100
 - 5s - loss: 0.0023 - mean_absolute_error: 0.0382 - val_loss: 0.3130 - val_mean_absolute_error: 0.4368

Epoch 00002: val_loss did not improve from 0.30891
Epoch 3/100
 - 4s - loss: 0.0024 - mean_absolute_error: 0.0384 - val_loss: 0.3109 - val_mean_absolute_error: 0.4339

Epoch 00003: val_loss did not improve from 0.30891
Epoch 4/100
 - 5s - loss: 0.0022 - mean_absolute_error: 0.0371 - val_loss: 0.3104 - val_mean_absolute_error: 0.4341

Epoch 00004: val_loss did not improve from 0.30891
Epoch 5/100
 - 5s - loss: 0.0023 - mean_absolute_error: 0.0382 - val_loss: 0.3124 - val_mean_absolute_error: 0.4362

Epoch 00005: val_loss did not improve from 0.30891
Epoch 6/100
 - 5s - loss: 0.0023 - mean_absolute_error: 0.0371 - val_loss: 0.3107 - val_mean_absolute_error: 0.4340

Epoch 00006: val_loss did not improve from 0.30891
Epo

In [178]:
# Predict stars
pred_reg_rmsprop = model_reg_relu.predict(x_test_reg)

In [179]:
#Display 10 business

for i in range(10):
    print("{}. Business ID: {}, Actual Stars: {}, predicted Stars: {}".format(i+1,merge_df['business_id'][2000+i],y_test_reg[i],pred_reg_rmsprop[i]))

1. Business ID: diaiQrxYFU1V5qxrFnW9fg, Actual Stars: 5.0, predicted Stars: [4.9493213]
2. Business ID: TDTASGFy_aGp6vy0i23mDA, Actual Stars: 3.5, predicted Stars: [3.58076]
3. Business ID: VuKJ2s_JP8weQ54NfsXJXQ, Actual Stars: 4.5, predicted Stars: [4.0033565]
4. Business ID: aGiBg2WKOpXS5-1DRnBiAQ, Actual Stars: 2.5, predicted Stars: [2.6006212]
5. Business ID: ZMmgFw2P4LWsFXNn1ZGc1g, Actual Stars: 4.5, predicted Stars: [3.9112868]
6. Business ID: sEKFq5u8P_s0-2mAZnx0JQ, Actual Stars: 3.5, predicted Stars: [3.6256251]
7. Business ID: rYziPPEILDXJ_F5uKR--YQ, Actual Stars: 3.0, predicted Stars: [3.8134456]
8. Business ID: Swm_uMOWNcJDZz5lXWyzKA, Actual Stars: 3.0, predicted Stars: [2.7809153]
9. Business ID: 6nGnVP7M4qQRiclXxeqXSQ, Actual Stars: 3.5, predicted Stars: [3.480331]
10. Business ID: Tc24GX9-ZPr4_SHU0nJZZA, Actual Stars: 4.0, predicted Stars: [4.8759866]


In [180]:
# Measure RMSE error.  RMSE is common for regression.
score_relu_rmsprop = np.sqrt(mean_squared_error(y_test_reg,pred_reg_rmsprop))
print("Final score (RMSE): {}".format(score_relu_rmsprop))
print('R2 score: %.2f' % r2_score(y_test_reg, pred_reg_rmsprop))

Final score (RMSE): 0.5557924509048462
R2 score: 0.70


In [181]:
# set up checkpointer
checkpointer_relu = ModelCheckpoint(filepath="./best_weights_relu_adagrad.hdf5", verbose=1, save_best_only=True)

In [182]:
#Model training with Adagrad.
for i in range(10):
    model_reg_relu.compile(loss='mean_squared_error', optimizer='adagrad')
    model_reg_relu.fit(x_train_reg,y_train_reg,validation_data=(x_test_reg,y_test_reg),callbacks=[monitor,checkpointer_relu],verbose=2,epochs=100) 

model_reg_relu.load_weights('./best_weights_relu_adagrad.hdf5')

Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 13s - loss: 0.0042 - val_loss: 0.3104

Epoch 00001: val_loss improved from inf to 0.31041, saving model to ./best_weights_relu_adagrad.hdf5
Epoch 2/100
 - 5s - loss: 8.3106e-04 - val_loss: 0.3103

Epoch 00002: val_loss improved from 0.31041 to 0.31034, saving model to ./best_weights_relu_adagrad.hdf5
Epoch 3/100
 - 5s - loss: 2.2155e-04 - val_loss: 0.3114

Epoch 00003: val_loss did not improve from 0.31034
Epoch 4/100
 - 5s - loss: 8.7464e-05 - val_loss: 0.3106

Epoch 00004: val_loss did not improve from 0.31034
Epoch 00004: early stopping
Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 14s - loss: 0.0027 - val_loss: 0.3105

Epoch 00001: val_loss did not improve from 0.31034
Epoch 2/100
 - 5s - loss: 0.0012 - val_loss: 0.3109

Epoch 00002: val_loss did not improve from 0.31034
Epoch 3/100
 - 5s - loss: 3.1699e-04 - val_loss: 0.3108

Epoch 00003: val_loss did not improve from 0.31034
Epoch 4/100
 - 5s - loss: 

In [183]:
# Predict stars
pred_reg_adagrad = model_reg_relu.predict(x_test_reg)

In [184]:
#Display 10 business

for i in range(10):
    print("{}. Business ID: {}, Actual Stars: {}, predicted Stars: {}".format(i+1,merge_df['business_id'][2000+i],y_test_reg[i],pred_reg_adagrad[i]))

1. Business ID: diaiQrxYFU1V5qxrFnW9fg, Actual Stars: 5.0, predicted Stars: [5.0349708]
2. Business ID: TDTASGFy_aGp6vy0i23mDA, Actual Stars: 3.5, predicted Stars: [3.6384451]
3. Business ID: VuKJ2s_JP8weQ54NfsXJXQ, Actual Stars: 4.5, predicted Stars: [4.023394]
4. Business ID: aGiBg2WKOpXS5-1DRnBiAQ, Actual Stars: 2.5, predicted Stars: [2.6242118]
5. Business ID: ZMmgFw2P4LWsFXNn1ZGc1g, Actual Stars: 4.5, predicted Stars: [3.9797232]
6. Business ID: sEKFq5u8P_s0-2mAZnx0JQ, Actual Stars: 3.5, predicted Stars: [3.7541223]
7. Business ID: rYziPPEILDXJ_F5uKR--YQ, Actual Stars: 3.0, predicted Stars: [3.8522465]
8. Business ID: Swm_uMOWNcJDZz5lXWyzKA, Actual Stars: 3.0, predicted Stars: [2.8507853]
9. Business ID: 6nGnVP7M4qQRiclXxeqXSQ, Actual Stars: 3.5, predicted Stars: [3.4768906]
10. Business ID: Tc24GX9-ZPr4_SHU0nJZZA, Actual Stars: 4.0, predicted Stars: [4.9576845]


In [185]:
# Measure RMSE error.  RMSE is common for regression.
score_relu_adagrad = np.sqrt(mean_squared_error(y_test_reg,pred_reg_adagrad))
print("Final score (RMSE): {}".format(score_relu_adagrad))
print('R2 score: %.2f' % r2_score(y_test_reg, pred_reg_adagrad))

Final score (RMSE): 0.556630551815033
R2 score: 0.70


In [186]:
# set up checkpointer
checkpointer_relu = ModelCheckpoint(filepath="./best_weights_relu_adadelta.hdf5", verbose=1, save_best_only=True)

In [187]:
#Model training with ADadelta.
for i in range(10):
    model_reg_relu.compile(loss='mean_squared_error', optimizer='adadelta')
    model_reg_relu.fit(x_train_reg,y_train_reg,validation_data=(x_test_reg,y_test_reg),callbacks=[monitor,checkpointer_relu],verbose=2,epochs=100) 

model_reg_relu.load_weights('./best_weights_relu_adadelta.hdf5')

Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 13s - loss: 0.0031 - val_loss: 0.3109

Epoch 00001: val_loss improved from inf to 0.31085, saving model to ./best_weights_relu_adadelta.hdf5
Epoch 2/100
 - 6s - loss: 0.0033 - val_loss: 0.3095

Epoch 00002: val_loss improved from 0.31085 to 0.30953, saving model to ./best_weights_relu_adadelta.hdf5
Epoch 3/100
 - 6s - loss: 0.0025 - val_loss: 0.3096

Epoch 00003: val_loss did not improve from 0.30953
Epoch 4/100
 - 6s - loss: 0.0034 - val_loss: 0.3131

Epoch 00004: val_loss did not improve from 0.30953
Epoch 5/100
 - 5s - loss: 0.0029 - val_loss: 0.3097

Epoch 00005: val_loss did not improve from 0.30953
Epoch 00005: early stopping
Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 13s - loss: 0.0032 - val_loss: 0.3098

Epoch 00001: val_loss did not improve from 0.30953
Epoch 2/100
 - 5s - loss: 0.0021 - val_loss: 0.3102

Epoch 00002: val_loss did not improve from 0.30953
Epoch 3/100
 - 5s - loss: 0.0027 - val_l

In [188]:
# Predict stars
pred_reg_adadelta = model_reg_relu.predict(x_test_reg)

In [189]:
#Display 10 business

for i in range(10):
    print("{}. Business ID: {}, Actual Stars: {}, predicted Stars: {}".format(i+1,merge_df['business_id'][2000+i],y_test_reg[i],pred_reg_adadelta[i]))

1. Business ID: diaiQrxYFU1V5qxrFnW9fg, Actual Stars: 5.0, predicted Stars: [5.014692]
2. Business ID: TDTASGFy_aGp6vy0i23mDA, Actual Stars: 3.5, predicted Stars: [3.629376]
3. Business ID: VuKJ2s_JP8weQ54NfsXJXQ, Actual Stars: 4.5, predicted Stars: [4.0018387]
4. Business ID: aGiBg2WKOpXS5-1DRnBiAQ, Actual Stars: 2.5, predicted Stars: [2.617671]
5. Business ID: ZMmgFw2P4LWsFXNn1ZGc1g, Actual Stars: 4.5, predicted Stars: [3.9684253]
6. Business ID: sEKFq5u8P_s0-2mAZnx0JQ, Actual Stars: 3.5, predicted Stars: [3.7200139]
7. Business ID: rYziPPEILDXJ_F5uKR--YQ, Actual Stars: 3.0, predicted Stars: [3.8755937]
8. Business ID: Swm_uMOWNcJDZz5lXWyzKA, Actual Stars: 3.0, predicted Stars: [2.8827367]
9. Business ID: 6nGnVP7M4qQRiclXxeqXSQ, Actual Stars: 3.5, predicted Stars: [3.450332]
10. Business ID: Tc24GX9-ZPr4_SHU0nJZZA, Actual Stars: 4.0, predicted Stars: [4.9467916]


In [190]:
# Measure RMSE error.  RMSE is common for regression.
score_relu_adadelta = np.sqrt(mean_squared_error(y_test_reg,pred_reg_adadelta))
print("Final score (RMSE): {}".format(score_relu_adadelta))
print('R2 score: %.2f' % r2_score(y_test_reg, pred_reg_adadelta))

Final score (RMSE): 0.556352972984314
R2 score: 0.70


In [191]:
# set up checkpointer
checkpointer_relu = ModelCheckpoint(filepath="./best_weights_relu_adamax.hdf5", verbose=1, save_best_only=True)

In [192]:
#Model training with Stochastic gradient descent optimizer(SGD).
for i in range(10):
    model_reg_relu.compile(loss='mean_squared_error', optimizer='adamax')
    model_reg_relu.fit(x_train_reg,y_train_reg,validation_data=(x_test_reg,y_test_reg),callbacks=[monitor,checkpointer_relu],verbose=2,epochs=100) 

model_reg_relu.load_weights('./best_weights_relu_adamax.hdf5')

Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 13s - loss: 9.0234e-04 - val_loss: 0.3114

Epoch 00001: val_loss improved from inf to 0.31136, saving model to ./best_weights_relu_adamax.hdf5
Epoch 2/100
 - 5s - loss: 3.5951e-04 - val_loss: 0.3108

Epoch 00002: val_loss improved from 0.31136 to 0.31083, saving model to ./best_weights_relu_adamax.hdf5
Epoch 3/100
 - 5s - loss: 1.8484e-04 - val_loss: 0.3105

Epoch 00003: val_loss improved from 0.31083 to 0.31051, saving model to ./best_weights_relu_adamax.hdf5
Epoch 4/100
 - 5s - loss: 1.0698e-04 - val_loss: 0.3105

Epoch 00004: val_loss did not improve from 0.31051
Epoch 00004: early stopping
Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 13s - loss: 3.2482e-04 - val_loss: 0.3108

Epoch 00001: val_loss did not improve from 0.31051
Epoch 2/100
 - 6s - loss: 1.1103e-04 - val_loss: 0.3107

Epoch 00002: val_loss did not improve from 0.31051
Epoch 3/100
 - 5s - loss: 5.4349e-05 - val_loss: 0.3105

Epoch 00003: v

In [193]:
# Predict stars
pred_reg_adamax = model_reg_relu.predict(x_test_reg)

In [194]:
#Display 10 business

for i in range(10):
    print("{}. Business ID: {}, Actual Stars: {}, predicted Stars: {}".format(i+1,merge_df['business_id'][2000+i],y_test_reg[i],pred_reg_adamax[i]))

1. Business ID: diaiQrxYFU1V5qxrFnW9fg, Actual Stars: 5.0, predicted Stars: [5.0025473]
2. Business ID: TDTASGFy_aGp6vy0i23mDA, Actual Stars: 3.5, predicted Stars: [3.614849]
3. Business ID: VuKJ2s_JP8weQ54NfsXJXQ, Actual Stars: 4.5, predicted Stars: [4.0129156]
4. Business ID: aGiBg2WKOpXS5-1DRnBiAQ, Actual Stars: 2.5, predicted Stars: [2.6334472]
5. Business ID: ZMmgFw2P4LWsFXNn1ZGc1g, Actual Stars: 4.5, predicted Stars: [3.9461608]
6. Business ID: sEKFq5u8P_s0-2mAZnx0JQ, Actual Stars: 3.5, predicted Stars: [3.7136426]
7. Business ID: rYziPPEILDXJ_F5uKR--YQ, Actual Stars: 3.0, predicted Stars: [3.8818626]
8. Business ID: Swm_uMOWNcJDZz5lXWyzKA, Actual Stars: 3.0, predicted Stars: [2.888313]
9. Business ID: 6nGnVP7M4qQRiclXxeqXSQ, Actual Stars: 3.5, predicted Stars: [3.4684021]
10. Business ID: Tc24GX9-ZPr4_SHU0nJZZA, Actual Stars: 4.0, predicted Stars: [4.972292]


In [195]:
# Measure RMSE error.  RMSE is common for regression.
score_relu_adamax = np.sqrt(mean_squared_error(y_test_reg,pred_reg_adamax))
print("Final score (RMSE): {}".format(score_relu_adamax))
print('R2 score: %.2f' % r2_score(y_test_reg, pred_reg_adamax))

Final score (RMSE): 0.5568938851356506
R2 score: 0.70


In [196]:
# set up checkpointer
checkpointer_relu = ModelCheckpoint(filepath="./best_weights_relu_nadam.hdf5", verbose=1, save_best_only=True)

In [197]:
#Model training with Stochastic gradient descent optimizer(SGD).
for i in range(10):
    model_reg_relu.compile(loss='mean_squared_error', optimizer='nadam')
    model_reg_relu.fit(x_train_reg,y_train_reg,validation_data=(x_test_reg,y_test_reg),callbacks=[monitor,checkpointer_relu],verbose=2,epochs=100) 

model_reg_relu.load_weights('./best_weights_relu_nadam.hdf5')

Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 15s - loss: 0.0040 - mean_absolute_error: 0.0468 - val_loss: 0.3084 - val_mean_absolute_error: 0.4325

Epoch 00001: val_loss improved from inf to 0.30843, saving model to ./best_weights_relu_nadam.hdf5
Epoch 2/100
 - 6s - loss: 0.0107 - mean_absolute_error: 0.0793 - val_loss: 0.3127 - val_mean_absolute_error: 0.4322

Epoch 00002: val_loss did not improve from 0.30843
Epoch 3/100
 - 6s - loss: 0.0088 - mean_absolute_error: 0.0701 - val_loss: 0.3100 - val_mean_absolute_error: 0.4347

Epoch 00003: val_loss did not improve from 0.30843
Epoch 4/100
 - 7s - loss: 0.0067 - mean_absolute_error: 0.0615 - val_loss: 0.3092 - val_mean_absolute_error: 0.4309

Epoch 00004: val_loss did not improve from 0.30843
Epoch 00004: early stopping
Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 14s - loss: 0.0073 - mean_absolute_error: 0.0639 - val_loss: 0.3100 - val_mean_absolute_error: 0.4344

Epoch 00001: val_loss did not improve

 - 6s - loss: 0.0034 - mean_absolute_error: 0.0432 - val_loss: 0.3042 - val_mean_absolute_error: 0.4277

Epoch 00012: val_loss did not improve from 0.29938
Epoch 13/100
 - 6s - loss: 0.0031 - mean_absolute_error: 0.0410 - val_loss: 0.3061 - val_mean_absolute_error: 0.4278

Epoch 00013: val_loss did not improve from 0.29938
Epoch 00013: early stopping
Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 15s - loss: 0.0038 - mean_absolute_error: 0.0463 - val_loss: 0.3041 - val_mean_absolute_error: 0.4262

Epoch 00001: val_loss did not improve from 0.29938
Epoch 2/100
 - 6s - loss: 0.0040 - mean_absolute_error: 0.0481 - val_loss: 0.3066 - val_mean_absolute_error: 0.4269

Epoch 00002: val_loss did not improve from 0.29938
Epoch 3/100
 - 6s - loss: 0.0043 - mean_absolute_error: 0.0493 - val_loss: 0.3023 - val_mean_absolute_error: 0.4261

Epoch 00003: val_loss did not improve from 0.29938
Epoch 4/100
 - 6s - loss: 0.0042 - mean_absolute_error: 0.0484 - val_loss: 0.3007 - val_mean_a

In [198]:
# Predict stars
pred_reg_nadam = model_reg_relu.predict(x_test_reg)

In [199]:
#Display 10 business

for i in range(10):
    print("{}. Business ID: {}, Actual Stars: {}, predicted Stars: {}".format(i+1,merge_df['business_id'][2000+i],y_test_reg[i],pred_reg_nadam[i]))

1. Business ID: diaiQrxYFU1V5qxrFnW9fg, Actual Stars: 5.0, predicted Stars: [4.909003]
2. Business ID: TDTASGFy_aGp6vy0i23mDA, Actual Stars: 3.5, predicted Stars: [3.6149395]
3. Business ID: VuKJ2s_JP8weQ54NfsXJXQ, Actual Stars: 4.5, predicted Stars: [3.999168]
4. Business ID: aGiBg2WKOpXS5-1DRnBiAQ, Actual Stars: 2.5, predicted Stars: [2.7483773]
5. Business ID: ZMmgFw2P4LWsFXNn1ZGc1g, Actual Stars: 4.5, predicted Stars: [3.9290817]
6. Business ID: sEKFq5u8P_s0-2mAZnx0JQ, Actual Stars: 3.5, predicted Stars: [3.7371616]
7. Business ID: rYziPPEILDXJ_F5uKR--YQ, Actual Stars: 3.0, predicted Stars: [3.9110315]
8. Business ID: Swm_uMOWNcJDZz5lXWyzKA, Actual Stars: 3.0, predicted Stars: [2.754163]
9. Business ID: 6nGnVP7M4qQRiclXxeqXSQ, Actual Stars: 3.5, predicted Stars: [3.287781]
10. Business ID: Tc24GX9-ZPr4_SHU0nJZZA, Actual Stars: 4.0, predicted Stars: [5.002371]


In [200]:
# Measure RMSE error.  RMSE is common for regression.
score_relu_nadam = np.sqrt(mean_squared_error(y_test_reg,pred_reg_nadam))
print("Final score (RMSE): {}".format(score_relu_nadam))
print('R2 score: %.2f' % r2_score(y_test_reg, pred_reg_nadam))

Final score (RMSE): 0.5471599102020264
R2 score: 0.71


** Experiments with Hidden nodes selection in hidden layer **

In [201]:
# set up checkpointer
checkpointer_relu = ModelCheckpoint(filepath="./best_weights_relu_2l.hdf5", verbose=1, save_best_only=True)

In [202]:
# Tensorflow model for Regression with sigmoid and forward approach
model_reg_relu = Sequential()

model_reg_relu.add(Dense(60, input_dim=x_train_reg.shape[1], activation='relu'))  
model_reg_relu.add(Dense(30, activation='relu')) # Hidden 2
model_reg_relu.add(Dense(1)) # Output


In [203]:
for i in range(10):
    model_reg_relu.compile(loss='mean_squared_error', optimizer='adam')
    model_reg_relu.fit(x_train_reg,y_train_reg,validation_data=(x_test_reg,y_test_reg),callbacks=[monitor,checkpointer_relu],verbose=2,epochs=100) 

model_reg_relu.load_weights('./best_weights_relu_2l.hdf5')


Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 13s - loss: 1.7546 - mean_absolute_error: 0.9536 - val_loss: 0.4214 - val_mean_absolute_error: 0.5134

Epoch 00001: val_loss improved from inf to 0.42140, saving model to ./best_weights_relu_2l.hdf5
Epoch 2/100
 - 5s - loss: 0.3120 - mean_absolute_error: 0.4340 - val_loss: 0.3650 - val_mean_absolute_error: 0.4751

Epoch 00002: val_loss improved from 0.42140 to 0.36504, saving model to ./best_weights_relu_2l.hdf5
Epoch 3/100
 - 5s - loss: 0.2118 - mean_absolute_error: 0.3532 - val_loss: 0.3479 - val_mean_absolute_error: 0.4601

Epoch 00003: val_loss improved from 0.36504 to 0.34788, saving model to ./best_weights_relu_2l.hdf5
Epoch 4/100
 - 5s - loss: 0.1716 - mean_absolute_error: 0.3142 - val_loss: 0.3500 - val_mean_absolute_error: 0.4592

Epoch 00004: val_loss did not improve from 0.34788
Epoch 5/100
 - 5s - loss: 0.1422 - mean_absolute_error: 0.2837 - val_loss: 0.3608 - val_mean_absolute_error: 0.4699

Epoch 00005: val_lo


Epoch 00007: val_loss did not improve from 0.34788
Epoch 00007: early stopping
Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 14s - loss: 0.0070 - mean_absolute_error: 0.0641 - val_loss: 0.3482 - val_mean_absolute_error: 0.4570

Epoch 00001: val_loss did not improve from 0.34788
Epoch 2/100
 - 5s - loss: 0.0079 - mean_absolute_error: 0.0683 - val_loss: 0.3506 - val_mean_absolute_error: 0.4582

Epoch 00002: val_loss did not improve from 0.34788
Epoch 3/100
 - 5s - loss: 0.0080 - mean_absolute_error: 0.0689 - val_loss: 0.3495 - val_mean_absolute_error: 0.4552

Epoch 00003: val_loss did not improve from 0.34788
Epoch 4/100
 - 5s - loss: 0.0072 - mean_absolute_error: 0.0649 - val_loss: 0.3484 - val_mean_absolute_error: 0.4572

Epoch 00004: val_loss did not improve from 0.34788
Epoch 00004: early stopping
Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 14s - loss: 0.0070 - mean_absolute_error: 0.0633 - val_loss: 0.3541 - val_mean_absolute_error: 0.4585

Epoch

In [204]:
# Predict stars
pred_reg_hl = model_reg_relu.predict(x_test_reg)

In [205]:
#Display 10 business

for i in range(10):
    print("{}. Business ID: {}, Actual Stars: {}, predicted Stars: {}".format(i+1,merge_df['business_id'][2000+i],y_test_reg[i],pred_reg_hl[i]))

1. Business ID: diaiQrxYFU1V5qxrFnW9fg, Actual Stars: 5.0, predicted Stars: [5.2980175]
2. Business ID: TDTASGFy_aGp6vy0i23mDA, Actual Stars: 3.5, predicted Stars: [3.3337855]
3. Business ID: VuKJ2s_JP8weQ54NfsXJXQ, Actual Stars: 4.5, predicted Stars: [3.955951]
4. Business ID: aGiBg2WKOpXS5-1DRnBiAQ, Actual Stars: 2.5, predicted Stars: [2.738479]
5. Business ID: ZMmgFw2P4LWsFXNn1ZGc1g, Actual Stars: 4.5, predicted Stars: [4.1582956]
6. Business ID: sEKFq5u8P_s0-2mAZnx0JQ, Actual Stars: 3.5, predicted Stars: [3.6227615]
7. Business ID: rYziPPEILDXJ_F5uKR--YQ, Actual Stars: 3.0, predicted Stars: [3.6236231]
8. Business ID: Swm_uMOWNcJDZz5lXWyzKA, Actual Stars: 3.0, predicted Stars: [2.9315176]
9. Business ID: 6nGnVP7M4qQRiclXxeqXSQ, Actual Stars: 3.5, predicted Stars: [3.6149912]
10. Business ID: Tc24GX9-ZPr4_SHU0nJZZA, Actual Stars: 4.0, predicted Stars: [4.893476]


In [206]:
# Measure RMSE error.  RMSE is common for regression.
score_2l = np.sqrt(mean_squared_error(y_test_reg,pred_reg_hl))
print("Final score (RMSE): {}".format(score_2l))
print('R2 score: %.2f' % r2_score(y_test_reg, pred_reg_hl))

Final score (RMSE): 0.5873870253562927
R2 score: 0.66


In [207]:
# set up checkpointer
checkpointer_relu = ModelCheckpoint(filepath="./best_weights_relu_3l.hdf5", verbose=1, save_best_only=True)

In [208]:
# Tensorflow model for Regression with relu and forward approach with 3 hidden layers
model_reg_relu = Sequential()

model_reg_relu.add(Dense(60, input_dim=x_train_reg.shape[1], activation='relu'))  
model_reg_relu.add(Dense(30, activation='relu')) # Hidden 2
model_reg_relu.add(Dense(10, activation='relu')) # Hidden 3
model_reg_relu.add(Dense(1)) # Output


In [209]:
for i in range(10):
    model_reg_relu.compile(loss='mean_squared_error', optimizer='adam')
    model_reg_relu.fit(x_train_reg,y_train_reg,validation_data=(x_test_reg,y_test_reg),callbacks=[monitor,checkpointer_relu],verbose=2,epochs=100) 

model_reg_relu.load_weights('./best_weights_relu_3l.hdf5')


Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 14s - loss: 2.0383 - mean_absolute_error: 1.0240 - val_loss: 0.4214 - val_mean_absolute_error: 0.5161

Epoch 00001: val_loss improved from inf to 0.42140, saving model to ./best_weights_relu_3l.hdf5
Epoch 2/100
 - 5s - loss: 0.3085 - mean_absolute_error: 0.4299 - val_loss: 0.3570 - val_mean_absolute_error: 0.4703

Epoch 00002: val_loss improved from 0.42140 to 0.35701, saving model to ./best_weights_relu_3l.hdf5
Epoch 3/100
 - 5s - loss: 0.2127 - mean_absolute_error: 0.3539 - val_loss: 0.3450 - val_mean_absolute_error: 0.4583

Epoch 00003: val_loss improved from 0.35701 to 0.34503, saving model to ./best_weights_relu_3l.hdf5
Epoch 4/100
 - 5s - loss: 0.1721 - mean_absolute_error: 0.3152 - val_loss: 0.3581 - val_mean_absolute_error: 0.4639

Epoch 00004: val_loss did not improve from 0.34503
Epoch 5/100
 - 5s - loss: 0.1401 - mean_absolute_error: 0.2845 - val_loss: 0.3612 - val_mean_absolute_error: 0.4692

Epoch 00005: val_lo


Epoch 00005: val_loss did not improve from 0.32738
Epoch 6/100
 - 8s - loss: 0.0068 - mean_absolute_error: 0.0630 - val_loss: 0.3289 - val_mean_absolute_error: 0.4461

Epoch 00006: val_loss did not improve from 0.32738
Epoch 00006: early stopping
Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 17s - loss: 0.0069 - mean_absolute_error: 0.0627 - val_loss: 0.3268 - val_mean_absolute_error: 0.4441

Epoch 00001: val_loss improved from 0.32738 to 0.32682, saving model to ./best_weights_relu_3l.hdf5
Epoch 2/100
 - 5s - loss: 0.0065 - mean_absolute_error: 0.0623 - val_loss: 0.3288 - val_mean_absolute_error: 0.4447

Epoch 00002: val_loss did not improve from 0.32682
Epoch 3/100
 - 5s - loss: 0.0074 - mean_absolute_error: 0.0658 - val_loss: 0.3249 - val_mean_absolute_error: 0.4416

Epoch 00003: val_loss improved from 0.32682 to 0.32490, saving model to ./best_weights_relu_3l.hdf5
Epoch 4/100
 - 6s - loss: 0.0082 - mean_absolute_error: 0.0697 - val_loss: 0.3232 - val_mean_absolute

In [210]:
# Predict stars
pred_reg_hl_3 = model_reg_relu.predict(x_test_reg)

In [211]:
#Display 10 business

for i in range(10):
    print("{}. Business ID: {}, Actual Stars: {}, predicted Stars: {}".format(i+1,merge_df['business_id'][2000+i],y_test_reg[i],pred_reg_hl_3[i]))

1. Business ID: diaiQrxYFU1V5qxrFnW9fg, Actual Stars: 5.0, predicted Stars: [5.205835]
2. Business ID: TDTASGFy_aGp6vy0i23mDA, Actual Stars: 3.5, predicted Stars: [3.612089]
3. Business ID: VuKJ2s_JP8weQ54NfsXJXQ, Actual Stars: 4.5, predicted Stars: [4.167994]
4. Business ID: aGiBg2WKOpXS5-1DRnBiAQ, Actual Stars: 2.5, predicted Stars: [2.5880194]
5. Business ID: ZMmgFw2P4LWsFXNn1ZGc1g, Actual Stars: 4.5, predicted Stars: [4.0089335]
6. Business ID: sEKFq5u8P_s0-2mAZnx0JQ, Actual Stars: 3.5, predicted Stars: [3.809264]
7. Business ID: rYziPPEILDXJ_F5uKR--YQ, Actual Stars: 3.0, predicted Stars: [3.5154214]
8. Business ID: Swm_uMOWNcJDZz5lXWyzKA, Actual Stars: 3.0, predicted Stars: [3.4075544]
9. Business ID: 6nGnVP7M4qQRiclXxeqXSQ, Actual Stars: 3.5, predicted Stars: [3.7099614]
10. Business ID: Tc24GX9-ZPr4_SHU0nJZZA, Actual Stars: 4.0, predicted Stars: [4.588487]


In [212]:
# Measure RMSE error.  RMSE is common for regression.
score_3l = np.sqrt(mean_squared_error(y_test_reg,pred_reg_hl_3))
print("Final score (RMSE): {}".format(score_3l))
print('R2 score: %.2f' % r2_score(y_test_reg, pred_reg_hl_3))

Final score (RMSE): 0.5597916841506958
R2 score: 0.70


In [213]:
# set up checkpointer
checkpointer_relu = ModelCheckpoint(filepath="./best_weights_relu_4l.hdf5", verbose=1, save_best_only=True)

In [214]:
# Tensorflow model for Regression with sigmoid and forward approach with 4 hidden layers
model_reg_relu = Sequential()

model_reg_relu.add(Dense(80, input_dim=x_train_reg.shape[1], activation='relu'))  
model_reg_relu.add(Dense(60, activation='relu')) # Hidden 2
model_reg_relu.add(Dense(20, activation='relu')) # Hidden 3
model_reg_relu.add(Dense(10, activation='relu')) # Hidden 4
model_reg_relu.add(Dense(1)) # Output


In [215]:
for i in range(10):
    model_reg_relu.compile(loss='mean_squared_error', optimizer='adam')
    model_reg_relu.fit(x_train_reg,y_train_reg,validation_data=(x_test_reg,y_test_reg),callbacks=[monitor,checkpointer_relu],verbose=2,epochs=100) 

model_reg_relu.load_weights('./best_weights_relu_4l.hdf5')


Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 20s - loss: 1.5814 - mean_absolute_error: 0.8688 - val_loss: 0.3751 - val_mean_absolute_error: 0.4818

Epoch 00001: val_loss improved from inf to 0.37512, saving model to ./best_weights_relu_4l.hdf5
Epoch 2/100
 - 7s - loss: 0.2693 - mean_absolute_error: 0.4017 - val_loss: 0.3414 - val_mean_absolute_error: 0.4545

Epoch 00002: val_loss improved from 0.37512 to 0.34138, saving model to ./best_weights_relu_4l.hdf5
Epoch 3/100
 - 7s - loss: 0.1912 - mean_absolute_error: 0.3339 - val_loss: 0.3490 - val_mean_absolute_error: 0.4589

Epoch 00003: val_loss did not improve from 0.34138
Epoch 4/100
 - 7s - loss: 0.1484 - mean_absolute_error: 0.2911 - val_loss: 0.3491 - val_mean_absolute_error: 0.4596

Epoch 00004: val_loss did not improve from 0.34138
Epoch 5/100
 - 7s - loss: 0.1166 - mean_absolute_error: 0.2600 - val_loss: 0.3547 - val_mean_absolute_error: 0.4658

Epoch 00005: val_loss did not improve from 0.34138
Epoch 00005: earl

Epoch 3/100
 - 7s - loss: 0.0077 - mean_absolute_error: 0.0675 - val_loss: 0.3108 - val_mean_absolute_error: 0.4313

Epoch 00003: val_loss did not improve from 0.30886
Epoch 4/100
 - 7s - loss: 0.0066 - mean_absolute_error: 0.0622 - val_loss: 0.3085 - val_mean_absolute_error: 0.4306

Epoch 00004: val_loss improved from 0.30886 to 0.30847, saving model to ./best_weights_relu_4l.hdf5
Epoch 5/100
 - 8s - loss: 0.0064 - mean_absolute_error: 0.0612 - val_loss: 0.3091 - val_mean_absolute_error: 0.4306

Epoch 00005: val_loss did not improve from 0.30847
Epoch 00005: early stopping
Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 22s - loss: 0.0067 - mean_absolute_error: 0.0623 - val_loss: 0.3096 - val_mean_absolute_error: 0.4306

Epoch 00001: val_loss did not improve from 0.30847
Epoch 2/100
 - 6s - loss: 0.0063 - mean_absolute_error: 0.0611 - val_loss: 0.3140 - val_mean_absolute_error: 0.4353

Epoch 00002: val_loss did not improve from 0.30847
Epoch 3/100
 - 6s - loss: 0.0073 -

In [216]:
# Predict stars
pred_reg_hl_4 = model_reg_relu.predict(x_test_reg)

In [217]:
#Display 10 business

for i in range(10):
    print("{}. Business ID: {}, Actual Stars: {}, predicted Stars: {}".format(i+1,merge_df['business_id'][2000+i],y_test_reg[i],pred_reg_hl_4[i]))

1. Business ID: diaiQrxYFU1V5qxrFnW9fg, Actual Stars: 5.0, predicted Stars: [5.336504]
2. Business ID: TDTASGFy_aGp6vy0i23mDA, Actual Stars: 3.5, predicted Stars: [3.6343367]
3. Business ID: VuKJ2s_JP8weQ54NfsXJXQ, Actual Stars: 4.5, predicted Stars: [3.9705586]
4. Business ID: aGiBg2WKOpXS5-1DRnBiAQ, Actual Stars: 2.5, predicted Stars: [2.8183677]
5. Business ID: ZMmgFw2P4LWsFXNn1ZGc1g, Actual Stars: 4.5, predicted Stars: [4.2699523]
6. Business ID: sEKFq5u8P_s0-2mAZnx0JQ, Actual Stars: 3.5, predicted Stars: [3.806445]
7. Business ID: rYziPPEILDXJ_F5uKR--YQ, Actual Stars: 3.0, predicted Stars: [3.8777049]
8. Business ID: Swm_uMOWNcJDZz5lXWyzKA, Actual Stars: 3.0, predicted Stars: [3.33152]
9. Business ID: 6nGnVP7M4qQRiclXxeqXSQ, Actual Stars: 3.5, predicted Stars: [3.7277024]
10. Business ID: Tc24GX9-ZPr4_SHU0nJZZA, Actual Stars: 4.0, predicted Stars: [4.5496783]


In [218]:
# Measure RMSE error.  RMSE is common for regression.
score_4l = np.sqrt(mean_squared_error(y_test_reg,pred_reg_hl_4))
print("Final score (RMSE): {}".format(score_4l))
print('R2 score: %.2f' % r2_score(y_test_reg, pred_reg_hl_4))

Final score (RMSE): 0.5487036108970642
R2 score: 0.71


In [219]:
# set up checkpointer
checkpointer_relu = ModelCheckpoint(filepath="./best_weights_relu_5l.hdf5", verbose=1, save_best_only=True)

In [220]:
# Tensorflow model for Regression with sigmoid and forward approach with 5 hidden layers
model_reg_relu = Sequential()

model_reg_relu.add(Dense(80, input_dim=x_train_reg.shape[1], activation='relu'))  
model_reg_relu.add(Dense(60, activation='relu')) # Hidden 2
model_reg_relu.add(Dense(40, activation='relu')) # Hidden 2
model_reg_relu.add(Dense(20, activation='relu')) # Hidden 2
model_reg_relu.add(Dense(10, activation='relu')) # Hidden 2
model_reg_relu.add(Dense(1)) # Output


In [221]:
for i in range(10):
    model_reg_relu.compile(loss='mean_squared_error', optimizer='adam')
    model_reg_relu.fit(x_train_reg,y_train_reg,validation_data=(x_test_reg,y_test_reg),callbacks=[monitor,checkpointer_relu],verbose=2,epochs=100) 

model_reg_relu.load_weights('./best_weights_relu_5l.hdf5')


Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 19s - loss: 1.5056 - mean_absolute_error: 0.8583 - val_loss: 0.4115 - val_mean_absolute_error: 0.5049

Epoch 00001: val_loss improved from inf to 0.41153, saving model to ./best_weights_relu_5l.hdf5
Epoch 2/100
 - 7s - loss: 0.2810 - mean_absolute_error: 0.4141 - val_loss: 0.3467 - val_mean_absolute_error: 0.4621

Epoch 00002: val_loss improved from 0.41153 to 0.34671, saving model to ./best_weights_relu_5l.hdf5
Epoch 3/100
 - 7s - loss: 0.1910 - mean_absolute_error: 0.3346 - val_loss: 0.3379 - val_mean_absolute_error: 0.4516

Epoch 00003: val_loss improved from 0.34671 to 0.33792, saving model to ./best_weights_relu_5l.hdf5
Epoch 4/100
 - 6s - loss: 0.1459 - mean_absolute_error: 0.2903 - val_loss: 0.3441 - val_mean_absolute_error: 0.4567

Epoch 00004: val_loss did not improve from 0.33792
Epoch 5/100
 - 7s - loss: 0.1144 - mean_absolute_error: 0.2577 - val_loss: 0.3523 - val_mean_absolute_error: 0.4644

Epoch 00005: val_lo


Epoch 00001: val_loss did not improve from 0.31455
Epoch 2/100
 - 7s - loss: 0.0069 - mean_absolute_error: 0.0636 - val_loss: 0.3161 - val_mean_absolute_error: 0.4329

Epoch 00002: val_loss did not improve from 0.31455
Epoch 3/100
 - 7s - loss: 0.0076 - mean_absolute_error: 0.0663 - val_loss: 0.3188 - val_mean_absolute_error: 0.4333

Epoch 00003: val_loss did not improve from 0.31455
Epoch 4/100
 - 7s - loss: 0.0078 - mean_absolute_error: 0.0668 - val_loss: 0.3174 - val_mean_absolute_error: 0.4329

Epoch 00004: val_loss did not improve from 0.31455
Epoch 5/100
 - 8s - loss: 0.0073 - mean_absolute_error: 0.0643 - val_loss: 0.3176 - val_mean_absolute_error: 0.4338

Epoch 00005: val_loss did not improve from 0.31455
Epoch 00005: early stopping
Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 18s - loss: 0.0064 - mean_absolute_error: 0.0605 - val_loss: 0.3172 - val_mean_absolute_error: 0.4315

Epoch 00001: val_loss did not improve from 0.31455
Epoch 2/100
 - 7s - loss: 0.006

In [222]:
# Predict stars
pred_reg_hl_5 = model_reg_relu.predict(x_test_reg)

In [223]:
#Display 10 business

for i in range(10):
    print("{}. Business ID: {}, Actual Stars: {}, predicted Stars: {}".format(i+1,merge_df['business_id'][2000+i],y_test_reg[i],pred_reg_hl_4[i]))

1. Business ID: diaiQrxYFU1V5qxrFnW9fg, Actual Stars: 5.0, predicted Stars: [5.336504]
2. Business ID: TDTASGFy_aGp6vy0i23mDA, Actual Stars: 3.5, predicted Stars: [3.6343367]
3. Business ID: VuKJ2s_JP8weQ54NfsXJXQ, Actual Stars: 4.5, predicted Stars: [3.9705586]
4. Business ID: aGiBg2WKOpXS5-1DRnBiAQ, Actual Stars: 2.5, predicted Stars: [2.8183677]
5. Business ID: ZMmgFw2P4LWsFXNn1ZGc1g, Actual Stars: 4.5, predicted Stars: [4.2699523]
6. Business ID: sEKFq5u8P_s0-2mAZnx0JQ, Actual Stars: 3.5, predicted Stars: [3.806445]
7. Business ID: rYziPPEILDXJ_F5uKR--YQ, Actual Stars: 3.0, predicted Stars: [3.8777049]
8. Business ID: Swm_uMOWNcJDZz5lXWyzKA, Actual Stars: 3.0, predicted Stars: [3.33152]
9. Business ID: 6nGnVP7M4qQRiclXxeqXSQ, Actual Stars: 3.5, predicted Stars: [3.7277024]
10. Business ID: Tc24GX9-ZPr4_SHU0nJZZA, Actual Stars: 4.0, predicted Stars: [4.5496783]


In [224]:
# Measure RMSE error.  RMSE is common for regression.
score_5l = np.sqrt(mean_squared_error(y_test_reg,pred_reg_hl_5))
print("Final score (RMSE): {}".format(score_5l))
print('R2 score: %.2f' % r2_score(y_test_reg, pred_reg_hl_5))

Final score (RMSE): 0.5536472797393799
R2 score: 0.70


# TensorFlow Model for Classification

In [117]:
#train test data for linear regression

x_train_lr, x_test_lr, y_train_lr, y_test_lr = train_test_split(x_matrix_minmax, merge_df['encoded_stars'] , test_size=0.2, random_state=42)


In [122]:
#implementing Nearest Neighbor

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=1)

knn.fit(x_train_lr, y_train_lr) 

y_pred_knn = knn.predict(x_test_lr)

In [127]:
# list  the business with the stars and prediction

for i in range(0,10):
    idx=y_test_lr.index[i]
    print("business id - %s actual stars label - %d predicted - %d" 
          %(merge_df['business_id'][idx], y_test_lr[idx], y_pred_knn[i]))

business id - EmezZdxbvjydG5FkN6Mecw actual stars label - 2 predicted - 4
business id - cmaPrML-0zCJOs8_1VYmaw actual stars label - 7 predicted - 6
business id - E6U8zl527AsspbTf5nZCdw actual stars label - 4 predicted - 5
business id - zXRf_6Bs1yX9an_QKpzbHQ actual stars label - 2 predicted - 2
business id - vllzSssD2HXGlzGUcITxhw actual stars label - 6 predicted - 7
business id - AcGRSWCpb7YB95MTsHlGEw actual stars label - 2 predicted - 2
business id - zfEcOCrgUKe8xYOdqNVmmA actual stars label - 5 predicted - 7
business id - z-q6Wu-L-iDCftYVfoElPw actual stars label - 8 predicted - 7
business id - K7c5wAhxd6CqtmBmY47c7g actual stars label - 5 predicted - 7
business id - b30HREePgMGPZMPaExTZSA actual stars label - 6 predicted - 8


In [128]:
# Metrics

score_acc_knn = accuracy_score(true_stars, predict_stars)
score_f1_knn =
score_precision_knn = 




Final score (RMSE): 1.8179962102383167
R2 score: 0.17


In [131]:
# SVM

svm_model = SVC(kernel="linear")

svm_model.fit(x_train_lr, y_train_lr)

y_pred_svm = svm_model.predict(x_test_lr)

In [132]:
# list  the business with the stars and prediction

for i in range(0,10):
    idx=y_test_lr.index[i]
    print("business id - %s actual stars label - %d predicted - %d" 
          %(merge_df['business_id'][idx], y_test_lr[idx], y_pred_svm[i]))

business id - EmezZdxbvjydG5FkN6Mecw actual stars label - 2 predicted - 6
business id - cmaPrML-0zCJOs8_1VYmaw actual stars label - 7 predicted - 6
business id - E6U8zl527AsspbTf5nZCdw actual stars label - 4 predicted - 6
business id - zXRf_6Bs1yX9an_QKpzbHQ actual stars label - 2 predicted - 6
business id - vllzSssD2HXGlzGUcITxhw actual stars label - 6 predicted - 6
business id - AcGRSWCpb7YB95MTsHlGEw actual stars label - 2 predicted - 6
business id - zfEcOCrgUKe8xYOdqNVmmA actual stars label - 5 predicted - 6
business id - z-q6Wu-L-iDCftYVfoElPw actual stars label - 8 predicted - 6
business id - K7c5wAhxd6CqtmBmY47c7g actual stars label - 5 predicted - 6
business id - b30HREePgMGPZMPaExTZSA actual stars label - 6 predicted - 6


In [133]:
# Measure RMSE error.  RMSE is common for regression.
score_svm = np.sqrt(mean_squared_error(y_test_lr,y_pred_svm))
print("Final score (RMSE): {}".format(score_svm))
print('R2 score: %.2f' % r2_score(y_test_lr, y_pred_svm))

Final score (RMSE): 2.14247062073706
R2 score: -0.16


In [135]:
mnb_model = MultinomialNB()

mnb_model.fit(x_train_lr, y_train_lr)

y_pred_mnb = mnb_model.predict(x_test_lr)

In [136]:
# list  the business with the stars and prediction

for i in range(0,10):
    idx=y_test_lr.index[i]
    print("business id - %s actual stars label - %d predicted - %d" 
          %(merge_df['business_id'][idx], y_test_lr[idx], y_pred_mnb[i]))

business id - EmezZdxbvjydG5FkN6Mecw actual stars label - 2 predicted - 5
business id - cmaPrML-0zCJOs8_1VYmaw actual stars label - 7 predicted - 6
business id - E6U8zl527AsspbTf5nZCdw actual stars label - 4 predicted - 5
business id - zXRf_6Bs1yX9an_QKpzbHQ actual stars label - 2 predicted - 3
business id - vllzSssD2HXGlzGUcITxhw actual stars label - 6 predicted - 5
business id - AcGRSWCpb7YB95MTsHlGEw actual stars label - 2 predicted - 5
business id - zfEcOCrgUKe8xYOdqNVmmA actual stars label - 5 predicted - 8
business id - z-q6Wu-L-iDCftYVfoElPw actual stars label - 8 predicted - 8
business id - K7c5wAhxd6CqtmBmY47c7g actual stars label - 5 predicted - 5
business id - b30HREePgMGPZMPaExTZSA actual stars label - 6 predicted - 7


In [137]:
# Measure RMSE error.  RMSE is common for regression.
score_mnb = np.sqrt(mean_squared_error(y_test_lr,y_pred_mnb))
print("Final score (RMSE): {}".format(score_mnb))
print('R2 score: %.2f' % r2_score(y_test_lr, y_pred_mnb))

Final score (RMSE): 1.8771113696817183
R2 score: 0.11


In [149]:
# one-hot cooding of postal codes 

hotcoded_stars_df = pd.get_dummies(merge_df['encoded_stars'], sparse = 'true')

In [150]:
y_stars_encoded = hotcoded_stars_df.values.astype(np.float32)

In [151]:
#train test data for linear regression

x_train_lr, x_test_lr, y_train_lr, y_test_lr = train_test_split(x_matrix_minmax, y_stars_encoded, test_size=0.2, random_state=42)


In [152]:
#Tensorflow classification

model_class = Sequential()
model_class.add(Dense(50, input_dim=x_train_lr.shape[1], activation='relu')) # Hidden 1
model_class.add(Dense(25, activation='relu')) # Hidden 2
model_class.add(Dense(y_train_lr.shape[1], activation='softmax')) # Output

model_class.compile(loss='categorical_crossentropy', optimizer='adam')

model_class.fit(x_train_lr,y_train_lr,verbose=2,epochs=50)

Epoch 1/50
 - 7s - loss: 1.8531
Epoch 2/50
 - 2s - loss: 1.3560
Epoch 3/50
 - 3s - loss: 1.1827
Epoch 4/50
 - 2s - loss: 1.1068
Epoch 5/50
 - 2s - loss: 1.0512
Epoch 6/50
 - 3s - loss: 1.0012
Epoch 7/50
 - 3s - loss: 0.9498
Epoch 8/50
 - 3s - loss: 0.9047
Epoch 9/50
 - 3s - loss: 0.8601
Epoch 10/50
 - 3s - loss: 0.8171
Epoch 11/50
 - 3s - loss: 0.7747
Epoch 12/50
 - 3s - loss: 0.7327
Epoch 13/50
 - 2s - loss: 0.6969
Epoch 14/50
 - 2s - loss: 0.6568
Epoch 15/50
 - 3s - loss: 0.6279
Epoch 16/50
 - 3s - loss: 0.5913
Epoch 17/50
 - 2s - loss: 0.5602
Epoch 18/50
 - 2s - loss: 0.5291
Epoch 19/50
 - 2s - loss: 0.5024
Epoch 20/50
 - 2s - loss: 0.4717
Epoch 21/50
 - 3s - loss: 0.4463
Epoch 22/50
 - 3s - loss: 0.4158
Epoch 23/50
 - 4s - loss: 0.3908
Epoch 24/50
 - 3s - loss: 0.3637
Epoch 25/50
 - 3s - loss: 0.3429
Epoch 26/50
 - 3s - loss: 0.3217
Epoch 27/50
 - 3s - loss: 0.2984
Epoch 28/50
 - 3s - loss: 0.2778
Epoch 29/50
 - 3s - loss: 0.2547
Epoch 30/50
 - 3s - loss: 0.2369
Epoch 31/50
 - 4s -

<keras.callbacks.History at 0x203d94f1908>

In [153]:
pred_class = model_class.predict(x_test_lr)
print("Shape: {}".format(pred_class.shape))
print(pred_class)

Shape: (1996, 9)
[[4.8480987e-21 8.7688440e-11 3.6708113e-07 ... 4.0000780e-11
  2.0035622e-11 1.7034164e-19]
 [0.0000000e+00 4.3768849e-33 1.0997931e-20 ... 9.6302813e-01
  3.6949817e-02 2.2013457e-05]
 [7.5696492e-22 1.4770561e-10 3.0735318e-05 ... 7.0913511e-06
  2.4185019e-06 3.8029577e-14]
 ...
 [0.0000000e+00 1.8693442e-18 2.8503936e-14 ... 4.6418074e-09
  1.4032820e-11 7.6944250e-20]
 [8.6572583e-28 7.0216779e-12 2.6254143e-06 ... 1.1362669e-02
  5.3011975e-03 1.7852780e-11]
 [0.0000000e+00 6.6005226e-31 1.1835784e-19 ... 9.8131645e-01
  1.8683216e-02 3.7645924e-07]]


In [154]:
predict_stars = np.argmax(pred_class,axis=1)

true_stars = np.argmax(y_test_lr,axis=1)

In [155]:
for i in range(10):
    print("{}. Business ID: {}, Actual Stars: {}, predicted Stars: {}".format(i+1,merge_df['business_id'][2000+i],true_stars[i],predict_stars[i]))

1. Business ID: diaiQrxYFU1V5qxrFnW9fg, Actual Stars: 2, predicted Stars: 3
2. Business ID: TDTASGFy_aGp6vy0i23mDA, Actual Stars: 7, predicted Stars: 6
3. Business ID: VuKJ2s_JP8weQ54NfsXJXQ, Actual Stars: 4, predicted Stars: 4
4. Business ID: aGiBg2WKOpXS5-1DRnBiAQ, Actual Stars: 2, predicted Stars: 1
5. Business ID: ZMmgFw2P4LWsFXNn1ZGc1g, Actual Stars: 6, predicted Stars: 6
6. Business ID: sEKFq5u8P_s0-2mAZnx0JQ, Actual Stars: 2, predicted Stars: 3
7. Business ID: rYziPPEILDXJ_F5uKR--YQ, Actual Stars: 5, predicted Stars: 8
8. Business ID: Swm_uMOWNcJDZz5lXWyzKA, Actual Stars: 8, predicted Stars: 7
9. Business ID: 6nGnVP7M4qQRiclXxeqXSQ, Actual Stars: 5, predicted Stars: 2
10. Business ID: Tc24GX9-ZPr4_SHU0nJZZA, Actual Stars: 6, predicted Stars: 8


In [156]:
#accuracy  

correct = accuracy_score(true_stars, predict_stars)
print("Accuracy: {}".format(correct))

Accuracy: 0.4308617234468938


In [157]:
#train test data for linear regression

x_train_tf, x_test_tf, y_train_tf, y_test_tf = train_test_split(x_matrix_final, y_stars_encoded, test_size=0.2, random_state=42)


In [158]:
#Tensorflow classification

model_class = Sequential()
model_class.add(Dense(50, input_dim=x_train_tf.shape[1], activation='relu')) # Hidden 1
model_class.add(Dense(25, activation='relu')) # Hidden 2
model_class.add(Dense(y_train_lr.shape[1], activation='softmax')) # Output

model_class.compile(loss='categorical_crossentropy', optimizer='adam')

model_class.fit(x_train_tf,y_train_tf,verbose=2,epochs=50)

Epoch 1/50
 - 9s - loss: 1.7639
Epoch 2/50
 - 3s - loss: 1.2489
Epoch 3/50
 - 3s - loss: 1.0005
Epoch 4/50
 - 4s - loss: 0.8068
Epoch 5/50
 - 4s - loss: 0.6362
Epoch 6/50
 - 4s - loss: 0.4994
Epoch 7/50
 - 5s - loss: 0.3911
Epoch 8/50
 - 4s - loss: 0.3061
Epoch 9/50
 - 5s - loss: 0.2414
Epoch 10/50
 - 4s - loss: 0.1890
Epoch 11/50
 - 3s - loss: 0.1474
Epoch 12/50
 - 3s - loss: 0.1160
Epoch 13/50
 - 4s - loss: 0.0884
Epoch 14/50
 - 3s - loss: 0.0667
Epoch 15/50
 - 4s - loss: 0.0512
Epoch 16/50
 - 4s - loss: 0.0377
Epoch 17/50
 - 4s - loss: 0.0297
Epoch 18/50
 - 3s - loss: 0.0219
Epoch 19/50
 - 4s - loss: 0.0166
Epoch 20/50
 - 3s - loss: 0.0124
Epoch 21/50
 - 3s - loss: 0.0098
Epoch 22/50
 - 3s - loss: 0.0077
Epoch 23/50
 - 4s - loss: 0.0063
Epoch 24/50
 - 3s - loss: 0.0053
Epoch 25/50
 - 4s - loss: 0.0044
Epoch 26/50
 - 4s - loss: 0.0035
Epoch 27/50
 - 3s - loss: 0.0030
Epoch 28/50
 - 3s - loss: 0.0024
Epoch 29/50
 - 4s - loss: 0.0020
Epoch 30/50
 - 3s - loss: 0.0017
Epoch 31/50
 - 3s -

<keras.callbacks.History at 0x20517897dd8>

In [159]:
pred_class = model_class.predict(x_test_tf)
print("Shape: {}".format(pred_class.shape))
print(pred_class)

Shape: (1996, 9)
[[2.5405296e-12 1.2393942e-07 5.0602633e-10 ... 1.8107455e-22
  2.6426383e-32 5.0162187e-34]
 [0.0000000e+00 1.0104799e-32 1.3840434e-30 ... 8.7040478e-01
  1.2959516e-01 4.5909987e-09]
 [4.0285604e-26 2.4749689e-16 2.5602374e-13 ... 2.4799876e-12
  2.1567378e-17 1.5830178e-22]
 ...
 [1.1329312e-22 5.1598427e-15 4.2198586e-10 ... 5.1856097e-07
  6.5324351e-16 8.3942530e-22]
 [7.1796776e-26 1.9210520e-17 5.3053065e-25 ... 1.0000000e+00
  3.6923738e-17 1.1553852e-19]
 [0.0000000e+00 1.3095930e-35 1.7397419e-33 ... 8.5242641e-07
  9.9914658e-01 8.5237162e-04]]


In [160]:
predict_stars = np.argmax(pred_class,axis=1)

true_stars = np.argmax(y_test_tf,axis=1)

In [161]:
for i in range(10):
    print("{}. Business ID: {}, Actual Stars: {}, predicted Stars: {}".format(i+1,merge_df['business_id'][2000+i],true_stars[i],predict_stars[i]))

1. Business ID: diaiQrxYFU1V5qxrFnW9fg, Actual Stars: 2, predicted Stars: 3
2. Business ID: TDTASGFy_aGp6vy0i23mDA, Actual Stars: 7, predicted Stars: 6
3. Business ID: VuKJ2s_JP8weQ54NfsXJXQ, Actual Stars: 4, predicted Stars: 5
4. Business ID: aGiBg2WKOpXS5-1DRnBiAQ, Actual Stars: 2, predicted Stars: 2
5. Business ID: ZMmgFw2P4LWsFXNn1ZGc1g, Actual Stars: 6, predicted Stars: 6
6. Business ID: sEKFq5u8P_s0-2mAZnx0JQ, Actual Stars: 2, predicted Stars: 2
7. Business ID: rYziPPEILDXJ_F5uKR--YQ, Actual Stars: 5, predicted Stars: 6
8. Business ID: Swm_uMOWNcJDZz5lXWyzKA, Actual Stars: 8, predicted Stars: 8
9. Business ID: 6nGnVP7M4qQRiclXxeqXSQ, Actual Stars: 5, predicted Stars: 3
10. Business ID: Tc24GX9-ZPr4_SHU0nJZZA, Actual Stars: 6, predicted Stars: 8


In [162]:
#accuracy  

correct = accuracy_score(true_stars, predict_stars)
print("Accuracy: {}".format(correct))

Accuracy: 0.3787575150300601


# Additional Features

In [225]:
%matplotlib inline
from matplotlib.pyplot import figure, show
from sklearn import metrics
from keras.callbacks import EarlyStopping
from keras.layers import Dense, Dropout
from keras import regularizers
from keras.models import Sequential

In [226]:
#train test data
x_train_reg, x_test_reg, y_train_reg, y_test_reg = train_test_split(x_matrix_minmax, y_stars_regression , test_size=0.2)

In [227]:
#USe linear regression for regularization
model_regularization = Sequential()
model_regularization.add(Dense(50, input_dim=x_train_reg.shape[1], activation='relu'))
model_regularization.add(Dense(25, activation='relu'))
model_regularization.add(Dense(10, 
                kernel_regularizer=regularizers.l1(0.01),
                activity_regularizer=regularizers.l2(0.01), activation='relu'))
model_regularization.add(Dense(1)) 
model_regularization.compile(loss='mean_squared_error', optimizer='adam')

In [228]:
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')

model_regularization.fit(x_train_reg,y_train_reg,validation_data=(x_test_reg,y_test_reg),callbacks=[monitor],verbose=2,epochs=100)

pred_regularization = model_regularization.predict(x_test_reg)

Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 20s - loss: 4.3367 - val_loss: 2.2657
Epoch 2/100
 - 6s - loss: 1.7097 - val_loss: 1.3561
Epoch 3/100
 - 6s - loss: 1.1760 - val_loss: 1.0891
Epoch 4/100
 - 5s - loss: 0.9575 - val_loss: 0.9274
Epoch 5/100
 - 5s - loss: 0.8107 - val_loss: 0.8123
Epoch 6/100
 - 5s - loss: 0.6907 - val_loss: 0.7237
Epoch 7/100
 - 5s - loss: 0.5859 - val_loss: 0.6551
Epoch 8/100
 - 6s - loss: 0.4999 - val_loss: 0.6038
Epoch 9/100
 - 6s - loss: 0.4322 - val_loss: 0.5663
Epoch 10/100
 - 6s - loss: 0.3755 - val_loss: 0.5377
Epoch 11/100
 - 5s - loss: 0.3289 - val_loss: 0.5132
Epoch 12/100
 - 6s - loss: 0.2891 - val_loss: 0.4969
Epoch 13/100
 - 5s - loss: 0.2566 - val_loss: 0.4748
Epoch 14/100
 - 6s - loss: 0.2272 - val_loss: 0.4554
Epoch 15/100
 - 6s - loss: 0.2044 - val_loss: 0.4384
Epoch 16/100
 - 6s - loss: 0.1839 - val_loss: 0.4344
Epoch 17/100
 - 6s - loss: 0.1670 - val_loss: 0.4177
Epoch 18/100
 - 6s - loss: 0.1513 - val_loss: 0.4044
Epoch 

In [229]:
# Measure RMSE error.  RMSE is common for regression.
score_regularization = np.sqrt(metrics.mean_squared_error(pred_regularization,y_test_reg))
print("Final score (RMSE): {}".format(score_regularization))

Final score (RMSE): 0.5542775392532349


In [230]:
# Dropout

model_dropout = Sequential()
model_dropout.add(Dense(50, input_dim=x_train_reg.shape[1]))
model_dropout.add(Dropout(0.1))

model_dropout.add(Dense(25, activation='relu'))
model_dropout.add(Dense(10, activation='relu'))
model_dropout.add(Dense(1))

model_dropout.compile(loss='mean_squared_error', optimizer='adam')

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
model_dropout.fit(x_train_reg,y_train_reg,validation_data=(x_test_reg,y_test_reg),callbacks=[monitor],verbose=0,epochs=1000)

pred_dropout = model_dropout.predict(x_test_reg)

# Measure RMSE error.  RMSE is common for regression.
score_dropout = np.sqrt(metrics.mean_squared_error(pred_dropout,y_test_reg))
print("Final score (RMSE): {}".format(score_dropout))

Epoch 00017: early stopping
Final score (RMSE): 0.49788129329681396


In [231]:
model_dropout.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_61 (Dense)             (None, 50)                50100     
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_62 (Dense)             (None, 25)                1275      
_________________________________________________________________
dense_63 (Dense)             (None, 10)                260       
_________________________________________________________________
dense_64 (Dense)             (None, 1)                 11        
Total params: 51,646
Trainable params: 51,646
Non-trainable params: 0
_________________________________________________________________


In [232]:
#Checking to see if regularization helps with over fitting when using postal code and categories and one hot coded values
#train test data
x_train_reg_ad, x_test_reg_ad, y_train_reg_ad, y_test_reg_ad = train_test_split(x_matrix_final, y_stars_regression , test_size=0.2)


In [238]:
#USe linear regression for regularization
model_regularization_ad = Sequential()
model_regularization_ad.add(Dense(50, input_dim=x_train_reg_ad.shape[1], activation='relu'))
model_regularization_ad.add(Dense(25, activation='relu'))
model_regularization_ad.add(Dense(10, 
                kernel_regularizer=regularizers.l1(0.01),
                activity_regularizer=regularizers.l2(0.01), activation='relu'))
model_regularization_ad.add(Dense(1)) 
model_regularization_ad.compile(loss='mean_squared_error', optimizer='adam')

In [239]:
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')

model_regularization_ad.fit(x_train_reg_ad,y_train_reg_ad,validation_data=(x_test_reg_ad,y_test_reg_ad),callbacks=[monitor],verbose=2,epochs=100)

pred_regularization_ad = model_regularization_ad.predict(x_test_reg_ad)

Train on 7981 samples, validate on 1996 samples
Epoch 1/100
 - 18s - loss: 4.4570 - val_loss: 2.4116
Epoch 2/100
 - 6s - loss: 1.8795 - val_loss: 1.6409
Epoch 3/100
 - 5s - loss: 1.3024 - val_loss: 1.3105
Epoch 4/100
 - 6s - loss: 0.9978 - val_loss: 1.0954
Epoch 5/100
 - 5s - loss: 0.7967 - val_loss: 0.9675
Epoch 6/100
 - 6s - loss: 0.6554 - val_loss: 0.8644
Epoch 7/100
 - 5s - loss: 0.5439 - val_loss: 0.7992
Epoch 8/100
 - 6s - loss: 0.4576 - val_loss: 0.7301
Epoch 9/100
 - 5s - loss: 0.3906 - val_loss: 0.6778
Epoch 10/100
 - 6s - loss: 0.3365 - val_loss: 0.6339
Epoch 11/100
 - 5s - loss: 0.2923 - val_loss: 0.6078
Epoch 12/100
 - 6s - loss: 0.2562 - val_loss: 0.5776
Epoch 13/100
 - 6s - loss: 0.2272 - val_loss: 0.5579
Epoch 14/100
 - 6s - loss: 0.2018 - val_loss: 0.5334
Epoch 15/100
 - 5s - loss: 0.1801 - val_loss: 0.5166
Epoch 16/100
 - 6s - loss: 0.1625 - val_loss: 0.5018
Epoch 17/100
 - 5s - loss: 0.1473 - val_loss: 0.4803
Epoch 18/100
 - 6s - loss: 0.1350 - val_loss: 0.4686
Epoch 

In [240]:
# Measure RMSE error.  RMSE is common for regression.
score_regularization_ad = np.sqrt(metrics.mean_squared_error(pred_regularization_ad,y_test_reg_ad))
print("Final score (RMSE): {}".format(score_regularization_ad))

Final score (RMSE): 0.5967134833335876


In [244]:
# Dropout

model_dropout_ad = Sequential()
model_dropout_ad.add(Dense(50, input_dim=x_train_reg_ad.shape[1]))
model_dropout_ad.add(Dropout(0.1))

model_dropout_ad.add(Dense(25, activation='relu'))
model_dropout_ad.add(Dense(10, activation='relu'))
model_dropout_ad.add(Dense(1))

model_dropout_ad.compile(loss='mean_squared_error', optimizer='adam')

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
model_dropout_ad.fit(x_train_reg_ad,y_train_reg_ad,validation_data=(x_test_reg_ad,y_test_reg_ad),callbacks=[monitor],verbose=0,epochs=1000)

pred_dropout_ad = model_dropout_ad.predict(x_test_reg_ad)

# Measure RMSE error.  RMSE is common for regression.
score_dropout_ad = np.sqrt(metrics.mean_squared_error(pred_dropout_ad,y_test_reg_ad))
print("Final score (RMSE): {}".format(score_dropout_ad))

Epoch 00009: early stopping
Final score (RMSE): 0.6171209812164307


In [245]:
model_dropout_ad.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_85 (Dense)             (None, 50)                275400    
_________________________________________________________________
dropout_5 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_86 (Dense)             (None, 25)                1275      
_________________________________________________________________
dense_87 (Dense)             (None, 10)                260       
_________________________________________________________________
dense_88 (Dense)             (None, 1)                 11        
Total params: 276,946
Trainable params: 276,946
Non-trainable params: 0
_________________________________________________________________
