In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
# import all libraries

import json
import csv
import pandas as pd
import numpy as np
import scipy as sci
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping, ModelCheckpoint
import io
import requests
import tensorflow as tf
from scipy import sparse
from sklearn.model_selection import train_test_split
import sklearn.feature_extraction.text as sk_text
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score
import collections
from sklearn import preprocessing
import matplotlib.pyplot as plt
import shutil
import os

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [3]:
# functions given in the lab

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)



# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, collections.Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low

In [4]:
# Create dataframes and creating merge_df file

business_df= pd.read_csv('business_postal.tsv', delimiter ="\t")
review_df= pd.read_csv('review_stars.tsv', delimiter ="\t")
review_agg_df = review_df.groupby('business_id')['text'].sum()
df_ready_for_sklearn = pd.DataFrame({'business_id': review_agg_df.index, 'all_reviews': review_agg_df.values})
merge_df = pd.merge(business_df, df_ready_for_sklearn, on='business_id')
print(merge_df.dtypes)
print(merge_df.shape)
#merge_df = merge_df[0:10000]
print(merge_df.dtypes)
print(merge_df.shape)

business_id      object
categories       object
stars           float64
review_count      int64
postal code      object
all_reviews      object
dtype: object
(188593, 6)
business_id      object
categories       object
stars           float64
review_count      int64
postal code      object
all_reviews      object
dtype: object
(188593, 6)


In [5]:
# removing NaN categories

merge_df = merge_df[merge_df['categories'].notnull()]

In [6]:
merge_df.shape

(188052, 6)

In [7]:
merge_df.to_csv('merge_df.tsv', sep='\t', encoding='utf-8')

In [8]:
merge_df.head()

Unnamed: 0,business_id,categories,stars,review_count,postal code,all_reviews
0,Apn5Q_b6Nz61Tq4XzPdf9A,"Tours, Breweries, Pizza, Restaurants, Food, Ho...",4.0,24,T2E 6L6,b'Great place. Major flaw is how early it clos...
1,AjEbIBw6ZFfln7ePHha9PA,"Chicken Wings, Burgers, Caterers, Street Vendo...",4.5,3,89002,"b""CK's BBQ is off the charts best BBQ I have E..."
2,O8S5hYJ1SMc8fA4QBtVujA,"Breakfast & Brunch, Restaurants, French, Sandw...",4.0,5,H2G 1K7,"b""La nourriture est excellente, le service Imp..."
3,bFzdJJ3wp3PZssNEsyU23g,"Insurance, Financial Services",1.5,8,85003,b'GEICO for auto is great! But they are really...
4,8USyCYqpScwiNEb58Bt6CA,"Home & Garden, Nurseries & Gardening, Shopping...",2.0,4,T2H 0N5,"b""This listing was originally under gardening ..."


In [9]:
#Normalization of review count field so it becomes comparable and remove bias
merge_df.insert(3,'normalized_count',((merge_df['review_count'] - merge_df['review_count'].min()) / (merge_df['review_count'].max() - merge_df['review_count'].min())).astype(float))
merge_df.drop('review_count', axis=1, inplace=True)

In [10]:
merge_df.head()

Unnamed: 0,business_id,categories,stars,normalized_count,postal code,all_reviews
0,Apn5Q_b6Nz61Tq4XzPdf9A,"Tours, Breweries, Pizza, Restaurants, Food, Ho...",4.0,0.002637,T2E 6L6,b'Great place. Major flaw is how early it clos...
1,AjEbIBw6ZFfln7ePHha9PA,"Chicken Wings, Burgers, Caterers, Street Vendo...",4.5,0.0,89002,"b""CK's BBQ is off the charts best BBQ I have E..."
2,O8S5hYJ1SMc8fA4QBtVujA,"Breakfast & Brunch, Restaurants, French, Sandw...",4.0,0.000251,H2G 1K7,"b""La nourriture est excellente, le service Imp..."
3,bFzdJJ3wp3PZssNEsyU23g,"Insurance, Financial Services",1.5,0.000628,85003,b'GEICO for auto is great! But they are really...
4,8USyCYqpScwiNEb58Bt6CA,"Home & Garden, Nurseries & Gardening, Shopping...",2.0,0.000126,T2H 0N5,"b""This listing was originally under gardening ..."


In [11]:
#TF-IDF calculation

tfidf = sk_text.TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))

# Fit the reviews column with TFIDFvectorizer
tfidf_matrix = tfidf.fit_transform(merge_df['all_reviews'])
matrix = tfidf_matrix.toarray()

# We are adding the normalized count to the original matrix with TFIDFvectorizer
x_matrix=np.column_stack((matrix, merge_df['normalized_count']))

print(x_matrix.dtype)

float64


In [12]:
x_matrix.shape

(188052, 1001)

In [13]:
y_stars = merge_df['stars'].values.astype(np.float32)

In [14]:
y_stars.shape

(188052,)

In [15]:
#stars need to be encoded for all the models we are going to train
le = preprocessing.LabelEncoder()
merge_df['encoded_stars'] = le.fit_transform(merge_df['stars'])

# one-hot cooding of postal codes 

hotcoded_stars_df = pd.get_dummies(merge_df['encoded_stars'], sparse = 'true')

In [16]:
print(hotcoded_stars_df.head())
print(hotcoded_stars_df.shape)

   0  1  2  3  4  5  6  7  8
0  0  0  0  0  0  0  1  0  0
1  0  0  0  0  0  0  0  1  0
2  0  0  0  0  0  0  1  0  0
3  0  1  0  0  0  0  0  0  0
4  0  0  1  0  0  0  0  0  0
(188052, 9)


In [17]:
y_stars_encoded = hotcoded_stars_df.values.astype(np.float32)

In [18]:
y_stars_encoded.shape[1]

9

In [19]:
merge_df.head()

Unnamed: 0,business_id,categories,stars,normalized_count,postal code,all_reviews,encoded_stars
0,Apn5Q_b6Nz61Tq4XzPdf9A,"Tours, Breweries, Pizza, Restaurants, Food, Ho...",4.0,0.002637,T2E 6L6,b'Great place. Major flaw is how early it clos...,6
1,AjEbIBw6ZFfln7ePHha9PA,"Chicken Wings, Burgers, Caterers, Street Vendo...",4.5,0.0,89002,"b""CK's BBQ is off the charts best BBQ I have E...",7
2,O8S5hYJ1SMc8fA4QBtVujA,"Breakfast & Brunch, Restaurants, French, Sandw...",4.0,0.000251,H2G 1K7,"b""La nourriture est excellente, le service Imp...",6
3,bFzdJJ3wp3PZssNEsyU23g,"Insurance, Financial Services",1.5,0.000628,85003,b'GEICO for auto is great! But they are really...,1
4,8USyCYqpScwiNEb58Bt6CA,"Home & Garden, Nurseries & Gardening, Shopping...",2.0,0.000126,T2H 0N5,"b""This listing was originally under gardening ...",2


In [20]:
#train test data for linear regression

x_cont_train, x_cont_test, y_cont_train, y_cont_test = train_test_split(x_matrix, y_stars , test_size=0.2)

# train test dat afor other models

x_enc_train, x_enc_test, y_enc_train, y_enc_test = train_test_split(x_matrix, y_stars_encoded , test_size=0.2)

In [21]:
print("X Train size ")
print(x_cont_train.shape)
print("X Test size ")
print(x_cont_test.shape)
print("y Train size ")
print(y_cont_train.shape)
print("y Test size ")
print(y_cont_test.shape)
print("X Train size ")
print(x_enc_train.shape)
print("X Test size ")
print(x_enc_test.shape)
print("y Train size ")
print(y_enc_train.shape)
print("y Test size ")
print(y_enc_test.shape)

X Train size 
(150441, 1001)
X Test size 
(37611, 1001)
y Train size 
(150441,)
y Test size 
(37611,)
X Train size 
(150441, 1001)
X Test size 
(37611, 1001)
y Train size 
(150441, 9)
y Test size 
(37611, 9)


In [22]:
# setup early stopping monitor

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')

# # Tensorflow model for Regression, Activation -ReLu

In [23]:
# Tensorflow model for Regression

model_reg_relu = Sequential()

model_reg_relu.add(Dense(25, input_dim=x_cont_train.shape[1], activation='relu')) # Hidden 1     #  why input_dim=x.shape[1]?  
model_reg_relu.add(Dense(10, activation='relu')) # Hidden 2
model_reg_relu.add(Dense(1)) # Output

# set up checkpointer
checkpointer = ModelCheckpoint(filepath="./best_weights_relu.hdf5", verbose=1, save_best_only=True)

for i in range(10):
    model_reg_relu.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])
    model_reg_relu.fit(x_cont_train,y_cont_train,validation_data=(x_cont_test,y_cont_test),callbacks=[monitor,checkpointer],verbose=1,epochs=100) 

model_reg_relu.load_weights('./best_weights_relu.hdf5')

Train on 150441 samples, validate on 37611 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.21450, saving model to ./best_weights_relu.hdf5
Epoch 2/100

Epoch 00002: val_loss improved from 0.21450 to 0.20743, saving model to ./best_weights_relu.hdf5
Epoch 3/100

Epoch 00003: val_loss improved from 0.20743 to 0.20701, saving model to ./best_weights_relu.hdf5
Epoch 4/100

Epoch 00004: val_loss did not improve from 0.20701
Epoch 5/100

Epoch 00005: val_loss improved from 0.20701 to 0.20688, saving model to ./best_weights_relu.hdf5
Epoch 6/100

Epoch 00006: val_loss did not improve from 0.20688
Epoch 7/100

Epoch 00007: val_loss did not improve from 0.20688
Epoch 00007: early stopping
Train on 150441 samples, validate on 37611 samples
Epoch 1/100

Epoch 00001: val_loss did not improve from 0.20688
Epoch 2/100

Epoch 00002: val_loss did not improve from 0.20688
Epoch 3/100

Epoch 00003: val_loss did not improve from 0.20688
Epoch 4/100

Epoch 00004: val_loss did not improve


Epoch 00003: val_loss did not improve from 0.20688
Epoch 4/100

Epoch 00004: val_loss did not improve from 0.20688
Epoch 5/100

Epoch 00005: val_loss did not improve from 0.20688
Epoch 6/100

Epoch 00006: val_loss did not improve from 0.20688
Epoch 00006: early stopping
Train on 150441 samples, validate on 37611 samples
Epoch 1/100

Epoch 00001: val_loss did not improve from 0.20688
Epoch 2/100

Epoch 00002: val_loss did not improve from 0.20688
Epoch 3/100

Epoch 00003: val_loss did not improve from 0.20688
Epoch 4/100

Epoch 00004: val_loss did not improve from 0.20688
Epoch 5/100

Epoch 00005: val_loss did not improve from 0.20688
Epoch 6/100

Epoch 00006: val_loss did not improve from 0.20688
Epoch 00006: early stopping
Train on 150441 samples, validate on 37611 samples
Epoch 1/100

Epoch 00001: val_loss did not improve from 0.20688
Epoch 2/100

Epoch 00002: val_loss did not improve from 0.20688
Epoch 3/100

Epoch 00003: val_loss did not improve from 0.20688
Epoch 4/100

Epoch 000

In [24]:
pred_reg_relu = model_reg_relu.predict(x_cont_test)
print("Shape: {}".format(pred_reg_relu.shape))
print(pred_reg_relu)

Shape: (37611, 1)
[[3.92646  ]
 [3.3934653]
 [4.483278 ]
 ...
 [3.9256923]
 [4.0380645]
 [4.014315 ]]


In [25]:
for i in range(10):
    print("{}. Business ID: {}, Actual Stars: {}, predicted Stars: {}".format(i+1,merge_df['business_id'][2000+i],y_cont_test[i],pred_reg_relu[i]))

1. Business ID: diaiQrxYFU1V5qxrFnW9fg, Actual Stars: 4.5, predicted Stars: [3.92646]
2. Business ID: TDTASGFy_aGp6vy0i23mDA, Actual Stars: 3.5, predicted Stars: [3.3934653]
3. Business ID: VuKJ2s_JP8weQ54NfsXJXQ, Actual Stars: 4.5, predicted Stars: [4.483278]
4. Business ID: aGiBg2WKOpXS5-1DRnBiAQ, Actual Stars: 4.5, predicted Stars: [4.816037]
5. Business ID: ZMmgFw2P4LWsFXNn1ZGc1g, Actual Stars: 3.0, predicted Stars: [2.6938899]
6. Business ID: sEKFq5u8P_s0-2mAZnx0JQ, Actual Stars: 3.0, predicted Stars: [2.9509478]
7. Business ID: rYziPPEILDXJ_F5uKR--YQ, Actual Stars: 4.5, predicted Stars: [3.6861687]
8. Business ID: Swm_uMOWNcJDZz5lXWyzKA, Actual Stars: 3.5, predicted Stars: [3.7589426]
9. Business ID: 6nGnVP7M4qQRiclXxeqXSQ, Actual Stars: 3.5, predicted Stars: [3.0025063]
10. Business ID: Tc24GX9-ZPr4_SHU0nJZZA, Actual Stars: 2.5, predicted Stars: [2.6642344]


In [26]:
# Measure RMSE error.  RMSE is common for regression.
score = np.sqrt(mean_squared_error(y_cont_test,pred_reg_relu))
print("Final score (RMSE): {}".format(score))
print('R2 score: %.2f' % r2_score(y_cont_test, pred_reg_relu))

Final score (RMSE): 0.45483526587486267
R2 score: 0.80


# # Tensorflow model for Regression, Activation -  Sigmoid

In [27]:


model_reg_sig = Sequential()

model_reg_sig.add(Dense(25, input_dim=x_cont_train.shape[1], activation='sigmoid')) # Hidden 1     #  why input_dim=x.shape[1]?  
model_reg_sig.add(Dense(10, activation='sigmoid')) # Hidden 2
model_reg_sig.add(Dense(1)) # Output

model_reg_sig.compile(loss='mean_squared_error', optimizer='adam')

model_reg_sig.fit(x_cont_train,y_cont_train,verbose=2,epochs=100)    # Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch.

Epoch 1/100
 - 5s - loss: 0.4835
Epoch 2/100
 - 4s - loss: 0.2190
Epoch 3/100
 - 4s - loss: 0.2095
Epoch 4/100
 - 4s - loss: 0.2043
Epoch 5/100
 - 4s - loss: 0.2008
Epoch 6/100
 - 4s - loss: 0.1982
Epoch 7/100
 - 4s - loss: 0.1964
Epoch 8/100
 - 4s - loss: 0.1949
Epoch 9/100
 - 4s - loss: 0.1938
Epoch 10/100
 - 4s - loss: 0.1928
Epoch 11/100
 - 4s - loss: 0.1920
Epoch 12/100
 - 4s - loss: 0.1914
Epoch 13/100
 - 4s - loss: 0.1908
Epoch 14/100
 - 4s - loss: 0.1901
Epoch 15/100
 - 4s - loss: 0.1895
Epoch 16/100
 - 4s - loss: 0.1888
Epoch 17/100
 - 4s - loss: 0.1881
Epoch 18/100
 - 4s - loss: 0.1872
Epoch 19/100
 - 4s - loss: 0.1865
Epoch 20/100
 - 4s - loss: 0.1856
Epoch 21/100
 - 4s - loss: 0.1848
Epoch 22/100
 - 4s - loss: 0.1841
Epoch 23/100
 - 4s - loss: 0.1832
Epoch 24/100
 - 4s - loss: 0.1824
Epoch 25/100
 - 4s - loss: 0.1816
Epoch 26/100
 - 4s - loss: 0.1809
Epoch 27/100
 - 4s - loss: 0.1800
Epoch 28/100
 - 4s - loss: 0.1795
Epoch 29/100
 - 4s - loss: 0.1787
Epoch 30/100
 - 4s - lo

<keras.callbacks.History at 0x1a9b67cc18>

In [28]:
pred_reg_sig = model_reg_sig.predict(x_cont_test)
print("Shape: {}".format(pred_reg_sig.shape))
print(pred_reg_sig)

Shape: (37611, 1)
[[4.0453157]
 [3.4327214]
 [4.647158 ]
 ...
 [3.5709999]
 [3.8430572]
 [3.138085 ]]


In [29]:
for i in range(10):
    print("{}. Business ID: {}, Actual Stars: {}, predicted Stars: {}".format(i+1,merge_df['business_id'][2000+i],y_cont_test[i],pred_reg_sig[i]))

1. Business ID: diaiQrxYFU1V5qxrFnW9fg, Actual Stars: 4.5, predicted Stars: [4.0453157]
2. Business ID: TDTASGFy_aGp6vy0i23mDA, Actual Stars: 3.5, predicted Stars: [3.4327214]
3. Business ID: VuKJ2s_JP8weQ54NfsXJXQ, Actual Stars: 4.5, predicted Stars: [4.647158]
4. Business ID: aGiBg2WKOpXS5-1DRnBiAQ, Actual Stars: 4.5, predicted Stars: [4.883079]
5. Business ID: ZMmgFw2P4LWsFXNn1ZGc1g, Actual Stars: 3.0, predicted Stars: [2.6510696]
6. Business ID: sEKFq5u8P_s0-2mAZnx0JQ, Actual Stars: 3.0, predicted Stars: [2.634763]
7. Business ID: rYziPPEILDXJ_F5uKR--YQ, Actual Stars: 4.5, predicted Stars: [4.069232]
8. Business ID: Swm_uMOWNcJDZz5lXWyzKA, Actual Stars: 3.5, predicted Stars: [3.528416]
9. Business ID: 6nGnVP7M4qQRiclXxeqXSQ, Actual Stars: 3.5, predicted Stars: [2.6707253]
10. Business ID: Tc24GX9-ZPr4_SHU0nJZZA, Actual Stars: 2.5, predicted Stars: [3.1336753]


In [30]:
# Measure RMSE error.  RMSE is common for regression.
score = np.sqrt(mean_squared_error(y_cont_test,pred_reg_sig))
print("Final score (RMSE): {}".format(score))
print('R2 score: %.2f' % r2_score(y_cont_test, pred_reg_sig))

Final score (RMSE): 0.4749263823032379
R2 score: 0.78


# #  Tensorflow model for Regression, Activation -  Tanh

In [31]:


model_reg_tanh = Sequential()

model_reg_tanh.add(Dense(25, input_dim=x_cont_train.shape[1], activation='tanh')) # Hidden 1     #  why input_dim=x.shape[1]?  
model_reg_tanh.add(Dense(10, activation='tanh')) # Hidden 2
model_reg_tanh.add(Dense(1)) # Output

model_reg_tanh.compile(loss='mean_squared_error', optimizer='adam')

model_reg_tanh.fit(x_cont_train,y_cont_train,verbose=2,epochs=100)    # Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch.

Epoch 1/100
 - 5s - loss: 0.3254
Epoch 2/100
 - 4s - loss: 0.2046
Epoch 3/100
 - 4s - loss: 0.1991
Epoch 4/100
 - 4s - loss: 0.1960
Epoch 5/100
 - 4s - loss: 0.1938
Epoch 6/100
 - 4s - loss: 0.1919
Epoch 7/100
 - 4s - loss: 0.1904
Epoch 8/100
 - 4s - loss: 0.1891
Epoch 9/100
 - 4s - loss: 0.1878
Epoch 10/100
 - 4s - loss: 0.1865
Epoch 11/100
 - 4s - loss: 0.1854
Epoch 12/100
 - 4s - loss: 0.1840
Epoch 13/100
 - 4s - loss: 0.1827
Epoch 14/100
 - 4s - loss: 0.1815
Epoch 15/100
 - 4s - loss: 0.1798
Epoch 16/100
 - 4s - loss: 0.1784
Epoch 17/100
 - 4s - loss: 0.1771
Epoch 18/100
 - 4s - loss: 0.1755
Epoch 19/100
 - 4s - loss: 0.1741
Epoch 20/100
 - 4s - loss: 0.1726
Epoch 21/100
 - 4s - loss: 0.1711
Epoch 22/100
 - 4s - loss: 0.1696
Epoch 23/100
 - 4s - loss: 0.1681
Epoch 24/100
 - 4s - loss: 0.1666
Epoch 25/100
 - 4s - loss: 0.1650
Epoch 26/100
 - 4s - loss: 0.1635
Epoch 27/100
 - 4s - loss: 0.1622
Epoch 28/100
 - 4s - loss: 0.1609
Epoch 29/100
 - 4s - loss: 0.1592
Epoch 30/100
 - 4s - lo

<keras.callbacks.History at 0x1a38b2b908>

In [32]:
pred_reg_tanh = model_reg_tanh.predict(x_cont_test)
print("Shape: {}".format(pred_reg_tanh.shape))
print(pred_reg_tanh)

Shape: (37611, 1)
[[4.4487305]
 [3.3607907]
 [4.5103235]
 ...
 [3.6535573]
 [3.7582762]
 [3.7692714]]


In [33]:
for i in range(10):
    print("{}. Business ID: {}, Actual Stars: {}, predicted Stars: {}".format(i+1,merge_df['business_id'][2000+i],y_cont_test[i],pred_reg_tanh[i]))

1. Business ID: diaiQrxYFU1V5qxrFnW9fg, Actual Stars: 4.5, predicted Stars: [4.4487305]
2. Business ID: TDTASGFy_aGp6vy0i23mDA, Actual Stars: 3.5, predicted Stars: [3.3607907]
3. Business ID: VuKJ2s_JP8weQ54NfsXJXQ, Actual Stars: 4.5, predicted Stars: [4.5103235]
4. Business ID: aGiBg2WKOpXS5-1DRnBiAQ, Actual Stars: 4.5, predicted Stars: [4.8939233]
5. Business ID: ZMmgFw2P4LWsFXNn1ZGc1g, Actual Stars: 3.0, predicted Stars: [2.620066]
6. Business ID: sEKFq5u8P_s0-2mAZnx0JQ, Actual Stars: 3.0, predicted Stars: [2.890732]
7. Business ID: rYziPPEILDXJ_F5uKR--YQ, Actual Stars: 4.5, predicted Stars: [3.1466634]
8. Business ID: Swm_uMOWNcJDZz5lXWyzKA, Actual Stars: 3.5, predicted Stars: [3.691975]
9. Business ID: 6nGnVP7M4qQRiclXxeqXSQ, Actual Stars: 3.5, predicted Stars: [2.8360512]
10. Business ID: Tc24GX9-ZPr4_SHU0nJZZA, Actual Stars: 2.5, predicted Stars: [2.6974099]


In [34]:
# Measure RMSE error.  RMSE is common for regression.
score = np.sqrt(mean_squared_error(y_cont_test,pred_reg_tanh))
print("Final score (RMSE): {}".format(score))
print('R2 score: %.2f' % r2_score(y_cont_test, pred_reg_tanh))

Final score (RMSE): 0.5123431086540222
R2 score: 0.74


# tensorflow model for classfication 

In [35]:


model_class = Sequential()
model_class.add(Dense(50, input_dim=x_enc_train.shape[1], activation='relu')) # Hidden 1
model_class.add(Dense(25, activation='relu')) # Hidden 2
model_class.add(Dense(y_enc_train.shape[1], activation='softmax')) # Output

model_class.compile(loss='categorical_crossentropy', optimizer='adam')

model_class.fit(x_enc_train,y_enc_train,verbose=2,epochs=50)

Epoch 1/50
 - 8s - loss: 1.2157
Epoch 2/50
 - 6s - loss: 1.1173
Epoch 3/50
 - 6s - loss: 1.0888
Epoch 4/50
 - 6s - loss: 1.0660
Epoch 5/50
 - 6s - loss: 1.0463
Epoch 6/50
 - 6s - loss: 1.0275
Epoch 7/50
 - 6s - loss: 1.0109
Epoch 8/50
 - 6s - loss: 0.9951
Epoch 9/50
 - 6s - loss: 0.9806
Epoch 10/50
 - 6s - loss: 0.9670
Epoch 11/50
 - 6s - loss: 0.9553
Epoch 12/50
 - 6s - loss: 0.9435
Epoch 13/50
 - 6s - loss: 0.9333
Epoch 14/50
 - 6s - loss: 0.9226
Epoch 15/50
 - 6s - loss: 0.9130
Epoch 16/50
 - 6s - loss: 0.9042
Epoch 17/50
 - 6s - loss: 0.8964
Epoch 18/50
 - 6s - loss: 0.8879
Epoch 19/50
 - 6s - loss: 0.8810
Epoch 20/50
 - 6s - loss: 0.8739
Epoch 21/50
 - 6s - loss: 0.8674
Epoch 22/50
 - 6s - loss: 0.8611
Epoch 23/50
 - 6s - loss: 0.8551
Epoch 24/50
 - 6s - loss: 0.8495
Epoch 25/50
 - 6s - loss: 0.8441
Epoch 26/50
 - 6s - loss: 0.8387
Epoch 27/50
 - 6s - loss: 0.8345
Epoch 28/50
 - 6s - loss: 0.8293
Epoch 29/50
 - 6s - loss: 0.8255
Epoch 30/50
 - 6s - loss: 0.8210
Epoch 31/50
 - 6s -

<keras.callbacks.History at 0x1a9637f518>

In [36]:
pred_class = model_class.predict(x_enc_test)
print("Shape: {}".format(pred_class.shape))
print(pred_class)

Shape: (37611, 9)
[[2.3819180e-02 7.2993851e-01 2.1101871e-01 ... 4.1254225e-07
  6.7608347e-10 2.5387077e-08]
 [3.3448377e-21 9.2503116e-09 1.9095463e-04 ... 4.1523003e-03
  2.2881670e-02 9.6084440e-01]
 [2.1823317e-25 4.6439821e-14 1.7447835e-08 ... 1.0430585e-01
  8.8899524e-04 4.8610818e-06]
 ...
 [1.2070042e-06 2.0124778e-04 7.2015047e-02 ... 7.3818028e-02
  8.9805136e-03 2.1195053e-03]
 [9.8673350e-01 1.1884475e-02 1.3730186e-03 ... 1.5880167e-12
  1.4802675e-16 2.3539783e-13]
 [0.0000000e+00 1.4064186e-34 2.8391910e-19 ... 3.4787111e-02
  8.4511524e-01 1.2008743e-01]]


In [37]:
predict_stars = np.argmax(pred_class,axis=1)

true_stars = np.argmax(y_enc_test,axis=1)

In [38]:
for i in range(10):
    print("{}. Business ID: {}, Actual Stars: {}, predicted Stars: {}".format(i+1,merge_df['business_id'][2000+i],true_stars[i],predict_stars[i]))

1. Business ID: diaiQrxYFU1V5qxrFnW9fg, Actual Stars: 3, predicted Stars: 1
2. Business ID: TDTASGFy_aGp6vy0i23mDA, Actual Stars: 7, predicted Stars: 8
3. Business ID: VuKJ2s_JP8weQ54NfsXJXQ, Actual Stars: 4, predicted Stars: 5
4. Business ID: aGiBg2WKOpXS5-1DRnBiAQ, Actual Stars: 6, predicted Stars: 6
5. Business ID: ZMmgFw2P4LWsFXNn1ZGc1g, Actual Stars: 3, predicted Stars: 4
6. Business ID: sEKFq5u8P_s0-2mAZnx0JQ, Actual Stars: 8, predicted Stars: 7
7. Business ID: rYziPPEILDXJ_F5uKR--YQ, Actual Stars: 8, predicted Stars: 8
8. Business ID: Swm_uMOWNcJDZz5lXWyzKA, Actual Stars: 6, predicted Stars: 6
9. Business ID: 6nGnVP7M4qQRiclXxeqXSQ, Actual Stars: 6, predicted Stars: 7
10. Business ID: Tc24GX9-ZPr4_SHU0nJZZA, Actual Stars: 6, predicted Stars: 7


In [39]:
#accuracy  

correct = accuracy_score(true_stars, predict_stars)
print("Accuracy: {}".format(correct))

Accuracy: 0.49466911275956504


# Additional Features

** Categories **

In [40]:
x_matrix.shape

(188052, 1001)

In [41]:
# Extracting categories

from sklearn.preprocessing import MultiLabelBinarizer
encoded_categories = MultiLabelBinarizer()
category_matrix = encoded_categories.fit_transform(merge_df['categories'].str.split(','))

In [None]:
category_matrix.head()

In [43]:
x_matrix_cat=np.column_stack((x_matrix, category_matrix))
print(x_matrix_cat.shape)

(188052, 3465)


In [44]:
x_cont_train1, x_cont_test1, y_cont_train1, y_cont_test1 = train_test_split(x_matrix_cat, y_stars , test_size=0.2)

In [45]:
# Tensorflow model for Regression

model_reg_relu1 = Sequential()

model_reg_relu1.add(Dense(25, input_dim=x_cont_train1.shape[1], activation='relu')) # Hidden 1     #  why input_dim=x.shape[1]?  
model_reg_relu1.add(Dense(10, activation='relu')) # Hidden 2
model_reg_relu1.add(Dense(1)) # Output

# set up checkpointer
checkpointer1 = ModelCheckpoint(filepath="./best_weights_relu1.hdf5", verbose=0, save_best_only=True)

for i in range(10):
    model_reg_relu1.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])
    model_reg_relu1.fit(x_cont_train1,y_cont_train1,validation_data=(x_cont_test1,y_cont_test1),callbacks=[monitor,checkpointer1],verbose=0,epochs=100) 

model_reg_relu1.load_weights('./best_weights_relu1.hdf5')

Epoch 00012: early stopping
Epoch 00006: early stopping
Epoch 00006: early stopping
Epoch 00006: early stopping
Epoch 00006: early stopping
Epoch 00007: early stopping
Epoch 00006: early stopping
Epoch 00006: early stopping
Epoch 00009: early stopping
Epoch 00007: early stopping


In [46]:
pred_reg_relu1 = model_reg_relu1.predict(x_cont_test1)
print("Shape: {}".format(pred_reg_relu1.shape))
print(pred_reg_relu1)

Shape: (37611, 1)
[[3.810142 ]
 [4.3475647]
 [1.2961597]
 ...
 [3.3238962]
 [3.4066894]
 [4.4033914]]


In [47]:
for i in range(10):
    print("{}. Business ID: {}, Actual Stars: {}, predicted Stars: {}".format(i+1,merge_df['business_id'][2000+i],y_cont_test1[i],pred_reg_relu1[i]))

1. Business ID: diaiQrxYFU1V5qxrFnW9fg, Actual Stars: 4.0, predicted Stars: [3.810142]
2. Business ID: TDTASGFy_aGp6vy0i23mDA, Actual Stars: 3.5, predicted Stars: [4.3475647]
3. Business ID: VuKJ2s_JP8weQ54NfsXJXQ, Actual Stars: 1.0, predicted Stars: [1.2961597]
4. Business ID: aGiBg2WKOpXS5-1DRnBiAQ, Actual Stars: 4.5, predicted Stars: [4.4692106]
5. Business ID: ZMmgFw2P4LWsFXNn1ZGc1g, Actual Stars: 5.0, predicted Stars: [4.91114]
6. Business ID: sEKFq5u8P_s0-2mAZnx0JQ, Actual Stars: 4.5, predicted Stars: [4.169408]
7. Business ID: rYziPPEILDXJ_F5uKR--YQ, Actual Stars: 4.5, predicted Stars: [4.3694396]
8. Business ID: Swm_uMOWNcJDZz5lXWyzKA, Actual Stars: 3.5, predicted Stars: [3.837064]
9. Business ID: 6nGnVP7M4qQRiclXxeqXSQ, Actual Stars: 3.5, predicted Stars: [3.799996]
10. Business ID: Tc24GX9-ZPr4_SHU0nJZZA, Actual Stars: 3.5, predicted Stars: [3.6079605]


In [48]:
# Measure RMSE error.  RMSE is common for regression.
score = np.sqrt(mean_squared_error(y_cont_test1,pred_reg_relu1))
print("Final score (RMSE): {}".format(score))
print('R2 score: %.2f' % r2_score(y_cont_test1, pred_reg_relu1))

Final score (RMSE): 0.4577174484729767
R2 score: 0.80
