In [374]:
from collections.abc import Sequence
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low


# Import Data

In [375]:
import pandas as pd
import numpy as np

# Convert Data from JSON to Pandas
reviews = pd.read_json('yelp_academic_dataset_review.json', lines=True, nrows=10000)
reviews = reviews.rename(columns={'stars': 'review_rating'})


businesses = pd.read_json('yelp_academic_dataset_business.json', lines=True, nrows=10000)
businesses = businesses.rename(columns={'stars': 'business_rating'})
businesses['location'] = businesses['city'] + ", " + businesses['state']



# Keep all business with more than 20 lines & merge the reviews

In [376]:

# Remove all business with reivew count < 20
businesses = businesses.dropna(subset=['business_id', 'review_count'])
businesses =  businesses[businesses.review_count > 20]

all_reviews = pd.merge(businesses,reviews, on='business_id') 

all_reviews.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,business_rating,review_count,...,hours,location,review_id,user_id,review_rating,useful,funny,cool,text,date
0,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,...,"{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ...","Tucson, AZ",IOmiYoBPtQsY_fh5uA4mXg,P-NTOAMFVSDFGkhcj4GaIQ,4,1,0,0,We are fans of Target. They seem to have a li...,2017-02-19 15:11:22
1,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,...,"{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...","Philadelphia, PA",BXQcBN0iAi1lAUxibGLFzA,6_SpY41LIHZuIaiDs5FMKA,4,0,0,1,This is nice little Chinese bakery in the hear...,2014-05-26 01:09:53
2,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,...,"{'Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'...","Largo, FL",oTTuahWNWzX_018P6O6_2g,z1Dfj8kz3KCArkXaIyaBIA,1,6,1,0,The worst Chicken Parm. Sandwich I've ever eat...,2014-05-25 21:52:30
3,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,...,"{'Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'...","Largo, FL",R7DC4sHDcklrk1s1K93FDA,HvgKiuV36e9SzNqeA5zOfA,4,0,0,0,"Zio's, previously known as Cesarina's is a lar...",2018-07-26 16:25:04
4,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,205 Race St,Philadelphia,PA,19106,39.953949,-75.143226,4.0,245,...,"{'Tuesday': '13:30-22:0', 'Wednesday': '13:30-...","Philadelphia, PA",XYaDbPKyJAu4k2aUOIth5g,Qsk0aTclam9W_DIK6bx42A,5,0,0,0,Stopped in to check out this new spot around t...,2017-12-16 00:13:06


# Get all reviews for one business

In [377]:
# print("Individual Reviews (after join): ", all_reviews.shape)

all_reviews = all_reviews[['business_id', 'text', 'business_rating', 'location', 'review_count']]
all_reviews = all_reviews.drop_duplicates(subset=['business_id', 'business_rating', 'location', 'review_count'])


reviews_agg = all_reviews.groupby('business_id')['text'].sum()
rating_avg = all_reviews.groupby('business_id')['business_rating'].mean()

print(all_reviews.shape, reviews_agg.shape, rating_avg.shape)

df_business_reviews = pd.DataFrame({
  'business_id': reviews_agg.index, 
  'business_rating' : rating_avg.values,
  'location' : all_reviews['location'],
  'review_count' : all_reviews['review_count'],
  'text': reviews_agg.values,
})

# print(df_agg['business_rating'].describe())
# print(df_reviews_agg)


(2077, 5) (2077,) (2077,)


In [378]:
# df_reviews_agg[df_reviews_agg.isnull().any(axis=1)].shape
df_business_reviews.head()


Unnamed: 0,business_id,business_rating,location,review_count,text
0,--ZVrH2X2QXBFdCilbirsw,4.5,"Tucson, AZ",22,This place is sadly perm closed. I was hoping ...
1,-02xFuruu85XmDn2xiynJw,4.5,"Philadelphia, PA",80,Dr. Curtis Dechant has an excellent chair-side...
2,-1ueCbvIpUPi8KT95ETTKw,4.0,"Largo, FL",100,I was very happy here.there is a shower with a...
4,-2Axhv9AZ_n7qjQefECpVw,3.5,"Philadelphia, PA",245,"The service was excellent, extremely friendly...."
8,-3AooxIkg38UyUdlz5oXdw,3.0,"Philadelphia, PA",205,Wow. Wow. We've been trying to go here for a w...


# TF_IDF Vectorizer

In [379]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english',max_features=300,min_df=1)
# Fit and transform the aggregated reviews into a TF-IDF matrix
tfidf_wm = vectorizer.fit_transform(df_business_reviews['text'])

# Retrieve the feature names (i.e., the vocabulary)
tfidf_tokens = vectorizer.get_feature_names_out()

# Create a DataFrame from the TF-IDF matrix for easier viewing
df_tfidfvect = pd.DataFrame(data=tfidf_wm.toarray(), columns=tfidf_tokens)

df_tfidfvect


Unnamed: 0,10,15,20,30,able,absolutely,actually,ago,amazing,area,...,wonderful,work,worst,worth,wouldn,wrong,year,years,yelp,yes
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.275982,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.195028,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2072,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2073,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.143289,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2074,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2075,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Create one table with all information

In [380]:
# business_id, rating concatonated onto the df_idfvect
print(df_business_reviews.shape, df_tfidfvect.shape)
df_business = pd.concat([df_business_reviews[['business_id', 'business_rating', 'location', 'review_count']], df_tfidfvect], axis=1)


(2077, 5) (2077, 300)


# Test/Train Split

In [381]:
encode_text_dummy(df_business, 'business_id')
encode_text_dummy(df_business, 'location')
missing_median(df_business, 'business_rating')
encode_numeric_zscore(df_business, 'review_count')

x, y = to_xy(df_business, 'business_rating')

print(df_business.shape, x.shape, y.shape)

  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy_name] = dummies[x]
  df[dummy

(3415, 2659) (3415, 2658) (3415,)


In [383]:
# Model #1 : (decide # of neurons in hiddens layers, relu, adam)
%matplotlib inline
from matplotlib.pyplot import figure, show
from sklearn.model_selection import train_test_split
from sklearn import metrics

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import optimizers
from tensorflow.keras.optimizers import Adam, SGD

parameters = {
  'test_size' : 0.20, 
  'learning_rate' : 0.005, 
  'batch_size' : 32, 
  'epochs' : 100
}


# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=parameters['test_size'], random_state=42)

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

# Create the model
model = Sequential()
model.add(Dense(100, input_dim=x.shape[1], activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(25, activation='relu'))
model.add(Dropout(0.05))  # Add dropout here
model.add(Dense(1))
adam = optimizers.Adam(learning_rate=parameters['learning_rate'], beta_1=0.9, beta_2=0.999)
sgd = optimizers.SGD(learning_rate=parameters['learning_rate'], momentum=0.9, nesterov=True)   
# model.compile(loss='mean_squared_error', optimizer=sgd)
model.compile(loss='mean_squared_error', optimizer=adam)



monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
checkpointer = ModelCheckpoint(filepath="dnn/best_weights.hdf5", verbose=0, save_best_only=True) # save best model

# batch_size: Integer or None. Number of samples per gradient update. If unspecified, batch_size will default to 32.
model.fit(x_train,y_train,validation_data=(x_test,y_test), batch_size= 16, callbacks=[monitor, checkpointer],verbose=2,epochs=parameters['epochs'])
# model.load_weights('dnn/best_weights.hdf5') # load weights from best model

# Predict and measure RMSE
pred = model.predict(x_test)
score = np.sqrt(metrics.mean_squared_error(pred,y_test))
print("Score (RMSE): {}".format(score))

# Plot the chart
chart_regression(pred.flatten(),y_test)


(2732, 2658) (2732,) (683, 2658) (683,)
Epoch 1/100
171/171 - 2s - loss: nan - val_loss: nan - 2s/epoch - 14ms/step
Epoch 2/100
171/171 - 1s - loss: nan - val_loss: nan - 636ms/epoch - 4ms/step
Epoch 3/100
171/171 - 1s - loss: nan - val_loss: nan - 624ms/epoch - 4ms/step
Epoch 4/100
171/171 - 1s - loss: nan - val_loss: nan - 663ms/epoch - 4ms/step
Epoch 5/100
171/171 - 1s - loss: nan - val_loss: nan - 643ms/epoch - 4ms/step
Epoch 5: early stopping


ValueError: Input contains NaN.