## Setup

In [22]:
import os, sys, re, json, time, unittest, datetime, shutil
import itertools, collections
from importlib import reload
from IPython.display import display, HTML

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss

**Note:** The data below is only on the 10k datasets for now. This will be updated to leverage the full datasets. 

# Loading Data

In [2]:
users_df = pd.read_json("../dataset/user_10k.json", lines=True)


In [3]:
users_df.head()

Unnamed: 0,average_stars,compliment_cool,compliment_cute,compliment_funny,compliment_hot,compliment_list,compliment_more,compliment_note,compliment_photos,compliment_plain,...,cool,elite,fans,friends,funny,name,review_count,useful,user_id,yelping_since
0,4.67,0,0,0,0,0,0,0,0,1,...,0,[],0,"[cvVMmlU1ouS3I5fhutaryQ, nj6UZ8tdGo8YJ9lUMTVWN...",0,Johnny,8,0,oMy_rEb0UBEmMlu-zcxnoQ,2014-11-03
1,3.7,0,0,0,0,0,0,0,0,0,...,0,[],0,"[0njfJmB-7n84DlIgUByCNw, rFn3Xe3RqHxRSxWOU19Gp...",0,Chris,10,0,JJ-aSuM4pCFPdkfoZ34q0Q,2013-09-24
2,2.0,0,0,0,0,0,0,0,0,0,...,0,[],0,[],0,Tiffy,1,0,uUzsFQn_6cXDh6rPNGbIFA,2017-03-02
3,4.67,0,0,0,0,0,0,0,0,0,...,0,[],0,[],0,Mark,6,0,mBneaEEH5EMyxaVyqS-72A,2015-03-13
4,4.67,0,0,0,0,0,0,0,0,0,...,0,[],0,[],0,Evelyn,3,0,W5mJGs-dcDWRGEhAzUYtoA,2016-09-08


In [4]:
# All types of reviews - 10K dataset
# reviews_df = pd.read_json("../dataset/review_10k.json", lines=True)

# Just restaurant reviews - 10K dataset
reviews_df = pd.read_json("../dataset/restaurant_reviews_10k.json", lines=True)

# All types of reviews
# reviews_df = pd.read_json("../../../final_project/full_dataset/review.json", lines=True)

In [5]:
reviews_df.head()

Unnamed: 0,business_id,cool,date,funny,stars,text,useful,user_id
0,--6MefnULPED_I942VcFNA,0,2017-08-17,0,4,This is one of my top 3 places to get BBQ pork...,2,FEg8v92qx3kK4Hu4TF28Fg
1,--6MefnULPED_I942VcFNA,0,2017-05-31,0,3,This restaurant is famous for their BBQ dishes...,0,HPtjvIrhzAUkKsiVkeT4MA
2,--6MefnULPED_I942VcFNA,0,2016-10-23,0,2,Roasted pork is one of my favorite things... A...,1,MpvqV7lQcl15rflTBEUhXA
3,--6MefnULPED_I942VcFNA,0,2017-07-30,0,2,I walked by the restaurant more than 5 years a...,1,x-Gbs8sVid3yhJIoHD6Gfw
4,--6MefnULPED_I942VcFNA,0,2017-02-07,1,2,I came here to order a roast duck over rice to...,0,7Dykd1HolQx8mKPYhYDYSg


In [6]:
reviews_df.shape

(10000, 8)

In [7]:
business_df = pd.read_json("../dataset/business_10k.json", lines=True)

In [8]:
business_df.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state
0,"4855 E Warner Rd, Ste B9","{'AcceptsInsurance': True, 'ByAppointmentOnly'...",FYWN1wneV18bWNgQjJ2GNg,"[Dentists, General Dentistry, Health & Medical...",Ahwatukee,"{'Friday': '7:30-17:00', 'Tuesday': '7:30-17:0...",1,33.33069,-111.978599,Dental by Design,,85044,22,4.0,AZ
1,3101 Washington Rd,"{'BusinessParking': {'garage': False, 'street'...",He-G7vWjzVUysIKrfNbPUQ,"[Hair Stylists, Hair Salons, Men's Hair Salons...",McMurray,"{'Monday': '9:00-20:00', 'Tuesday': '9:00-20:0...",1,40.291685,-80.1049,Stephen Szabo Salon,,15317,11,3.0,PA
2,"6025 N 27th Ave, Ste 1",{},KQPW8lFf1y5BT2MxiSZ3QA,"[Departments of Motor Vehicles, Public Service...",Phoenix,{},1,33.524903,-112.11531,Western Motor Vehicle,,85017,18,1.5,AZ
3,"5000 Arizona Mills Cr, Ste 435","{'BusinessAcceptsCreditCards': True, 'Restaura...",8DShNS-LuFqpEWIp0HxijA,"[Sporting Goods, Shopping]",Tempe,"{'Monday': '10:00-21:00', 'Tuesday': '10:00-21...",0,33.383147,-111.964725,Sports Authority,,85282,9,3.0,AZ
4,581 Howe Ave,"{'Alcohol': 'full_bar', 'HasTV': True, 'NoiseL...",PfOCPjBrlQAnz__NXj9h_w,"[American (New), Nightlife, Bars, Sandwiches, ...",Cuyahoga Falls,"{'Monday': '11:00-1:00', 'Tuesday': '11:00-1:0...",1,41.119535,-81.47569,Brick House Tavern + Tap,,44221,116,3.5,OH


In [9]:
checkin_df = pd.read_json("../dataset/checkin_10k.json", lines=True)

In [10]:
checkin_df.head()

Unnamed: 0,business_id,time
0,7KPBkxAOEtb3QeIL9PEErg,"{'Thursday': {'21:00': 4, '1:00': 1, '4:00': 1..."
1,kREVIrSBbtqBhIYkTccQUg,"{'Monday': {'13:00': 1}, 'Thursday': {'20:00':..."
2,tJRDll5yqpZwehenzE2cSg,"{'Monday': {'12:00': 1, '1:00': 1}, 'Friday': ..."
3,r1p7RAMzCV_6NPF0dNoR3g,"{'Thursday': {'23:00': 1}, 'Saturday': {'21:00..."
4,mDdqgfrvROGAumcQdZ3HIg,"{'Monday': {'12:00': 1, '21:00': 1}, 'Wednesda..."


In [11]:
photos_df = pd.read_json("../dataset/photos_10k.json", lines=True)

In [12]:
photos_df.head()

Unnamed: 0,business_id,caption,label,photo_id
0,OnAzbTDn79W6CFZIriqLrA,,inside,soK1szeyan202jnsGhUDmA
1,OnAzbTDn79W6CFZIriqLrA,,inside,dU7AyRB_fHOZkflodEyN5A
2,OnAzbTDn79W6CFZIriqLrA,,outside,6T1qlbBdKkXA1cDNqMjg2g
3,OnAzbTDn79W6CFZIriqLrA,Bakery area,inside,lHhMNhCA7rAZmi-MMfF3ZA
4,XaeCGHZzsMwvFcHYq3q9sA,,food,oHSCeyoK9oLIGaCZq-wRJw


In [13]:
tip_df = pd.read_json("../dataset/tip_10k.json", lines=True)

In [14]:
tip_df.head()

Unnamed: 0,business_id,date,likes,text,user_id
0,tJRDll5yqpZwehenzE2cSg,2012-07-15,0,Get here early enough to have dinner.,zcTZk7OG8ovAmh_fenH21g
1,jH19V2I9fIslnNhDzPmdkA,2015-08-12,0,Great breakfast large portions and friendly wa...,ZcLKXikTHYOnYt5VYRO5sg
2,dAa0hB2yrnHzVmsCkN4YvQ,2014-06-20,0,Nice place. Great staff. A fixture in the tow...,oaYhjqBbh18ZhU0bpyzSuw
3,dAa0hB2yrnHzVmsCkN4YvQ,2016-10-12,0,Happy hour 5-7 Monday - Friday,ulQ8Nyj7jCUR8M83SUMoRQ
4,ESzO3Av0b1_TzKOiqzbQYQ,2017-01-28,0,"Parking is a premium, keep circling, you will ...",ulQ8Nyj7jCUR8M83SUMoRQ


# Baseline Model

In [17]:
n_features = 100000

text = reviews_df["text"]

print("Fitting Count Vectorizer")
# vectorizer = CountVectorizer(max_df=0.95, min_df=2,
#                                 max_features=n_features,
#                                 stop_words='english')
# word_vector = vectorizer.fit_transform(text)

# No setting of hyper-parameters
vectorizer = CountVectorizer()
word_vector = vectorizer.fit_transform(text)

print(np.shape(word_vector))

Fitting Count Vectorizer
(10000, 24872)


In [18]:
#Print example text, stars, and embeddings

print(reviews_df["text"][102])
print(reviews_df["stars"][102])
print(word_vector[102])

At ces trade show and looking for lunch. I show up at 2:03 and the host jokingly says we are closed. We laughed. But he meant it. Last year my burger ordered medium came out almost raw. I am never going back
1
  (0, 17650)	1
  (0, 3376)	1
  (0, 13684)	1
  (0, 12582)	1
  (0, 4549)	1
  (0, 19037)	1
  (0, 11962)	1
  (0, 22483)	1
  (0, 3953)	1
  (0, 10897)	1
  (0, 13729)	1
  (0, 24528)	1
  (0, 12)	1
  (0, 12556)	1
  (0, 13164)	1
  (0, 15363)	1
  (0, 13056)	1
  (0, 19747)	2
  (0, 1101)	1
  (0, 10472)	1
  (0, 1133)	1
  (0, 3582)	1
  (0, 15453)	1
  (0, 14751)	1
  (0, 2016)	1
  (0, 1762)	2
  (0, 9793)	1
  (0, 23190)	1
  (0, 23929)	2
  (0, 1555)	1
  (0, 1239)	2
  (0, 8885)	1
  (0, 3440)	1
  (0, 22022)	1
  (0, 11748)	1
  (0, 14510)	1


## NB Training and Scoring

In [19]:
# x_train_user_reviews = reviews_df["text"][0:6000]
# x_dev_user_reviews = reviews_df["text"][6001:8000]
# x_test_user_reviews = reviews_df["text"][8001:10000]


# x_train_user_reviews = word_vector[0:6000]
# x_dev_user_reviews = word_vector[6001:8000]
x_train_user_reviews = word_vector[0:8000]
x_test_user_reviews = word_vector[8001:10000]

# print("x_train_user_reviews", x_train_user_reviews)
# print("shape x_train_user_reviews", np.shape(x_train_user_reviews))



# y_train_user_stars = reviews_df["stars"][0:6000]
# y_dev_user_stars = reviews_df["stars"][6001:8000]
y_train_user_stars = reviews_df["stars"][0:8000]
y_test_user_stars = reviews_df["stars"][8001:10000]

In [26]:
nb = MultinomialNB()

nb.fit(x_train_user_reviews, y_train_user_stars)

y_pred = nb.predict(x_test_user_reviews)

acc = accuracy_score(y_pred, y_test_user_stars)
print("Accuracy on test set: {:.02%}".format(acc))
pred_proba = nb.predict_proba(y_pred)
log_loss_metric = log_loss(y_test_user_stars, pred_proba)
print("Log-loss on test set: {:.02%}".format(log_loss_metric))

Accuracy on test set: 55.93%


ValueError: Expected 2D array, got 1D array instead:
array=[5 5 5 ..., 4 4 5].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [27]:
# Print example prediction

print(y_pred[400])

4
