# Importing Libraries

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [1]:
import requests
from bs4 import BeautifulSoup as bs

In [2]:
from pprint import pprint
import random
import json

In [6]:
from nltk.sentiment import SentimentAnalyzer
import nltk.sentiment.util
from nltk.classify import NaiveBayesClassifier

# Fetching Reviews through Yelp's API

In [7]:
accessToken = '---'

In [8]:
r = requests.get('https://api.yelp.com/v3/businesses/search?location=Toronto&limit=50', headers={'Authorization': 'Bearer {}'.format(accessToken)})

In [9]:
print(r.status_code) # status code '200' implies the request was successful

200


In [36]:
pprint(r.json()['businesses'][:5])

[{'alias': 'pai-northern-thai-kitchen-toronto-5',
  'categories': [{'alias': 'thai', 'title': 'Thai'}],
  'coordinates': {'latitude': 43.64784, 'longitude': -79.38872},
  'display_phone': '+1 416-901-4724',
  'distance': 3009.3347434869856,
  'id': 'r_BrIgzYcwo1NAuG9dLbpg',
  'image_url': 'https://s3-media1.fl.yelpcdn.com/bphoto/qEbOKh9rDmTrmHdaUDDnNg/o.jpg',
  'is_closed': False,
  'location': {'address1': '18 Duncan Street',
               'address2': '',
               'address3': '',
               'city': 'Toronto',
               'country': 'CA',
               'display_address': ['18 Duncan Street',
                                   'Toronto, ON M5H 3G8',
                                   'Canada'],
               'state': 'ON',
               'zip_code': 'M5H 3G8'},
  'name': 'Pai Northern Thai Kitchen',
  'phone': '+14169014724',
  'price': '$$',
  'rating': 4.5,
  'review_count': 3010,
  'transactions': [],
  'url': 'https://www.yelp.com/biz/pai-northern-thai-kitchen-toront

# Scraping Reviews using BeautifulSoup

In [12]:
reviews = []
for business in r.json()['businesses']:
  reviews_requests = requests.get('https://api.yelp.com/v3/businesses/{}/reviews'.format(business['id']), headers={'Authorization': 'Bearer {}'.format(accessToken)}).json()
  review_url = reviews_requests['reviews'][0]['url'] # extracting URL for a specific business
  page = requests.get(review_url)
  soup = bs(page.content, 'html.parser') # parsing page contents using HTML parser
  reviews_list = soup.find_all('div', class_ = lambda x: x and x.startswith('review__')) # list of all reviews
  for review in reviews_list:
    reviews.append((review.find('span', class_ = lambda x: x and x.startswith('raw__')).text.rstrip('.'), reviews_requests['reviews'][0]['rating'])) # creating tuple of review and its rating

In [13]:
reviews[:5] # checking the first 5 reviews extracted

[("A friend recommended this restaurant and I would have waked right by otherwise. Down a flight of stairs, once you walk in, it's a cozy hub of bustling activity, feeling intimate, warm, and hip all at the same time. We had the Thai chicken wings, gai satay, moo ping, and spring rolls to start. The chicken wings were crispy with just enough sauce and the tamarind came through well - we ordered a second serving immediately after getting the first. We opted to dine family style, sharing the Khao Soi, Massaman curry, and the Chef Nuit Pad Thai. These dishes were all well-executed, with the typical flavours of lime, coconut milk, pepper, peanut, and tamarind balanced appropriately. Serving sizes were also great for a group of four to share and get a taste of everything. Service was quick, considering how incredibly busy it was for a Saturday night. Would certainly recommend making reservations in advance, as this place gets PACKED around dinnertime. Prices are only slightly higher than ot

# Storing Results in JSON

In [14]:
with open('data.json', 'w') as file:
  json.dump(list(map(list, {x: 'Positive' if r > 3 else 'Negative' for (x, r) in reviews}.items())), file) # reviews with rating > 3 are defined to be positive, else negative

# Extracting Features

In [15]:
review_features = [(x.split(' '), 'Positive' if y > 3 else 'Negative') for (x, y) in reviews]

# Modeling

In [25]:
random.shuffle(review_features)
training_docs = review_features[:int(len(review_features)*0.25)]
test_docs = review_features[int(len(review_features)*0.25):]

print("Training: %d, Testing: %d" % (len(training_docs), len(test_docs)))

sentim_analyzer = SentimentAnalyzer()

Training: 122, Testing: 368


In [26]:
all_words_neg = sentim_analyzer.all_words([nltk.sentiment.util.mark_negation(doc) for doc in training_docs])

In [27]:
unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
sentim_analyzer.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigram_feats)

In [28]:
training_set = sentim_analyzer.apply_features(training_docs)
test_set = sentim_analyzer.apply_features(test_docs)

In [29]:
trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, training_set)
for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
  print('{0}: {1}'.format(key, value))

Training classifier
Evaluating NaiveBayesClassifier results...
Accuracy: 0.9728260869565217
F-measure [Negative]: None
F-measure [Positive]: 0.9862258953168043
Precision [Negative]: None
Precision [Positive]: 0.9728260869565217
Recall [Negative]: 0.0
Recall [Positive]: 1.0


The accuracy obtained through a Naive Bayes Classifier is 97.28%