# Part I

## Import Libraries

In [24]:
# importing all libraries I need
import requests
import nltk
# used to traverse and then extract the data
from bs4 import BeautifulSoup
#
import re
import pandas as pd
import numpy as np
from textblob import Word
from textblob import TextBlob

## Get Request

In [7]:
r = requests.get("https://www.yelp.com/biz/nobi-asian-grill-friendswood")


## Getting the reviews

In [74]:
#creating soup object
soup = BeautifulSoup(r.text, 'html.parser')
# creating a regex, as each review has an html class that starts with "comment"
regex = re.compile(".*comment.*")
# looking for paragraphs with a class that contains our regex
review_info = soup.find_all('p', {'class':regex})

#loop through all pages and gather ALL reviews!
for i in range(10,90,10):
    r = requests.get("https://www.yelp.com/biz/nobi-asian-grill-friendswood?start=" + str(i))
    soup = BeautifulSoup(r.text, 'html.parser')
    review_info.append(soup.find_all('p', {'class':regex}))


In [53]:
# Loop through all the review info and only snag the actual text
reviews = []
for review in review_info:
    for r in review:
        reviews.append(r.text)

## Exploring Data 

In [54]:
df = pd.DataFrame(np.array(reviews), columns=['review'])

def avg_word(review):
    words = review.split()
    return (sum(len(word) for word in words) / len(words))

# add columns to our data frame based on our reviews
df['word_count'] = df['review'].apply(lambda x: len(x.split()))
df['char_count'] = df['review'].apply(lambda x: len(x))
df['avg_word'] = df['review'].apply(lambda x: avg_word(x))


In [55]:
from nltk.corpus import stopwords
# a list of words that add no value to review
stop_words = stopwords.words('english')
#spilt words in review and count if in stopwords
df['stopword_count'] = df['review'].apply(lambda x: len([x for x in x.split() if x in stop_words]))
df['stopword_rate'] = df['stopword_count'] / df['word_count']

## Cleaning Data

In [56]:
# making review cleaner in step wise fashion
df['review_lower'] = df['review'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['review_nopunc'] = df['review_lower'].str.replace('[^\w\s]', '')
df['review_nostopwords'] = df['review_nopunc'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))

df.head()

  df['review_nopunc'] = df['review_lower'].str.replace('[^\w\s]', '')


Unnamed: 0,review,word_count,char_count,avg_word,stopword_count,stopword_rate,review_lower,review_nopunc,review_nostopwords
0,It's cool place. All the art is by Donkey boy ...,46,211,3.608696,19,0.413043,it's cool place. all the art is by donkey boy ...,its cool place all the art is by donkey boy or...,cool place art donkey boy alex ramos lot astro...
1,Highly recommend the shaking beef.... so much ...,28,152,4.464286,12,0.428571,highly recommend the shaking beef.... so much ...,highly recommend the shaking beef so much flav...,highly recommend shaking beef much flavor fres...
2,I really love the food at this place. I've be...,131,715,4.412214,60,0.458015,i really love the food at this place. i've bee...,i really love the food at this place ive been ...,really love food place ive pub well place perf...
3,Nice place to pick up or order & wait. It isn'...,310,1606,4.183871,139,0.448387,nice place to pick up or order & wait. it isn'...,nice place to pick up or order wait it isnt t...,nice place pick order wait isnt pleasant seat ...
4,"Nobi is always good, any time of the day, no m...",73,398,4.465753,32,0.438356,"nobi is always good, any time of the day, no m...",nobi is always good any time of the day no mat...,nobi always good time day matter get bánh mis ...


In [57]:
# look to see other potential stopwords in dataframe
freq= pd.Series(" ".join(df['review_nostopwords']).split()).value_counts()[:30]
print(freq)

place         65
food          62
good          61
pork          57
great         48
sandwich      44
one           33
nobi          32
order         31
always        30
rolls         30
go            30
mi            30
fried         29
also          29
service       28
chicken       28
rice          26
time          25
fresh         25
sandwiches    24
get           24
banh          24
best          23
ordered       23
beer          23
like          22
egg           22
grilled       21
people        21
dtype: int64


In [59]:
# removing some extra stop words
more_stop_words = ['asian', 'us', 'mi', 'still', 'place', 'one', 'also']
df['cleanreviews'] = df['review_nostopwords'].apply(lambda x: "".join(" ".join(x for x in x.split() if x not in more_stop_words)))
df.head()

Unnamed: 0,review,word_count,char_count,avg_word,stopword_count,stopword_rate,review_lower,review_nopunc,review_nostopwords,cleanreviews
0,It's cool place. All the art is by Donkey boy ...,46,211,3.608696,19,0.413043,it's cool place. all the art is by donkey boy ...,its cool place all the art is by donkey boy or...,cool place art donkey boy alex ramos lot astro...,cool art donkey boy alex ramos lot astros art ...
1,Highly recommend the shaking beef.... so much ...,28,152,4.464286,12,0.428571,highly recommend the shaking beef.... so much ...,highly recommend the shaking beef so much flav...,highly recommend shaking beef much flavor fres...,highly recommend shaking beef much flavor fres...
2,I really love the food at this place. I've be...,131,715,4.412214,60,0.458015,i really love the food at this place. i've bee...,i really love the food at this place ive been ...,really love food place ive pub well place perf...,really love food ive pub well perfect quick me...
3,Nice place to pick up or order & wait. It isn'...,310,1606,4.183871,139,0.448387,nice place to pick up or order & wait. it isn'...,nice place to pick up or order wait it isnt t...,nice place pick order wait isnt pleasant seat ...,nice pick order wait isnt pleasant seat eat du...
4,"Nobi is always good, any time of the day, no m...",73,398,4.465753,32,0.438356,"nobi is always good, any time of the day, no m...",nobi is always good any time of the day no mat...,nobi always good time day matter get bánh mis ...,nobi always good time day matter get bánh mis ...


## Lemmatization (taking a word back to the basics)

In [60]:
#lemmatizing words
df['lemmatized'] = df['cleanreviews'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

In [61]:
df

Unnamed: 0,review,word_count,char_count,avg_word,stopword_count,stopword_rate,review_lower,review_nopunc,review_nostopwords,cleanreviews,lemmatized
0,It's cool place. All the art is by Donkey boy ...,46,211,3.608696,19,0.413043,it's cool place. all the art is by donkey boy ...,its cool place all the art is by donkey boy or...,cool place art donkey boy alex ramos lot astro...,cool art donkey boy alex ramos lot astros art ...,cool art donkey boy alex ramos lot astros art ...
1,Highly recommend the shaking beef.... so much ...,28,152,4.464286,12,0.428571,highly recommend the shaking beef.... so much ...,highly recommend the shaking beef so much flav...,highly recommend shaking beef much flavor fres...,highly recommend shaking beef much flavor fres...,highly recommend shaking beef much flavor fres...
2,I really love the food at this place. I've be...,131,715,4.412214,60,0.458015,i really love the food at this place. i've bee...,i really love the food at this place ive been ...,really love food place ive pub well place perf...,really love food ive pub well perfect quick me...,really love food ive pub well perfect quick me...
3,Nice place to pick up or order & wait. It isn'...,310,1606,4.183871,139,0.448387,nice place to pick up or order & wait. it isn'...,nice place to pick up or order wait it isnt t...,nice place pick order wait isnt pleasant seat ...,nice pick order wait isnt pleasant seat eat du...,nice pick order wait isnt pleasant seat eat du...
4,"Nobi is always good, any time of the day, no m...",73,398,4.465753,32,0.438356,"nobi is always good, any time of the day, no m...",nobi is always good any time of the day no mat...,nobi always good time day matter get bánh mis ...,nobi always good time day matter get bánh mis ...,nobi always good time day matter get bánh mi b...
...,...,...,...,...,...,...,...,...,...,...,...
92,"The food speaks for it self, fantastic. Howeve...",72,385,4.361111,31,0.430556,"the food speaks for it self, fantastic. howeve...",the food speaks for it self fantastic however ...,food speaks self fantastic however one thing r...,food speaks self fantastic however thing reall...,food speaks self fantastic however thing reall...
93,I love everything this place serves. Nice owne...,44,248,4.659091,14,0.318182,i love everything this place serves. nice owne...,i love everything this place serves nice owner...,love everything place serves nice owners great...,love everything serves nice owners great staff...,love everything serf nice owner great staff mi...
94,My wife and I love this place. They combine my...,50,273,4.480000,20,0.400000,my wife and i love this place. they combine my...,my wife and i love this place they combine my ...,wife love place combine two favorites craft be...,wife love combine two favorites craft beer vie...,wife love combine two favorite craft beer viet...
95,I eat here 1 or 2 times a week. Everything is ...,36,172,3.805556,11,0.305556,i eat here 1 or 2 times a week. everything is ...,i eat here 1 or 2 times a week everything is g...,eat 1 2 times week everything great alway go b...,eat 1 2 times week everything great alway go b...,eat 1 2 time week everything great alway go ba...


## Sentiment Analysis

In [64]:
# calculating polarity metic (how positive or negative)
df['polarity'] = df['lemmatized'].apply(lambda x: TextBlob(x).sentiment[0])
# Calculate subjectivity (how factual review is)
df['subjectivity'] = df['lemmatized'].apply(lambda x: TextBlob(x).sentiment[1])
df[['lemmatized', 'polarity', 'subjectivity']]

Unnamed: 0,lemmatized,polarity,subjectivity
0,cool art donkey boy alex ramos lot astros art ...,0.462500,0.537500
1,highly recommend shaking beef much flavor fres...,0.372000,0.468000
2,really love food ive pub well perfect quick me...,0.329683,0.679524
3,nice pick order wait isnt pleasant seat eat du...,0.172364,0.534410
4,nobi always good time day matter get bánh mi b...,0.595833,0.620833
...,...,...,...
92,food speaks self fantastic however thing reall...,0.387037,0.627778
93,love everything serf nice owner great staff mi...,0.391667,0.491667
94,wife love combine two favorite craft beer viet...,0.179762,0.540476
95,eat 1 2 time week everything great alway go ba...,0.480000,0.550000


In [65]:
df.sort_values(by='polarity')

Unnamed: 0,review,word_count,char_count,avg_word,stopword_count,stopword_rate,review_lower,review_nopunc,review_nostopwords,cleanreviews,lemmatized,polarity,subjectivity
33,"Have us wrong food, brought it back gave and u...",45,239,4.333333,11,0.244444,"have us wrong food, brought it back gave and u...",have us wrong food brought it back gave and us...,us wrong food brought back gave us cold food a...,wrong food brought back gave cold food actuall...,wrong food brought back gave cold food actuall...,-0.145000,0.710000
30,I tried this place based on a friend's recomme...,85,454,4.352941,31,0.364706,i tried this place based on a friend's recomme...,i tried this place based on a friends recommen...,tried place based friends recommendation order...,tried based friends recommendation ordered por...,tried based friend recommendation ordered pork...,-0.005556,0.516667
13,Great food. Not much in house seating. Nice pl...,92,469,4.108696,40,0.434783,great food. not much in house seating. nice pl...,great food not much in house seating nice plac...,great food much house seating nice place grab ...,great food much house seating nice grab beer m...,great food much house seating nice grab beer m...,0.031746,0.478571
23,Had a couple of sandwiches one chicken one por...,66,355,4.393939,21,0.318182,had a couple of sandwiches one chicken one por...,had a couple of sandwiches one chicken one por...,couple sandwiches one chicken one pork called ...,couple sandwiches chicken pork called ahead pi...,couple sandwich chicken pork called ahead pick...,0.032727,0.427879
88,This place sure knows how to market itself. T...,203,1134,4.541872,86,0.423645,this place sure knows how to market itself. th...,this place sure knows how to market itself the...,place sure knows market theres nothing wrong f...,sure knows market theres nothing wrong food be...,sure know market there nothing wrong food bett...,0.070625,0.523194
...,...,...,...,...,...,...,...,...,...,...,...,...,...
53,I LOVE this place! The bread is amazing. I'm v...,37,190,4.162162,12,0.324324,i love this place! the bread is amazing. i'm v...,i love this place the bread is amazing im vega...,love place bread amazing im vegan always get t...,love bread amazing im vegan always get tofu sa...,love bread amazing im vegan always get tofu sa...,0.775000,0.700000
49,Best sandwiches ever i so recommend this place...,9,62,6.000000,3,0.333333,best sandwiches ever i so recommend this place...,best sandwiches ever i so recommend this place...,best sandwiches ever recommend place amazing,best sandwiches ever recommend amazing,best sandwich ever recommend amazing,0.800000,0.600000
63,Incredible food that the staff recommended. We...,17,104,5.176471,6,0.352941,incredible food that the staff recommended. we...,incredible food that the staff recommended we ...,incredible food staff recommended great pork s...,incredible food staff recommended great pork s...,incredible food staff recommended great pork s...,0.900000,0.883333
5,Incredible food that the staff recommended. We...,17,104,5.176471,6,0.352941,incredible food that the staff recommended. we...,incredible food that the staff recommended we ...,incredible food staff recommended great pork s...,incredible food staff recommended great pork s...,incredible food staff recommended great pork s...,0.900000,0.883333


# PART II (BERT MODEL)

In [41]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

## Setup Model and Neural Network

In [42]:
#converts strings to numbers
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
# model returning how positive the review is by predicting number of stars on yelp
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

## Calculating Sentiment

In [43]:
# testing tokenizer using just a simple string
tokens = tokenizer.encode('This was amazing! I loved it.', return_tensors='pt')
result = model(tokens)
result.logits
#calc sentiment (higher number better sentiment)
int(torch.argmax(result.logits))+1

5

## Score Model

In [66]:
df2 = pd.DataFrame(np.array(reviews), columns=['review'])
df2['review'].iloc[0]

"It's cool place. All the art is by Donkey boy or Alex Ramos who does a lot of the astros art work. In fact he did the mural at the BBVA stadium. Good food. Good stuff. Check it out and if you want pho, go early."

In [68]:
# returns the sentiment score of the review passed in after 
# tokenizing and running through our BERT model
def sentiment_score(review):
    tokens = tokenizer.encode(review, return_tensors='pt')
    result = model(tokens)
    return int(torch.argmax(result.logits))+1

In [72]:
# passing through the first 512 tokens, as that is our model limit
df2['sentiment'] = df2['review'].apply(lambda x: sentiment_score(x[:512]))

In [73]:
df2

Unnamed: 0,review,sentiment
0,It's cool place. All the art is by Donkey boy ...,4
1,Highly recommend the shaking beef.... so much ...,5
2,I really love the food at this place. I've be...,4
3,Nice place to pick up or order & wait. It isn'...,3
4,"Nobi is always good, any time of the day, no m...",5
...,...,...
92,"The food speaks for it self, fantastic. Howeve...",4
93,I love everything this place serves. Nice owne...,5
94,My wife and I love this place. They combine my...,4
95,I eat here 1 or 2 times a week. Everything is ...,4
