<a href="https://colab.research.google.com/github/frizchar/Sentiment-Analysis-on-Web-Scraped-Yelp-Reviews/blob/main/Sentiment_Analysis_on_Web_Scraped_Yelp_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Collect Yelp Reviews

In [1]:
import requests
from bs4 import BeautifulSoup
import re

In [2]:
r = requests.get('https://www.yelp.ie/biz/the-cake-caf%C3%A9-dublin-2')
soup = BeautifulSoup(r.text, 'html.parser')
regex = re.compile('.*comment.*')
results = soup.find_all('p', {'class':regex})
reviews = [result.text for result in results]

In [3]:
reviews

["Lovely little café off Camden Street. You walk through a book shop to get to a pretty courtyard which is very peaceful considering it's in the middle of Dublin City. Staff were lovely and service was quick too. We ordered the Green Goddess poached eggs and the potato farl. The green goddess was delicious - three thick slices of seedy toast, pea guacamole, 2 poached eggs, feta, spring onion, coriander and a drizzle of their house green goddess dressing. I highly recommend! Potato farl was served with sautéed onions, feta, fried egg, bacon and a drizzle of the green goddess dressing. We couldn't resist taking away a slice of the carrot cake and the chocolate fudge cake - divine!",
 'I ordered online for a cake and a tea towel to be delivered to my friend living in Dublin. I live in the U.S. and relied on Yelp ratings to choose the Cake Cafe. The cake looked beautiful (my friend sent a picture), she loved the tea towel, and she said the cake was delicious. Ray, from the Cake Cafe was ve

# Analyse The Data

In [4]:
import pandas as pd
import numpy as np

In [5]:
df = pd.DataFrame(np.array(reviews), columns=['review'])
df.tail()

Unnamed: 0,review
5,Quirky cakey goodnessA lovely selection of tea...
6,"Amazing meal at Cake Café. Very nice people, b..."
7,What can be said that hasn't already been said...
8,We visited The Cake Café while we were visitin...
9,Super awesome place to order cakes made and de...


In [6]:
df['word_count'] = df['review'].apply(lambda x: len(x.split()))
df['word_count'].tail()

5     42
6     49
7    109
8     37
9     44
Name: word_count, dtype: int64

In [7]:
df['char_count'] = df['review'].apply(lambda x: len(x))
df['char_count'].tail()

5    250
6    444
7    630
8    227
9    250
Name: char_count, dtype: int64

In [8]:
df

Unnamed: 0,review,word_count,char_count
0,Lovely little café off Camden Street. You walk...,119,685
1,I ordered online for a cake and a tea towel to...,70,379
2,Yummy cake bites (mini slices of sponge cake w...,29,167
3,I went to The Cake Cafe during my first trip t...,137,699
4,You had me at cake. We came here to share a do...,57,295
5,Quirky cakey goodnessA lovely selection of tea...,42,250
6,"Amazing meal at Cake Café. Very nice people, b...",49,444
7,What can be said that hasn't already been said...,109,630
8,We visited The Cake Café while we were visitin...,37,227
9,Super awesome place to order cakes made and de...,44,250


In [9]:
#drop reviews with 0 words
df = df.drop(df.index[df['word_count'] == 0.0])
df

Unnamed: 0,review,word_count,char_count
0,Lovely little café off Camden Street. You walk...,119,685
1,I ordered online for a cake and a tea towel to...,70,379
2,Yummy cake bites (mini slices of sponge cake w...,29,167
3,I went to The Cake Cafe during my first trip t...,137,699
4,You had me at cake. We came here to share a do...,57,295
5,Quirky cakey goodnessA lovely selection of tea...,42,250
6,"Amazing meal at Cake Café. Very nice people, b...",49,444
7,What can be said that hasn't already been said...,109,630
8,We visited The Cake Café while we were visitin...,37,227
9,Super awesome place to order cakes made and de...,44,250


In [10]:
def average_words(x):
    words = x.split()
    return sum(len(word) for word in words) / len(words)    

In [11]:
df['average_word_length'] = df['review'].apply(lambda x: average_words(x))

In [12]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [13]:
stop_words = stopwords.words('greek')
stop_words[:5]

['αλλα', 'αν', 'αντι', 'απο', 'αυτα']

In [14]:
df['stopword_count'] = df['review'].apply(lambda x: len([word for word in x.split() if word.lower() in stop_words]))
df['stopword_rate'] = df['stopword_count'] / df['word_count']
df.head()

Unnamed: 0,review,word_count,char_count,average_word_length,stopword_count,stopword_rate
0,Lovely little café off Camden Street. You walk...,119,685,4.764706,0,0.0
1,I ordered online for a cake and a tea towel to...,70,379,4.428571,0,0.0
2,Yummy cake bites (mini slices of sponge cake w...,29,167,4.793103,0,0.0
3,I went to The Cake Cafe during my first trip t...,137,699,4.109489,0,0.0
4,You had me at cake. We came here to share a do...,57,295,4.192982,0,0.0


In [15]:
df.describe()

Unnamed: 0,word_count,char_count,average_word_length,stopword_count,stopword_rate
count,10.0,10.0,10.0,10.0,10.0
mean,69.3,402.6,4.997856,0.0,0.0
std,38.363901,201.681377,1.131655,0.0,0.0
min,29.0,167.0,4.109489,0.0,0.0
25%,42.5,250.0,4.497565,0.0,0.0
50%,53.0,337.0,4.776848,0.0,0.0
75%,99.25,583.5,4.912562,0.0,0.0
max,137.0,699.0,8.081633,0.0,0.0


# Data Cleansing

In [16]:
#generate the lowercase reviews
df['lowercase'] = df['review'].apply(lambda x: " ".join(word.lower() for word in x.split()))
#remove everything that is not a word or a space
df['punctuation'] = df['lowercase'].str.replace('[^\w\s]', '')
#remove stopwords from the punctuated reviews
df['cleaned_review'] = df['punctuation'].apply(lambda x: " ".join(word for word in x.split() if word not in stop_words))
#calculate rate of clean words per review
df['cleaned_review_word_count'] = df['cleaned_review'].apply(lambda x: len(x.split()))
df['clean_rate'] = df['cleaned_review_word_count'] / df['word_count']

In [17]:
#inspect dataframe
df.head()

Unnamed: 0,review,word_count,char_count,average_word_length,stopword_count,stopword_rate,lowercase,punctuation,cleaned_review,cleaned_review_word_count,clean_rate
0,Lovely little café off Camden Street. You walk...,119,685,4.764706,0,0.0,lovely little café off camden street. you walk...,lovely little café off camden street you walk ...,lovely little café off camden street you walk ...,117,0.983193
1,I ordered online for a cake and a tea towel to...,70,379,4.428571,0,0.0,i ordered online for a cake and a tea towel to...,i ordered online for a cake and a tea towel to...,i ordered online for a cake and a tea towel to...,70,1.0
2,Yummy cake bites (mini slices of sponge cake w...,29,167,4.793103,0,0.0,yummy cake bites (mini slices of sponge cake w...,yummy cake bites mini slices of sponge cake wi...,yummy cake bites mini slices of sponge cake wi...,26,0.896552
3,I went to The Cake Cafe during my first trip t...,137,699,4.109489,0,0.0,i went to the cake cafe during my first trip t...,i went to the cake cafe during my first trip t...,i went to the cake cafe during my first trip t...,137,1.0
4,You had me at cake. We came here to share a do...,57,295,4.192982,0,0.0,you had me at cake. we came here to share a do...,you had me at cake we came here to share a dou...,you had me at cake we came here to share a dou...,57,1.0


#Lemmatization

In [18]:
#Import textblob
from textblob import Word
nltk.download('wordnet')
from nltk import wordnet

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [19]:
df['lemmatized'] = df['cleaned_review'].apply(lambda x: " ".join(Word(word).lemmatize() for word in x.split()))
df.head()

Unnamed: 0,review,word_count,char_count,average_word_length,stopword_count,stopword_rate,lowercase,punctuation,cleaned_review,cleaned_review_word_count,clean_rate,lemmatized
0,Lovely little café off Camden Street. You walk...,119,685,4.764706,0,0.0,lovely little café off camden street. you walk...,lovely little café off camden street you walk ...,lovely little café off camden street you walk ...,117,0.983193,lovely little café off camden street you walk ...
1,I ordered online for a cake and a tea towel to...,70,379,4.428571,0,0.0,i ordered online for a cake and a tea towel to...,i ordered online for a cake and a tea towel to...,i ordered online for a cake and a tea towel to...,70,1.0,i ordered online for a cake and a tea towel to...
2,Yummy cake bites (mini slices of sponge cake w...,29,167,4.793103,0,0.0,yummy cake bites (mini slices of sponge cake w...,yummy cake bites mini slices of sponge cake wi...,yummy cake bites mini slices of sponge cake wi...,26,0.896552,yummy cake bite mini slice of sponge cake with...
3,I went to The Cake Cafe during my first trip t...,137,699,4.109489,0,0.0,i went to the cake cafe during my first trip t...,i went to the cake cafe during my first trip t...,i went to the cake cafe during my first trip t...,137,1.0,i went to the cake cafe during my first trip t...
4,You had me at cake. We came here to share a do...,57,295,4.192982,0,0.0,you had me at cake. we came here to share a do...,you had me at cake we came here to share a dou...,you had me at cake we came here to share a dou...,57,1.0,you had me at cake we came here to share a dou...


#Sentiment Analysis

In [20]:
from textblob import TextBlob

In [21]:
df['polarity'] = df['lemmatized'].apply(lambda x: TextBlob(x).sentiment[0] )
df['subjectivity'] = df['lemmatized'].apply(lambda x: TextBlob(x).sentiment[1] )

In [22]:
df.head()

Unnamed: 0,review,word_count,char_count,average_word_length,stopword_count,stopword_rate,lowercase,punctuation,cleaned_review,cleaned_review_word_count,clean_rate,lemmatized,polarity,subjectivity
0,Lovely little café off Camden Street. You walk...,119,685,4.764706,0,0.0,lovely little café off camden street. you walk...,lovely little café off camden street you walk ...,lovely little café off camden street you walk ...,117,0.983193,lovely little café off camden street you walk ...,0.127202,0.526071
1,I ordered online for a cake and a tea towel to...,70,379,4.428571,0,0.0,i ordered online for a cake and a tea towel to...,i ordered online for a cake and a tea towel to...,i ordered online for a cake and a tea towel to...,70,1.0,i ordered online for a cake and a tea towel to...,0.507727,0.69
2,Yummy cake bites (mini slices of sponge cake w...,29,167,4.793103,0,0.0,yummy cake bites (mini slices of sponge cake w...,yummy cake bites mini slices of sponge cake wi...,yummy cake bites mini slices of sponge cake wi...,26,0.896552,yummy cake bite mini slice of sponge cake with...,0.175,0.266667
3,I went to The Cake Cafe during my first trip t...,137,699,4.109489,0,0.0,i went to the cake cafe during my first trip t...,i went to the cake cafe during my first trip t...,i went to the cake cafe during my first trip t...,137,1.0,i went to the cake cafe during my first trip t...,0.275,0.561667
4,You had me at cake. We came here to share a do...,57,295,4.192982,0,0.0,you had me at cake. we came here to share a do...,you had me at cake we came here to share a dou...,you had me at cake we came here to share a dou...,57,1.0,you had me at cake we came here to share a dou...,0.5,0.45
