In this notebook additional analysis is done on the review text data. The review text is split up in bigrams to help find the most important topics in competitor reviews.

In [1]:
#import packages
%matplotlib inline
import matplotlib.pyplot as plt 
import pandas as pd
import numpy as np
import seaborn as sns
import nltk

sns.set()

In [2]:
df = pd.read_csv('yelp_reviews_filtered.csv')

### Reviews mentioning Food and Restaurant

In [3]:
#create strings for the reviews mentioning food and spa
food = df.text[df.text.str.contains('food')]
rest = df.text[df.text.str.contains('restaurant')]
foodstr = food.str.cat()
reststr = rest.str.cat()

In [4]:
print('Reviews containing food: ', len(food))
print('Reviews containing restaurant: ', len(rest))

Reviews containing food:  10637
Reviews containing restaurant:  8890


In [5]:
from collections import Counter
from nltk.tokenize import word_tokenize

# Tokenize
tokensf = word_tokenize(foodstr)

# Convert the tokens into lowercase: lower_tokens
lower_tokensf = [t.lower() for t in tokensf]

In [6]:
# Tokenize
tokensr = word_tokenize(reststr)

# Convert the tokens into lowercase: lower_tokens
lower_tokensr = [t.lower() for t in tokensr]

In [7]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
stop_words.update(['hotel', 'stay', 'stayed', 'one', 'get', 'u'])

# Retain alphabetic words: alpha_only
alpha_onlyf = [t for t in lower_tokensf if t.isalpha()]

# Remove all stop words: no_stops
no_stopsf = [t  for t in alpha_onlyf if t not in stop_words]

# Instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# Lemmatize all tokens into a new list: lemmatized
lemmatizedf = [wordnet_lemmatizer.lemmatize(t) for t in no_stopsf]

In [8]:
# Retain alphabetic words: alpha_only
alpha_onlyr = [t for t in lower_tokensr if t.isalpha()]

# Remove all stop words: no_stops
no_stopsr = [t  for t in alpha_onlyr if t not in stop_words]

# Instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# Lemmatize all tokens into a new list: lemmatized
lemmatizedr = [wordnet_lemmatizer.lemmatize(t) for t in no_stopsr]

In [9]:
#Create your bigrams
bgsf = nltk.bigrams(lemmatizedf)

#compute frequency distribution for all the bigrams in the text
fdistf = nltk.FreqDist(bgsf)

In [10]:
#Create your bigrams
bgsr = nltk.bigrams(lemmatizedr)

#compute frequency distribution for all the bigrams in the text
fdistr = nltk.FreqDist(bgsr)





In [11]:
# Print the 20 most common bigrams
print('Reviews mentioning food: ', fdistf.most_common(20))



Reviews mentioning food:  [(('front', 'desk'), 1998), (('la', 'vega'), 1664), (('food', 'court'), 1628), (('resort', 'fee'), 1215), (('room', 'service'), 1183), (('customer', 'service'), 761), (('pool', 'area'), 673), (('room', 'clean'), 660), (('room', 'nice'), 611), (('monte', 'carlo'), 607), (('first', 'time'), 589), (('good', 'food'), 580), (('mandalay', 'bay'), 572), (('food', 'option'), 571), (('even', 'though'), 553), (('pretty', 'good'), 527), (('come', 'back'), 515), (('food', 'good'), 464), (('great', 'food'), 461), (('casino', 'floor'), 441)]


In [12]:
print('Reviews mentioning restaurant: ', fdistr.most_common(20))

Reviews mentioning restaurant:  [(('la', 'vega'), 1551), (('front', 'desk'), 1434), (('resort', 'fee'), 1014), (('room', 'service'), 775), (('mandalay', 'bay'), 677), (('pool', 'area'), 622), (('room', 'nice'), 610), (('room', 'clean'), 593), (('food', 'court'), 582), (('customer', 'service'), 571), (('new', 'york'), 464), (('first', 'time'), 460), (('feel', 'like'), 453), (('monte', 'carlo'), 441), (('casino', 'floor'), 438), (('even', 'though'), 435), (('hard', 'rock'), 420), (('great', 'restaurant'), 418), (('red', 'rock'), 410), (('pretty', 'good'), 403)]


### Reviews mentioning Spa

In [13]:
spa = df.text[df.text.str.contains('spa')]
spastr = spa.str.cat()

In [14]:
# Tokenize
tokenss = word_tokenize(spastr)

# Convert the tokens into lowercase: lower_tokens
lower_tokenss = [t.lower() for t in tokenss]

In [15]:
# Retain alphabetic words: alpha_only
alpha_onlys = [t for t in lower_tokenss if t.isalpha()]

# Remove all stop words: no_stops
no_stopss = [t  for t in alpha_onlys if t not in stop_words]

# Instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# Lemmatize all tokens into a new list: lemmatized
lemmatizeds = [wordnet_lemmatizer.lemmatize(t) for t in no_stopss]

In [16]:
#Create your bigrams
bgss = nltk.bigrams(lemmatizeds)

#compute frequency distribution for all the bigrams in the text
fdists = nltk.FreqDist(bgss)

In [17]:
# Print the 20 most common bigrams
print('Reviews mentioning spa: ', fdists.most_common(20))

Reviews mentioning spa:  [(('front', 'desk'), 1998), (('la', 'vega'), 1664), (('food', 'court'), 1628), (('resort', 'fee'), 1215), (('room', 'service'), 1183), (('customer', 'service'), 761), (('pool', 'area'), 673), (('room', 'clean'), 660), (('room', 'nice'), 611), (('monte', 'carlo'), 607), (('first', 'time'), 589), (('good', 'food'), 580), (('mandalay', 'bay'), 572), (('food', 'option'), 571), (('even', 'though'), 553), (('pretty', 'good'), 527), (('come', 'back'), 515), (('food', 'good'), 464), (('great', 'food'), 461), (('casino', 'floor'), 441)]


### Top Rival Reviews

In [18]:
#look at who the highest rated competitors are
rivals = df.groupby('name').mean()
rivals.review_count.describe()

count      80.000000
mean      770.219318
std       993.217078
min         3.000000
25%        63.000000
50%       340.500000
75%      1017.000000
max      4041.000000
Name: review_count, dtype: float64

In [19]:
big_rivals = rivals[rivals.review_count > 1000]
big_rivals.sort_values('company_rating', ascending=False)

Unnamed: 0_level_0,cool,funny,review_rating,useful,is_open,review_count,company_rating,text_length
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
M Resort Spa Casino,0.864943,0.793103,3.94636,1.58908,1.0,1044.0,4.0,771.402299
ARIA Resort & Casino,0.731502,0.816382,3.492947,1.663202,1.0,4041.0,3.5,987.181886
"South Point Hotel, Casino & Spa",0.564729,0.588507,3.422721,1.449802,1.0,1514.0,3.5,731.21004
Red Rock Casino Resort & Spa,0.79361,0.735463,3.594249,1.804473,1.0,1564.0,3.5,831.402556
Caesars Palace Las Vegas Hotel & Casino,0.870526,0.86329,3.186976,1.631759,1.0,2627.0,3.0,841.00495
Hard Rock Hotel & Casino,0.738631,0.76068,3.030776,1.602205,1.0,2175.0,3.0,962.060634
Mandalay Bay Resort & Casino,0.861629,0.821256,3.245687,1.597308,1.0,2898.0,3.0,899.205314
Bally's Las Vegas Hotel & Casino,0.605711,0.783635,2.878638,1.719385,1.0,1821.0,3.0,844.846787
New York New York Hotel & Casino,0.779516,0.745711,3.232089,1.436428,1.0,1982.0,3.0,801.771443
Palms Casino Resort,0.842912,0.958621,2.954789,1.914943,1.0,1305.0,3.0,916.8659


In [20]:
top_rivals = big_rivals[big_rivals.company_rating >=3.5]

In [21]:
tr = top_rivals.merge(df, on='name')
neg_tr = tr[tr.review_rating_y < 3]
pos_tr = tr[tr.review_rating_y > 3]
trn = neg_tr.text.str.cat()
trp = pos_tr.text.str.cat()

In [22]:
# Tokenize
tokenstrp = word_tokenize(trp)

# Convert the tokens into lowercase: lower_tokens
lower_tokenstrp = [t.lower() for t in tokenstrp]

In [23]:
# Tokenize
tokenstrn = word_tokenize(trn)

# Convert the tokens into lowercase: lower_tokens
lower_tokenstrn = [t.lower() for t in tokenstrn]

In [24]:
# Retain alphabetic words: alpha_only
alpha_onlytrp = [t for t in lower_tokenstrp if t.isalpha()]

# Remove all stop words: no_stops
no_stopstrp = [t  for t in alpha_onlytrp if t not in stop_words]

# Instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# Lemmatize all tokens into a new list: lemmatized
lemmatizedtrp = [wordnet_lemmatizer.lemmatize(t) for t in no_stopstrp]


In [25]:
# Retain alphabetic words: alpha_only
alpha_onlytrn = [t for t in lower_tokenstrn if t.isalpha()]

# Remove all stop words: no_stops
no_stopstrn = [t  for t in alpha_onlytrn if t not in stop_words]

# Instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# Lemmatize all tokens into a new list: lemmatized
lemmatizedtrn = [wordnet_lemmatizer.lemmatize(t) for t in no_stopstrn]

In [26]:
#Create your bigrams
bgstrp = nltk.bigrams(lemmatizedtrp)

#compute frequency distribution for all the bigrams in the text
fdiststrp = nltk.FreqDist(bgstrp)

In [27]:
#Create your bigrams
bgstrn = nltk.bigrams(lemmatizedtrn)

#compute frequency distribution for all the bigrams in the text
fdiststrn = nltk.FreqDist(bgstrn)

In [28]:
print('Top Rival positive reviews: ', fdiststrp.most_common(20))

Top Rival positive reviews:  [(('red', 'rock'), 728), (('la', 'vega'), 672), (('bowling', 'alley'), 422), (('movie', 'theater'), 401), (('front', 'desk'), 394), (('south', 'point'), 389), (('room', 'service'), 364), (('pool', 'area'), 327), (('room', 'nice'), 294), (('first', 'time'), 251), (('really', 'nice'), 245), (('customer', 'service'), 241), (('resort', 'fee'), 228), (('room', 'clean'), 224), (('great', 'place'), 214), (('sky', 'suite'), 209), (('next', 'time'), 205), (('would', 'definitely'), 193), (('casino', 'floor'), 192), (('feel', 'like'), 192)]


In [29]:
print('Top Rival negative reviews: ', fdiststrn.most_common(20))

Top Rival negative reviews:  [(('front', 'desk'), 724), (('customer', 'service'), 369), (('red', 'rock'), 226), (('resort', 'fee'), 177), (('la', 'vega'), 176), (('room', 'service'), 140), (('told', 'u'), 114), (('south', 'point'), 113), (('called', 'front'), 107), (('even', 'though'), 104), (('come', 'back'), 103), (('next', 'day'), 100), (('came', 'back'), 96), (('room', 'ready'), 95), (('first', 'time'), 85), (('said', 'would'), 83), (('go', 'back'), 80), (('gave', 'u'), 80), (('room', 'nice'), 78), (('got', 'room'), 75)]
