In [None]:
%matplotlib inline
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from math import pi
import seaborn as sns

import csv

# Clean Dataset

The data that is used in this project is the yelp dataset. We only selected boba shops for the business dataset and concatenated it with the reviews for each given business.

In [None]:
df = pd.read_csv("yelp_reviews_boba_categories.csv")
#Created a new column called sentiment to group positive, negative and neutral 
df.loc[(df["stars"] >= 4, "Sentiment")] = 'Positive'
df.loc[(df["stars"] == 3, "Sentiment")] = 'Neutral'
df.loc[(df["stars"] == 3.5, "Sentiment")] = 'Neutral'
df.loc[(df["stars"] <= 2.5, "Sentiment")] = 'Negative'
df.head()

In [None]:
print(len(df.business_id.unique())) #Prints how many businesses are in the clean dataset

# EDA for the Clean Dataset

In [None]:
#Total number of reviews for each restaurant
reviewsCount = df.groupby("name")["stars"].count()
plt.figure().set_figwidth(20)
plt.bar(reviewsCount.index,reviewsCount)
plt.xticks(rotation=90)
plt.show()

In [None]:
#Distribution of star ratings in dataframe
df.groupby(['review_stars']).count().plot(kind='pie', y='text', ylabel="Review Ratings", autopct='%1.0f%%')

In [None]:
#for i in range(len(df.attributes)):
print(len(df.attributes))
df['attributes'] = df['attributes'].astype(str)
df.head(50)

This graph shows the relationship between the cost of Wi-Fi and the star rating given by the user. We can see that since there are more lower ratings when Wi-Fi is not free that there could be a relationship between cost of Wi-Fi and the star rating given by the user.

In [None]:
for i in range(len(df.attributes)):
    if df.attributes[i][12:16] == 'free':
        df.loc[i,'wifi'] = "Wi-Fi is free"
    else:
        df.loc[i,'wifi'] = "Wi-Fi is not free"
sns.boxplot(x='wifi', y='stars', data=df)

In [None]:
for i in range(len(df.attributes)):
    if "'RestaurantsTakeOut': 'True'" in df.attributes[i]:
        df.loc[i,'take-out'] = "Yes"
    else:
        df.loc[i,'take-out'] = "No"

This graph shows the relationship between restaurant take-out and the star rating given by the user. We can see that most of the restaurants have take-out and have significantly good ratings given by user. The restaurants that don't have take-out also have similar distribution. Therefore, restaurant take-out does not have much effect on the star rating given by the user. 

In [None]:
df2 = df.groupby(['take-out','stars']).size()
df2 = df2.unstack()
df2.plot(kind="barh")

In [None]:
reviewsCount = df.groupby("city")["stars"].count()
plt.figure().set_figwidth(20)
plt.bar(reviewsCount.index,reviewsCount)
plt.xticks(rotation=90)
plt.show()

The bar graph shows the distribution of places with boba on yelp based on cities in the data set.

In [None]:
reviewsCount = df.groupby("state")["stars"].count()
plt.figure().set_figwidth(20)
plt.bar(reviewsCount.index,reviewsCount)
plt.xticks(rotation=90)
plt.show()

The bar graph shows the distribution of places with boba on yelp based on states in the data set.

In [None]:
df.groupby(['stars']).count().plot(kind='pie', y='text', ylabel="Review Stars", autopct='%1.0f%%')

These is the distribution of review stars rounded to the half-stars.

In [None]:
sns.distplot(df['stars'], kde=False); #distribution of stars

This visualization shows the average amount of stars across all boba businesses. 

# Text Mining with Linear Regression

Using linear regression, we are trying to predict whether a review is negative or positive and what rating the user will give for that business. 

In [None]:
#natural language toolkit
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import FreqDist
from sklearn import metrics, neighbors
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import MultinomialNB

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [None]:
#displaying a dataframe that only contains the reviews, sentiment, and star ratings
display(df[['text', 'Sentiment','stars']])

In [None]:
#length of each review, made new length column at end to see num of charcaters in review
df['length'] = df['text'].apply(len)
df.head()

In [None]:
#visualize length of review vs star number rating
sns.boxplot(x='stars', y='length', data=df)

The above boxlpot shows the visualization of length of characters in a review vs the respective star rating. We can see that shorter lengthed reviews tend to have a lower star rating compared to higher star rated reviews. However, this might be because there are less negative ratings.

In [None]:
import nltk
from nltk.corpus import stopwords

full_text = ' '.join(df['text'])
#convert reviews to lowercase
lower_full_text = full_text.lower()
#tokenize words and put into a list
word_tokens = word_tokenize(lower_full_text)
tokens = list()

#if the word is a character and is not a stop word, append to list, then find freq of tokens
for word in word_tokens:
    if word.isalpha() and word not in stopwords.words('english'):
        tokens.append(word)
token_dist = FreqDist(tokens)
dist = pd.DataFrame(token_dist.most_common(30),columns=['Word', 'Frequency'])

In [None]:
#distribution of frequent unique words in reviews
print(dist)

In [None]:
#new df with only stars, type of review, and reviews
starsdf = pd.DataFrame(df[['stars','Sentiment','text']])
starsdf.head()

In [None]:
#categorical to numerical data for sentinent
starsdf['Sentiment'].replace(['Negative', 'Positive', 'Neutral'],
                        [2, 1, 0], inplace=True)
starsdf.head()

In [None]:
#analyze each sentiment (pos(1), neutral(0), neg(2))
print('Number of positive reviews: ', starsdf['Sentiment'].value_counts()[1])
print('Number of negative reviews: ', starsdf['Sentiment'].value_counts()[2])
print('Number of neutral reviews: ', starsdf['Sentiment'].value_counts()[0])

In [None]:
#7212 rows = 7212 positive reviews, as seen from above
#make df with only positive reviews
pos = starsdf.loc[starsdf['Sentiment'] == 1]
posrev = pd.DataFrame(pos[['stars','Sentiment','text']])
len(pos.index)

In [None]:
#make df with only neutral reviews
neu = starsdf.loc[starsdf['Sentiment'] == 0]
neurev = pd.DataFrame(neu[['stars','Sentiment','text']])
len(neu.index)

In [None]:
#make df with only negative reviews
neg = starsdf.loc[starsdf['Sentiment'] == 2]
negrev = pd.DataFrame(neg[['stars','Sentiment','text']])
len(neg.index)

In [None]:
#df with positive and negative reviews
posneg = starsdf.loc[(starsdf['Sentiment'] == 1) | (starsdf['Sentiment'] == 2)]
posnegrev = pd.DataFrame(posneg[['stars','Sentiment','text']])
len(posneg.index)

We will be using the dataframe above to decipher between strictly positive and negative reviews when testing

In [None]:
#data classification
X = posneg['text']
y = posneg['stars']

In [None]:
#remove punctuation in reviews
def rm(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [None]:
#word freq in pos reviews
full_text = ' '.join(pos['text'])
#convert reviews to lowercase
lower_full_text = full_text.lower()
#tokenize words and put into a list
word_tokens = word_tokenize(lower_full_text)
tokens = list()

#if the word is a character and is not a stop word, append to list, then find freq of tokens
for word in word_tokens:
    if word.isalpha() and word not in stopwords.words('english'):
        tokens.append(word)
token_dist = FreqDist(tokens)
dist = pd.DataFrame(token_dist.most_common(10),columns=['Word', 'Frequency'])
print(dist)

The above table shows the first 10 most common words used in positive reviews

In [None]:
testReview = CountVectorizer(analyzer=rm).fit(X)

Below we are testing a random review. First, we see the size of the review is 17734 characters. Then the review is vectorized and we see that transform is used so that it goes through each word in the review to see how many unique words there are.

In [None]:
#testing random review, review as a vector (another way for BOW)
#size of review
print(len(testReview.vocabulary_))
rev43 = X[43]
print(rev43)
test1 = testReview.transform([rev43])
print(test1)

In [None]:
#words that are most common, in this review we can read that the drinks are sweet
print(testReview.get_feature_names_out()[8504])
print(testReview.get_feature_names_out()[15955])

Here we are transforming data X into sparse matrix to speed up processing since there are many zero occurences:

In [None]:
X = testReview.transform(X)
#shape of the matrix:
print("Shape of the sparse matrix: ", X.shape)
#non-zero occurences:
print("Non-Zero occurences: ",X.nnz)

#new density of matrix
density = (X.nnz/(X.shape[0]*X.shape[1]))*100
print("Density of the matrix = ",density)

In [None]:
# vectorization
vect = CountVectorizer(stop_words=stopwords.words('english'))
vect.fit(posneg.text)
X = vect.transform(posneg.text)
X_df = pd.DataFrame(X.toarray(), columns=vect.get_feature_names_out())

# define the vector of targets 
# matrix of features
y = posneg.Sentiment
X = X_df
# # Perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
#k nearest neighbor
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train,y_train)
predknn = knn.predict(X_test)
print("Confusion Matrix for K Nearest Neighbors:")
print(confusion_matrix(y_test,predknn))
print("Score: ",round(accuracy_score(y_test,predknn)*100,2))
print("Classification Report:")
print(classification_report(y_test,predknn))

Above are the confusion matrix and accuracy score for K nearest neighbors, we can see that the score has an accuracy of 98.14% which is high. We will use test this to predict whether a review is positive or negative.

Below we are predicting a positive or negative review using KNN. The review below is positive. 

In [None]:
posrev = posneg['text'][0]
print("Positive review: ", posrev)
print("Star Rating: ", posneg['stars'][0])
posrev_t = vect.transform([posrev])
knn.predict(posrev_t)

#look at first number
#1 = positive, 2 = negative
print("Predicted Pos/Neg Rating (second column):")
print(posrev_t)

Below we are predicting a positive or negative review using KNN. The review below is negative.

In [None]:
negrev = posneg['text'][562]
print("Negative review: ", negrev)
print("Star Rating: ", neg['stars'][562])
negrev_t = vect.transform([negrev])
knn.predict(negrev_t)

#look at first number
#1 = positive, 2 = negative
print("Predicted Pos/Neg Rating (second column):")
print(negrev_t)

In [None]:
df.dropna(inplace=True)

# Text Mining with Naïve Bayes

Using naïve bayes, we are trying to predict whether a review is negative or positive and what rating the user will give for that business.

In [None]:
#converting text into tokens
vectorizer = CountVectorizer(stop_words='english', ngram_range = (1,1), max_df = .80, min_df = 4)
#train/test split
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["Sentiment"],random_state=1, test_size= 0.2)
#transforming the tokens into counts
vect.fit(X_train)
X_train_dtm = vect.transform(X_train) 
X_test_dtm = vect.transform(X_test)
NB = MultinomialNB()
NB.fit(X_train_dtm, y_train)
y_pred = NB.predict(X_test_dtm)
print('Naïve Bayes')
print()
print('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred)*100,'%',sep='')
print()
print("Test Review: ", vals[2])
print()
print("Predicted Sentiment for Review: ", y_pred[2])