In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns

In [2]:
!ls ../data

yelp_reviews.json


# Load Processed Dataset

In [3]:
dataset_raw = pd.read_json("../data/yelp_reviews.json")

In [4]:
dataset_raw.head()

Unnamed: 0,business_id,business_name,business_stars,review_id,review_stars,review_text
0,--DaPTJW3-tB1vP-PfdTEg,Sunnyside Grill,3.5,DhjYsxxGlGa2Y4EaJV99XA,4,My sister and in laws were in town and we want...
1,--DaPTJW3-tB1vP-PfdTEg,Sunnyside Grill,3.5,5sg8KeTfTiLFC0hx75Xt9g,4,We really enjoy this breakfast restaurant in t...
2,--DaPTJW3-tB1vP-PfdTEg,Sunnyside Grill,3.5,-DcBOwz6gRTW0iQBSdJX3A,4,"Just tried this breakfast joint, didn't want t..."
3,--DaPTJW3-tB1vP-PfdTEg,Sunnyside Grill,3.5,WxvsMJUo8RVvuS_5rXgEVg,4,Stopped in on a weekday while in Toronto just ...
4,--DaPTJW3-tB1vP-PfdTEg,Sunnyside Grill,3.5,0TBECKZ08JmnPYw7zxRIsg,2,Not great. Ordered the eggs florentine with ho...


In [5]:
dataset_raw['review_stars'].value_counts()

4    118792
5    106918
3     65154
2     37520
1     34170
Name: review_stars, dtype: int64

# Further Data Processing

In [6]:
# Function to assign sentiment
def get_sentiment(n):
    assert n >= 1 and n <= 5 and type(n) == int
    
    if n == 1 or n == 2:
        return 0
    elif n == 3:
        return -1
    else:
        return 1

In [7]:
dataset = dataset_raw.loc[:,['review_stars', 'review_text']]
dataset['sentiment'] = dataset['review_stars'].apply(lambda x : get_sentiment(x))

In [8]:
dataset['sentiment'].value_counts()

 1    225710
 0     71690
-1     65154
Name: sentiment, dtype: int64

In [9]:
# Drop neutral reviews
dataset = dataset.loc[dataset['sentiment'] != -1]
dataset = dataset.reset_index(drop=True)

In [10]:
# Remove stars column
dataset = dataset.drop('review_stars', axis=1)
dataset = dataset.rename(columns={'review_text':'review'})

In [11]:
dataset.head()

Unnamed: 0,review,sentiment
0,My sister and in laws were in town and we want...,1
1,We really enjoy this breakfast restaurant in t...,1
2,"Just tried this breakfast joint, didn't want t...",1
3,Stopped in on a weekday while in Toronto just ...,1
4,Not great. Ordered the eggs florentine with ho...,0


In [12]:
dataset.describe()

Unnamed: 0,sentiment
count,297400.0
mean,0.758944
std,0.427725
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


In [19]:
# Example of negative review
dataset.loc[dataset['sentiment'] == 0, 'review'].iloc[0]

"Not great. Ordered the eggs florentine with hollandaise on the side. The spinach was tossed on raw and half of it was black and mushy/wilted. I guess that would have normally been hidden by the hollandaise... but it wasn't. I couldn't believe they served it like that. The chef clearly doesn't care about the quality of food."

In [20]:
# Example of positive review
dataset.loc[dataset['sentiment'] == 1, 'review'].iloc[0]

'My sister and in laws were in town and we wanted a central meeting point. My sister wanted more of a basic breakfast and we decided on here. We are so glad that we did. Great food, good service and good prices. They only have drip coffee (if you are a coffee snob) but I am told it is good. It is small, tables of 2 or 4. We had 5 so we had to wait about 15 minutes for a table which was fine. We got to check out the food as it came out of the kitchen. The home fries were awesome, the eggs were cooked to order and I will be back.'