In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import json
from pathlib import Path
from time import time

import warnings
warnings.filterwarnings('ignore')

### Read yelp review and business data

In [3]:
def load_rows(file_path, nrows=None, only_return_count=False, verbose=True):
    """
    Returns dataframe from json file
    """
    tic = time()
    with open(file_path, encoding = 'utf-8') as json_file:
        count = 0
        objs = []
        line = json_file.readline()
        while (nrows is None or count<nrows) and line:
            count += 1
            if not only_return_count:
                obj = json.loads(line)
                objs.append(obj)
            line = json_file.readline()
        toc = time()
        if verbose:
            print(file_path.split('/')[-1], 'loaded. Count =', count, ', Time =', round(toc-tic,2), 'secs.')

        if only_return_count:
            return count

        return pd.DataFrame(objs)

In [4]:
path_reviews = 'D:\\Chicago_MScA\\Winter 2022\\Data Mining Principles\\Project\\yelp_academic_dataset_review.json'

In [5]:
df_reviews = load_rows(path_reviews)

D:\Chicago_MScA\Winter 2022\Data Mining Principles\Project\yelp_academic_dataset_review.json loaded. Count = 8635403 , Time = 55.53 secs.


In [6]:
df_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,lWC-xP3rd6obsecCYsGZRg,ak0TdVmGKo4pwqdJSTLwWw,buF9druCkbuXLX526sGELQ,4.0,3,1,1,Apparently Prides Osteria had a rough summer a...,2014-10-11 03:34:02
1,8bFej1QE5LXp4O05qjGqXA,YoVfDbnISlW0f7abNQACIg,RA4V8pr014UyUbDvI-LW2A,4.0,1,0,0,This store is pretty good. Not as great as Wal...,2015-07-03 20:38:25
2,NDhkzczKjLshODbqDoNLSg,eC5evKn1TWDyHCyQAwguUw,_sS2LBIGNT5NQb6PD1Vtjw,5.0,0,0,0,I called WVM on the recommendation of a couple...,2013-05-28 20:38:06
3,T5fAqjjFooT4V0OeZyuk1w,SFQ1jcnGguO0LYWnbbftAA,0AzLzHfOJgL7ROwhdww2ew,2.0,1,1,1,I've stayed at many Marriott and Renaissance M...,2010-01-08 02:29:15
4,sjm_uUcQVxab_EeLCqsYLg,0kA0PAJ8QFMeveQWHFqz2A,8zehGz9jnxPqXtOc7KaJxA,4.0,0,0,0,The food is always great here. The service fro...,2011-07-28 18:05:01


In [14]:
import datetime
df_reviews['date'] = df_reviews['date'].apply(lambda x: pd.to_datetime(x, format='%Y-%m-%d %H:%M:%S'))

In [20]:
df_reviews['year'] = df_reviews['date'].apply(lambda x: x.year)

In [22]:
df_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,year
0,lWC-xP3rd6obsecCYsGZRg,ak0TdVmGKo4pwqdJSTLwWw,buF9druCkbuXLX526sGELQ,4.0,3,1,1,Apparently Prides Osteria had a rough summer a...,2014-10-11 03:34:02,2014
1,8bFej1QE5LXp4O05qjGqXA,YoVfDbnISlW0f7abNQACIg,RA4V8pr014UyUbDvI-LW2A,4.0,1,0,0,This store is pretty good. Not as great as Wal...,2015-07-03 20:38:25,2015
2,NDhkzczKjLshODbqDoNLSg,eC5evKn1TWDyHCyQAwguUw,_sS2LBIGNT5NQb6PD1Vtjw,5.0,0,0,0,I called WVM on the recommendation of a couple...,2013-05-28 20:38:06,2013
3,T5fAqjjFooT4V0OeZyuk1w,SFQ1jcnGguO0LYWnbbftAA,0AzLzHfOJgL7ROwhdww2ew,2.0,1,1,1,I've stayed at many Marriott and Renaissance M...,2010-01-08 02:29:15,2010
4,sjm_uUcQVxab_EeLCqsYLg,0kA0PAJ8QFMeveQWHFqz2A,8zehGz9jnxPqXtOc7KaJxA,4.0,0,0,0,The food is always great here. The service fro...,2011-07-28 18:05:01,2011


In [23]:
df_reviews['year'].value_counts()

2018    1084335
2019    1037569
2017    1029557
2016     960527
2015     907529
2014     726119
2020     601891
2013     555740
2012     472441
2011     431192
2010     317583
2009     213797
2008     150436
2007      71916
2021      44461
2006      23819
2005       6439
2004         52
Name: year, dtype: int64

In [7]:
path_business = 'D:\\Chicago_MScA\\Winter 2022\\Data Mining Principles\\Project\\yelp_academic_dataset_business.json'

In [8]:
df_business = load_rows(path_business)

D:\Chicago_MScA\Winter 2022\Data Mining Principles\Project\yelp_academic_dataset_business.json loaded. Count = 160585 , Time = 2.74 secs.


In [9]:
df_business['review_count'].describe()

count    160585.000000
mean         51.964548
std         130.030448
min           5.000000
25%           8.000000
50%          17.000000
75%          44.000000
max        9185.000000
Name: review_count, dtype: float64

In [10]:
df_business[df_business['review_count'] == 5].shape

(13844, 14)

### Data Filtering

In [11]:
rest = df_business.categories.str.contains('Restaurant', na=False)
df_business = df_business[rest]
df_business.drop(df_business[df_business.is_open == 0].index, inplace=True)
df_business = df_business.dropna()

In [20]:
df_business.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,6iYb2HFDywm3zjuRg0shjw,Oskar Blues Taproom,921 Pearl St,Boulder,CO,80302,40.017544,-105.283348,4.0,86,1,"{'RestaurantsTableService': 'True', 'WiFi': 'u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'..."
1,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,Portland,OR,97218,45.588906,-122.593331,4.0,126,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ..."
5,D4JtQNTI4X3KcbzacDJsMw,Bob Likes Thai Food,3755 Main St,Vancouver,BC,V5V,49.251342,-123.101333,3.5,169,1,"{'GoodForKids': 'True', 'Alcohol': 'u'none'', ...","Restaurants, Thai","{'Monday': '17:0-21:0', 'Tuesday': '17:0-21:0'..."
12,HPA_qyMEddpAEtFof02ixg,Mr G's Pizza & Subs,474 Lowell St,Peabody,MA,01960,42.541155,-70.973438,4.0,39,1,"{'RestaurantsGoodForGroups': 'True', 'HasTV': ...","Food, Pizza, Restaurants","{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'..."
13,ufCxltuh56FF4-ZFZ6cVhg,Sister Honey's,247 E Michigan St,Orlando,FL,32806,28.513265,-81.374707,4.5,135,1,"{'BusinessParking': '{'garage': False, 'street...","Restaurants, American (New), Bakeries, Dessert...","{'Tuesday': '11:0-18:0', 'Wednesday': '11:0-18..."


In [13]:
print('Business Data Shape:', df_business.shape)

Business Data Shape: (28684, 14)


In [14]:
# take a look at the distribution of review counts
print(df_business[df_business['review_count'] > 7000].shape)
df_business['review_count'].describe()

(3, 14)


count    28684.000000
mean       141.777995
std        247.854217
min          5.000000
25%         25.000000
50%         64.000000
75%        162.000000
max       9185.000000
Name: review_count, dtype: float64

In [21]:
df_business_part = df_business[['city','business_id','review_count']]

In [22]:
# take a look at the distribution of review stars
merged_data = pd.merge(df_reviews, df_business_part, on='business_id', how='outer')
merged_data['stars'].value_counts()

5.0    3814532
4.0    1920037
1.0    1262800
3.0     926656
2.0     711378
Name: stars, dtype: int64

In [29]:
merged_c1 = merged_data[(merged_data['city']=='Portland')|(merged_data['city']=='Vancouver')]
merged_c2 = merged_data[(merged_data['city']=='Atlanta') | (merged_data['city']=='Orlando')| (merged_data['city']=='Austin')]
print(merged_c1['stars'].value_counts())
print(merged_c2['stars'].value_counts())

5.0    357487
4.0    215154
3.0     98748
1.0     71370
2.0     64860
Name: stars, dtype: int64
5.0    630556
4.0    359593
3.0    170132
1.0    153135
2.0    118899
Name: stars, dtype: int64


### Define stratified sampling function

In [9]:
def stratify_data(df_data, stratify_column_name, stratify_values, stratify_proportions, threshold, random_state=0):
    """Stratifies data according to the values and proportions passed in
    Args:
        df_data (DataFrame): source data
        stratify_column_name (str): The name of the single column in the dataframe that holds the data values that will be used to stratify the data
        stratify_values (list of str): A list of all of the potential values for stratifying e.g. "Male, Graduate", "Male, Undergraduate", "Female, Graduate", "Female, Undergraduate"
        stratify_proportions (list of float): A list of numbers representing the desired propotions for stratifying e.g. 0.4, 0.4, 0.2, 0.2, The list values must add up to 1 and must match the number of values in stratify_values
        random_state (int, optional): sets the random_state. Defaults to None.
    Returns:
        DataFrame: a new dataframe based on df_data that has the new proportions represnting the desired strategy for stratifying
    """
    df_stratified = pd.DataFrame(columns = df_data.columns) # Create an empty DataFrame with column names matching df_data

    pos = -1
    
    for i in range(len(stratify_values)): # Iterate over the stratify values (e.g. "star"=5.0, etc.)
        pos += 1
        if 0 < threshold * stratify_proportions[i] < 1:
            ratio_len = 1 # Extract at least 1 review for those items existing
        else:
            ratio_len = int(threshold * stratify_proportions[i]) # Calculate the number of rows to match the desired proportion
        if ratio_len < 0:
            break
        df_filtered = df_data[df_data[stratify_column_name] ==stratify_values[i]] # Filter the source data based on the currently selected stratify value
        # print(stratify_proportions, pos, len(df_filtered), ratio_len)
        df_temp = df_filtered.sample(replace=False, n=ratio_len, random_state=random_state) # Sample the filtered data using the calculated ratio
        df_stratified = pd.concat([df_stratified, df_temp]) # Add the sampled / stratified datasets together to produce the final result
        
    return df_stratified # Return the stratified, re-sampled data

### Define sampling function

In [None]:
def sample(city):
    df = df_business[df_business['city'] == city]
    len_df_business = len(df)
    business_list = df['business_id'].drop_duplicates()
    len_business_list = len(business_list)
    
    reviews = list()
    total_original_reviews = 0

    # set threshold as 10
    for business in business_list:
        N = len(df_reviews[df_reviews['business_id'] == business])
        total_original_reviews += N
        # print(N)
        if N == 0:
            continue
        # If number of reviews <= 10, retain all reviews, else implement stratified sampling
        elif N <= 10:
            for line in df_reviews[df_reviews['business_id'] == business].index:
                reviews.append(df_reviews[df_reviews['business_id'] == business].loc[line,['business_id', 'stars', 'text']])
        else:
            star_count = pd.DataFrame(df_reviews[df_reviews['business_id'] == business]['stars'].value_counts()) # Count number of reviews for each star
            bus_stars = list(star_count.index)
            stratify_proportions = list(star_count['stars'] / N) # Calculate review proportons for each star
            #print(star_count)
            #print(df_reviews[df_reviews['business_id'] == business])
            stratified_reviews = stratify_data(df_reviews[df_reviews['business_id'] == business], 'stars', bus_stars, stratify_proportions, threshold=10)
            #print(stratified_reviews)
            for line in stratified_reviews.index:
                reviews.append(stratified_reviews.loc[line,['business_id', 'stars', 'text']])
    
    
    reviews = pd.DataFrame(reviews)
    reviews['city'] = city
    len_reviews_sample = len(reviews)
    reviews.to_csv('D:\\Chicago_MScA\\Winter 2022\\Data Mining Principles\\Project\\yelp_review_samples_'+city+'.csv')
    
    print('city:', city)
    print('len_df_business:', len_df_business)
    print('len_business_list:', len_business_list)
    print('total_original_reviews:', total_original_reviews)
    print('len_reviews_sample:', len_reviews_sample)

In [15]:
city_list = ['Portland','Vancouver']
for city in city_list:
    sample(city)

city: Portland
len_df_business: 2848
len_business_list: 2848
total_original_reviews: 530917
len_reviews_sample: 26399
city: Vancouver
len_df_business: 2675
len_business_list: 2675
total_original_reviews: 276702
len_reviews_sample: 23587
