In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import csv

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [4]:
# Load the data
yelp_file_to_load = "Resources/yelp_data2.csv"
# Open the city_file to filter zip codes for metropolitan areas.
yelp_file_df = pd.read_csv(yelp_file_to_load)

yelp_file_df


Unnamed: 0.1,Unnamed: 0,categories,rating,price,review_count,name,city,state
0,1,Mexican,4.5,2,865,La Contenta,New York,NY
1,2,American (New),4.0,2,495,The Cabin NYC,New York,NY
2,4,Shanghainese,4.0,2,6937,Joe's Shanghai,New York,NY
3,7,Dim Sum,4.0,2,206,3 Times,New York,NY
4,8,Thai,4.5,3,545,Wayla,New York,NY
...,...,...,...,...,...,...,...,...
45539,65042,Wine Bars,4.0,2,86,Tap & Barrel,Richland,WA
45540,65043,Barbeque,4.0,2,107,Gangnam Style BBQ,Kennewick,WA
45541,65044,Tapas/Small Plates,3.0,2,34,Flight Tap & Table,Richland,WA
45542,65046,American (New),4.0,2,185,3 Eyed Fish,Richland,WA


## Prepare Data for Ensemble Learning

In [5]:
# identify all unique categories
yelp_file_df.groupby('categories').nunique()

Unnamed: 0_level_0,Unnamed: 0,rating,price,review_count,name,city,state
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
African,18,2,2,3,3,3,1
American (New),3557,7,4,169,184,82,6
American (Traditional),1532,6,4,90,101,59,3
Arabic,16,1,1,1,1,1,1
Argentine,19,2,2,4,4,3,3
...,...,...,...,...,...,...,...
Venezuelan,36,2,3,6,6,4,2
Venues & Event Spaces,2,2,1,2,2,2,2
Vietnamese,1593,6,3,42,44,19,3
Wine Bars,212,4,3,22,22,13,3


### Understand the Frequesncy of the top 5 categories

In [6]:
# Understand the frequency of the data
yelp_categories_count = pd.DataFrame(yelp_file_df['categories'].value_counts(normalize=True)*100)
yelp_categories_count = yelp_categories_count.sort_values(by=['categories'], ascending=False)
yelp_categories_count.head(30)

Unnamed: 0,categories
Italian,8.962761
American (New),7.81003
Chinese,5.818549
Mexican,5.462849
Asian Fusion,5.173019
Caribbean,4.362814
Vietnamese,3.497716
American (Traditional),3.36378
Thai,3.326454
Turkish,3.102494


Taking the count of the categories, we can see out of 156 unique categories the top 5 categories hold 33% of the restauarants being evaluated in this analysis. Therefore, we can classify whether a restaurant is or is not one of the 5 then include all others as a classifier of "other"

In [7]:
evaluation_categories = ["Asian Fusion", "Mexican", "Chinese", "American (New)", "Italian"]
non_evalu_categories = yelp_file_df[~yelp_file_df['categories'].isin(evaluation_categories)]

In [8]:
# Create condition for italian restaurants
conditions = [
    (yelp_file_df['categories'] == "Italian"),
    (yelp_file_df['categories'] != "Italian")
]

values = [1, 0]

yelp_file_df['Italian'] = np.select(conditions, values)

yelp_file_df

Unnamed: 0.1,Unnamed: 0,categories,rating,price,review_count,name,city,state,Italian
0,1,Mexican,4.5,2,865,La Contenta,New York,NY,0
1,2,American (New),4.0,2,495,The Cabin NYC,New York,NY,0
2,4,Shanghainese,4.0,2,6937,Joe's Shanghai,New York,NY,0
3,7,Dim Sum,4.0,2,206,3 Times,New York,NY,0
4,8,Thai,4.5,3,545,Wayla,New York,NY,0
...,...,...,...,...,...,...,...,...,...
45539,65042,Wine Bars,4.0,2,86,Tap & Barrel,Richland,WA,0
45540,65043,Barbeque,4.0,2,107,Gangnam Style BBQ,Kennewick,WA,0
45541,65044,Tapas/Small Plates,3.0,2,34,Flight Tap & Table,Richland,WA,0
45542,65046,American (New),4.0,2,185,3 Eyed Fish,Richland,WA,0


In [9]:
# Create condition for American (New) restaurants
conditions = [
    (yelp_file_df['categories'] == "American (New)"),
    (yelp_file_df['categories'] != "American (New)")
]

values = [1, 0]

yelp_file_df['American(New)'] = np.select(conditions, values)

yelp_file_df

Unnamed: 0.1,Unnamed: 0,categories,rating,price,review_count,name,city,state,Italian,American(New)
0,1,Mexican,4.5,2,865,La Contenta,New York,NY,0,0
1,2,American (New),4.0,2,495,The Cabin NYC,New York,NY,0,1
2,4,Shanghainese,4.0,2,6937,Joe's Shanghai,New York,NY,0,0
3,7,Dim Sum,4.0,2,206,3 Times,New York,NY,0,0
4,8,Thai,4.5,3,545,Wayla,New York,NY,0,0
...,...,...,...,...,...,...,...,...,...,...
45539,65042,Wine Bars,4.0,2,86,Tap & Barrel,Richland,WA,0,0
45540,65043,Barbeque,4.0,2,107,Gangnam Style BBQ,Kennewick,WA,0,0
45541,65044,Tapas/Small Plates,3.0,2,34,Flight Tap & Table,Richland,WA,0,0
45542,65046,American (New),4.0,2,185,3 Eyed Fish,Richland,WA,0,1


In [10]:
# Create condition for Chinese restaurants
conditions = [
    (yelp_file_df['categories'] == "Chinese"),
    (yelp_file_df['categories'] != "Chinese")
]

values = [1, 0]

yelp_file_df['Chinese'] = np.select(conditions, values)

yelp_file_df

Unnamed: 0.1,Unnamed: 0,categories,rating,price,review_count,name,city,state,Italian,American(New),Chinese
0,1,Mexican,4.5,2,865,La Contenta,New York,NY,0,0,0
1,2,American (New),4.0,2,495,The Cabin NYC,New York,NY,0,1,0
2,4,Shanghainese,4.0,2,6937,Joe's Shanghai,New York,NY,0,0,0
3,7,Dim Sum,4.0,2,206,3 Times,New York,NY,0,0,0
4,8,Thai,4.5,3,545,Wayla,New York,NY,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
45539,65042,Wine Bars,4.0,2,86,Tap & Barrel,Richland,WA,0,0,0
45540,65043,Barbeque,4.0,2,107,Gangnam Style BBQ,Kennewick,WA,0,0,0
45541,65044,Tapas/Small Plates,3.0,2,34,Flight Tap & Table,Richland,WA,0,0,0
45542,65046,American (New),4.0,2,185,3 Eyed Fish,Richland,WA,0,1,0


In [11]:
# Create condition for Mexican restaurants
conditions = [
    (yelp_file_df['categories'] == "Mexican"),
    (yelp_file_df['categories'] != "Mexican")
]

values = [1, 0]

yelp_file_df['Mexican'] = np.select(conditions, values)

yelp_file_df

Unnamed: 0.1,Unnamed: 0,categories,rating,price,review_count,name,city,state,Italian,American(New),Chinese,Mexican
0,1,Mexican,4.5,2,865,La Contenta,New York,NY,0,0,0,1
1,2,American (New),4.0,2,495,The Cabin NYC,New York,NY,0,1,0,0
2,4,Shanghainese,4.0,2,6937,Joe's Shanghai,New York,NY,0,0,0,0
3,7,Dim Sum,4.0,2,206,3 Times,New York,NY,0,0,0,0
4,8,Thai,4.5,3,545,Wayla,New York,NY,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
45539,65042,Wine Bars,4.0,2,86,Tap & Barrel,Richland,WA,0,0,0,0
45540,65043,Barbeque,4.0,2,107,Gangnam Style BBQ,Kennewick,WA,0,0,0,0
45541,65044,Tapas/Small Plates,3.0,2,34,Flight Tap & Table,Richland,WA,0,0,0,0
45542,65046,American (New),4.0,2,185,3 Eyed Fish,Richland,WA,0,1,0,0


In [12]:
# Create condition for Asian Fusion restaurants
conditions = [
    (yelp_file_df['categories'] == "Asian Fusion"),
    (yelp_file_df['categories'] != "Asian Fusion")
]

values = [1, 0]

yelp_file_df['Asian Fusion'] = np.select(conditions, values)

yelp_file_df

Unnamed: 0.1,Unnamed: 0,categories,rating,price,review_count,name,city,state,Italian,American(New),Chinese,Mexican,Asian Fusion
0,1,Mexican,4.5,2,865,La Contenta,New York,NY,0,0,0,1,0
1,2,American (New),4.0,2,495,The Cabin NYC,New York,NY,0,1,0,0,0
2,4,Shanghainese,4.0,2,6937,Joe's Shanghai,New York,NY,0,0,0,0,0
3,7,Dim Sum,4.0,2,206,3 Times,New York,NY,0,0,0,0,0
4,8,Thai,4.5,3,545,Wayla,New York,NY,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
45539,65042,Wine Bars,4.0,2,86,Tap & Barrel,Richland,WA,0,0,0,0,0
45540,65043,Barbeque,4.0,2,107,Gangnam Style BBQ,Kennewick,WA,0,0,0,0,0
45541,65044,Tapas/Small Plates,3.0,2,34,Flight Tap & Table,Richland,WA,0,0,0,0,0
45542,65046,American (New),4.0,2,185,3 Eyed Fish,Richland,WA,0,1,0,0,0


In [13]:
# Create condition for American (New) restaurants
conditions = [
    (yelp_file_df['state'] == "NY"),
    (yelp_file_df['state'] != "NY")
]

values = [1, 0]

yelp_file_df['EC'] = np.select(conditions, values)

yelp_file_df

Unnamed: 0.1,Unnamed: 0,categories,rating,price,review_count,name,city,state,Italian,American(New),Chinese,Mexican,Asian Fusion,EC
0,1,Mexican,4.5,2,865,La Contenta,New York,NY,0,0,0,1,0,1
1,2,American (New),4.0,2,495,The Cabin NYC,New York,NY,0,1,0,0,0,1
2,4,Shanghainese,4.0,2,6937,Joe's Shanghai,New York,NY,0,0,0,0,0,1
3,7,Dim Sum,4.0,2,206,3 Times,New York,NY,0,0,0,0,0,1
4,8,Thai,4.5,3,545,Wayla,New York,NY,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45539,65042,Wine Bars,4.0,2,86,Tap & Barrel,Richland,WA,0,0,0,0,0,0
45540,65043,Barbeque,4.0,2,107,Gangnam Style BBQ,Kennewick,WA,0,0,0,0,0,0
45541,65044,Tapas/Small Plates,3.0,2,34,Flight Tap & Table,Richland,WA,0,0,0,0,0,0
45542,65046,American (New),4.0,2,185,3 Eyed Fish,Richland,WA,0,1,0,0,0,0


In [14]:
# Create condition for American (New) restaurants
conditions = [
    (yelp_file_df['state'] == "IL"),
    (yelp_file_df['state'] != "IL")
]

values = [1, 0]

yelp_file_df['MW'] = np.select(conditions, values)

yelp_file_df

Unnamed: 0.1,Unnamed: 0,categories,rating,price,review_count,name,city,state,Italian,American(New),Chinese,Mexican,Asian Fusion,EC,MW
0,1,Mexican,4.5,2,865,La Contenta,New York,NY,0,0,0,1,0,1,0
1,2,American (New),4.0,2,495,The Cabin NYC,New York,NY,0,1,0,0,0,1,0
2,4,Shanghainese,4.0,2,6937,Joe's Shanghai,New York,NY,0,0,0,0,0,1,0
3,7,Dim Sum,4.0,2,206,3 Times,New York,NY,0,0,0,0,0,1,0
4,8,Thai,4.5,3,545,Wayla,New York,NY,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45539,65042,Wine Bars,4.0,2,86,Tap & Barrel,Richland,WA,0,0,0,0,0,0,0
45540,65043,Barbeque,4.0,2,107,Gangnam Style BBQ,Kennewick,WA,0,0,0,0,0,0,0
45541,65044,Tapas/Small Plates,3.0,2,34,Flight Tap & Table,Richland,WA,0,0,0,0,0,0,0
45542,65046,American (New),4.0,2,185,3 Eyed Fish,Richland,WA,0,1,0,0,0,0,0


In [15]:
yelp_df = yelp_file_df.drop(["Unnamed: 0", "categories", "name", "city", "state"], axis=1)

In [16]:
yelp_df.to_csv("yelp_eval_data.csv", index=False)
yelp_df

Unnamed: 0,rating,price,review_count,Italian,American(New),Chinese,Mexican,Asian Fusion,EC,MW
0,4.5,2,865,0,0,0,1,0,1,0
1,4.0,2,495,0,1,0,0,0,1,0
2,4.0,2,6937,0,0,0,0,0,1,0
3,4.0,2,206,0,0,0,0,0,1,0
4,4.5,3,545,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
45539,4.0,2,86,0,0,0,0,0,0,0
45540,4.0,2,107,0,0,0,0,0,0,0
45541,3.0,2,34,0,0,0,0,0,0,0
45542,4.0,2,185,0,1,0,0,0,0,0


## Split the Data into Training and Testing

In [19]:
# Create our features
X = yelp_df

X = pd.get_dummies(X)

# Create our target
y = yelp_df["price"]

In [20]:
y.describe()

count    45544.000000
mean         2.043628
std          0.425364
min          1.000000
25%          2.000000
50%          2.000000
75%          2.000000
max          4.000000
Name: price, dtype: float64

In [21]:
X.describe()

Unnamed: 0,rating,price,review_count,Italian,American(New),Chinese,Mexican,Asian Fusion,EC,MW
count,45544.0,45544.0,45544.0,45544.0,45544.0,45544.0,45544.0,45544.0,45544.0,45544.0
mean,4.212179,2.043628,381.995038,0.089628,0.0781,0.058185,0.054628,0.05173,0.779554,0.209073
std,0.392673,0.425364,528.709782,0.285651,0.268332,0.234096,0.227256,0.221484,0.414552,0.406651
min,1.5,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,2.0,79.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,4.0,2.0,211.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,4.5,2.0,469.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,5.0,4.0,9515.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [22]:
# Check the balance of our target values
y.value_counts()

2    37742
3     4632
1     2995
4      175
Name: price, dtype: int64

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)