In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import csv

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [4]:
# Load the data
yelp_file_to_load = "Resources/yelp_data2.csv"
# Open the city_file to filter zip codes for metropolitan areas.
yelp_file_df = pd.read_csv(yelp_file_to_load)

yelp_file_df


Unnamed: 0.1,Unnamed: 0,categories,rating,price,review_count,name,city,state
0,1,Mexican,4.5,2,865,La Contenta,New York,NY
1,2,American (New),4.0,2,495,The Cabin NYC,New York,NY
2,4,Shanghainese,4.0,2,6937,Joe's Shanghai,New York,NY
3,7,Dim Sum,4.0,2,206,3 Times,New York,NY
4,8,Thai,4.5,3,545,Wayla,New York,NY
...,...,...,...,...,...,...,...,...
45539,65042,Wine Bars,4.0,2,86,Tap & Barrel,Richland,WA
45540,65043,Barbeque,4.0,2,107,Gangnam Style BBQ,Kennewick,WA
45541,65044,Tapas/Small Plates,3.0,2,34,Flight Tap & Table,Richland,WA
45542,65046,American (New),4.0,2,185,3 Eyed Fish,Richland,WA


## Prepare Data for Ensemble Learning

In [5]:
# identify all unique categories
yelp_file_df.groupby('categories').nunique()

Unnamed: 0_level_0,Unnamed: 0,rating,price,review_count,name,city,state
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
African,18,2,2,3,3,3,1
American (New),3557,7,4,169,184,82,6
American (Traditional),1532,6,4,90,101,59,3
Arabic,16,1,1,1,1,1,1
Argentine,19,2,2,4,4,3,3
...,...,...,...,...,...,...,...
Venezuelan,36,2,3,6,6,4,2
Venues & Event Spaces,2,2,1,2,2,2,2
Vietnamese,1593,6,3,42,44,19,3
Wine Bars,212,4,3,22,22,13,3


### Understand the Frequesncy of the top 5 categories

In [6]:
# Understand the frequency of the data
yelp_categories_count = pd.DataFrame(yelp_file_df['categories'].value_counts(normalize=True)*100)
yelp_categories_count = yelp_categories_count.sort_values(by=['categories'], ascending=False)
yelp_categories_count.head(30)

Unnamed: 0,categories
Italian,8.962761
American (New),7.81003
Chinese,5.818549
Mexican,5.462849
Asian Fusion,5.173019
Caribbean,4.362814
Vietnamese,3.497716
American (Traditional),3.36378
Thai,3.326454
Turkish,3.102494


Taking the count of the categories, we can see out of 156 unique categories the top 5 categories hold 33% of the restauarants being evaluated in this analysis. Therefore, we can classify whether a restaurant is or is not one of the 5 then include all others as a classifier of "other"

In [7]:
evaluation_categories = ["Asian Fusion", "Mexican", "Chinese", "American (New)", "Italian"]
non_evalu_categories = yelp_file_df[~yelp_file_df['categories'].isin(evaluation_categories)]

In [8]:
# Create condition for italian restaurants
conditions = [
    (yelp_file_df['categories'] == "Italian"),
    (yelp_file_df['categories'] != "Italian")
]

values = [1, 0]

yelp_file_df['Italian'] = np.select(conditions, values)

yelp_file_df

Unnamed: 0.1,Unnamed: 0,categories,rating,price,review_count,name,city,state,Italian
0,1,Mexican,4.5,2,865,La Contenta,New York,NY,0
1,2,American (New),4.0,2,495,The Cabin NYC,New York,NY,0
2,4,Shanghainese,4.0,2,6937,Joe's Shanghai,New York,NY,0
3,7,Dim Sum,4.0,2,206,3 Times,New York,NY,0
4,8,Thai,4.5,3,545,Wayla,New York,NY,0
...,...,...,...,...,...,...,...,...,...
45539,65042,Wine Bars,4.0,2,86,Tap & Barrel,Richland,WA,0
45540,65043,Barbeque,4.0,2,107,Gangnam Style BBQ,Kennewick,WA,0
45541,65044,Tapas/Small Plates,3.0,2,34,Flight Tap & Table,Richland,WA,0
45542,65046,American (New),4.0,2,185,3 Eyed Fish,Richland,WA,0


In [9]:
# Create condition for American (New) restaurants
conditions = [
    (yelp_file_df['categories'] == "American (New)"),
    (yelp_file_df['categories'] != "American (New)")
]

values = [1, 0]

yelp_file_df['American(New)'] = np.select(conditions, values)

yelp_file_df

Unnamed: 0.1,Unnamed: 0,categories,rating,price,review_count,name,city,state,Italian,American(New)
0,1,Mexican,4.5,2,865,La Contenta,New York,NY,0,0
1,2,American (New),4.0,2,495,The Cabin NYC,New York,NY,0,1
2,4,Shanghainese,4.0,2,6937,Joe's Shanghai,New York,NY,0,0
3,7,Dim Sum,4.0,2,206,3 Times,New York,NY,0,0
4,8,Thai,4.5,3,545,Wayla,New York,NY,0,0
...,...,...,...,...,...,...,...,...,...,...
45539,65042,Wine Bars,4.0,2,86,Tap & Barrel,Richland,WA,0,0
45540,65043,Barbeque,4.0,2,107,Gangnam Style BBQ,Kennewick,WA,0,0
45541,65044,Tapas/Small Plates,3.0,2,34,Flight Tap & Table,Richland,WA,0,0
45542,65046,American (New),4.0,2,185,3 Eyed Fish,Richland,WA,0,1


In [10]:
# Create condition for Chinese restaurants
conditions = [
    (yelp_file_df['categories'] == "Chinese"),
    (yelp_file_df['categories'] != "Chinese")
]

values = [1, 0]

yelp_file_df['Chinese'] = np.select(conditions, values)

yelp_file_df

Unnamed: 0.1,Unnamed: 0,categories,rating,price,review_count,name,city,state,Italian,American(New),Chinese
0,1,Mexican,4.5,2,865,La Contenta,New York,NY,0,0,0
1,2,American (New),4.0,2,495,The Cabin NYC,New York,NY,0,1,0
2,4,Shanghainese,4.0,2,6937,Joe's Shanghai,New York,NY,0,0,0
3,7,Dim Sum,4.0,2,206,3 Times,New York,NY,0,0,0
4,8,Thai,4.5,3,545,Wayla,New York,NY,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
45539,65042,Wine Bars,4.0,2,86,Tap & Barrel,Richland,WA,0,0,0
45540,65043,Barbeque,4.0,2,107,Gangnam Style BBQ,Kennewick,WA,0,0,0
45541,65044,Tapas/Small Plates,3.0,2,34,Flight Tap & Table,Richland,WA,0,0,0
45542,65046,American (New),4.0,2,185,3 Eyed Fish,Richland,WA,0,1,0


In [11]:
# Create condition for Mexican restaurants
conditions = [
    (yelp_file_df['categories'] == "Mexican"),
    (yelp_file_df['categories'] != "Mexican")
]

values = [1, 0]

yelp_file_df['Mexican'] = np.select(conditions, values)

yelp_file_df

Unnamed: 0.1,Unnamed: 0,categories,rating,price,review_count,name,city,state,Italian,American(New),Chinese,Mexican
0,1,Mexican,4.5,2,865,La Contenta,New York,NY,0,0,0,1
1,2,American (New),4.0,2,495,The Cabin NYC,New York,NY,0,1,0,0
2,4,Shanghainese,4.0,2,6937,Joe's Shanghai,New York,NY,0,0,0,0
3,7,Dim Sum,4.0,2,206,3 Times,New York,NY,0,0,0,0
4,8,Thai,4.5,3,545,Wayla,New York,NY,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
45539,65042,Wine Bars,4.0,2,86,Tap & Barrel,Richland,WA,0,0,0,0
45540,65043,Barbeque,4.0,2,107,Gangnam Style BBQ,Kennewick,WA,0,0,0,0
45541,65044,Tapas/Small Plates,3.0,2,34,Flight Tap & Table,Richland,WA,0,0,0,0
45542,65046,American (New),4.0,2,185,3 Eyed Fish,Richland,WA,0,1,0,0


In [12]:
# Create condition for Asian Fusion restaurants
conditions = [
    (yelp_file_df['categories'] == "Asian Fusion"),
    (yelp_file_df['categories'] != "Asian Fusion")
]

values = [1, 0]

yelp_file_df['Asian Fusion'] = np.select(conditions, values)

yelp_file_df

Unnamed: 0.1,Unnamed: 0,categories,rating,price,review_count,name,city,state,Italian,American(New),Chinese,Mexican,Asian Fusion
0,1,Mexican,4.5,2,865,La Contenta,New York,NY,0,0,0,1,0
1,2,American (New),4.0,2,495,The Cabin NYC,New York,NY,0,1,0,0,0
2,4,Shanghainese,4.0,2,6937,Joe's Shanghai,New York,NY,0,0,0,0,0
3,7,Dim Sum,4.0,2,206,3 Times,New York,NY,0,0,0,0,0
4,8,Thai,4.5,3,545,Wayla,New York,NY,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
45539,65042,Wine Bars,4.0,2,86,Tap & Barrel,Richland,WA,0,0,0,0,0
45540,65043,Barbeque,4.0,2,107,Gangnam Style BBQ,Kennewick,WA,0,0,0,0,0
45541,65044,Tapas/Small Plates,3.0,2,34,Flight Tap & Table,Richland,WA,0,0,0,0,0
45542,65046,American (New),4.0,2,185,3 Eyed Fish,Richland,WA,0,1,0,0,0


In [13]:
# Create condition for American (New) restaurants
conditions = [
    (yelp_file_df['state'] == "NY"),
    (yelp_file_df['state'] != "NY")
]

values = [1, 0]

yelp_file_df['EC'] = np.select(conditions, values)

yelp_file_df

Unnamed: 0.1,Unnamed: 0,categories,rating,price,review_count,name,city,state,Italian,American(New),Chinese,Mexican,Asian Fusion,EC
0,1,Mexican,4.5,2,865,La Contenta,New York,NY,0,0,0,1,0,1
1,2,American (New),4.0,2,495,The Cabin NYC,New York,NY,0,1,0,0,0,1
2,4,Shanghainese,4.0,2,6937,Joe's Shanghai,New York,NY,0,0,0,0,0,1
3,7,Dim Sum,4.0,2,206,3 Times,New York,NY,0,0,0,0,0,1
4,8,Thai,4.5,3,545,Wayla,New York,NY,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45539,65042,Wine Bars,4.0,2,86,Tap & Barrel,Richland,WA,0,0,0,0,0,0
45540,65043,Barbeque,4.0,2,107,Gangnam Style BBQ,Kennewick,WA,0,0,0,0,0,0
45541,65044,Tapas/Small Plates,3.0,2,34,Flight Tap & Table,Richland,WA,0,0,0,0,0,0
45542,65046,American (New),4.0,2,185,3 Eyed Fish,Richland,WA,0,1,0,0,0,0


In [14]:
# Create condition for American (New) restaurants
conditions = [
    (yelp_file_df['state'] == "IL"),
    (yelp_file_df['state'] != "IL")
]

values = [1, 0]

yelp_file_df['MW'] = np.select(conditions, values)

yelp_file_df

Unnamed: 0.1,Unnamed: 0,categories,rating,price,review_count,name,city,state,Italian,American(New),Chinese,Mexican,Asian Fusion,EC,MW
0,1,Mexican,4.5,2,865,La Contenta,New York,NY,0,0,0,1,0,1,0
1,2,American (New),4.0,2,495,The Cabin NYC,New York,NY,0,1,0,0,0,1,0
2,4,Shanghainese,4.0,2,6937,Joe's Shanghai,New York,NY,0,0,0,0,0,1,0
3,7,Dim Sum,4.0,2,206,3 Times,New York,NY,0,0,0,0,0,1,0
4,8,Thai,4.5,3,545,Wayla,New York,NY,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45539,65042,Wine Bars,4.0,2,86,Tap & Barrel,Richland,WA,0,0,0,0,0,0,0
45540,65043,Barbeque,4.0,2,107,Gangnam Style BBQ,Kennewick,WA,0,0,0,0,0,0,0
45541,65044,Tapas/Small Plates,3.0,2,34,Flight Tap & Table,Richland,WA,0,0,0,0,0,0,0
45542,65046,American (New),4.0,2,185,3 Eyed Fish,Richland,WA,0,1,0,0,0,0,0


In [15]:
yelp_df = yelp_file_df.drop(["Unnamed: 0", "categories", "name", "city", "state"], axis=1)

In [16]:
yelp_df.to_csv("yelp_eval_data.csv", index=False)
yelp_df

Unnamed: 0,rating,price,review_count,Italian,American(New),Chinese,Mexican,Asian Fusion,EC,MW
0,4.5,2,865,0,0,0,1,0,1,0
1,4.0,2,495,0,1,0,0,0,1,0
2,4.0,2,6937,0,0,0,0,0,1,0
3,4.0,2,206,0,0,0,0,0,1,0
4,4.5,3,545,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
45539,4.0,2,86,0,0,0,0,0,0,0
45540,4.0,2,107,0,0,0,0,0,0,0
45541,3.0,2,34,0,0,0,0,0,0,0
45542,4.0,2,185,0,1,0,0,0,0,0


## All Combined - Price

### Split the Data into Training and Testing

In [71]:
# Create our features
X = yelp_df.drop("price", axis=1)

X = pd.get_dummies(X)

# Create our target
y = yelp_df["price"]

In [72]:
y.describe()

count    45544.000000
mean         2.043628
std          0.425364
min          1.000000
25%          2.000000
50%          2.000000
75%          2.000000
max          4.000000
Name: price, dtype: float64

In [73]:
X.describe()

Unnamed: 0,rating,review_count,Italian,American(New),Chinese,Mexican,Asian Fusion,EC,MW
count,45544.0,45544.0,45544.0,45544.0,45544.0,45544.0,45544.0,45544.0,45544.0
mean,4.212179,381.995038,0.089628,0.0781,0.058185,0.054628,0.05173,0.779554,0.209073
std,0.392673,528.709782,0.285651,0.268332,0.234096,0.227256,0.221484,0.414552,0.406651
min,1.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,79.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,4.0,211.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,4.5,469.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,5.0,9515.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [74]:
# Check the balance of our target values
y.value_counts()

2    37742
3     4632
1     2995
4      175
Name: price, dtype: int64

In [75]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [76]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
rf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
model = rf_model.fit(X_train, y_train)

In [77]:
# Calculated the balanced accuracy score
from sklearn.metrics import accuracy_score
y_pred = rf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8835290266133755

In [78]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 649,   78,    4,    3],
       [ 883, 7628,  533,  369],
       [  21,   75, 1069,   35],
       [   0,    1,    1,   37]], dtype=int64)

In [79]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          1       0.42      0.88      0.92      0.57      0.90      0.81       734
          2       0.98      0.81      0.92      0.89      0.86      0.74      9413
          3       0.67      0.89      0.95      0.76      0.92      0.84      1200
          4       0.08      0.95      0.96      0.15      0.96      0.91        39

avg / total       0.91      0.82      0.92      0.85      0.87      0.75     11386



In [80]:
# List the features sorted in descending order by feature importance

importances = rf_model.feature_importances_
#
# Sort the feature importance in descending order
#
sorted_indices = np.argsort(importances)[::-1]
 
feat_labels = X.columns[1:]
 
for f in range(X_train.shape[0]):
    print("%2d) %-*s %f" % (f + 1, 30,
                            feat_labels[sorted_indices[f]],
                            importances[sorted_indices[f]]))

 1) Italian                        0.670864
 2) review_count                   0.124332
 3) EC                             0.048132
 4) Chinese                        0.035824
 5) American(New)                  0.034733
 6) MW                             0.030066


IndexError: index 8 is out of bounds for axis 0 with size 8

## West Coast - Price

In [81]:
# Filter dataset to west coast
west_coast = yelp_df[(yelp_df.EC == 0) &
                    (yelp_df.MW == 0)]

west_coast = west_coast.loc[:,["rating", "price", "review_count", "Asian Fusion", 
                               "Mexican", "Chinese", "American(New)", "Italian"]]
west_coast.head(10)

Unnamed: 0,rating,price,review_count,Asian Fusion,Mexican,Chinese,American(New),Italian
101,4.0,3,1138,0,0,0,1,0
102,4.0,3,123,0,0,0,0,0
1147,4.5,2,348,0,0,0,1,0
13892,3.0,4,2,0,0,0,0,0
13902,4.0,3,447,0,0,0,1,0
13904,4.0,2,425,0,0,0,0,0
13907,4.5,2,328,0,0,0,0,0
13926,4.5,2,412,0,0,0,0,1
32343,2.5,2,27,0,0,0,0,0
37662,4.0,2,100,0,0,0,0,0


In [82]:
# Create our features
X = west_coast.drop("price", axis=1)

X = pd.get_dummies(X)

# Create our target
y = west_coast["price"]

In [83]:
y.describe()

count    518.000000
mean       2.222008
std        0.572532
min        1.000000
25%        2.000000
50%        2.000000
75%        2.000000
max        4.000000
Name: price, dtype: float64

In [84]:
X.describe()

Unnamed: 0,rating,review_count,Asian Fusion,Mexican,Chinese,American(New),Italian
count,518.0,518.0,518.0,518.0,518.0,518.0,518.0
mean,4.198842,324.173745,0.005792,0.050193,0.009653,0.046332,0.055985
std,0.448551,282.100584,0.075955,0.218554,0.097866,0.210406,0.230114
min,2.0,2.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,107.75,0.0,0.0,0.0,0.0,0.0
50%,4.0,236.0,0.0,0.0,0.0,0.0,0.0
75%,4.5,497.0,0.0,0.0,0.0,0.0,0.0
max,5.0,1755.0,1.0,1.0,1.0,1.0,1.0


In [85]:
# Check the balance of our target values
y.value_counts()

2    383
3     95
4     20
1     20
Name: price, dtype: int64

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [87]:
# Resample the training data with the BalancedRandomForestClassifier
rf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
model = rf_model.fit(X_train, y_train)

In [88]:
# Calculated the balanced accuracy score
y_pred = rf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7267525773195876

In [89]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 2,  0,  1,  1],
       [22, 55, 18,  2],
       [ 3,  1, 21,  0],
       [ 0,  0,  0,  4]], dtype=int64)

In [90]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          1       0.07      0.50      0.80      0.13      0.63      0.39         4
          2       0.98      0.57      0.97      0.72      0.74      0.53        97
          3       0.53      0.84      0.82      0.65      0.83      0.69        25
          4       0.57      1.00      0.98      0.73      0.99      0.98         4

avg / total       0.85      0.63      0.94      0.69      0.76      0.57       130



In [91]:
# List the features sorted in descending order by feature importance

importances = rf_model.feature_importances_
#
# Sort the feature importance in descending order
#
sorted_indices = np.argsort(importances)[::-1]
 
feat_labels = X.columns[1:]
 
for f in range(X_train.shape[0]):
    print("%2d) %-*s %f" % (f + 1, 30,
                            feat_labels[sorted_indices[f]],
                            importances[sorted_indices[f]]))

 1) Asian Fusion                   0.654189
 2) review_count                   0.242290
 3) Chinese                        0.027046


IndexError: index 6 is out of bounds for axis 0 with size 6

## East Coast - Price

In [92]:
# Filter dataset to west coast
east_coast = yelp_df[(yelp_df.EC == 1) &
                    (yelp_df.MW == 0)]

east_coast = east_coast.loc[:,["rating", "price", "review_count", "Asian Fusion", 
                               "Mexican", "Chinese", "American(New)", "Italian"]]
east_coast.head(10)

Unnamed: 0,rating,price,review_count,Asian Fusion,Mexican,Chinese,American(New),Italian
0,4.5,2,865,0,1,0,0,0
1,4.0,2,495,0,0,0,1,0
2,4.0,2,6937,0,0,0,0,0
3,4.0,2,206,0,0,0,0,0
4,4.5,3,545,0,0,0,0,0
5,4.0,2,1849,0,0,0,0,0
6,4.0,2,346,0,0,0,0,0
7,5.0,2,13,0,0,0,0,0
8,4.5,2,73,0,0,0,0,0
9,4.0,2,305,0,0,0,0,0


In [93]:
# Create our features
X = east_coast.drop("price", axis=1)

X = pd.get_dummies(X)

# Create our target
y = east_coast["price"]

In [94]:
y.describe()

count    35504.000000
mean         2.063429
std          0.443754
min          1.000000
25%          2.000000
50%          2.000000
75%          2.000000
max          4.000000
Name: price, dtype: float64

In [95]:
X.describe()

Unnamed: 0,rating,review_count,Asian Fusion,Mexican,Chinese,American(New),Italian
count,35504.0,35504.0,35504.0,35504.0,35504.0,35504.0,35504.0
mean,4.190176,382.780306,0.066105,0.014928,0.074161,0.085258,0.108438
std,0.40193,484.879164,0.248469,0.121266,0.262036,0.279269,0.310938
min,2.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,89.0,0.0,0.0,0.0,0.0,0.0
50%,4.0,214.0,0.0,0.0,0.0,0.0,0.0
75%,4.5,466.0,0.0,0.0,0.0,0.0,0.0
max,5.0,6937.0,1.0,1.0,1.0,1.0,1.0


In [96]:
# Check the balance of our target values
y.value_counts()

2    28778
3     4285
1     2305
4      136
Name: price, dtype: int64

In [97]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [98]:
# Resample the training data with the BalancedRandomForestClassifier
rf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
model = rf_model.fit(X_train, y_train)

In [99]:
# Calculated the balanced accuracy score
y_pred = rf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9040885633213394

In [100]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 536,   42,    4,    4],
       [ 673, 5786,  373,  322],
       [  16,   49, 1011,   23],
       [   0,    0,    1,   36]], dtype=int64)

In [101]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          1       0.44      0.91      0.92      0.59      0.92      0.84       586
          2       0.98      0.81      0.95      0.89      0.88      0.76      7154
          3       0.73      0.92      0.95      0.81      0.94      0.87      1099
          4       0.09      0.97      0.96      0.17      0.97      0.94        37

avg / total       0.91      0.83      0.95      0.86      0.89      0.78      8876



In [102]:
# List the features sorted in descending order by feature importance

importances = rf_model.feature_importances_
#
# Sort the feature importance in descending order
#
sorted_indices = np.argsort(importances)[::-1]
 
feat_labels = X.columns[1:]
 
for f in range(X_train.shape[0]):
    print("%2d) %-*s %f" % (f + 1, 30,
                            feat_labels[sorted_indices[f]],
                            importances[sorted_indices[f]]))

 1) Asian Fusion                   0.736962
 2) review_count                   0.103536
 3) Mexican                        0.057366
 4) Italian                        0.038031


IndexError: index 6 is out of bounds for axis 0 with size 6

## MidWest - Price

In [103]:
# Filter dataset to west coast
mwest_coast = yelp_df[(yelp_df.EC == 0) &
                    (yelp_df.MW == 1)]

mwest_coast = mwest_coast.loc[:,["rating", "price", "review_count", "Asian Fusion", 
                               "Mexican", "Chinese", "American(New)", "Italian"]]
mwest_coast.head(10)

Unnamed: 0,rating,price,review_count,Asian Fusion,Mexican,Chinese,American(New),Italian
35513,4.0,2,47,0,0,0,0,0
35514,4.0,2,47,0,0,0,0,0
35515,4.0,2,47,0,0,0,0,0
35516,4.0,2,47,0,0,0,0,0
35517,4.0,2,47,0,0,0,0,0
35518,4.0,2,47,0,0,0,0,0
35519,4.0,2,47,0,0,0,0,0
35520,4.0,2,47,0,0,0,0,0
35521,4.0,2,47,0,0,0,0,0
35522,4.0,2,47,0,0,0,0,0


In [104]:
# Create our features
X = mwest_coast.drop("price", axis=1)

X = pd.get_dummies(X)

# Create our target
y = mwest_coast["price"]

In [105]:
y.describe()

count    9522.000000
mean        1.960092
std         0.321291
min         1.000000
25%         2.000000
50%         2.000000
75%         2.000000
max         4.000000
Name: price, dtype: float64

In [106]:
X.describe()

Unnamed: 0,rating,review_count,Asian Fusion,Mexican,Chinese,American(New),Italian
count,9522.0,9522.0,9522.0,9522.0,9522.0,9522.0,9522.0
mean,4.294949,382.21256,0.00063,0.202899,0.00126,0.05314,0.021319
std,0.339963,675.224272,0.025096,0.402179,0.035479,0.224325,0.144453
min,1.5,1.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,50.0,0.0,0.0,0.0,0.0,0.0
50%,4.5,193.0,0.0,0.0,0.0,0.0,0.0
75%,4.5,469.0,0.0,0.0,0.0,0.0,0.0
max,5.0,9515.0,1.0,1.0,1.0,1.0,1.0


In [107]:
y.value_counts()

2    8581
1     670
3     252
4      19
Name: price, dtype: int64

In [108]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [109]:
# Resample the training data with the BalancedRandomForestClassifier
rf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
model = rf_model.fit(X_train, y_train)

In [110]:
# Calculated the balanced accuracy score
y_pred = rf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8386630047214256

In [111]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 119,   28,    9,    3],
       [ 319, 1399,  280,  151],
       [   1,    1,   64,    1],
       [   0,    0,    0,    6]], dtype=int64)

In [112]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          1       0.27      0.75      0.86      0.40      0.80      0.63       159
          2       0.98      0.65      0.88      0.78      0.75      0.56      2149
          3       0.18      0.96      0.88      0.30      0.91      0.84        67
          4       0.04      1.00      0.93      0.07      0.97      0.94         6

avg / total       0.91      0.67      0.87      0.74      0.76      0.57      2381



In [113]:
# List the features sorted in descending order by feature importance

importances = rf_model.feature_importances_
#
# Sort the feature importance in descending order
#
sorted_indices = np.argsort(importances)[::-1]
 
feat_labels = X.columns[1:]
 
for f in range(X_train.shape[0]):
    print("%2d) %-*s %f" % (f + 1, 30,
                            feat_labels[sorted_indices[f]],
                            importances[sorted_indices[f]]))

 1) Asian Fusion                   0.652205
 2) review_count                   0.189441
 3) Italian                        0.088279
 4) Chinese                        0.047779


IndexError: index 6 is out of bounds for axis 0 with size 6