In [6]:
import warnings
warnings.filterwarnings('ignore')

In [7]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import csv

In [8]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [13]:
# Load the data
yelp_file_to_load = "Resources/yelp_eval_data.csv"
# Open the city_file to filter zip codes for metropolitan areas.
yelp_file_df = pd.read_csv(yelp_file_to_load)

yelp_file_df


Unnamed: 0,categories,rating,price,review count,name,city,state
0,Chinese,4.5,1,1116,Shu Jiao Fu Zhou,New York,NY
1,Mediterranean,4.5,1,222,Lava Shawarma,New York,NY
2,Belgian,4.5,1,86,Bel-Fries,New York,NY
3,Chinese,4.0,1,609,Kong Sihk Tong 港食堂,New York,NY
4,Fast Food,4.5,1,1887,Wah Fung No 1,New York,NY
...,...,...,...,...,...,...,...
14137,Steakhouses,4.0,4,79,Holu,Chicago,IL
14138,Steakhouses,4.0,4,1936,Maple & Ash,Chicago,IL
14139,American (New),4.5,4,331,Goosefoot,Chicago,IL
14140,Desserts,5.0,4,17,The Chef behind the curtain,Snohomish,WA


## Prepare Data for Ensemble Learning

In [14]:
# identify all unique categories
yelp_file_df.groupby('categories').nunique()

Unnamed: 0_level_0,rating,price,review count,name,city,state
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
African,3,2,4,4,3,1
American (New),7,4,161,179,80,5
American (Traditional),7,4,81,92,53,3
Arabic,1,1,1,1,1,1
Argentine,2,2,3,3,3,3
...,...,...,...,...,...,...
Venezuelan,2,3,6,6,4,2
Venues & Event Spaces,2,1,2,2,2,2
Vietnamese,6,3,39,40,16,3
Wine Bars,4,3,21,21,13,3


### Understand the Frequesncy of the top 5 categories

In [15]:
# Understand the frequency of the data
yelp_categories_count = pd.DataFrame(yelp_file_df['categories'].value_counts(normalize=True)*100)
yelp_categories_count = yelp_categories_count.sort_values(by=['categories'], ascending=False)
yelp_categories_count.head(30)

Unnamed: 0,categories
American (New),7.311554
Italian,7.226701
Mexican,5.204356
Caribbean,4.815443
Chinese,4.207326
American (Traditional),3.585066
Asian Fusion,3.309291
Bars,3.252722
Thai,3.210296
Pizza,2.786027


Taking the count of the categories, we can see out of 156 unique categories the top 5 categories hold 33% of the restauarants being evaluated in this analysis. Therefore, we can classify whether a restaurant is or is not one of the 5 then include all others as a classifier of "other"

In [16]:
evaluation_categories = ["Asian Fusion", "Mexican", "Chinese", "American (New)", "Italian"]
non_evalu_categories = yelp_file_df[~yelp_file_df['categories'].isin(evaluation_categories)]

In [17]:
# Create condition for italian restaurants
conditions = [
    (yelp_file_df['categories'] == "Italian"),
    (yelp_file_df['categories'] != "Italian")
]

values = [1, 0]

yelp_file_df['Italian'] = np.select(conditions, values)

yelp_file_df

Unnamed: 0,categories,rating,price,review count,name,city,state,Italian
0,Chinese,4.5,1,1116,Shu Jiao Fu Zhou,New York,NY,0
1,Mediterranean,4.5,1,222,Lava Shawarma,New York,NY,0
2,Belgian,4.5,1,86,Bel-Fries,New York,NY,0
3,Chinese,4.0,1,609,Kong Sihk Tong 港食堂,New York,NY,0
4,Fast Food,4.5,1,1887,Wah Fung No 1,New York,NY,0
...,...,...,...,...,...,...,...,...
14137,Steakhouses,4.0,4,79,Holu,Chicago,IL,0
14138,Steakhouses,4.0,4,1936,Maple & Ash,Chicago,IL,0
14139,American (New),4.5,4,331,Goosefoot,Chicago,IL,0
14140,Desserts,5.0,4,17,The Chef behind the curtain,Snohomish,WA,0


In [18]:
# Create condition for American (New) restaurants
conditions = [
    (yelp_file_df['categories'] == "American (New)"),
    (yelp_file_df['categories'] != "American (New)")
]

values = [1, 0]

yelp_file_df['American(New)'] = np.select(conditions, values)

yelp_file_df

Unnamed: 0,categories,rating,price,review count,name,city,state,Italian,American(New)
0,Chinese,4.5,1,1116,Shu Jiao Fu Zhou,New York,NY,0,0
1,Mediterranean,4.5,1,222,Lava Shawarma,New York,NY,0,0
2,Belgian,4.5,1,86,Bel-Fries,New York,NY,0,0
3,Chinese,4.0,1,609,Kong Sihk Tong 港食堂,New York,NY,0,0
4,Fast Food,4.5,1,1887,Wah Fung No 1,New York,NY,0,0
...,...,...,...,...,...,...,...,...,...
14137,Steakhouses,4.0,4,79,Holu,Chicago,IL,0,0
14138,Steakhouses,4.0,4,1936,Maple & Ash,Chicago,IL,0,0
14139,American (New),4.5,4,331,Goosefoot,Chicago,IL,0,1
14140,Desserts,5.0,4,17,The Chef behind the curtain,Snohomish,WA,0,0


In [19]:
# Create condition for Chinese restaurants
conditions = [
    (yelp_file_df['categories'] == "Chinese"),
    (yelp_file_df['categories'] != "Chinese")
]

values = [1, 0]

yelp_file_df['Chinese'] = np.select(conditions, values)

yelp_file_df

Unnamed: 0,categories,rating,price,review count,name,city,state,Italian,American(New),Chinese
0,Chinese,4.5,1,1116,Shu Jiao Fu Zhou,New York,NY,0,0,1
1,Mediterranean,4.5,1,222,Lava Shawarma,New York,NY,0,0,0
2,Belgian,4.5,1,86,Bel-Fries,New York,NY,0,0,0
3,Chinese,4.0,1,609,Kong Sihk Tong 港食堂,New York,NY,0,0,1
4,Fast Food,4.5,1,1887,Wah Fung No 1,New York,NY,0,0,0
...,...,...,...,...,...,...,...,...,...,...
14137,Steakhouses,4.0,4,79,Holu,Chicago,IL,0,0,0
14138,Steakhouses,4.0,4,1936,Maple & Ash,Chicago,IL,0,0,0
14139,American (New),4.5,4,331,Goosefoot,Chicago,IL,0,1,0
14140,Desserts,5.0,4,17,The Chef behind the curtain,Snohomish,WA,0,0,0


In [20]:
# Create condition for Mexican restaurants
conditions = [
    (yelp_file_df['categories'] == "Mexican"),
    (yelp_file_df['categories'] != "Mexican")
]

values = [1, 0]

yelp_file_df['Mexican'] = np.select(conditions, values)

yelp_file_df

Unnamed: 0,categories,rating,price,review count,name,city,state,Italian,American(New),Chinese,Mexican
0,Chinese,4.5,1,1116,Shu Jiao Fu Zhou,New York,NY,0,0,1,0
1,Mediterranean,4.5,1,222,Lava Shawarma,New York,NY,0,0,0,0
2,Belgian,4.5,1,86,Bel-Fries,New York,NY,0,0,0,0
3,Chinese,4.0,1,609,Kong Sihk Tong 港食堂,New York,NY,0,0,1,0
4,Fast Food,4.5,1,1887,Wah Fung No 1,New York,NY,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
14137,Steakhouses,4.0,4,79,Holu,Chicago,IL,0,0,0,0
14138,Steakhouses,4.0,4,1936,Maple & Ash,Chicago,IL,0,0,0,0
14139,American (New),4.5,4,331,Goosefoot,Chicago,IL,0,1,0,0
14140,Desserts,5.0,4,17,The Chef behind the curtain,Snohomish,WA,0,0,0,0


In [21]:
# Create condition for Asian Fusion restaurants
conditions = [
    (yelp_file_df['categories'] == "Asian Fusion"),
    (yelp_file_df['categories'] != "Asian Fusion")
]

values = [1, 0]

yelp_file_df['Asian Fusion'] = np.select(conditions, values)

yelp_file_df

Unnamed: 0,categories,rating,price,review count,name,city,state,Italian,American(New),Chinese,Mexican,Asian Fusion
0,Chinese,4.5,1,1116,Shu Jiao Fu Zhou,New York,NY,0,0,1,0,0
1,Mediterranean,4.5,1,222,Lava Shawarma,New York,NY,0,0,0,0,0
2,Belgian,4.5,1,86,Bel-Fries,New York,NY,0,0,0,0,0
3,Chinese,4.0,1,609,Kong Sihk Tong 港食堂,New York,NY,0,0,1,0,0
4,Fast Food,4.5,1,1887,Wah Fung No 1,New York,NY,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
14137,Steakhouses,4.0,4,79,Holu,Chicago,IL,0,0,0,0,0
14138,Steakhouses,4.0,4,1936,Maple & Ash,Chicago,IL,0,0,0,0,0
14139,American (New),4.5,4,331,Goosefoot,Chicago,IL,0,1,0,0,0
14140,Desserts,5.0,4,17,The Chef behind the curtain,Snohomish,WA,0,0,0,0,0


In [22]:
# Create condition for American (New) restaurants
conditions = [
    (yelp_file_df['state'] == "NY"),
    (yelp_file_df['state'] != "NY")
]

values = [1, 0]

yelp_file_df['EC'] = np.select(conditions, values)

yelp_file_df

Unnamed: 0,categories,rating,price,review count,name,city,state,Italian,American(New),Chinese,Mexican,Asian Fusion,EC
0,Chinese,4.5,1,1116,Shu Jiao Fu Zhou,New York,NY,0,0,1,0,0,1
1,Mediterranean,4.5,1,222,Lava Shawarma,New York,NY,0,0,0,0,0,1
2,Belgian,4.5,1,86,Bel-Fries,New York,NY,0,0,0,0,0,1
3,Chinese,4.0,1,609,Kong Sihk Tong 港食堂,New York,NY,0,0,1,0,0,1
4,Fast Food,4.5,1,1887,Wah Fung No 1,New York,NY,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14137,Steakhouses,4.0,4,79,Holu,Chicago,IL,0,0,0,0,0,0
14138,Steakhouses,4.0,4,1936,Maple & Ash,Chicago,IL,0,0,0,0,0,0
14139,American (New),4.5,4,331,Goosefoot,Chicago,IL,0,1,0,0,0,0
14140,Desserts,5.0,4,17,The Chef behind the curtain,Snohomish,WA,0,0,0,0,0,0


In [23]:
# Create condition for American (New) restaurants
conditions = [
    (yelp_file_df['state'] == "IL"),
    (yelp_file_df['state'] != "IL")
]

values = [1, 0]

yelp_file_df['MW'] = np.select(conditions, values)

yelp_file_df

Unnamed: 0,categories,rating,price,review count,name,city,state,Italian,American(New),Chinese,Mexican,Asian Fusion,EC,MW
0,Chinese,4.5,1,1116,Shu Jiao Fu Zhou,New York,NY,0,0,1,0,0,1,0
1,Mediterranean,4.5,1,222,Lava Shawarma,New York,NY,0,0,0,0,0,1,0
2,Belgian,4.5,1,86,Bel-Fries,New York,NY,0,0,0,0,0,1,0
3,Chinese,4.0,1,609,Kong Sihk Tong 港食堂,New York,NY,0,0,1,0,0,1,0
4,Fast Food,4.5,1,1887,Wah Fung No 1,New York,NY,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14137,Steakhouses,4.0,4,79,Holu,Chicago,IL,0,0,0,0,0,0,1
14138,Steakhouses,4.0,4,1936,Maple & Ash,Chicago,IL,0,0,0,0,0,0,1
14139,American (New),4.5,4,331,Goosefoot,Chicago,IL,0,1,0,0,0,0,1
14140,Desserts,5.0,4,17,The Chef behind the curtain,Snohomish,WA,0,0,0,0,0,0,0


In [27]:
yelp_df = yelp_file_df.drop(["categories", "name", "city", "state"], axis=1)

In [28]:
yelp_df.to_csv("yelp_eval_data.csv", index=False)
yelp_df

Unnamed: 0,rating,price,review count,Italian,American(New),Chinese,Mexican,Asian Fusion,EC,MW
0,4.5,1,1116,0,0,1,0,0,1,0
1,4.5,1,222,0,0,0,0,0,1,0
2,4.5,1,86,0,0,0,0,0,1,0
3,4.0,1,609,0,0,1,0,0,1,0
4,4.5,1,1887,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
14137,4.0,4,79,0,0,0,0,0,0,1
14138,4.0,4,1936,0,0,0,0,0,0,1
14139,4.5,4,331,0,1,0,0,0,0,1
14140,5.0,4,17,0,0,0,0,0,0,0


## All Combined - Price

### Split the Data into Training and Testing

In [29]:
# Create our features
X = yelp_df.drop("price", axis=1)

X = pd.get_dummies(X)

# Create our target
y = yelp_df["price"]

In [30]:
y.describe()

count    14142.000000
mean         2.006859
std          0.478838
min          1.000000
25%          2.000000
50%          2.000000
75%          2.000000
max          4.000000
Name: price, dtype: float64

In [31]:
X.describe()

Unnamed: 0,rating,review count,Italian,American(New),Chinese,Mexican,Asian Fusion,EC,MW
count,14142.0,14142.0,14142.0,14142.0,14142.0,14142.0,14142.0,14142.0,14142.0
mean,4.139478,351.573186,0.072267,0.073116,0.042073,0.052044,0.033093,0.745015,0.226559
std,0.425533,547.2792,0.258939,0.260335,0.200763,0.222123,0.178886,0.435868,0.41862
min,1.5,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,82.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.0,193.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,4.5,408.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,5.0,9535.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [32]:
# Check the balance of our target values
y.value_counts()

2    11094
1     1508
3     1475
4       65
Name: price, dtype: int64

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [34]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
rf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
model = rf_model.fit(X_train, y_train)

In [35]:
# Calculated the balanced accuracy score
from sklearn.metrics import accuracy_score
y_pred = rf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7907179996423526

In [36]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 317,   61,    9,   11],
       [ 550, 1626,  389,  196],
       [  12,   36,  311,    5],
       [   1,    0,    0,   12]], dtype=int64)

In [37]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          1       0.36      0.80      0.82      0.50      0.81      0.65       398
          2       0.94      0.59      0.87      0.73      0.72      0.50      2761
          3       0.44      0.85      0.87      0.58      0.86      0.75       364
          4       0.05      0.92      0.94      0.10      0.93      0.87        13

avg / total       0.82      0.64      0.87      0.68      0.74      0.54      3536



In [46]:
# List the features sorted in descending order by feature importance

importances = rf_model.feature_importances_
sorted(zip(importances, X.columns), reverse = True)

combined_importance = pd.DataFrame(sorted(zip(importances, X.columns), reverse = True))

combined_importance

Unnamed: 0,0,1
0,0.642373,review count
1,0.151054,rating
2,0.0363,EC
3,0.035963,American(New)
4,0.035382,Italian
5,0.031623,Asian Fusion
6,0.02904,MW
7,0.022489,Mexican
8,0.015776,Chinese


## West Coast - Price

In [48]:
# Filter dataset to west coast
west_coast = yelp_df[(yelp_df.EC == 0) &
                    (yelp_df.MW == 0)]

west_coast = west_coast.loc[:,["rating", "price", "review count", "Asian Fusion", 
                               "Mexican", "Chinese", "American(New)", "Italian"]]
west_coast.head(10)

Unnamed: 0,rating,price,review count,Asian Fusion,Mexican,Chinese,American(New),Italian
1215,4.0,1,110,0,0,0,1,0
1221,4.5,1,32,0,0,0,0,1
1227,4.0,1,47,0,0,1,0,0
1487,4.0,1,125,0,0,0,0,0
1488,4.5,1,293,0,0,0,0,0
1489,4.0,1,461,0,0,1,0,0
1490,4.0,1,369,0,0,0,0,0
1491,4.5,1,883,0,0,0,0,0
1492,4.5,1,165,0,0,0,0,0
1493,3.5,1,169,0,0,0,0,0


In [49]:
# Create our features
X = west_coast.drop("price", axis=1)

X = pd.get_dummies(X)

# Create our target
y = west_coast["price"]

In [50]:
y.describe()

count    402.000000
mean       2.072139
std        0.449183
min        1.000000
25%        2.000000
50%        2.000000
75%        2.000000
max        4.000000
Name: price, dtype: float64

In [51]:
X.describe()

Unnamed: 0,rating,review count,Asian Fusion,Mexican,Chinese,American(New),Italian
count,402.0,402.0,402.0,402.0,402.0,402.0,402.0
mean,4.151741,317.761194,0.00995,0.052239,0.017413,0.087065,0.049751
std,0.395561,287.415108,0.099377,0.222786,0.130967,0.282281,0.217702
min,2.5,2.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,117.0,0.0,0.0,0.0,0.0,0.0
50%,4.0,215.0,0.0,0.0,0.0,0.0,0.0
75%,4.5,412.25,0.0,0.0,0.0,0.0,0.0
max,5.0,1763.0,1.0,1.0,1.0,1.0,1.0


In [52]:
# Check the balance of our target values
y.value_counts()

2    328
3     47
1     24
4      3
Name: price, dtype: int64

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [54]:
# Resample the training data with the BalancedRandomForestClassifier
rf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
model = rf_model.fit(X_train, y_train)

In [55]:
# Calculated the balanced accuracy score
y_pred = rf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.41539634146341464

In [56]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 5,  3,  0,  0],
       [19, 44, 12,  7],
       [ 3,  2,  5,  0],
       [ 1,  0,  0,  0]], dtype=int64)

In [57]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          1       0.18      0.62      0.75      0.28      0.69      0.46         8
          2       0.90      0.54      0.74      0.67      0.63      0.39        82
          3       0.29      0.50      0.87      0.37      0.66      0.42        10
          4       0.00      0.00      0.93      0.00      0.00      0.00         1

avg / total       0.77      0.53      0.75      0.60      0.63      0.39       101



In [59]:
# List the features sorted in descending order by feature importance

importances = rf_model.feature_importances_
sorted(zip(importances, X.columns), reverse = True)

wc_importance = pd.DataFrame(sorted(zip(importances, X.columns), reverse = True))

wc_importance.rename(columns={0: 'Importance Percentage', 1: 'Importance Variable'})

wc_importance

Unnamed: 0,0,1
0,0.528139,review count
1,0.301112,rating
2,0.102722,Italian
3,0.026221,American(New)
4,0.014311,Asian Fusion
5,0.014139,Chinese
6,0.013356,Mexican


## East Coast - Price

In [61]:
# Filter dataset to west coast
east_coast = yelp_df[(yelp_df.EC == 1) &
                    (yelp_df.MW == 0)]

east_coast = east_coast.loc[:,["rating", "price", "review count", "Asian Fusion", 
                               "Mexican", "Chinese", "American(New)", "Italian"]]
east_coast.head(10)

Unnamed: 0,rating,price,review count,Asian Fusion,Mexican,Chinese,American(New),Italian
0,4.5,1,1116,0,0,1,0,0
1,4.5,1,222,0,0,0,0,0
2,4.5,1,86,0,0,0,0,0
3,4.0,1,609,0,0,1,0,0
4,4.5,1,1887,0,0,0,0,0
5,4.0,1,121,0,0,0,0,0
6,4.0,1,403,0,0,0,0,0
7,4.5,1,69,0,1,0,0,0
8,4.5,1,62,0,0,0,0,0
9,4.5,1,9,0,0,0,0,0


In [62]:
# Create our features
X = east_coast.drop("price", axis=1)

X = pd.get_dummies(X)

# Create our target
y = east_coast["price"]

In [63]:
y.describe()

count    10536.000000
mean         2.041667
std          0.487597
min          1.000000
25%          2.000000
50%          2.000000
75%          2.000000
max          4.000000
Name: price, dtype: float64

In [64]:
X.describe()

Unnamed: 0,rating,review count,Asian Fusion,Mexican,Chinese,American(New),Italian
count,10536.0,10536.0,10536.0,10536.0,10536.0,10536.0,10536.0
mean,4.117787,338.17891,0.04347,0.030847,0.054765,0.074981,0.08694
std,0.427887,466.836132,0.203922,0.17291,0.227531,0.263373,0.281761
min,1.5,2.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,89.0,0.0,0.0,0.0,0.0,0.0
50%,4.0,193.0,0.0,0.0,0.0,0.0,0.0
75%,4.5,405.0,0.0,0.0,0.0,0.0,0.0
max,5.0,6947.0,1.0,1.0,1.0,1.0,1.0


In [65]:
# Check the balance of our target values
y.value_counts()

2    8166
3    1328
1     991
4      51
Name: price, dtype: int64

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [67]:
# Resample the training data with the BalancedRandomForestClassifier
rf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
model = rf_model.fit(X_train, y_train)

In [68]:
# Calculated the balanced accuracy score
y_pred = rf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7785023114687127

In [69]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 224,   24,    7,    5],
       [ 469, 1170,  245,  158],
       [  19,   30,  251,   22],
       [   0,    0,    1,    9]], dtype=int64)

In [70]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          1       0.31      0.86      0.79      0.46      0.83      0.69       260
          2       0.96      0.57      0.91      0.72      0.72      0.50      2042
          3       0.50      0.78      0.89      0.61      0.83      0.69       322
          4       0.05      0.90      0.93      0.09      0.91      0.83        10

avg / total       0.83      0.63      0.90      0.68      0.75      0.55      2634



In [71]:
# List the features sorted in descending order by feature importance

importances = rf_model.feature_importances_
sorted(zip(importances, X.columns), reverse = True)

ec_importance = pd.DataFrame(sorted(zip(importances, X.columns), reverse = True))

ec_importance.rename(columns={0: 'Importance Percentage', 1: 'Importance Variable'})

ec_importance

Unnamed: 0,0,1
0,0.709437,review count
1,0.149101,rating
2,0.038183,Asian Fusion
3,0.033335,American(New)
4,0.031596,Italian
5,0.021795,Chinese
6,0.016553,Mexican


## MidWest - Price

In [72]:
# Filter dataset to west coast
mwest_coast = yelp_df[(yelp_df.EC == 0) &
                    (yelp_df.MW == 1)]

mwest_coast = mwest_coast.loc[:,["rating", "price", "review count", "Asian Fusion", 
                               "Mexican", "Chinese", "American(New)", "Italian"]]
mwest_coast.head(10)

Unnamed: 0,rating,price,review count,Asian Fusion,Mexican,Chinese,American(New),Italian
991,4.5,1,326,0,0,0,0,0
992,4.0,1,178,0,0,0,0,0
993,4.5,1,240,0,1,0,0,0
994,4.0,1,31,0,0,0,0,0
995,4.5,1,47,0,0,0,0,0
996,4.0,1,250,0,0,0,0,0
997,2.5,1,146,0,1,0,0,0
998,4.5,1,246,0,0,0,0,1
999,4.5,1,44,0,0,0,0,0
1000,5.0,1,707,0,0,0,0,0


In [73]:
# Create our features
X = mwest_coast.drop("price", axis=1)

X = pd.get_dummies(X)

# Create our target
y = mwest_coast["price"]

In [74]:
y.describe()

count    3204.000000
mean        1.884207
std         0.430655
min         1.000000
25%         2.000000
50%         2.000000
75%         2.000000
max         4.000000
Name: price, dtype: float64

In [75]:
X.describe()

Unnamed: 0,rating,review count,Asian Fusion,Mexican,Chinese,American(New),Italian
count,3204.0,3204.0,3204.0,3204.0,3204.0,3204.0,3204.0
mean,4.20927,399.861111,0.001873,0.121723,0.003433,0.065231,0.026841
std,0.413794,769.487815,0.04324,0.327016,0.058502,0.246971,0.161645
min,1.5,2.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,75.0,0.0,0.0,0.0,0.0,0.0
50%,4.0,175.0,0.0,0.0,0.0,0.0,0.0
75%,4.5,426.0,0.0,0.0,0.0,0.0,0.0
max,5.0,9535.0,1.0,1.0,1.0,1.0,1.0


In [76]:
y.value_counts()

2    2600
1     493
3     100
4      11
Name: price, dtype: int64

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [78]:
# Resample the training data with the BalancedRandomForestClassifier
rf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
model = rf_model.fit(X_train, y_train)

In [79]:
# Calculated the balanced accuracy score
y_pred = rf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6075457875457875

In [80]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 90,  24,   4,   8],
       [119, 357, 139,  35],
       [  2,   4,  14,   1],
       [  0,   1,   1,   2]], dtype=int64)

In [81]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          1       0.43      0.71      0.82      0.53      0.77      0.58       126
          2       0.92      0.55      0.81      0.69      0.67      0.43       650
          3       0.09      0.67      0.82      0.16      0.74      0.54        21
          4       0.04      0.50      0.94      0.08      0.69      0.45         4

avg / total       0.82      0.58      0.81      0.65      0.68      0.46       801



In [82]:
# List the features sorted in descending order by feature importance

importances = rf_model.feature_importances_
sorted(zip(importances, X.columns), reverse = True)

mw_importance = pd.DataFrame(sorted(zip(importances, X.columns), reverse = True))

mw_importance.rename(columns={0: 'Importance Percentage', 1: 'Importance Variable'})

mw_importance

Unnamed: 0,0,1
0,0.580381,review count
1,0.28926,rating
2,0.07529,American(New)
3,0.041391,Mexican
4,0.010558,Italian
5,0.002278,Chinese
6,0.000842,Asian Fusion
