In [81]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from pathlib import Path
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

In [82]:
file_path = Path("../Resources/cleaned_data2.csv")
df= pd.read_csv(file_path)

In [83]:
df['delivery'] = df['transactions'].str.contains('delivery', case=False, na=False)
df['pickup'] = df['transactions'].str.contains('pickup', case=False, na=False)
df['restaurant_reservation'] = df['transactions'].str.contains('restaurant_reservation', case=False, na=False)

# Fill NaN values with False in the new columns
df['delivery'] = df['delivery'].fillna(False)
df['pickup'] = df['pickup'].fillna(False)
df['restaurant_reservation'] = df['restaurant_reservation'].fillna(False)

In [84]:
def is_success(rating):
    return rating > 3.7

df['success'] = df['rating'].apply(is_success)

df.head()

Unnamed: 0,id,name,image_url,is_closed,url,review_count,rating,transactions,price,group_city,cuisines,latitude,longitude,state,delivery,pickup,restaurant_reservation,success
0,a0IET3_yCFcO36OqGSsisg,Eataly NYC Flatiron,https://s3-media4.fl.yelpcdn.com/bphoto/1UDlnu...,False,https://www.yelp.com/biz/eataly-nyc-flatiron-n...,6102,4.0,"delivery, pickup",2,New York City,Italian,40.742101,-73.989922,NY,True,True,False,True
1,zj8Lq1T8KIC5zwFief15jg,Prince Street Pizza,https://s3-media4.fl.yelpcdn.com/bphoto/PfI8oV...,False,https://www.yelp.com/biz/prince-street-pizza-n...,5031,4.5,"delivery, pickup",1,New York City,Italian,40.723088,-73.99453,NY,True,True,False,True
2,16ZnHpuaaBt92XWeJHCC5A,Olio e Più,https://s3-media4.fl.yelpcdn.com/bphoto/CUpPgz...,False,https://www.yelp.com/biz/olio-e-pi%C3%B9-new-y...,4858,4.5,"delivery, pickup",2,New York City,Italian,40.733798,-73.999774,NY,True,True,False,True
3,vyoA8dxwScuMV_AsTcjQcg,L & B Spumoni Gardens,https://s3-media1.fl.yelpcdn.com/bphoto/hN5xKw...,False,https://www.yelp.com/biz/l-and-b-spumoni-garde...,4647,4.0,"delivery, pickup",2,New York City,Italian,40.594715,-73.981316,NY,True,True,False,True
4,22nKUyCIbpnzR6R3_g1ptQ,Carmine's Italian Restaurant - Times Square,https://s3-media1.fl.yelpcdn.com/bphoto/0UszeE...,False,https://www.yelp.com/biz/carmines-italian-rest...,4644,4.0,"delivery, pickup",2,New York City,Italian,40.757498,-73.986653,NY,True,True,False,True


In [85]:
y = df['success']
x = df.drop(columns=['rating', 'success', 'image_url', 'url', 'id', 'name'])

In [86]:

X= x.fillna(0)

In [87]:
X.head(20)

Unnamed: 0,is_closed,review_count,transactions,price,group_city,cuisines,latitude,longitude,state,delivery,pickup,restaurant_reservation
0,False,6102,"delivery, pickup",2,New York City,Italian,40.742101,-73.989922,NY,True,True,False
1,False,5031,"delivery, pickup",1,New York City,Italian,40.723088,-73.99453,NY,True,True,False
2,False,4858,"delivery, pickup",2,New York City,Italian,40.733798,-73.999774,NY,True,True,False
3,False,4647,"delivery, pickup",2,New York City,Italian,40.594715,-73.981316,NY,True,True,False
4,False,4644,"delivery, pickup",2,New York City,Italian,40.757498,-73.986653,NY,True,True,False
5,False,3813,delivery,2,New York City,Mexican,40.742255,-74.005958,NY,True,False,False
6,False,3317,"delivery, pickup",2,New York City,Mexican,40.72293,-73.99419,NY,True,True,False
7,False,3314,"delivery, pickup",2,New York City,Italian,40.70493,-73.93399,NY,True,True,False
8,False,3155,pickup,2,New York City,Italian,40.722766,-73.996233,NY,False,True,False
9,False,3139,"delivery, pickup",3,New York City,Italian,40.76083,-73.98981,NY,True,True,False


In [88]:
from sklearn import preprocessing
from sklearn import utils

lab = preprocessing.LabelEncoder()
y_transformed = lab.fit_transform(y)

In [89]:
y_transformed[:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0], dtype=int64)

In [90]:
X =pd.get_dummies(X)

In [91]:
X.dtypes

is_closed          bool
review_count      int64
price             int64
latitude        float64
longitude       float64
                 ...   
state_TN           bool
state_TX           bool
state_VA           bool
state_WA           bool
state_WI           bool
Length: 119, dtype: object

In [92]:
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, random_state=42)

In [93]:
scaler = StandardScaler()

X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)

X_test_scaled = X_scaler.transform(X_test)

In [94]:
model = RandomForestRegressor(n_estimators= 100, random_state=42)

In [95]:
model= model.fit(X_train_scaled, y_train)

In [96]:
predictions = model.predict(X_test_scaled)

In [97]:
print(predictions)

[0.53 0.76 0.34 ... 0.43 0.85 0.39]


In [103]:
from sklearn import metrics
import numpy as np


# Calculate error metrics
mae = metrics.mean_absolute_error(y_test, predictions)
mse = metrics.mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)


print('Random Forest Results:')
print(f'Mean Absolute Error (MAE): {mae:.2f}')
print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')


Random Forest Results:
Mean Absolute Error (MAE): 0.41
Mean Squared Error (MSE): 0.23
Root Mean Squared Error (RMSE): 0.48


In [99]:
class_model = RandomForestClassifier(n_estimators= 500, random_state=42)

In [100]:
class_model= class_model.fit(X_train_scaled, y_train)

In [101]:
class_predictions = class_model.predict(X_test_scaled)

In [102]:
print(classification_report(y_test, class_predictions))

              precision    recall  f1-score   support

           0       0.62      0.60      0.61      2892
           1       0.63      0.65      0.64      2995

    accuracy                           0.63      5887
   macro avg       0.63      0.62      0.62      5887
weighted avg       0.63      0.63      0.63      5887

