In [325]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
sns.set_style("whitegrid")

## Presentation and Cleaning of Data

We downloaded the restaurant data scraped from Google Maps. There are some unnecessary columns in the dataframe, along with various other anomalies and missing entries through which we sift.

In [242]:
data = pd.read_csv("/Users/dominiquekemp/Downloads/Outscraper-20250306234315s02_restaurant 4.csv")
print(data.shape)
del data['query']

(3786, 94)


### Preliminary Data Cleaning

In [243]:
len(data[data.state.isnull() == True]), len(data[data.state == 'New Jersey']), len(data[data.state == "New York"])

(2, 6, 4)

In [244]:
#clear data of out-of-state restaurants
data = data[(data.state == 'PA') | (data.state == 'Pennsylvania')]


In [245]:

data.columns

Index(['name', 'name_for_emails', 'site', 'subtypes', 'category', 'type',
       'phone', 'full_address', 'borough', 'street', 'city', 'postal_code',
       'state', 'us_state', 'country', 'country_code', 'latitude', 'longitude',
       'h3', 'time_zone', 'plus_code', 'area_service', 'rating', 'reviews',
       'reviews_link', 'reviews_tags', 'reviews_per_score',
       'reviews_per_score_1', 'reviews_per_score_2', 'reviews_per_score_3',
       'reviews_per_score_4', 'reviews_per_score_5', 'photos_count', 'photo',
       'street_view', 'located_in', 'working_hours',
       'working_hours_old_format', 'other_hours', 'popular_times',
       'business_status', 'about', 'range', 'posts', 'logo', 'description',
       'typical_time_spent', 'verified', 'owner_id', 'owner_title',
       'owner_link', 'reservation_links', 'booking_appointment_link',
       'menu_link', 'order_links', 'location_link', 'location_reviews_link',
       'place_id', 'google_id', 'cid', 'kgmid', 'reviews_id',
       

In [246]:
#determine tallies of missing values by column
miss_values = pd.isnull(data).sum()
miss_values

name                      0
name_for_emails           1
site                   1209
subtypes                  1
category                  2
                       ... 
company_sales          3774
company_sales_code     3774
number_of_employees    3774
employee_code          3774
company_num_emp        3774
Length: 93, dtype: int64

In [247]:
#remove all fully incomplete columns 
for index in data.columns:
    if miss_values[index] == len(data):
        del data[index]

print(data.shape)
#drop redundant columns
data = data.drop(["city", 'state', 'us_state', 'country', 'country_code', 'time_zone'], axis = 1)
data.columns

(3774, 56)


Index(['name', 'name_for_emails', 'site', 'subtypes', 'category', 'type',
       'phone', 'full_address', 'borough', 'street', 'postal_code', 'latitude',
       'longitude', 'h3', 'area_service', 'rating', 'reviews', 'reviews_link',
       'reviews_per_score_1', 'reviews_per_score_2', 'reviews_per_score_3',
       'reviews_per_score_4', 'reviews_per_score_5', 'photos_count', 'photo',
       'street_view', 'located_in', 'working_hours',
       'working_hours_old_format', 'other_hours', 'business_status', 'about',
       'range', 'logo', 'description', 'verified', 'owner_id', 'owner_title',
       'owner_link', 'reservation_links', 'booking_appointment_link',
       'order_links', 'location_link', 'location_reviews_link', 'place_id',
       'google_id', 'cid', 'kgmid', 'reviews_id', 'located_google_id'],
      dtype='object')

In [248]:
missing_values = pd.isnull(data).sum()
missing_values

name                           0
name_for_emails                1
site                        1209
subtypes                       1
category                       2
type                           1
phone                        282
full_address                   0
borough                       59
street                        15
postal_code                    1
latitude                       0
longitude                      0
h3                             0
area_service                   0
rating                       115
reviews                      115
reviews_link                 116
reviews_per_score_1          115
reviews_per_score_2          115
reviews_per_score_3          115
reviews_per_score_4          115
reviews_per_score_5          115
photos_count                   8
photo                          8
street_view                    8
located_in                  3773
working_hours                350
working_hours_old_format     350
other_hours                 2905
business_s

In [249]:
#delete 'located_google_id' column since only one filled entry
del data['located_google_id']

In [250]:
#remove 'located_in' for same reason, as well as restaurants with missing target feature
del data['located_in']
df = data[data.rating.isnull() == False]

In [251]:
df.shape
missing_values = df.isnull().sum()
missing_values

name                           0
name_for_emails                1
site                        1122
subtypes                       1
category                       2
type                           1
phone                        228
full_address                   0
borough                       59
street                        12
postal_code                    0
latitude                       0
longitude                      0
h3                             0
area_service                   0
rating                         0
reviews                        0
reviews_link                   1
reviews_per_score_1            0
reviews_per_score_2            0
reviews_per_score_3            0
reviews_per_score_4            0
reviews_per_score_5            0
photos_count                   3
photo                          3
street_view                    3
working_hours                279
working_hours_old_format     279
other_hours                 2791
business_status                0
about     

### Operational Status and Cuisine Variety

We observe that the data includes some restaurants which are closed, either temporarily or permanently. We do not remove these entries, as the Philly restaurant scene is dynamic with a constant flux of new restaurants and relocation of old ones and moreover Google Maps only retains the information of closed restaurants for just a few months.

In [252]:
#record of the businesses that are closed permamently or temporarily
closed = df.loc[df.business_status != 'OPERATIONAL']
df_nonoper = pd.get_dummies(closed['business_status'])
df_nonoper

Unnamed: 0,CLOSED_PERMANENTLY,CLOSED_TEMPORARILY
96,True,False
154,False,True
155,False,True
156,False,True
157,False,True
...,...,...
3598,False,True
3599,False,True
3678,False,True
3679,False,True


In [253]:
#a sample of some closed restaurants
df.iloc[153:159]

Unnamed: 0,name,name_for_emails,site,subtypes,category,type,phone,full_address,borough,street,...,reservation_links,booking_appointment_link,order_links,location_link,location_reviews_link,place_id,google_id,cid,kgmid,reviews_id
154,Bing Bing Dim Sum,Bing Bing Dim Sum,,"Chinese restaurant, Asian fusion restaurant, A...",restaurants,Chinese restaurant,,"1648 E Passyunk Ave, Philadelphia, PA 19148",East Passyunk Crossing,1648 E Passyunk Ave,...,,,,https://www.google.com/maps/place/Bing+Bing+Di...,https://www.google.com/maps/place/Bing+Bing+Di...,ChIJFxvwNQXGxokRMfV_Gwcj8ZQ,0x89c6c60535f01b17:0x94f123071b7ff531,10732397900433716529,/g/11b70bmmmz,-7714346173275835087
155,Black Cat Tavern on 12th,Black Cat Tavern On 12Th,http://www.blackcattavernphilly.com/,"Gastropub, Art gallery, ATM, Cocktail bar, Ecl...",restaurants,Gastropub,+1 267-519-3574,"2654 S 12th St, Philadelphia, PA 19148",Lower Moyamensing,2654 S 12th St,...,,,,https://www.google.com/maps/place/Black+Cat+Ta...,https://www.google.com/maps/place/Black+Cat+Ta...,ChIJLYgEGOPFxokR6XiNpM0hlD0,0x89c6c5e31804882d:0x3d9421cda48d78e9,4437208699979528425,/g/11b_2r0vd2,4437208699979528425
156,SciFood-Crown,Scifood Crown,,Restaurant,restaurants,Restaurant,+1 908-487-8687,"330 Oregon Ave, Philadelphia, PA 19148",South Philadelphia East,330 Oregon Ave,...,,,,https://www.google.com/maps/place/SciFood-Crow...,https://www.google.com/maps/place/SciFood-Crow...,ChIJN0nT-j7FxokRnLR1vNwFP9U,0x89c6c53efad34937:0xd53f05dcbc75b49c,15366006899224196252,/g/11j2gfd2n8,-3080737174485355364
157,Pizza Shop,Pizza Shop,http://pizzashop.com/,"Pizza restaurant, Cheesesteak restaurant, Fast...",restaurants,Pizza restaurant,+1 215-551-3333,"2700 S 7th St, Philadelphia, PA 19148",South Philadelphia East,2700 S 7th St,...,,,,https://www.google.com/maps/place/Pizza+Shop/@...,https://www.google.com/maps/place/Pizza+Shop/@...,ChIJ3ZK3dKDFxokR9owY2NE78mQ,0x89c6c5a074b792dd:0x64f23bd1d8188cf6,7273942120616463606,/g/11fp4ry9sk,7273942120616463606
158,Stadium Pizza & Grill,Stadium Pizza And Grill,,"Pizza restaurant, Italian restaurant, Delivery...",restaurants,Pizza restaurant,+1 215-755-5411,"2400 S 10th St, Philadelphia, PA 19148",Lower Moyamensing,2400 S 10th St,...,,,,https://www.google.com/maps/place/Stadium+Pizz...,https://www.google.com/maps/place/Stadium+Pizz...,ChIJGbLEhAfGxokRatQqMx0RHiI,0x89c6c60784c4b219:0x221e111d332ad46a,2458421263701038186,/g/1vbnqcf4,2458421263701038186
159,La Tienda,La Tienda,http://orderlatienda.com/,"Mexican restaurant, Grocery store, Mexican gro...",restaurants,Mexican restaurant,+1 215-334-1159,"1247 Snyder Ave, Philadelphia, PA 19148",East Passyunk Crossing,1247 Snyder Ave,...,,,,https://www.google.com/maps/place/La+Tienda/@3...,https://www.google.com/maps/place/La+Tienda/@3...,ChIJ4a08tAjGxokRqq1SBymojgI,0x89c6c608b43cade1:0x28ea8290752adaa,184269528938753450,/g/12613gj0b,184269528938753450


In anticipation, it is interesting to see what cuisine types are noted.

In [254]:
#record cuisine features 
cuisines= data.type.unique()
print(cuisines, cuisines.shape)

['Restaurant' 'Diner' 'Italian restaurant' 'Bar' 'Brunch restaurant'
 'Gastropub' 'Sicilian restaurant' 'American restaurant'
 'Cambodian restaurant' 'Asian restaurant' 'Thai restaurant' 'Steak house'
 'Vietnamese restaurant' 'Breakfast restaurant' 'Hamburger restaurant'
 'Mexican restaurant' 'Honduran restaurant' 'Bar & grill'
 'Cheesesteak restaurant' 'Indian restaurant' 'Taco restaurant'
 'Swedish restaurant' 'Spanish restaurant' 'Pizza restaurant'
 'Chinese restaurant' 'French restaurant' 'Delivery Chinese restaurant'
 'Sandwich shop' 'Southern restaurant (US)' 'Indonesian restaurant'
 'Mediterranean restaurant' 'Latin American restaurant' 'Event venue'
 'Chicken restaurant' 'Halal restaurant' 'Barbecue restaurant'
 'New American restaurant' 'Cafe' 'Fast food restaurant'
 'Seafood restaurant' 'Vegan restaurant' 'Caribbean restaurant'
 'Takeout Restaurant' 'Pizza Takeout' 'West African restaurant'
 'Colombian restaurant' 'Portuguese restaurant' 'Brazilian restaurant'
 'Korean restau

### Redundant Features, Boolean Exchange, and Sufficient Customer Feedback

In [255]:
#determine some redundant columns 
print(df[(df.booking_appointment_link.isnull() == True) & ((df.order_links.isnull() == False) | (df.reservation_links.isnull() == False))].shape)
print(df[(df.verified.isnull() == True) & ((df.owner_link.isnull() == False) | (df.owner_id.isnull() == False))].shape)

#consolidate and delete unnecessary info
del  df['logo'], df['owner_title'], df['owner_id'], df['owner_link'], df['street_view'], df['working_hours'], df['verified'], df['name_for_emails'], df['area_service'], df['reservation_links'], df['order_links']

(0, 48)
(0, 48)


In [256]:
#convert some columns to Boolean data
missing_values = df.isnull().sum()
print(missing_values)
rd = df.copy()
rd['site'] = (df.site.isnull() == False)
rd['phone'] = (df.phone.isnull() == False)
rd['booking_appointment_link'] = (df.booking_appointment_link.isnull() == False)

#convert missing values in 'range' and 'borough'
rd.range = rd.range.fillna(0)
rd.borough = rd.borough.fillna('Unlabeled')
#ensure that there is sufficient customer feedback
rd = rd[rd.reviews > 29]

name                           0
site                        1122
subtypes                       1
category                       2
type                           1
phone                        228
full_address                   0
borough                       59
street                        12
postal_code                    0
latitude                       0
longitude                      0
h3                             0
rating                         0
reviews                        0
reviews_link                   1
reviews_per_score_1            0
reviews_per_score_2            0
reviews_per_score_3            0
reviews_per_score_4            0
reviews_per_score_5            0
photos_count                   3
photo                          3
working_hours_old_format     279
other_hours                 2791
business_status                0
about                          0
range                        779
description                 2180
booking_appointment_link    1203
location_l

In [259]:
#primary features for customer targets
features = ['type', 'category', 'site','phone', 'booking_appointment_link', 'range', 'latitude','longitude', 'postal_code', 'borough', 'working_hours_old_format', 'other_hours', 'range', 'photos_count', 'reviews_per_score_1', 'reviews_per_score_2']

## Train/Test Split of Restaurant Data

Taken collectively, Philly has a lot of neighborhood diversity, so it seems naive to apply a uniform perspective to the data. Therefore, our approach will be founded on categorization by boroughs.

We first need to exclude the boroughs with only marginal representation.

In [None]:

marginal_boroughs = [bor for bor in rd.borough.value_counts().index if rd.borough.value_counts()[bor] == 1]
marginal_boroughs

['Cecil B. Moore', 'Roxborough', 'Mayfair', 'Holmes']

We split the data into training and test sets, preserving the proportions of restaurants in each borough.

In [261]:
from sklearn.model_selection import train_test_split
rd_bor = rd[~rd.borough.isin(marginal_boroughs)]
rd_train, rd_test = train_test_split(rd_bor,
                                        test_size=0.2,
                                        random_state=50201,
                                        stratify = rd_bor['borough'],
                                        shuffle=True)



In [401]:
#write test set immediately to file
rd_test.to_csv("../'Will It Restaurant?'/test_data.csv", index = False)

In [399]:
rd_train.to_csv("../'Will It Restaurant?'/train_data.csv", index = False)