In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from scipy import stats


# remove non-member restaurants, they have no further information populated
df = df[df['is_member'] != 0]

# convert price_tier to numeric
price_tier_dict = {'$30 and under': 1, '$31 to $50': 2, '$50 and over':3}
df['price_tier'] = df['price_tier'].map(lambda x: price_tier_dict[x])


# clean review_count and conver to int
df['review_count']= df['review_count'].map(lambda s: str.replace(s, ' Reviews', '')).map(lambda s: str.replace(s, 'No', '0')).map(lambda x: int(x))

# any restaurant with no reviews has no category reviews
df[df['overall'].isnull()].loc[:,['name','overall', 'food', 'service', 'ambience','value']]

# create newly_added field
df['newly_added'] = df['name'].map(lambda s: 'Newly added ' in s)

# clean name field
df['name'] = df['name'].map(lambda s: str.replace(s, ' restaurant', '')).map(lambda s: str.replace(s, 'Newly added ', ''))

# remove duplicates
dupe_mask = df.duplicated(keep = 'first') == False
df = df[dupe_mask]

# convert promoted status to bool
df['promoted'] = df['promoted'].apply(lambda x: bool(x))

# create weighted overall rating column
df['weighted_overall'] = df['review_count'] * df['overall']

# create total covid precautions column
df['precautions'] = sum((df['sanitizing'], df['distancing'], df['ppe'], df['screening']))

# convert NaN values in precautions to 0s
df['precautions'] = df['precautions'].apply(lambda x: 0 if pd.isnull(x) else x)
df['sanitizing'] = df['sanitizing'].apply(lambda x: 0 if pd.isnull(x) else x)
df['ppe'] = df['ppe'].apply(lambda x: 0 if pd.isnull(x) else x)
df['screening'] = df['screening'].apply(lambda x: 0 if pd.isnull(x) else x)
df['distancing'] = df['distancing'].apply(lambda x: 0 if pd.isnull(x) else x)

# correctly handle encoding of special characters in name field
df['name'] = df['name'].apply(lambda s: bytes(s,'iso-8859-1').decode('utf-8'))

# clean query fields out of restaurant page url
df['url'] = df['url'].apply(lambda s: re.search('^.*\?', s).group(0)).apply(lambda s: s[:-1])

# remove unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# fix noise null values
df['noise'] = df['noise'].map(lambda s: None if s not in ['Quiet', 'Moderate', 'Energetic'] else s)

# filter to remove outliers in column col
df_no_out = df[(np.abs(df.col-df.col.mean())<=(3*df.col.std()))]

NameError: name 'df' is not defined

In [None]:
# import bookings from given day, format urls, update total bookings
total_bookings = pd.read_csv('total_bookings_day.csv')


bookings_today = pd.DataFrame()
boroughs = ['manhattan','bronx', 'queens', 'brooklyn', 'staten_island']
date = '2021-07-XX' # today's date
for borough in boroughs:
    bookings_today = bookings_today.append(pd.read_csv(f'./bookings/bookings_{borough}_{date}.csv'), ignore_index = True)

bookings_today['url'] = bookings_today['url'].apply(lambda s: re.search('^.*\?', s).group(0)).apply(lambda s: s[:-1])

total_bookings_day = pd.merge(total_bookings, bookings_today, on='url', how='outer')

total_bookings.to_csv(f'total_bookings_day.csv', index = False)

In [2]:
# merging restaurants and bookings

rest_df = pd.read_csv('nyc_restaurants_clean.csv', encoding = 'utf-8')
df = pd.merge(rest_df, total_bookings, on='url', how = 'left')

# calculate average bookings

avg_bookings = df.loc[: , "7/19/2021":]
df['avg_bookings'] = avg_bookings.mean(axis = 1)
df.reset_index(inplace = True, drop = True)

# remove unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# fix noise null values
df['noise'] = df['noise'].map(lambda s: None if s not in ['Quiet', 'Moderate', 'Energetic'] else s)

# the same restaurant sometimes comes up twice in the results for one day of bookings
# this groups by url and averages avg_bookings, other fields left the same

url_bookings_means = df.groupby(['url'], as_index = False).agg({'avg_bookings':'mean'})
df = df[df['url'].duplicated() == False]
df = df.drop('avg_bookings', 1)
df = pd.merge(df, url_bookings_means, how = 'inner', on = 'url')

df.to_csv('X_day_results.csv', index = False) # then can just import this to have compiled results

# replace restaurants that have no bookings on any day examined with 0
df['avg_bookings'] = df['avg_bookings'].fillna(0)

# add bool precautions column
df4['any_precautions'] = df4['precautions'] != 0

# filter to remove outliers in column col
df_no_out = df[(np.abs(df.col-df.col.mean())<=(3*df.col.std()))]

NameError: name 'total_bookings' is not defined

In [None]:
# EDA

# filter restaurants that have null

df = df[~df['weighted_overall'].isnull()==True] # reviews
df = df[~df['avg_bookings'].isnull()==True] # bookings

# filter restaurants that have 0 bookings for all days observed
df_bk = df[df['avg_bookings']!= 0]

# filter for promoted status
promoted = df[df['promoted']==True]
not_promoted = df[df['promoted']==False]