# About
This notebook contains EDA of restaurants in the sample dataset of Team 7.

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import folium
from folium.plugins import HeatMap

# Load datasets

In [None]:
restaurant_df = pd.read_json('yelp_team7_dataset_restaurant.json')

In [None]:
restaurant_df.shape

In [None]:
categories_df = pd.read_json('yelp_team7_dataset_category.json')

In [None]:
categories_df.shape

# EDA

In [None]:
# How many restaurants are there in our sample for each top city?
restaurant_df.city.value_counts()

In [None]:
restaurant_df.city.value_counts().plot(kind = 'bar')
plt.show()

In [None]:
restaurant_df.city.value_counts(normalize = True)

In [None]:
restaurant_df.city.value_counts(normalize = True).plot(kind = 'bar')
plt.title('Proportion of restaurants in the sample')
plt.xticks(rotation = 0)
#plt.show()
plt.savefig('cities_prop.png')

In [None]:
# How many restaurants were open?
restaurant_df.is_open.value_counts(normalize = True)

In [None]:
# looking at the proportion of restaurants that were open
fig = restaurant_df.is_open.value_counts(normalize = True). plot(kind = 'bar')
plt.title('Proportion of restaurants that were open vs. not open')
fig.set_xlabel('Restaurant survival status')
fig.set_xticklabels(['open', 'not open'], rotation = 0)
#plt.show()
plt.savefig('isopen_prop.png')

In [None]:
fig = sns.catplot(x = 'city', y = 'is_open', kind = 'bar', data = restaurant_df)
plt.title("Proportion of restaurants that were open")
fig.set(ylabel = 'Proportion', xlabel = None)
#plt.show()
plt.savefig('prop_open_by_city.png', bbox_inches='tight')

In [None]:
restaurant_df.groupby('city').agg({'is_open': 'mean'}).sort_values(by = 'is_open', ascending = False)

In [None]:
restaurant_df.head()

### Number of reviews

In [None]:
# number of reviews
restaurant_df.review_count.describe()

In [None]:
restaurant_df['review_count'].plot.hist(title = "Review Count")
plt.show()

The distribution of review count is very skewed. With few restaurants receiving a lot of reviews.

In [None]:
g = sns.FacetGrid(restaurant_df, col='city')
g.map_dataframe(sns.histplot, x = 'review_count')

In [None]:
sns.catplot(x = 'city', y = 'review_count', kind = 'box', data = restaurant_df)
plt.title("Boxplot of city vs. review count")

In [None]:
sns.catplot(x = 'city', y = 'review_count', hue = 'is_open', kind = 'box', data = restaurant_df)
plt.title("Boxplot of city vs. review count")

### Review Stars

In [None]:
sns.displot(restaurant_df, x = 'stars', bins = [1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5])
plt.title('Distribution of review stars')
#plt.show()
plt.savefig('dist_stars.png', bbox_inches='tight')

In [None]:
restaurant_df.stars.describe()

In [None]:
sns.displot(restaurant_df, x = 'stars', bins = [1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5], hue = 'city', element = 'step')

In [None]:
sns.displot(restaurant_df, x = 'stars', bins = [1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5], hue = 'is_open', multiple = 'dodge')
plt.title('Distribution of review stars by restaurant survival status')

In [None]:
sns.displot(restaurant_df, x = 'stars', bins = [1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5], col = 'is_open')
#plt.show()
plt.savefig('dist_stars_by_open.png')

In [None]:
restaurant_df[restaurant_df['is_open'] == 1]['stars'].describe()

In [None]:
restaurant_df[restaurant_df['is_open'] == 0]['stars'].describe()

In [None]:
sns.displot(restaurant_df, x = 'stars', bins = [1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5], col = 'city', hue = 'is_open')
plt.title('Distribution of review stars by city and restaurant status')

### Categories

In [None]:
restaurant_category = restaurant_df.merge(categories_df, on = 'business_id', how = 'left')

In [None]:
restaurant_category.shape

In [None]:
restaurant_category.head()

In [None]:
# How many restaurants has cusine_type non missing?
restaurant_category.cuisine_type.isna().value_counts()

In [None]:
# what are the most popular cuisine type?
categories_df.cuisine_type.value_counts(sort = True).plot(kind = 'bar')

In [None]:
categories_df.cuisine_type.value_counts(normalize = True, sort = True)

In [None]:
chart = sns.displot(restaurant_category, x = 'cuisine_type', col = 'city')
chart.set_xticklabels(rotation = 90)

In [None]:
chart = sns.catplot(x = 'cuisine_type', y = 'is_open', kind = 'bar', data = restaurant_category)
chart.set_xticklabels(rotation = 90)

In [None]:
chart = sns.catplot(x = 'cuisine_type', y = 'is_open', kind = 'bar', col = 'city', data = restaurant_category)
chart.set_xticklabels(rotation = 90)

In [None]:
restaurant_category.loc[~restaurant_category['hi_transmission_risk'].isna(), 'hi_risk'] = 1
restaurant_category.loc[restaurant_category['hi_transmission_risk'].isna(), 'hi_risk'] = 0

In [None]:
restaurant_category['hi_risk'].value_counts(dropna = False)

In [None]:
sns.catplot(x = 'hi_risk', y = 'is_open', kind = 'bar', data = restaurant_category)
plt.title('Proportion of open: high vs. low tranmission risk restaurants')

### Food type

In [None]:
# How many restaurants has cusine_type non missing?
restaurant_category.food_type.isna().value_counts(normalize = True)

In [None]:
# what are the most popular food type?
categories_df.food_type.value_counts(sort = True).plot(kind = 'bar')

In [None]:
categories_df.food_type.value_counts(normalize = True, sort = True)

In [None]:
13+12+12

### Atmosphere type

In [None]:
# How many restaurants has atmosphere_type non missing?
restaurant_category.atmosphere_type.isna().value_counts(normalize = True)

In [None]:
# what are the most popular food type?
categories_df.atmosphere_type.value_counts(sort = True).plot(kind = 'bar')

# Maps of 'is_open' 

In [None]:
folium_map = folium.Map()

In [None]:
lat_log_open_yes = restaurant_df.loc[ restaurant_df['is_open']==1, ["latitude","longitude" ] ]
lat_log_open_no = restaurant_df.loc[ restaurant_df['is_open']==0, ["latitude","longitude" ] ]

#### Atlanta

In [None]:
atlanta_coords = [33.749,-84.388] # lat, long
folium_map = folium.Map(location=atlanta_coords, zoom_start=12)
folium_map

In [None]:
atlanta_coords = [33.749,-84.388] # lat, long

folium_map = folium.Map(location=atlanta_coords,
                        zoom_start=12,
                        tiles="CartoDB positron")

for i in range(len(lat_log_open_yes)):
    marker = folium.CircleMarker(location=[lat_log_open_yes["latitude"].iloc[i],lat_log_open_yes["longitude"].iloc[i]],radius=5,color="blue",
                                 weight = 1.0, fill=True)
    marker.add_to(folium_map)

for i in range(len(lat_log_open_no)):
    marker = folium.CircleMarker(location=[lat_log_open_no["latitude"].iloc[i],lat_log_open_no["longitude"].iloc[i]],radius=5,color="red",
                                 weight = 1.0, fill=True)
    marker.add_to(folium_map)

folium_map

#### Portland

In [None]:
portland_coords = [45.515,-122.678]
folium_map = folium.Map(location=portland_coords, zoom_start=13)
folium_map

In [None]:
folium_map = folium.Map(location=portland_coords,
                        zoom_start=12,
                        tiles="CartoDB positron")

for i in range(len(lat_log_open_yes)):
    marker = folium.CircleMarker(location=[lat_log_open_yes["latitude"].iloc[i],lat_log_open_yes["longitude"].iloc[i]],radius=5,
                                 weight = 1.0, color="blue",fill=True)
    marker.add_to(folium_map)

for i in range(len(lat_log_open_no)):
    marker = folium.CircleMarker(location=[lat_log_open_no["latitude"].iloc[i],lat_log_open_no["longitude"].iloc[i]],radius=5,
                                 weight=1.0, color="red",fill=True)
    marker.add_to(folium_map)

folium_map

#### Boston

In [None]:
boston_coords = [42.360,-71.059] 
folium_map = folium.Map(location=boston_coords, zoom_start=12)
folium_map

In [None]:
folium_map = folium.Map(location=boston_coords,
                        zoom_start=13,
                        tiles="CartoDB positron")

for i in range(len(lat_log_open_yes)):
    marker = folium.CircleMarker(location=[lat_log_open_yes["latitude"].iloc[i],lat_log_open_yes["longitude"].iloc[i]],radius=5,
                                 weight = 1.0, color="blue",fill=True)
    marker.add_to(folium_map)

for i in range(len(lat_log_open_no)):
    marker = folium.CircleMarker(location=[lat_log_open_no["latitude"].iloc[i],lat_log_open_no["longitude"].iloc[i]],radius=5,
                                 weight=1.0, color="red",fill=True)
    marker.add_to(folium_map)

folium_map

#### Austin

In [None]:
austin_coords = [30.267,-97.743]
folium_map = folium.Map(location=austin_coords, zoom_start=12)
folium_map

In [None]:
lat_log_open_yes.shape

In [None]:
austin_open = (restaurant_df['is_open']==1) & (restaurant_df['city']=='AUSTIN')
austin_close = (restaurant_df['is_open']==0) & (restaurant_df['city']=='AUSTIN')

In [None]:
austin_lat_log_open_yes = restaurant_df.loc[ austin_open, ["latitude","longitude" ] ]
austin_lat_log_open_no = restaurant_df.loc[ austin_close, ["latitude","longitude" ] ]

In [None]:
austin_lat_log_open_yes.shape

In [None]:
folium_map = folium.Map(location=austin_coords,
                        zoom_start=12,
                        tiles="CartoDB positron")

for i in range(len(austin_lat_log_open_yes)):
    marker = folium.CircleMarker(location=[austin_lat_log_open_yes["latitude"].iloc[i],austin_lat_log_open_yes["longitude"].iloc[i]],radius=5,
                                 weight = 1.0, color="blue",fill=True)
    marker.add_to(folium_map)

for i in range(len(austin_lat_log_open_no)):
    marker = folium.CircleMarker(location=[austin_lat_log_open_no["latitude"].iloc[i],austin_lat_log_open_no["longitude"].iloc[i]],radius=5,
                                 weight=1.0, color="red",fill=True)
    marker.add_to(folium_map)

folium_map

#### Orlando

In [None]:
orlando_coords = [28.538,-81.379]
folium_map = folium.Map(location=orlando_coords, zoom_start=12)
folium_map

In [None]:
folium_map = folium.Map(location=orlando_coords,
                        zoom_start=12,
                        tiles="CartoDB positron")

for i in range(len(lat_log_open_yes)):
    marker = folium.CircleMarker(location=[lat_log_open_yes["latitude"].iloc[i],lat_log_open_yes["longitude"].iloc[i]],radius=5,
                                 weight = 1.0, color="blue",fill=True)
    marker.add_to(folium_map)

for i in range(len(lat_log_open_no)):
    marker = folium.CircleMarker(location=[lat_log_open_no["latitude"].iloc[i],lat_log_open_no["longitude"].iloc[i]],radius=5,
                                 weight=1.0, color="red",fill=True)
    marker.add_to(folium_map)

folium_map

# Map of star reviews

In [None]:
my_zip = zip(restaurant_df['latitude'], restaurant_df['longitude'], restaurant_df['stars'])
list_of_my_zip = list(my_zip)
list_of_my_zip[0:5]

#### Atlanta

In [None]:
folium_hmap = folium.Map(location=atlanta_coords, zoom_start=13, tiles="CartoDB positron")

hm_layer = HeatMap(list_of_my_zip,
                   min_opacity=0.2,
                   radius=8,
                   blur=6, 
                 )

folium_hmap.add_child(hm_layer)
folium_hmap

#### Portland

In [None]:
folium_hmap = folium.Map(location=portland_coords, zoom_start=13, tiles="CartoDB positron")

hm_layer = HeatMap(list_of_my_zip,
                   min_opacity=0.2,
                   radius=8,
                   blur=6, 
                 )

folium_hmap.add_child(hm_layer)
folium_hmap

### Boston

In [None]:
folium_hmap = folium.Map(location=boston_coords, zoom_start=13, tiles="CartoDB positron")

hm_layer = HeatMap(list_of_my_zip,
                   min_opacity=0.2,
                   radius=8,
                   blur=6, 
                 )

folium_hmap.add_child(hm_layer)
folium_hmap

#### Austin

In [None]:
folium_hmap = folium.Map(location=austin_coords, zoom_start=13, tiles="CartoDB positron")

hm_layer = HeatMap(list_of_my_zip,
                   min_opacity=0.2,
                   radius=8,
                   blur=6, 
                 )

folium_hmap.add_child(hm_layer)
folium_hmap

#### Orlando

In [None]:
folium_hmap = folium.Map(location=orlando_coords, zoom_start=13, tiles="CartoDB positron")

hm_layer = HeatMap(list_of_my_zip,
                   min_opacity=0.2,
                   radius=8,
                   blur=6, 
                 )

folium_hmap.add_child(hm_layer)
folium_hmap