# About

This notebook is to clean and wrangle `yelp_team7_dataset_restaurant.json`

In [2]:
import json
import pandas as pd

# Load the restaurant dataset

In [3]:
restaurant_df = pd.read_json('yelp_team7_dataset_restaurant.json')

In [4]:
restaurant_df.shape

(20693, 14)

In [5]:
restaurant_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,Portland,OR,97218,45.588906,-122.593331,4.0,126,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ..."
1,ufCxltuh56FF4-ZFZ6cVhg,Sister Honey's,247 E Michigan St,Orlando,FL,32806,28.513265,-81.374707,4.5,135,1,"{'BusinessParking': '{'garage': False, 'street...","Restaurants, American (New), Bakeries, Dessert...","{'Tuesday': '11:0-18:0', 'Wednesday': '11:0-18..."
2,hcRxdDg7DYryCxCoI8ySQA,Longwood Galleria,340-350 Longwood Ave,Boston,MA,2215,42.338544,-71.106842,2.5,24,1,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...","Restaurants, Shopping, Shopping Centers","{'Monday': '6:30-22:0', 'Tuesday': '6:30-22:0'..."
3,jGennaZUr2MsJyRhijNBfA,Legal Sea Foods,1 Harborside Dr,Boston,MA,2128,42.363442,-71.025781,3.5,856,1,"{'NoiseLevel': 'u'average'', 'BikeParking': 'F...","Sandwiches, Food, Restaurants, Breakfast & Bru...","{'Monday': '6:0-21:0', 'Tuesday': '6:0-21:0', ..."
4,iPD8BBvea6YldQZPHzVrSQ,Espresso Minute,334 Mass Ave,Boston,MA,2115,42.342673,-71.084239,4.5,7,0,"{'NoiseLevel': ''quiet'', 'GoodForKids': 'True...","Creperies, Restaurants, Food, Coffee & Tea, Br...","{'Tuesday': '8:0-20:0', 'Wednesday': '8:0-20:0..."


In [7]:
restaurant_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20693 entries, 0 to 20692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   business_id   20693 non-null  object 
 1   name          20693 non-null  object 
 2   address       20693 non-null  object 
 3   city          20693 non-null  object 
 4   state         20693 non-null  object 
 5   postal_code   20693 non-null  object 
 6   latitude      20693 non-null  float64
 7   longitude     20693 non-null  float64
 8   stars         20693 non-null  float64
 9   review_count  20693 non-null  int64  
 10  is_open       20693 non-null  int64  
 11  attributes    20591 non-null  object 
 12  categories    20693 non-null  object 
 13  hours         17500 non-null  object 
dtypes: float64(3), int64(2), object(9)
memory usage: 2.4+ MB


# Wrangle *categories* column

In [12]:
categories_df = restaurant_df[['categories']]
categories_df

Unnamed: 0,categories
0,"Salad, Soup, Sandwiches, Delis, Restaurants, C..."
1,"Restaurants, American (New), Bakeries, Dessert..."
2,"Restaurants, Shopping, Shopping Centers"
3,"Sandwiches, Food, Restaurants, Breakfast & Bru..."
4,"Creperies, Restaurants, Food, Coffee & Tea, Br..."
...,...
20688,"Italian, Restaurants"
20689,"Japanese, Sushi Bars, Restaurants"
20690,"Event Planning & Services, Steakhouses, Seafoo..."
20691,"Restaurants, Noodles, Salad, Vietnamese, Veget..."


In [11]:
categories_df.iloc[0,0]

'Salad, Soup, Sandwiches, Delis, Restaurants, Cafes, Vegetarian'

In [15]:
categories_df.iloc[1,0]

'Restaurants, American (New), Bakeries, Desserts, Food, Cupcakes'

In [16]:
categories_df.iloc[2,0]

'Restaurants, Shopping, Shopping Centers'

In [17]:
categories_df.iloc[3,0]

'Sandwiches, Food, Restaurants, Breakfast & Brunch, Seafood, Italian, Beer, Wine & Spirits, Cocktail Bars, Gluten-Free, Nightlife, Bars, Salad'

In [19]:
tmp_df = categories_df.iloc[0:11,:]
tmp_df

Unnamed: 0,categories
0,"Salad, Soup, Sandwiches, Delis, Restaurants, C..."
1,"Restaurants, American (New), Bakeries, Dessert..."
2,"Restaurants, Shopping, Shopping Centers"
3,"Sandwiches, Food, Restaurants, Breakfast & Bru..."
4,"Creperies, Restaurants, Food, Coffee & Tea, Br..."
5,"Nightlife, Sandwiches, Seafood, Restaurants"
6,"Food Trucks, Restaurants, Specialty Food, Food..."
7,"Bars, Nightlife, Cocktail Bars, Seafood, Resta..."
8,"Vegetarian, Vegan, Sandwiches, Soup, American ..."
9,"Italian, Restaurants"


In [22]:
tmp_df['categories']

0     Salad, Soup, Sandwiches, Delis, Restaurants, C...
1     Restaurants, American (New), Bakeries, Dessert...
2               Restaurants, Shopping, Shopping Centers
3     Sandwiches, Food, Restaurants, Breakfast & Bru...
4     Creperies, Restaurants, Food, Coffee & Tea, Br...
5           Nightlife, Sandwiches, Seafood, Restaurants
6     Food Trucks, Restaurants, Specialty Food, Food...
7     Bars, Nightlife, Cocktail Bars, Seafood, Resta...
8     Vegetarian, Vegan, Sandwiches, Soup, American ...
9                                  Italian, Restaurants
10                      Fast Food, Restaurants, Burgers
Name: categories, dtype: object

## Create a full list of categories

### A mini pilot: make it a list

In [25]:
for i,item in enumerate(tmp_df['categories']):
    print(i, item)

0 Salad, Soup, Sandwiches, Delis, Restaurants, Cafes, Vegetarian
1 Restaurants, American (New), Bakeries, Desserts, Food, Cupcakes
2 Restaurants, Shopping, Shopping Centers
3 Sandwiches, Food, Restaurants, Breakfast & Brunch, Seafood, Italian, Beer, Wine & Spirits, Cocktail Bars, Gluten-Free, Nightlife, Bars, Salad
4 Creperies, Restaurants, Food, Coffee & Tea, Breakfast & Brunch
5 Nightlife, Sandwiches, Seafood, Restaurants
6 Food Trucks, Restaurants, Specialty Food, Food, Mexican, Ethnic Food
7 Bars, Nightlife, Cocktail Bars, Seafood, Restaurants, Sushi Bars
8 Vegetarian, Vegan, Sandwiches, Soup, American (New), Fast Food, Restaurants, Wraps, American (Traditional)
9 Italian, Restaurants
10 Fast Food, Restaurants, Burgers


In [27]:
for i,item in enumerate(tmp_df['categories']):
    my_list = item.split(',')
    print(my_list)

['Salad', ' Soup', ' Sandwiches', ' Delis', ' Restaurants', ' Cafes', ' Vegetarian']
['Restaurants', ' American (New)', ' Bakeries', ' Desserts', ' Food', ' Cupcakes']
['Restaurants', ' Shopping', ' Shopping Centers']
['Sandwiches', ' Food', ' Restaurants', ' Breakfast & Brunch', ' Seafood', ' Italian', ' Beer', ' Wine & Spirits', ' Cocktail Bars', ' Gluten-Free', ' Nightlife', ' Bars', ' Salad']
['Creperies', ' Restaurants', ' Food', ' Coffee & Tea', ' Breakfast & Brunch']
['Nightlife', ' Sandwiches', ' Seafood', ' Restaurants']
['Food Trucks', ' Restaurants', ' Specialty Food', ' Food', ' Mexican', ' Ethnic Food']
['Bars', ' Nightlife', ' Cocktail Bars', ' Seafood', ' Restaurants', ' Sushi Bars']
['Vegetarian', ' Vegan', ' Sandwiches', ' Soup', ' American (New)', ' Fast Food', ' Restaurants', ' Wraps', ' American (Traditional)']
['Italian', ' Restaurants']
['Fast Food', ' Restaurants', ' Burgers']


In [28]:
tmp_df['categories'][0]

'Salad, Soup, Sandwiches, Delis, Restaurants, Cafes, Vegetarian'

In [31]:
not('Soup' in (tmp_df['categories'][0]))

False

In [35]:
tmp_categories_all = []

for i, item in enumerate(tmp_df['categories']):
    my_list = item.split(',')
    for j, element in enumerate(my_list):
        cleaned_element = element.strip()
        if not(cleaned_element in tmp_categories_all):
            tmp_categories_all.append(cleaned_element)
            
tmp_categories_all.remove('Restaurants')

print(tmp_categories_all)

['Salad', 'Soup', 'Sandwiches', 'Delis', 'Cafes', 'Vegetarian', 'American (New)', 'Bakeries', 'Desserts', 'Food', 'Cupcakes', 'Shopping', 'Shopping Centers', 'Breakfast & Brunch', 'Seafood', 'Italian', 'Beer', 'Wine & Spirits', 'Cocktail Bars', 'Gluten-Free', 'Nightlife', 'Bars', 'Creperies', 'Coffee & Tea', 'Food Trucks', 'Specialty Food', 'Mexican', 'Ethnic Food', 'Sushi Bars', 'Vegan', 'Fast Food', 'Wraps', 'American (Traditional)', 'Burgers']


## mini pilot: make it a dictionary

In [37]:
tmp_dict = {}

for i, item in enumerate(tmp_df['categories']):
    my_list = item.split(',')
    for j, element in enumerate(my_list):
        cleaned_element = element.strip()
        if not(cleaned_element in tmp_dict):
            tmp_dict[cleaned_element] = 1
        else:
            tmp_dict[cleaned_element] += 1
            

print(tmp_dict)

{'Salad': 2, 'Soup': 2, 'Sandwiches': 4, 'Delis': 1, 'Restaurants': 11, 'Cafes': 1, 'Vegetarian': 2, 'American (New)': 2, 'Bakeries': 1, 'Desserts': 1, 'Food': 4, 'Cupcakes': 1, 'Shopping': 1, 'Shopping Centers': 1, 'Breakfast & Brunch': 2, 'Seafood': 3, 'Italian': 2, 'Beer': 1, 'Wine & Spirits': 1, 'Cocktail Bars': 2, 'Gluten-Free': 1, 'Nightlife': 3, 'Bars': 2, 'Creperies': 1, 'Coffee & Tea': 1, 'Food Trucks': 1, 'Specialty Food': 1, 'Mexican': 1, 'Ethnic Food': 1, 'Sushi Bars': 1, 'Vegan': 1, 'Fast Food': 2, 'Wraps': 1, 'American (Traditional)': 1, 'Burgers': 1}


In [39]:
tmp_dict = {}

for item in tmp_df['categories']:
    my_list = item.split(',')
    for element in my_list:
        cleaned_element = element.strip()
        if not(cleaned_element in tmp_dict):
            tmp_dict[cleaned_element] = 1
        else:
            tmp_dict[cleaned_element] += 1
            

print(tmp_dict)

{'Salad': 2, 'Soup': 2, 'Sandwiches': 4, 'Delis': 1, 'Restaurants': 11, 'Cafes': 1, 'Vegetarian': 2, 'American (New)': 2, 'Bakeries': 1, 'Desserts': 1, 'Food': 4, 'Cupcakes': 1, 'Shopping': 1, 'Shopping Centers': 1, 'Breakfast & Brunch': 2, 'Seafood': 3, 'Italian': 2, 'Beer': 1, 'Wine & Spirits': 1, 'Cocktail Bars': 2, 'Gluten-Free': 1, 'Nightlife': 3, 'Bars': 2, 'Creperies': 1, 'Coffee & Tea': 1, 'Food Trucks': 1, 'Specialty Food': 1, 'Mexican': 1, 'Ethnic Food': 1, 'Sushi Bars': 1, 'Vegan': 1, 'Fast Food': 2, 'Wraps': 1, 'American (Traditional)': 1, 'Burgers': 1}


scribble:
* 'Restaurants' for sannity check
* see what are businesses that tag themselves as 'Food', 'Shopping', 'Shopping Center'
* consider removing businesses that tag themselves as'Food Trucks'