# About
This notebook is to clean `categories` column in `yelp_team7_dataset_restaurant.json`.

In [1]:
import json
import pandas as pd

# Load restaurant dataset

In [2]:
restaurant_df = pd.read_json('yelp_team7_dataset_restaurant.json')

In [3]:
restaurant_df

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,PORTLAND,OR,97218,45.588906,-122.593331,4.0,126,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ..."
1,ufCxltuh56FF4-ZFZ6cVhg,Sister Honey's,247 E Michigan St,ORLANDO,FL,32806,28.513265,-81.374707,4.5,135,1,"{'BusinessParking': '{'garage': False, 'street...","Restaurants, American (New), Bakeries, Dessert...","{'Tuesday': '11:0-18:0', 'Wednesday': '11:0-18..."
2,jGennaZUr2MsJyRhijNBfA,Legal Sea Foods,1 Harborside Dr,BOSTON,MA,02128,42.363442,-71.025781,3.5,856,1,"{'NoiseLevel': 'u'average'', 'BikeParking': 'F...","Sandwiches, Food, Restaurants, Breakfast & Bru...","{'Monday': '6:0-21:0', 'Tuesday': '6:0-21:0', ..."
3,iPD8BBvea6YldQZPHzVrSQ,Espresso Minute,334 Mass Ave,BOSTON,MA,02115,42.342673,-71.084239,4.5,7,0,"{'NoiseLevel': ''quiet'', 'GoodForKids': 'True...","Creperies, Restaurants, Food, Coffee & Tea, Br...","{'Tuesday': '8:0-20:0', 'Wednesday': '8:0-20:0..."
4,jx91IMdGOmLOo8h_F9z39g,Cleary's Restaurant & Spirits,12429 NE Glisan St,PORTLAND,OR,97230,45.526473,-122.535323,3.5,19,1,"{'RestaurantsGoodForGroups': 'True', 'Alcohol'...","Nightlife, Sandwiches, Seafood, Restaurants","{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14965,m5eUPVD0Hu39Ff-Uqe-FLA,The Italian Joint,3145 SE Hawthorne Blvd,PORTLAND,OR,97203,45.512196,-122.632865,3.5,20,0,"{'RestaurantsGoodForGroups': 'False', 'GoodFor...","Italian, Restaurants",
14966,87f7kR7nTz8WHnmtLM_S6w,O Ya,9 East St Pl,BOSTON,MA,02111,42.351408,-71.056867,4.5,737,1,"{'RestaurantsPriceRange2': '4', 'RestaurantsGo...","Japanese, Sushi Bars, Restaurants","{'Monday': '0:0-0:0', 'Tuesday': '17:0-21:30',..."
14967,jYgqSazE0gUyI7qq086Dzw,Chart House,5700 SW Terwilliger,PORTLAND,OR,97239,45.483154,-122.682748,3.5,457,1,"{'BusinessAcceptsCreditCards': 'True', 'Outdoo...","Event Planning & Services, Steakhouses, Seafoo...","{'Tuesday': '16:0-21:0', 'Wednesday': '16:0-21..."
14968,r5Uag1JqYjr2nbxQCVqm8A,Saigon Noodle & Grill,101 N Bumby Ave,ORLANDO,FL,32803,28.544430,-81.351606,4.5,437,1,"{'Alcohol': 'u'beer_and_wine'', 'RestaurantsAt...","Restaurants, Noodles, Salad, Vietnamese, Veget...","{'Monday': '0:0-0:0', 'Tuesday': '10:30-21:30'..."


In [4]:
restaurant_df.shape

(14970, 14)

# Identify categories

In [5]:
categories_df = restaurant_df[['categories']]
categories_df

Unnamed: 0,categories
0,"Salad, Soup, Sandwiches, Delis, Restaurants, C..."
1,"Restaurants, American (New), Bakeries, Dessert..."
2,"Sandwiches, Food, Restaurants, Breakfast & Bru..."
3,"Creperies, Restaurants, Food, Coffee & Tea, Br..."
4,"Nightlife, Sandwiches, Seafood, Restaurants"
...,...
14965,"Italian, Restaurants"
14966,"Japanese, Sushi Bars, Restaurants"
14967,"Event Planning & Services, Steakhouses, Seafoo..."
14968,"Restaurants, Noodles, Salad, Vietnamese, Veget..."


In [6]:
# create a full list of categories
# --- make this a function

categories_dict = {}

for item in restaurant_df['categories']:
    my_list = item.split(',')
    for element in my_list:
        cleaned_element = element.strip()
        if not(cleaned_element in categories_dict):
            categories_dict[cleaned_element] = 1
        else:
            categories_dict[cleaned_element] += 1

In [7]:
# create a data frame
categories_agg = pd.DataFrame(list(categories_dict.items()))
categories_agg.columns = ['category', 'number_of_restaurants']
categories_agg.head()

Unnamed: 0,category,number_of_restaurants
0,Salad,882
1,Soup,317
2,Sandwiches,2102
3,Delis,425
4,Restaurants,14970


In [8]:
# sort and export
categories_agg_sorted = categories_agg.sort_values(by = 'number_of_restaurants', ascending = False)
categories_agg_sorted.to_csv('categories_agg.csv', index = False)

# Map categories: cuisine type

In [12]:
# load mapping csv for cuisine type
mapping_cuisine_df = pd.read_csv('categories_mapping_cuisine_type.csv')

In [10]:
mapping_cuisine_df.head()

Unnamed: 0,category,cuisine type
0,American (Traditional),American
1,American (New),American
2,Mexican,Mexican
3,Italian,Italian
4,Chinese,Chinese


In [13]:
mapping_cuisine_df.shape

(90, 2)

In [16]:
mapping_cuisine_df['category']

0     American (Traditional)
1             American (New)
2                    Mexican
3                    Italian
4                    Chinese
               ...          
85                   Iberian
86                Nicaraguan
87                  Scottish
88                  Sicilian
89                  Georgian
Name: category, Length: 90, dtype: object

In [68]:
# create dictionary of mapping
cuisine_dict = mapping_cuisine_df.set_index('category').to_dict()['cuisine type']
#cuisine_dict

In [69]:
cuisine_dict['American (New)']

'American'

In [23]:
# create a list of categories that are considered as cuisine types
categories_cuisine = list(mapping_cuisine_df['category'])
categories_cuisine[0:9]

['American (Traditional)',
 'American (New)',
 'Mexican',
 'Italian',
 'Chinese',
 'Japanese',
 'Asian Fusion',
 'Mediterranean',
 'Thai']

In [18]:
restaurant_df['categories']

0        Salad, Soup, Sandwiches, Delis, Restaurants, C...
1        Restaurants, American (New), Bakeries, Dessert...
2        Sandwiches, Food, Restaurants, Breakfast & Bru...
3        Creperies, Restaurants, Food, Coffee & Tea, Br...
4              Nightlife, Sandwiches, Seafood, Restaurants
                               ...                        
14965                                 Italian, Restaurants
14966                    Japanese, Sushi Bars, Restaurants
14967    Event Planning & Services, Steakhouses, Seafoo...
14968    Restaurants, Noodles, Salad, Vietnamese, Veget...
14969                 Restaurants, Mexican, Latin American
Name: categories, Length: 14970, dtype: object

In [21]:
case_1 = restaurant_df['categories'].iloc[1]
case_1

'Restaurants, American (New), Bakeries, Desserts, Food, Cupcakes'

In [24]:
'American (New)' in categories_cuisine

True

In [46]:
def check_category_type(item, reference_list):
    is_category_type = False
    my_list = item.split(',')
    for item in my_list:
        item = item.strip()
        is_category_type = item in reference_list
        #print('item:',item, '; status:', is_category_type)
        if is_category_type:
            return True
    return is_category_type

In [42]:
check_category_type(case_1, categories_cuisine)

item: Restaurants ; status: False
item: American (New) ; status: True


True

In [83]:
case_2 = restaurant_df['categories'].iloc[0]
case_2

'Salad, Soup, Sandwiches, Delis, Restaurants, Cafes, Vegetarian'

In [43]:
check_category_type(case_2, categories_cuisine)

item: Salad ; status: False
item: Soup ; status: False
item: Sandwiches ; status: False
item: Delis ; status: False
item: Restaurants ; status: False
item: Cafes ; status: False
item: Vegetarian ; status: False


False

In [84]:
map_category_type(case_2, reference_list = categories_cuisine, reference_dict = cuisine_dict)

In [32]:
case_3 = restaurant_df['categories'].iloc[8]
case_3

'Coffee & Tea, Tex-Mex, Restaurants, Mexican, Food'

In [44]:
check_category_type(case_3, categories_cuisine)

item: Coffee & Tea ; status: False
item: Tex-Mex ; status: True


True

In [47]:
# create a new column to tag whether a business categories is one of the cuisine type
condition = restaurant_df.categories.apply(check_category_type, args = (categories_cuisine,))

In [49]:
condition.value_counts()

True     10637
False     4333
Name: categories, dtype: int64

In [54]:
cuisine_df = restaurant_df[condition][['business_id', 'categories']]

In [55]:
cuisine_df.shape

(10637, 2)

In [56]:
cuisine_df.head()

Unnamed: 0,business_id,categories
1,ufCxltuh56FF4-ZFZ6cVhg,"Restaurants, American (New), Bakeries, Dessert..."
2,jGennaZUr2MsJyRhijNBfA,"Sandwiches, Food, Restaurants, Breakfast & Bru..."
6,w4qVflIAbdklzG3mnKmQsg,"Italian, Restaurants"
8,z-0oY7VxQMQw3JHvdPejrA,"Coffee & Tea, Tex-Mex, Restaurants, Mexican, Food"
12,TbZDLpBOl-EbO2LfMySrEg,"Restaurants, Chinese"


In [57]:
cuisine_df['categories']

1        Restaurants, American (New), Bakeries, Dessert...
2        Sandwiches, Food, Restaurants, Breakfast & Bru...
6                                     Italian, Restaurants
8        Coffee & Tea, Tex-Mex, Restaurants, Mexican, Food
12                                    Restaurants, Chinese
                               ...                        
14965                                 Italian, Restaurants
14966                    Japanese, Sushi Bars, Restaurants
14967    Event Planning & Services, Steakhouses, Seafoo...
14968    Restaurants, Noodles, Salad, Vietnamese, Veget...
14969                 Restaurants, Mexican, Latin American
Name: categories, Length: 10637, dtype: object

In [59]:
case_1 = cuisine_df['categories'].iloc[0]
case_1

'Restaurants, American (New), Bakeries, Desserts, Food, Cupcakes'

In [81]:
def map_category_type(item, reference_list, reference_dict):
    my_list = item.split(',')
    for item in my_list:
        item = item.strip()
        is_category_type = item in reference_list
        if is_category_type:
            new_category = reference_dict[item]
            return new_category
    return None

In [82]:
map_category_type(case_1, reference_list = categories_cuisine, reference_dict = cuisine_dict)

'American'

In [73]:
case_2 = cuisine_df['categories'].iloc[7]
case_2

'Restaurants, Southern'

In [74]:
map_category_type(case_2, reference_list = categories_cuisine, reference_dict = cuisine_dict)

'American'

In [76]:
case_3 = cuisine_df['categories'].iloc[9]
case_3

'Restaurants, Nightlife, Mexican, Bars, Sports Bars, Tex-Mex'

In [77]:
map_category_type(case_3, reference_list = categories_cuisine, reference_dict = cuisine_dict)

'Mexican'

In [89]:
cuisine_mapped = restaurant_df.categories.apply(map_category_type, args = (categories_cuisine, cuisine_dict))
cuisine_mapped

0              None
1          American
2           Italian
3              None
4              None
            ...    
14965       Italian
14966      Japanese
14967      American
14968    Vietnamese
14969       Mexican
Name: categories, Length: 14970, dtype: object

In [90]:
# approach 1: map directly to the restaurant df
restaurant_df.loc[condition, 'cuisine_type'] = cuisine_mapped

In [91]:
restaurant_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,cuisine_type
0,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,PORTLAND,OR,97218,45.588906,-122.593331,4.0,126,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ...",
1,ufCxltuh56FF4-ZFZ6cVhg,Sister Honey's,247 E Michigan St,ORLANDO,FL,32806,28.513265,-81.374707,4.5,135,1,"{'BusinessParking': '{'garage': False, 'street...","Restaurants, American (New), Bakeries, Dessert...","{'Tuesday': '11:0-18:0', 'Wednesday': '11:0-18...",American
2,jGennaZUr2MsJyRhijNBfA,Legal Sea Foods,1 Harborside Dr,BOSTON,MA,2128,42.363442,-71.025781,3.5,856,1,"{'NoiseLevel': 'u'average'', 'BikeParking': 'F...","Sandwiches, Food, Restaurants, Breakfast & Bru...","{'Monday': '6:0-21:0', 'Tuesday': '6:0-21:0', ...",Italian
3,iPD8BBvea6YldQZPHzVrSQ,Espresso Minute,334 Mass Ave,BOSTON,MA,2115,42.342673,-71.084239,4.5,7,0,"{'NoiseLevel': ''quiet'', 'GoodForKids': 'True...","Creperies, Restaurants, Food, Coffee & Tea, Br...","{'Tuesday': '8:0-20:0', 'Wednesday': '8:0-20:0...",
4,jx91IMdGOmLOo8h_F9z39g,Cleary's Restaurant & Spirits,12429 NE Glisan St,PORTLAND,OR,97230,45.526473,-122.535323,3.5,19,1,"{'RestaurantsGoodForGroups': 'True', 'Alcohol'...","Nightlife, Sandwiches, Seafood, Restaurants","{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ...",


In [93]:
restaurant_df.iloc[2]['categories']

'Sandwiches, Food, Restaurants, Breakfast & Brunch, Seafood, Italian, Beer, Wine & Spirits, Cocktail Bars, Gluten-Free, Nightlife, Bars, Salad'