# About
This notebook is to clean `categories` column in `yelp_team7_dataset_restaurant.json`.

In [1]:
import json
import pandas as pd

# Load restaurant dataset

In [2]:
restaurant_df = pd.read_json('yelp_team7_dataset_restaurant.json')

In [3]:
restaurant_df

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,PORTLAND,OR,97218,45.588906,-122.593331,4.0,126,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ..."
1,ufCxltuh56FF4-ZFZ6cVhg,Sister Honey's,247 E Michigan St,ORLANDO,FL,32806,28.513265,-81.374707,4.5,135,1,"{'BusinessParking': '{'garage': False, 'street...","Restaurants, American (New), Bakeries, Dessert...","{'Tuesday': '11:0-18:0', 'Wednesday': '11:0-18..."
2,jGennaZUr2MsJyRhijNBfA,Legal Sea Foods,1 Harborside Dr,BOSTON,MA,02128,42.363442,-71.025781,3.5,856,1,"{'NoiseLevel': 'u'average'', 'BikeParking': 'F...","Sandwiches, Food, Restaurants, Breakfast & Bru...","{'Monday': '6:0-21:0', 'Tuesday': '6:0-21:0', ..."
3,iPD8BBvea6YldQZPHzVrSQ,Espresso Minute,334 Mass Ave,BOSTON,MA,02115,42.342673,-71.084239,4.5,7,0,"{'NoiseLevel': ''quiet'', 'GoodForKids': 'True...","Creperies, Restaurants, Food, Coffee & Tea, Br...","{'Tuesday': '8:0-20:0', 'Wednesday': '8:0-20:0..."
4,jx91IMdGOmLOo8h_F9z39g,Cleary's Restaurant & Spirits,12429 NE Glisan St,PORTLAND,OR,97230,45.526473,-122.535323,3.5,19,1,"{'RestaurantsGoodForGroups': 'True', 'Alcohol'...","Nightlife, Sandwiches, Seafood, Restaurants","{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14965,m5eUPVD0Hu39Ff-Uqe-FLA,The Italian Joint,3145 SE Hawthorne Blvd,PORTLAND,OR,97203,45.512196,-122.632865,3.5,20,0,"{'RestaurantsGoodForGroups': 'False', 'GoodFor...","Italian, Restaurants",
14966,87f7kR7nTz8WHnmtLM_S6w,O Ya,9 East St Pl,BOSTON,MA,02111,42.351408,-71.056867,4.5,737,1,"{'RestaurantsPriceRange2': '4', 'RestaurantsGo...","Japanese, Sushi Bars, Restaurants","{'Monday': '0:0-0:0', 'Tuesday': '17:0-21:30',..."
14967,jYgqSazE0gUyI7qq086Dzw,Chart House,5700 SW Terwilliger,PORTLAND,OR,97239,45.483154,-122.682748,3.5,457,1,"{'BusinessAcceptsCreditCards': 'True', 'Outdoo...","Event Planning & Services, Steakhouses, Seafoo...","{'Tuesday': '16:0-21:0', 'Wednesday': '16:0-21..."
14968,r5Uag1JqYjr2nbxQCVqm8A,Saigon Noodle & Grill,101 N Bumby Ave,ORLANDO,FL,32803,28.544430,-81.351606,4.5,437,1,"{'Alcohol': 'u'beer_and_wine'', 'RestaurantsAt...","Restaurants, Noodles, Salad, Vietnamese, Veget...","{'Monday': '0:0-0:0', 'Tuesday': '10:30-21:30'..."


In [4]:
restaurant_df.shape

(14970, 14)

# Identify categories

In [5]:
categories_df = restaurant_df[['categories']]
categories_df

Unnamed: 0,categories
0,"Salad, Soup, Sandwiches, Delis, Restaurants, C..."
1,"Restaurants, American (New), Bakeries, Dessert..."
2,"Sandwiches, Food, Restaurants, Breakfast & Bru..."
3,"Creperies, Restaurants, Food, Coffee & Tea, Br..."
4,"Nightlife, Sandwiches, Seafood, Restaurants"
...,...
14965,"Italian, Restaurants"
14966,"Japanese, Sushi Bars, Restaurants"
14967,"Event Planning & Services, Steakhouses, Seafoo..."
14968,"Restaurants, Noodles, Salad, Vietnamese, Veget..."


In [6]:
# create a full list of categories
# --- make this a function

categories_dict = {}

for item in restaurant_df['categories']:
    my_list = item.split(',')
    for element in my_list:
        cleaned_element = element.strip()
        if not(cleaned_element in categories_dict):
            categories_dict[cleaned_element] = 1
        else:
            categories_dict[cleaned_element] += 1

In [7]:
# create a data frame
categories_agg = pd.DataFrame(list(categories_dict.items()))
categories_agg.columns = ['category', 'number_of_restaurants']
categories_agg.head()

Unnamed: 0,category,number_of_restaurants
0,Salad,882
1,Soup,317
2,Sandwiches,2102
3,Delis,425
4,Restaurants,14970


In [8]:
# sort and export
categories_agg_sorted = categories_agg.sort_values(by = 'number_of_restaurants', ascending = False)
categories_agg_sorted.to_csv('categories_agg.csv', index = False)

# Map categories: cuisine type

In [6]:
# load mapping csv for cuisine type
mapping_cuisine_df = pd.read_csv('categories_mapping_cuisine_type.csv')

In [7]:
mapping_cuisine_df.head()

Unnamed: 0,category,cuisine type
0,American (Traditional),American
1,American (New),American
2,Mexican,Mexican
3,Italian,Italian
4,Chinese,Chinese


In [8]:
mapping_cuisine_df.shape

(90, 2)

In [16]:
mapping_cuisine_df['category']

0     American (Traditional)
1             American (New)
2                    Mexican
3                    Italian
4                    Chinese
               ...          
85                   Iberian
86                Nicaraguan
87                  Scottish
88                  Sicilian
89                  Georgian
Name: category, Length: 90, dtype: object

In [3]:
def gen_category_dict(df, category_type):
    a_dict = df.set_index('category').to_dict()[category_type]
    return a_dict

In [11]:
# create dictionary of mapping
cuisine_dict = gen_category_dict(mapping_cuisine_df, 'cuisine_type')

In [68]:
# create dictionary of mapping
#cuisine_dict = mapping_cuisine_df.set_index('category').to_dict()['cuisine type']

In [12]:
cuisine_dict['American (New)']

'American'

In [13]:
# create a list of categories that are considered as cuisine types
cuisine_list = list(mapping_cuisine_df['category'])
cuisine_list[0:9]

['American (Traditional)',
 'American (New)',
 'Mexican',
 'Italian',
 'Chinese',
 'Japanese',
 'Asian Fusion',
 'Mediterranean',
 'Thai']

In [18]:
restaurant_df['categories']

0        Salad, Soup, Sandwiches, Delis, Restaurants, C...
1        Restaurants, American (New), Bakeries, Dessert...
2        Sandwiches, Food, Restaurants, Breakfast & Bru...
3        Creperies, Restaurants, Food, Coffee & Tea, Br...
4              Nightlife, Sandwiches, Seafood, Restaurants
                               ...                        
14965                                 Italian, Restaurants
14966                    Japanese, Sushi Bars, Restaurants
14967    Event Planning & Services, Steakhouses, Seafoo...
14968    Restaurants, Noodles, Salad, Vietnamese, Veget...
14969                 Restaurants, Mexican, Latin American
Name: categories, Length: 14970, dtype: object

In [14]:
case_1 = restaurant_df['categories'].iloc[1]
case_1

'Restaurants, American (New), Bakeries, Desserts, Food, Cupcakes'

In [24]:
'American (New)' in categories_cuisine

True

In [4]:
def identify_category_type(item, reference_list):
    is_category_type = False
    my_list = item.split(',')
    for item in my_list:
        item = item.strip()
        is_category_type = item in reference_list
        #print('item:',item, '; status:', is_category_type)
        if is_category_type:
            return True
    return is_category_type

In [17]:
identify_category_type(case_1, cuisine_list)

item: Restaurants ; status: False
item: American (New) ; status: True


True

In [18]:
case_2 = restaurant_df['categories'].iloc[0]
case_2

'Salad, Soup, Sandwiches, Delis, Restaurants, Cafes, Vegetarian'

In [19]:
identify_category_type(case_2, cuisine_list)

item: Salad ; status: False
item: Soup ; status: False
item: Sandwiches ; status: False
item: Delis ; status: False
item: Restaurants ; status: False
item: Cafes ; status: False
item: Vegetarian ; status: False


False

In [84]:
#map_category_type(case_2, reference_list = cuisine_list, reference_dict = cuisine_dict)

In [20]:
case_3 = restaurant_df['categories'].iloc[8]
case_3

'Coffee & Tea, Tex-Mex, Restaurants, Mexican, Food'

In [21]:
identify_category_type(case_3, cuisine_list)

item: Coffee & Tea ; status: False
item: Tex-Mex ; status: True


True

In [24]:
# create a new column to tag whether a business categories is one of the cuisine type
condition = restaurant_df.categories.apply(identify_category_type, args = (cuisine_list,))

In [25]:
condition.value_counts()

True     10637
False     4333
Name: categories, dtype: int64

In [26]:
cuisine_df = restaurant_df[condition][['business_id', 'categories']]

In [27]:
cuisine_df.shape

(10637, 2)

In [28]:
cuisine_df.head()

Unnamed: 0,business_id,categories
1,ufCxltuh56FF4-ZFZ6cVhg,"Restaurants, American (New), Bakeries, Dessert..."
2,jGennaZUr2MsJyRhijNBfA,"Sandwiches, Food, Restaurants, Breakfast & Bru..."
6,w4qVflIAbdklzG3mnKmQsg,"Italian, Restaurants"
8,z-0oY7VxQMQw3JHvdPejrA,"Coffee & Tea, Tex-Mex, Restaurants, Mexican, Food"
12,TbZDLpBOl-EbO2LfMySrEg,"Restaurants, Chinese"


In [57]:
cuisine_df['categories']

1        Restaurants, American (New), Bakeries, Dessert...
2        Sandwiches, Food, Restaurants, Breakfast & Bru...
6                                     Italian, Restaurants
8        Coffee & Tea, Tex-Mex, Restaurants, Mexican, Food
12                                    Restaurants, Chinese
                               ...                        
14965                                 Italian, Restaurants
14966                    Japanese, Sushi Bars, Restaurants
14967    Event Planning & Services, Steakhouses, Seafoo...
14968    Restaurants, Noodles, Salad, Vietnamese, Veget...
14969                 Restaurants, Mexican, Latin American
Name: categories, Length: 10637, dtype: object

In [29]:
case_1 = cuisine_df['categories'].iloc[0]
case_1

'Restaurants, American (New), Bakeries, Desserts, Food, Cupcakes'

In [5]:
def map_category_type(item, reference_list, reference_dict):
    my_list = item.split(',')
    for item in my_list:
        item = item.strip()
        is_category_type = item in reference_list
        if is_category_type:
            new_category = reference_dict[item]
            return new_category
    return None

In [31]:
map_category_type(case_1, reference_list = cuisine_list, reference_dict = cuisine_dict)

'American'

In [32]:
case_2 = cuisine_df['categories'].iloc[7]
case_2

'Restaurants, Southern'

In [33]:
map_category_type(case_2, reference_list = cuisine_list, reference_dict = cuisine_dict)

'American'

In [34]:
case_3 = cuisine_df['categories'].iloc[9]
case_3

'Restaurants, Nightlife, Mexican, Bars, Sports Bars, Tex-Mex'

In [35]:
map_category_type(case_3, reference_list = cuisine_list, reference_dict = cuisine_dict)

'Mexican'

In [36]:
cuisine_mapped = restaurant_df.categories.apply(map_category_type, args = (cuisine_list, cuisine_dict))
cuisine_mapped

0              None
1          American
2           Italian
3              None
4              None
            ...    
14965       Italian
14966      Japanese
14967      American
14968    Vietnamese
14969       Mexican
Name: categories, Length: 14970, dtype: object

In [37]:
# approach 1: map directly to the restaurant df
restaurant_df.loc[condition, 'cuisine_type'] = cuisine_mapped

In [38]:
restaurant_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,cuisine_type
0,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,PORTLAND,OR,97218,45.588906,-122.593331,4.0,126,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ...",
1,ufCxltuh56FF4-ZFZ6cVhg,Sister Honey's,247 E Michigan St,ORLANDO,FL,32806,28.513265,-81.374707,4.5,135,1,"{'BusinessParking': '{'garage': False, 'street...","Restaurants, American (New), Bakeries, Dessert...","{'Tuesday': '11:0-18:0', 'Wednesday': '11:0-18...",American
2,jGennaZUr2MsJyRhijNBfA,Legal Sea Foods,1 Harborside Dr,BOSTON,MA,2128,42.363442,-71.025781,3.5,856,1,"{'NoiseLevel': 'u'average'', 'BikeParking': 'F...","Sandwiches, Food, Restaurants, Breakfast & Bru...","{'Monday': '6:0-21:0', 'Tuesday': '6:0-21:0', ...",Italian
3,iPD8BBvea6YldQZPHzVrSQ,Espresso Minute,334 Mass Ave,BOSTON,MA,2115,42.342673,-71.084239,4.5,7,0,"{'NoiseLevel': ''quiet'', 'GoodForKids': 'True...","Creperies, Restaurants, Food, Coffee & Tea, Br...","{'Tuesday': '8:0-20:0', 'Wednesday': '8:0-20:0...",
4,jx91IMdGOmLOo8h_F9z39g,Cleary's Restaurant & Spirits,12429 NE Glisan St,PORTLAND,OR,97230,45.526473,-122.535323,3.5,19,1,"{'RestaurantsGoodForGroups': 'True', 'Alcohol'...","Nightlife, Sandwiches, Seafood, Restaurants","{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ...",


In [39]:
restaurant_df.iloc[2]['categories']

'Sandwiches, Food, Restaurants, Breakfast & Brunch, Seafood, Italian, Beer, Wine & Spirits, Cocktail Bars, Gluten-Free, Nightlife, Bars, Salad'

## Compile steps

In [6]:
# create a function to compile the steps
def gen_category_column(restaurant_df, mapping_csv, category_type):
    """Generate a category column for a category type
    
       restaurant_df: a restaurant data frame from Yelp business dataset
       mapping_csv: a string, containing the name of csv file that map categories
       category_type: a string, for example 'cuisine_type', 'atmosphere_type'
       
       Return a data frame with a new category column
    """
    # upload the mapping_csv into a data frame
    mapping_df = pd.read_csv(mapping_csv)
    
    # create a dictionary of mapping
    category_dict = gen_category_dict(mapping_df, category_type)
    
    # create a list of categories that fall under the category type
    category_list = list(mapping_df['category'])
    
    # create a new column to tag whether a business' categories include the category type
    condition = restaurant_df.categories.apply(identify_category_type, args = (category_list,))
    
    # map the category
    category_mapped = restaurant_df.categories.apply(map_category_type, args = (category_list, category_dict))
    restaurant_df.loc[condition, category_type] = category_mapped
    
    return restaurant_df
    
    

In [7]:
updated_restaurant_df = gen_category_column(restaurant_df, 'categories_mapping_cuisine_type.csv', 'cuisine_type')

In [8]:
updated_restaurant_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,cuisine_type
0,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,PORTLAND,OR,97218,45.588906,-122.593331,4.0,126,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ...",
1,ufCxltuh56FF4-ZFZ6cVhg,Sister Honey's,247 E Michigan St,ORLANDO,FL,32806,28.513265,-81.374707,4.5,135,1,"{'BusinessParking': '{'garage': False, 'street...","Restaurants, American (New), Bakeries, Dessert...","{'Tuesday': '11:0-18:0', 'Wednesday': '11:0-18...",American
2,jGennaZUr2MsJyRhijNBfA,Legal Sea Foods,1 Harborside Dr,BOSTON,MA,2128,42.363442,-71.025781,3.5,856,1,"{'NoiseLevel': 'u'average'', 'BikeParking': 'F...","Sandwiches, Food, Restaurants, Breakfast & Bru...","{'Monday': '6:0-21:0', 'Tuesday': '6:0-21:0', ...",Italian
3,iPD8BBvea6YldQZPHzVrSQ,Espresso Minute,334 Mass Ave,BOSTON,MA,2115,42.342673,-71.084239,4.5,7,0,"{'NoiseLevel': ''quiet'', 'GoodForKids': 'True...","Creperies, Restaurants, Food, Coffee & Tea, Br...","{'Tuesday': '8:0-20:0', 'Wednesday': '8:0-20:0...",
4,jx91IMdGOmLOo8h_F9z39g,Cleary's Restaurant & Spirits,12429 NE Glisan St,PORTLAND,OR,97230,45.526473,-122.535323,3.5,19,1,"{'RestaurantsGoodForGroups': 'True', 'Alcohol'...","Nightlife, Sandwiches, Seafood, Restaurants","{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ...",


In [9]:
updated_restaurant_df.shape

(14970, 15)

# Loop for all category types

In [7]:
categories = [['categories_mapping_cuisine_type.csv', 'cuisine_type'],
              ['categories_mapping_food_type.csv', 'food_type'],
              ['categories_mapping_atmosphere_type.csv', 'atmosphere_type'],
              ['categories_mapping_dietary_type.csv', 'dietary_type']
]

In [8]:
for a_type in categories:
    gen_category_column(restaurant_df, a_type[0], a_type[1])

In [9]:
restaurant_df.shape

(14970, 18)

In [10]:
restaurant_df

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,cuisine_type,food_type,atmosphere_type,dietary_type
0,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,PORTLAND,OR,97218,45.588906,-122.593331,4.0,126,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ...",,Salad,Delis,Vegetarian
1,ufCxltuh56FF4-ZFZ6cVhg,Sister Honey's,247 E Michigan St,ORLANDO,FL,32806,28.513265,-81.374707,4.5,135,1,"{'BusinessParking': '{'garage': False, 'street...","Restaurants, American (New), Bakeries, Dessert...","{'Tuesday': '11:0-18:0', 'Wednesday': '11:0-18...",American,Bakeries,,
2,jGennaZUr2MsJyRhijNBfA,Legal Sea Foods,1 Harborside Dr,BOSTON,MA,02128,42.363442,-71.025781,3.5,856,1,"{'NoiseLevel': 'u'average'', 'BikeParking': 'F...","Sandwiches, Food, Restaurants, Breakfast & Bru...","{'Monday': '6:0-21:0', 'Tuesday': '6:0-21:0', ...",Italian,Sandwiches & Wraps,Bars related,Gluten-Free
3,iPD8BBvea6YldQZPHzVrSQ,Espresso Minute,334 Mass Ave,BOSTON,MA,02115,42.342673,-71.084239,4.5,7,0,"{'NoiseLevel': ''quiet'', 'GoodForKids': 'True...","Creperies, Restaurants, Food, Coffee & Tea, Br...","{'Tuesday': '8:0-20:0', 'Wednesday': '8:0-20:0...",,Desserts,,
4,jx91IMdGOmLOo8h_F9z39g,Cleary's Restaurant & Spirits,12429 NE Glisan St,PORTLAND,OR,97230,45.526473,-122.535323,3.5,19,1,"{'RestaurantsGoodForGroups': 'True', 'Alcohol'...","Nightlife, Sandwiches, Seafood, Restaurants","{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ...",,Sandwiches & Wraps,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14965,m5eUPVD0Hu39Ff-Uqe-FLA,The Italian Joint,3145 SE Hawthorne Blvd,PORTLAND,OR,97203,45.512196,-122.632865,3.5,20,0,"{'RestaurantsGoodForGroups': 'False', 'GoodFor...","Italian, Restaurants",,Italian,,,
14966,87f7kR7nTz8WHnmtLM_S6w,O Ya,9 East St Pl,BOSTON,MA,02111,42.351408,-71.056867,4.5,737,1,"{'RestaurantsPriceRange2': '4', 'RestaurantsGo...","Japanese, Sushi Bars, Restaurants","{'Monday': '0:0-0:0', 'Tuesday': '17:0-21:30',...",Japanese,,Bars related,
14967,jYgqSazE0gUyI7qq086Dzw,Chart House,5700 SW Terwilliger,PORTLAND,OR,97239,45.483154,-122.682748,3.5,457,1,"{'BusinessAcceptsCreditCards': 'True', 'Outdoo...","Event Planning & Services, Steakhouses, Seafoo...","{'Tuesday': '16:0-21:0', 'Wednesday': '16:0-21...",American,Steakhouses,,
14968,r5Uag1JqYjr2nbxQCVqm8A,Saigon Noodle & Grill,101 N Bumby Ave,ORLANDO,FL,32803,28.544430,-81.351606,4.5,437,1,"{'Alcohol': 'u'beer_and_wine'', 'RestaurantsAt...","Restaurants, Noodles, Salad, Vietnamese, Veget...","{'Monday': '0:0-0:0', 'Tuesday': '10:30-21:30'...",Vietnamese,Noodles related,,Vegetarian


In [11]:
restaurant_df.cuisine_type.value_counts(dropna = False)

NaN                     4333
American                3922
Italian                 1094
Mexican                 1084
Other Asian              726
Chinese                  703
Japanese                 609
Other Mediterranean      428
Other Latin American     386
Other European           371
Thai                     350
Vietnamese               289
Caribean                 223
Tex-Mex                  219
Middle Eastern           139
African                   89
Other                      5
Name: cuisine_type, dtype: int64

In [12]:
restaurant_df.food_type.value_counts(dropna = False)

NaN                       5860
Pizza                     1180
Breakfast & Brunch        1144
Sandwiches & Wraps        1084
Coffee & Tea               757
Seafood                    718
Other food                 608
Burgers                    608
Desserts                   477
Bakeries                   448
Salad                      385
Barbeque                   379
Chicken Wings              306
Steakhouses                280
Alcoholic drinks           266
Noodles related            191
Juice Bars & Smoothies     140
Soup                       139
Name: food_type, dtype: int64

In [13]:
restaurant_df.atmosphere_type.value_counts(dropna = False)

NaN                                9222
Bars related                       3648
Cafes related                       845
Delis                               371
Diners and cafeteria related        319
Pubs related                        282
Buffets                             186
Brewery and beer garden related      97
Name: atmosphere_type, dtype: int64

In [14]:
restaurant_df.dietary_type.value_counts(dropna = False)

NaN            13791
Vegetarian       422
Vegan            316
Gluten-Free      290
Halal            135
Kosher            16
Name: dietary_type, dtype: int64