# About
This notebook is to wrangle `categories` column in `yelp_team7_dataset_restaurant.json`. New columns based on category types were created: 
* `cuisine_type`
* `food_type`
* `atmosphere_type`
* `dietary_type`
* `hi_transmission_risk`

These columns and `business_id` were saved in a separate file, `yelp_team7_dataset_categories.json`.

In [1]:
import json
import pandas as pd
import preprocess



# Load restaurant dataset

In [2]:
restaurant_df = pd.read_json('yelp_team7_dataset_restaurant.json')

In [3]:
restaurant_df.shape

(14970, 14)

# Create new columns of category types

See `categories_mapping.xls` for detailed steps to generate category mapping files used in this section.

In [4]:
categories = [['categories_mapping_cuisine_type.csv', 'cuisine_type'],
              ['categories_mapping_food_type.csv', 'food_type'],
              ['categories_mapping_atmosphere_type.csv', 'atmosphere_type'],
              ['categories_mapping_dietary_type.csv', 'dietary_type'],
              ['categories_mapping_transmission_risk.csv', 'hi_transmission_risk']]

In [5]:
# map the categories column into four new columns of category types
for a_type in categories:
    preprocess.gen_category_column(restaurant_df, a_type[0], a_type[1])

In [6]:
restaurant_df.shape

(14970, 19)

In [7]:
# check for observations with missing values in all category types
missing_cuisine_type = restaurant_df.cuisine_type.isna()
missing_food_type = restaurant_df.food_type.isna()
missing_atmosphere_type = restaurant_df.atmosphere_type.isna()
missing_dietary_type = restaurant_df.dietary_type.isna()
missing_transmission_risk = restaurant_df.hi_transmission_risk.isna()
missing_all = (missing_cuisine_type & missing_food_type & missing_atmosphere_type & 
               missing_dietary_type & missing_transmission_risk)
no_category_types = restaurant_df[missing_all]

In [8]:
no_category_types.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,cuisine_type,food_type,atmosphere_type,dietary_type,hi_transmission_risk
1018,vzyplzkOTTBAgjVJtBA5Qg,Northcreek Cafe,3715 Northside Pkwy NW,ATLANTA,GA,30327,33.857038,-84.429692,4.5,6,1,"{'Alcohol': 'u'none'', 'HasTV': 'True', 'Busin...",Restaurants,,,,,,
1115,kJRZuKXMY3pqfunLCzEG7Q,Au Bon Pain,75 Federal St,BOSTON,MA,2110,42.355071,-71.056717,3.0,6,0,"{'RestaurantsTakeOut': 'True', 'GoodForKids': ...",Restaurants,,,,,,
1360,EXOsmAB1s71WePlQk0WZrA,Linwood Grill & BBQ Restaurant,69 Kilmarnock St,BOSTON,MA,2215,42.342541,-71.099522,2.5,14,0,"{'RestaurantsAttire': ''casual'', 'BusinessPar...",Restaurants,,,,,,
1450,OC2bWcsJqhBHKowi8smMgw,Tiger Lily,8 Westland Ave,BOSTON,MA,2115,42.343153,-71.086494,2.5,28,0,"{'RestaurantsAttire': ''casual'', 'Restaurants...",Restaurants,,,,,,
1857,XVjsDe9cncy0s-K5kb6qAQ,City Grill,50 Hurt Plz SE Ste 200,ATLANTA,GA,30303,33.754129,-84.38721,3.5,7,0,"{'BusinessAcceptsCreditCards': 'True', 'Restau...",Restaurants,,,,,,


In [9]:
no_category_types.categories.value_counts()

Restaurants                                       77
Restaurants, Comfort Food                          1
Specialty Food, Restaurants, Food, Ethnic Food     1
Name: categories, dtype: int64

# Clean and export

In [10]:
# keep only restaurants that have at least one category type 
# (i.e., remove restaurants that have missing values in all category types)
columns = ['business_id','cuisine_type', 'food_type', 'atmosphere_type', 'dietary_type', 'hi_transmission_risk']
categories_df = restaurant_df[~missing_all][columns]

In [11]:
categories_df.shape

(14891, 6)

In [12]:
categories_df

Unnamed: 0,business_id,cuisine_type,food_type,atmosphere_type,dietary_type,hi_transmission_risk
0,tCbdrRPZA0oiIYSmHG3J0w,,Salad,Delis,Vegetarian,
1,ufCxltuh56FF4-ZFZ6cVhg,American,Bakeries,,,
2,jGennaZUr2MsJyRhijNBfA,Italian,Sandwiches & Wraps,Bars related,Gluten-Free,Bars related
3,iPD8BBvea6YldQZPHzVrSQ,,Desserts,,,
4,jx91IMdGOmLOo8h_F9z39g,,Sandwiches & Wraps,,,Nightlife
...,...,...,...,...,...,...
14965,m5eUPVD0Hu39Ff-Uqe-FLA,Italian,,,,
14966,87f7kR7nTz8WHnmtLM_S6w,Japanese,,Bars related,,Bars related
14967,jYgqSazE0gUyI7qq086Dzw,American,Steakhouses,,,Event Planning & Services
14968,r5Uag1JqYjr2nbxQCVqm8A,Vietnamese,Noodles related,,Vegetarian,


In [13]:
categories_df.to_json(r'yelp_team7_dataset_category.json')