# About

This notebook is to clean and wrangle `yelp_team7_dataset_restaurant.json`

In [1]:
import json
import pandas as pd

# Load the restaurant dataset

In [2]:
restaurant_df = pd.read_json('yelp_team7_dataset_restaurant.json')

In [3]:
restaurant_df.shape

(20693, 14)

In [4]:
restaurant_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,Portland,OR,97218,45.588906,-122.593331,4.0,126,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ..."
1,ufCxltuh56FF4-ZFZ6cVhg,Sister Honey's,247 E Michigan St,Orlando,FL,32806,28.513265,-81.374707,4.5,135,1,"{'BusinessParking': '{'garage': False, 'street...","Restaurants, American (New), Bakeries, Dessert...","{'Tuesday': '11:0-18:0', 'Wednesday': '11:0-18..."
2,hcRxdDg7DYryCxCoI8ySQA,Longwood Galleria,340-350 Longwood Ave,Boston,MA,2215,42.338544,-71.106842,2.5,24,1,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...","Restaurants, Shopping, Shopping Centers","{'Monday': '6:30-22:0', 'Tuesday': '6:30-22:0'..."
3,jGennaZUr2MsJyRhijNBfA,Legal Sea Foods,1 Harborside Dr,Boston,MA,2128,42.363442,-71.025781,3.5,856,1,"{'NoiseLevel': 'u'average'', 'BikeParking': 'F...","Sandwiches, Food, Restaurants, Breakfast & Bru...","{'Monday': '6:0-21:0', 'Tuesday': '6:0-21:0', ..."
4,iPD8BBvea6YldQZPHzVrSQ,Espresso Minute,334 Mass Ave,Boston,MA,2115,42.342673,-71.084239,4.5,7,0,"{'NoiseLevel': ''quiet'', 'GoodForKids': 'True...","Creperies, Restaurants, Food, Coffee & Tea, Br...","{'Tuesday': '8:0-20:0', 'Wednesday': '8:0-20:0..."


In [5]:
restaurant_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20693 entries, 0 to 20692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   business_id   20693 non-null  object 
 1   name          20693 non-null  object 
 2   address       20693 non-null  object 
 3   city          20693 non-null  object 
 4   state         20693 non-null  object 
 5   postal_code   20693 non-null  object 
 6   latitude      20693 non-null  float64
 7   longitude     20693 non-null  float64
 8   stars         20693 non-null  float64
 9   review_count  20693 non-null  int64  
 10  is_open       20693 non-null  int64  
 11  attributes    20591 non-null  object 
 12  categories    20693 non-null  object 
 13  hours         17500 non-null  object 
dtypes: float64(3), int64(2), object(9)
memory usage: 2.4+ MB


# Wrangle *categories* column

In [6]:
categories_df = restaurant_df[['categories']]
categories_df

Unnamed: 0,categories
0,"Salad, Soup, Sandwiches, Delis, Restaurants, C..."
1,"Restaurants, American (New), Bakeries, Dessert..."
2,"Restaurants, Shopping, Shopping Centers"
3,"Sandwiches, Food, Restaurants, Breakfast & Bru..."
4,"Creperies, Restaurants, Food, Coffee & Tea, Br..."
...,...
20688,"Italian, Restaurants"
20689,"Japanese, Sushi Bars, Restaurants"
20690,"Event Planning & Services, Steakhouses, Seafoo..."
20691,"Restaurants, Noodles, Salad, Vietnamese, Veget..."


In [7]:
categories_df.iloc[0,0]

'Salad, Soup, Sandwiches, Delis, Restaurants, Cafes, Vegetarian'

In [8]:
categories_df.iloc[1,0]

'Restaurants, American (New), Bakeries, Desserts, Food, Cupcakes'

In [16]:
categories_df.iloc[2,0]

'Restaurants, Shopping, Shopping Centers'

In [17]:
categories_df.iloc[3,0]

'Sandwiches, Food, Restaurants, Breakfast & Brunch, Seafood, Italian, Beer, Wine & Spirits, Cocktail Bars, Gluten-Free, Nightlife, Bars, Salad'

In [9]:
tmp_df = categories_df.iloc[0:11,:]
tmp_df

Unnamed: 0,categories
0,"Salad, Soup, Sandwiches, Delis, Restaurants, C..."
1,"Restaurants, American (New), Bakeries, Dessert..."
2,"Restaurants, Shopping, Shopping Centers"
3,"Sandwiches, Food, Restaurants, Breakfast & Bru..."
4,"Creperies, Restaurants, Food, Coffee & Tea, Br..."
5,"Nightlife, Sandwiches, Seafood, Restaurants"
6,"Food Trucks, Restaurants, Specialty Food, Food..."
7,"Bars, Nightlife, Cocktail Bars, Seafood, Resta..."
8,"Vegetarian, Vegan, Sandwiches, Soup, American ..."
9,"Italian, Restaurants"


In [10]:
tmp_df['categories']

0     Salad, Soup, Sandwiches, Delis, Restaurants, C...
1     Restaurants, American (New), Bakeries, Dessert...
2               Restaurants, Shopping, Shopping Centers
3     Sandwiches, Food, Restaurants, Breakfast & Bru...
4     Creperies, Restaurants, Food, Coffee & Tea, Br...
5           Nightlife, Sandwiches, Seafood, Restaurants
6     Food Trucks, Restaurants, Specialty Food, Food...
7     Bars, Nightlife, Cocktail Bars, Seafood, Resta...
8     Vegetarian, Vegan, Sandwiches, Soup, American ...
9                                  Italian, Restaurants
10                      Fast Food, Restaurants, Burgers
Name: categories, dtype: object

## Create a full list of categories

### A mini pilot: make it a list

In [25]:
for i,item in enumerate(tmp_df['categories']):
    print(i, item)

0 Salad, Soup, Sandwiches, Delis, Restaurants, Cafes, Vegetarian
1 Restaurants, American (New), Bakeries, Desserts, Food, Cupcakes
2 Restaurants, Shopping, Shopping Centers
3 Sandwiches, Food, Restaurants, Breakfast & Brunch, Seafood, Italian, Beer, Wine & Spirits, Cocktail Bars, Gluten-Free, Nightlife, Bars, Salad
4 Creperies, Restaurants, Food, Coffee & Tea, Breakfast & Brunch
5 Nightlife, Sandwiches, Seafood, Restaurants
6 Food Trucks, Restaurants, Specialty Food, Food, Mexican, Ethnic Food
7 Bars, Nightlife, Cocktail Bars, Seafood, Restaurants, Sushi Bars
8 Vegetarian, Vegan, Sandwiches, Soup, American (New), Fast Food, Restaurants, Wraps, American (Traditional)
9 Italian, Restaurants
10 Fast Food, Restaurants, Burgers


In [27]:
for i,item in enumerate(tmp_df['categories']):
    my_list = item.split(',')
    print(my_list)

['Salad', ' Soup', ' Sandwiches', ' Delis', ' Restaurants', ' Cafes', ' Vegetarian']
['Restaurants', ' American (New)', ' Bakeries', ' Desserts', ' Food', ' Cupcakes']
['Restaurants', ' Shopping', ' Shopping Centers']
['Sandwiches', ' Food', ' Restaurants', ' Breakfast & Brunch', ' Seafood', ' Italian', ' Beer', ' Wine & Spirits', ' Cocktail Bars', ' Gluten-Free', ' Nightlife', ' Bars', ' Salad']
['Creperies', ' Restaurants', ' Food', ' Coffee & Tea', ' Breakfast & Brunch']
['Nightlife', ' Sandwiches', ' Seafood', ' Restaurants']
['Food Trucks', ' Restaurants', ' Specialty Food', ' Food', ' Mexican', ' Ethnic Food']
['Bars', ' Nightlife', ' Cocktail Bars', ' Seafood', ' Restaurants', ' Sushi Bars']
['Vegetarian', ' Vegan', ' Sandwiches', ' Soup', ' American (New)', ' Fast Food', ' Restaurants', ' Wraps', ' American (Traditional)']
['Italian', ' Restaurants']
['Fast Food', ' Restaurants', ' Burgers']


In [28]:
tmp_df['categories'][0]

'Salad, Soup, Sandwiches, Delis, Restaurants, Cafes, Vegetarian'

In [31]:
not('Soup' in (tmp_df['categories'][0]))

False

In [11]:
tmp_categories_all = []

for item in tmp_df['categories']:
    my_list = item.split(',')
    for element in my_list:
        cleaned_element = element.strip()
        if not(cleaned_element in tmp_categories_all):
            tmp_categories_all.append(cleaned_element)
            
tmp_categories_all.remove('Restaurants')

print(tmp_categories_all)

['Salad', 'Soup', 'Sandwiches', 'Delis', 'Cafes', 'Vegetarian', 'American (New)', 'Bakeries', 'Desserts', 'Food', 'Cupcakes', 'Shopping', 'Shopping Centers', 'Breakfast & Brunch', 'Seafood', 'Italian', 'Beer', 'Wine & Spirits', 'Cocktail Bars', 'Gluten-Free', 'Nightlife', 'Bars', 'Creperies', 'Coffee & Tea', 'Food Trucks', 'Specialty Food', 'Mexican', 'Ethnic Food', 'Sushi Bars', 'Vegan', 'Fast Food', 'Wraps', 'American (Traditional)', 'Burgers']


## mini pilot: make it a dictionary

In [19]:
tmp_dict = {}

for item in tmp_df['categories']:
    my_list = item.split(',')
    for element in my_list:
        cleaned_element = element.strip()
        if not(cleaned_element in tmp_dict):
            tmp_dict[cleaned_element] = 1
        else:
            tmp_dict[cleaned_element] += 1
            

print(tmp_dict)

{'Salad': 2, 'Soup': 2, 'Sandwiches': 4, 'Delis': 1, 'Restaurants': 11, 'Cafes': 1, 'Vegetarian': 2, 'American (New)': 2, 'Bakeries': 1, 'Desserts': 1, 'Food': 4, 'Cupcakes': 1, 'Shopping': 1, 'Shopping Centers': 1, 'Breakfast & Brunch': 2, 'Seafood': 3, 'Italian': 2, 'Beer': 1, 'Wine & Spirits': 1, 'Cocktail Bars': 2, 'Gluten-Free': 1, 'Nightlife': 3, 'Bars': 2, 'Creperies': 1, 'Coffee & Tea': 1, 'Food Trucks': 1, 'Specialty Food': 1, 'Mexican': 1, 'Ethnic Food': 1, 'Sushi Bars': 1, 'Vegan': 1, 'Fast Food': 2, 'Wraps': 1, 'American (Traditional)': 1, 'Burgers': 1}


scribble:
* 'Restaurants' for sannity check
* see what are businesses that tag themselves as 'Food', 'Shopping', 'Shopping Center'
* consider removing businesses that tag themselves as'Food Trucks'

### Apply to the entire restaurants

In [15]:
categories_dict = {}

for item in restaurant_df['categories']:
    my_list = item.split(',')
    for element in my_list:
        cleaned_element = element.strip()
        if not(cleaned_element in categories_dict):
            categories_dict[cleaned_element] = 1
        else:
            categories_dict[cleaned_element] += 1
            
            
#print(categories_dict)

In [23]:
# create a data frame
categories_agg = pd.DataFrame(list(categories_dict.items()))
categories_agg.columns = ['category', 'number_of_restaurants']
categories_agg.head()

Unnamed: 0,category,number_of_restaurants
0,Salad,1080
1,Soup,376
2,Sandwiches,3099
3,Delis,771
4,Restaurants,20693


In [25]:
categories_agg['number_of_restaurants'].describe()

count      558.000000
mean       182.577061
std       1015.494437
min          1.000000
25%          1.000000
50%          6.000000
75%         42.750000
max      20693.000000
Name: number_of_restaurants, dtype: float64

In [27]:
categories_agg[categories_agg['number_of_restaurants'] <= 6]

Unnamed: 0,category,number_of_restaurants
89,Mass Media,2
90,Print Media,1
95,Austrian,4
102,Nutritionists,6
104,Vitamins & Supplements,1
...,...,...
553,Drive-Thru Bars,1
554,Veterinarians,1
555,Tires,1
556,Trampoline Parks,1


In [30]:
categories_agg_sorted = categories_agg.sort_values(by = 'number_of_restaurants', ascending = False)
categories_agg_sorted

Unnamed: 0,category,number_of_restaurants
4,Restaurants,20693
10,Food,7378
21,Nightlife,4232
22,Bars,4030
2,Sandwiches,3099
...,...,...
452,Axe Throwing,1
454,Dumpster Rental,1
455,Junk Removal & Hauling,1
456,Boat Charters,1


In [32]:
categories_agg_sorted.to_csv('categories_agg.csv', index = False)

## Check potential exclusions

In [66]:
restaurant_df[restaurant_df['categories'].str.contains('Pop-Up Restaurants')][['name', 'attributes', 'categories']].head(n= 50)

Unnamed: 0,name,attributes,categories
1826,AIX,"{'BusinessParking': '{'garage': True, 'street'...","Restaurants, Nightlife, Pop-Up Restaurants, Am..."
2236,Meatballerz Restaurant,"{'GoodForMeal': '{'dessert': False, 'latenight...","Food Trucks, Food, Italian, Breakfast & Brunch..."
2872,Smoke & Donuts BBQ,"{'RestaurantsTakeOut': 'True', 'Ambience': '{'...","Pop-Up Restaurants, Food, Barbeque, Event Plan..."
2931,Hungry Heart PDX,"{'BikeParking': 'True', 'Caters': 'True', 'Amb...","Caterers, Ice Cream & Frozen Yogurt, Restauran..."
3958,Kamayan ATL,"{'RestaurantsTakeOut': 'False', 'RestaurantsRe...","Restaurants, Event Planning & Services, Catere..."
4889,Ok Yaki,"{'DogsAllowed': 'False', 'Alcohol': 'u'none'',...","Restaurants, Teppanyaki, Street Vendors, Japan..."
5313,Bangrak Thai Street Kitchen,"{'BusinessParking': '{'garage': False, 'street...","Pop-Up Restaurants, Restaurants, Thai"
6787,Oisa Ramen,"{'WiFi': 'u'free'', 'NoiseLevel': 'u'average''...","Japanese, Poke, Pop-Up Restaurants, Ramen, Sho..."
6924,a mano,"{'HappyHour': 'True', 'CoatCheck': 'False', 'B...","Breakfast & Brunch, Italian, Wine Bars, Pop-Up..."
7921,Hernandez Hospitality,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Specialty Food, Restaurants, Personal Chefs, F..."


# Wrangle *attributes* column

In [4]:
attributes_df = restaurant_df[['attributes']]
attributes_df

Unnamed: 0,attributes
0,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt..."
1,"{'BusinessParking': '{'garage': False, 'street..."
2,"{'RestaurantsPriceRange2': '1', 'BusinessAccep..."
3,"{'NoiseLevel': 'u'average'', 'BikeParking': 'F..."
4,"{'NoiseLevel': ''quiet'', 'GoodForKids': 'True..."
...,...
20688,"{'RestaurantsGoodForGroups': 'False', 'GoodFor..."
20689,"{'RestaurantsPriceRange2': '4', 'RestaurantsGo..."
20690,"{'BusinessAcceptsCreditCards': 'True', 'Outdoo..."
20691,"{'Alcohol': 'u'beer_and_wine'', 'RestaurantsAt..."


In [5]:
attributes_df.iloc[0,0]

{'RestaurantsTakeOut': 'True',
 'RestaurantsAttire': "u'casual'",
 'GoodForKids': 'True',
 'BikeParking': 'False',
 'OutdoorSeating': 'False',
 'Ambience': "{'romantic': False, 'intimate': False, 'touristy': False, 'hipster': False, 'divey': False, 'classy': False, 'trendy': False, 'upscale': False, 'casual': True}",
 'Caters': 'True',
 'RestaurantsReservations': 'False',
 'RestaurantsDelivery': 'False',
 'HasTV': 'False',
 'RestaurantsGoodForGroups': 'False',
 'BusinessAcceptsCreditCards': 'True',
 'NoiseLevel': "u'average'",
 'ByAppointmentOnly': 'False',
 'RestaurantsPriceRange2': '2',
 'WiFi': "u'free'",
 'BusinessParking': "{'garage': True, 'street': False, 'validated': False, 'lot': False, 'valet': False}",
 'Alcohol': "u'beer_and_wine'",
 'GoodForMeal': "{'dessert': False, 'latenight': False, 'lunch': True, 'dinner': False, 'brunch': False, 'breakfast': True}"}

In [6]:
attributes_df.iloc[2,0]

{'RestaurantsPriceRange2': '1',
 'BusinessAcceptsCreditCards': 'True',
 'HasTV': 'False',
 'Ambience': "{'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': True}",
 'WiFi': "'free'",
 'GoodForKids': 'True',
 'RestaurantsAttire': "u'casual'",
 'Caters': 'True',
 'BikeParking': 'True',
 'OutdoorSeating': 'False',
 'Alcohol': "'full_bar'",
 'RestaurantsGoodForGroups': 'True',
 'RestaurantsReservations': 'False',
 'GoodForMeal': "{'dessert': False, 'latenight': False, 'lunch': False, 'dinner': False, 'brunch': False, 'breakfast': False}",
 'RestaurantsDelivery': 'None',
 'BusinessParking': "{'garage': True, 'street': False, 'validated': False, 'lot': False, 'valet': False}",
 'RestaurantsTakeOut': 'None',
 'NoiseLevel': "u'average'"}

In [9]:
attr_tmp_df = attributes_df.iloc[0:6,:]
attr_tmp_df

Unnamed: 0,attributes
0,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt..."
1,"{'BusinessParking': '{'garage': False, 'street..."
2,"{'RestaurantsPriceRange2': '1', 'BusinessAccep..."
3,"{'NoiseLevel': 'u'average'', 'BikeParking': 'F..."
4,"{'NoiseLevel': ''quiet'', 'GoodForKids': 'True..."
5,"{'RestaurantsGoodForGroups': 'True', 'Alcohol'..."


## Create a full list of attributes

### mini pilot

In [11]:
for i,item in enumerate(attr_tmp_df['attributes']):
    print(i, type(item))

0 <class 'dict'>
1 <class 'dict'>
2 <class 'dict'>
3 <class 'dict'>
4 <class 'dict'>
5 <class 'dict'>


In [13]:
test_dict = attributes_df.iloc[2,0]
test_dict

{'RestaurantsPriceRange2': '1',
 'BusinessAcceptsCreditCards': 'True',
 'HasTV': 'False',
 'Ambience': "{'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': True}",
 'WiFi': "'free'",
 'GoodForKids': 'True',
 'RestaurantsAttire': "u'casual'",
 'Caters': 'True',
 'BikeParking': 'True',
 'OutdoorSeating': 'False',
 'Alcohol': "'full_bar'",
 'RestaurantsGoodForGroups': 'True',
 'RestaurantsReservations': 'False',
 'GoodForMeal': "{'dessert': False, 'latenight': False, 'lunch': False, 'dinner': False, 'brunch': False, 'breakfast': False}",
 'RestaurantsDelivery': 'None',
 'BusinessParking': "{'garage': True, 'street': False, 'validated': False, 'lot': False, 'valet': False}",
 'RestaurantsTakeOut': 'None',
 'NoiseLevel': "u'average'"}

In [22]:
list(test_dict.keys())

['RestaurantsPriceRange2',
 'BusinessAcceptsCreditCards',
 'HasTV',
 'Ambience',
 'WiFi',
 'GoodForKids',
 'RestaurantsAttire',
 'Caters',
 'BikeParking',
 'OutdoorSeating',
 'Alcohol',
 'RestaurantsGoodForGroups',
 'RestaurantsReservations',
 'GoodForMeal',
 'RestaurantsDelivery',
 'BusinessParking',
 'RestaurantsTakeOut',
 'NoiseLevel']

In [26]:
attr_tmp_dict = {}

for dict_item in attr_tmp_df['attributes']:
    my_list = list(dict_item.keys())
    for element in my_list:
        cleaned_element = element.strip()
        if not(cleaned_element in attr_tmp_dict):
            attr_tmp_dict[cleaned_element] = 1
        else:
            attr_tmp_dict[cleaned_element] += 1
            

print(attr_tmp_dict)

{'RestaurantsTakeOut': 6, 'RestaurantsAttire': 5, 'GoodForKids': 5, 'BikeParking': 6, 'OutdoorSeating': 5, 'Ambience': 5, 'Caters': 5, 'RestaurantsReservations': 5, 'RestaurantsDelivery': 6, 'HasTV': 5, 'RestaurantsGoodForGroups': 5, 'BusinessAcceptsCreditCards': 6, 'NoiseLevel': 6, 'ByAppointmentOnly': 1, 'RestaurantsPriceRange2': 6, 'WiFi': 4, 'BusinessParking': 6, 'Alcohol': 5, 'GoodForMeal': 5, 'DogsAllowed': 2, 'Music': 1, 'BusinessAcceptsBitcoin': 1, 'GoodForDancing': 1, 'BestNights': 1, 'HappyHour': 1, 'RestaurantsTableService': 2, 'WheelchairAccessible': 2}


### Apply to *attributes* column of the entire restaurants

In [34]:
attributes_dict = {}
index_to_check = []

for i, dict_item in enumerate(restaurant_df['attributes']):
    try:
        my_list = list(dict_item.keys())
        for element in my_list:
            cleaned_element = element.strip()
            if not(cleaned_element in attributes_dict):
                attributes_dict[cleaned_element] = 1
            else:
                attributes_dict[cleaned_element] += 1
    except:
        index_to_check.append(i)

In [31]:
print(index_to_check)

[27, 134, 206, 296, 694, 1031, 1218, 1528, 1752, 2051, 2070, 2361, 2722, 2822, 2849, 3523, 3767, 3798, 3931, 3941, 3963, 4380, 5024, 5403, 5509, 5518, 5600, 5706, 5731, 5801, 5843, 6054, 6080, 6247, 7253, 7341, 7595, 7846, 8105, 8247, 8463, 8602, 8901, 9038, 9043, 9211, 9290, 10017, 10434, 10490, 10494, 10628, 10889, 11212, 11276, 11793, 12152, 12474, 12781, 12942, 12949, 13185, 13250, 13273, 13420, 13661, 13662, 13672, 13783, 14118, 14318, 14362, 14476, 14512, 14552, 14735, 14998, 15027, 15158, 15245, 15410, 15655, 15823, 15825, 15832, 15940, 15962, 16175, 16800, 17301, 17618, 18004, 18068, 18273, 18275, 18530, 18624, 19409, 19617, 20040, 20285, 20375]


In [32]:
len(index_to_check)

102

In [35]:
restaurant_df.loc[index_to_check]

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
27,opdd6sSWjwiy-doXnlpMeA,Tin Lizzy's Cantina,229 Peachtree St,Atlanta,GA,30308,33.759923,-84.386444,3.5,10,1,,"Restaurants, Nightlife, Mexican, Bars, Sports ...",
134,0c8NZgbg1Q0vwXMPZTUlvA,Campo di Fiori,580 Commonwealth Ave,Boston,MA,02228,42.348810,-71.099424,3.5,6,0,,"Restaurants, Italian",
206,lKhF50ZDNZoz6jA5YynRtg,Al Forno's,27 12th St NW,Atlanta,GA,30309,33.784570,-84.388229,3.0,5,0,,"Pizza, Restaurants","{'Monday': '16:0-22:0', 'Tuesday': '16:0-22:0'..."
296,8ehEKnoY-ZFjRfM1q0Ybhg,Moda Cars,12832 NE Airport Way,Portland,OR,97230,45.562559,-122.528696,1.0,6,0,,"Car Dealers, Automotive, Restaurants, Pizza","{'Monday': '10:0-18:0', 'Tuesday': '10:0-18:0'..."
694,af3n7eYe1D8M45VA_1kIXA,Jin-Dak,1112 SE Tacoma St,Portland,OR,97202,45.464096,-122.654565,5.0,12,1,,"Restaurants, Food Trucks, Food, Korean",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19409,scDAcEG8GUbfbfSI-ZS5Iw,Mario's Deli,3202 W Anderson Ln,Austin,TX,78757,30.362109,-97.741429,3.0,7,0,,"Restaurants, Delis","{'Monday': '10:0-20:0', 'Tuesday': '10:0-20:0'..."
19617,kDqsJKCWnLSKbSzfMDJqVg,Have A Taco,4602 Welekta Dr,Austin,TX,78734,30.406835,-97.925985,1.5,19,0,,"Food Trucks, Food, American (Traditional), Res...","{'Wednesday': '12:0-21:0', 'Thursday': '12:0-2..."
20040,IDv-q4te2TsaPUbeWIgShQ,Veronica's Papusas Y Tacos,4276-4298 Lockhart Hwy,Austin,TX,78744,30.189091,-97.686166,3.5,6,0,,"Food Trucks, Mexican, Salvadoran, Restaurants,...","{'Monday': '10:30-19:0', 'Tuesday': '10:30-19:..."
20285,m9tVJ_9hWQI3WFTefEkF9Q,Bonehead's,2349 Peachtree Rd NE,Atlanta,GA,30305,33.820130,-84.388335,3.0,5,0,,"American (New), Restaurants",


In [36]:
# create a attributes-aggregate data frame
attributes_agg = pd.DataFrame(list(attributes_dict.items()))
attributes_agg.columns = ['attribute', 'number_of_restaurants']
attributes_agg.head()

Unnamed: 0,attribute,number_of_restaurants
0,RestaurantsTakeOut,19530
1,RestaurantsAttire,17047
2,GoodForKids,17624
3,BikeParking,14493
4,OutdoorSeating,18511


In [37]:
# sort and export
attributes_agg_sorted = attributes_agg.sort_values(by = 'number_of_restaurants', ascending = False)
attributes_agg_sorted.to_csv('attributes_agg.csv', index = False)

## Check for possible exclusion (by attributes)

In [207]:
attr = 'BusinessAcceptsBitcoin'

restaurants_nomissing_attr_df = restaurant_df[~restaurant_df['attributes'].isna()]


In [135]:
restaurants_nomissing_attr_df.isna().sum() # check no missing values in attributes column

business_id        0
name               0
address            0
city               0
state              0
postal_code        0
latitude           0
longitude          0
stars              0
review_count       0
is_open            0
attributes         0
categories         0
hours           3140
dtype: int64

In [177]:
restaurants_nomissing_attr_df.attributes

0        {'RestaurantsTakeOut': 'True', 'RestaurantsAtt...
1        {'BusinessParking': '{'garage': False, 'street...
2        {'RestaurantsPriceRange2': '1', 'BusinessAccep...
3        {'NoiseLevel': 'u'average'', 'BikeParking': 'F...
4        {'NoiseLevel': ''quiet'', 'GoodForKids': 'True...
                               ...                        
20688    {'RestaurantsGoodForGroups': 'False', 'GoodFor...
20689    {'RestaurantsPriceRange2': '4', 'RestaurantsGo...
20690    {'BusinessAcceptsCreditCards': 'True', 'Outdoo...
20691    {'Alcohol': 'u'beer_and_wine'', 'RestaurantsAt...
20692    {'WiFi': 'u'no'', 'BusinessAcceptsCreditCards'...
Name: attributes, Length: 20591, dtype: object

In [183]:
restaurants_nomissing_attr_df.attributes.apply(lambda x: x.get(attr))

0         None
1         None
2         None
3        False
4         None
         ...  
20688     None
20689     None
20690     None
20691     None
20692     None
Name: attributes, Length: 20591, dtype: object

In [188]:
restaurants_nomissing_attr_df.attributes.apply(lambda x: x.get(attr)).value_counts(dropna = False)

NaN      17349
False     3187
True        55
Name: attributes, dtype: int64

In [203]:
#restaurants_nomissing_attr_df[restaurants_nomissing_attr_df.attributes.apply(lambda x: x.get(attr)).isnull()]

restaurants_nomissing_attr_df[restaurants_nomissing_attr_df.attributes.apply(lambda x: x.get(attr)) == 'False']['attributes'].iloc[8]


{'HasTV': 'True',
 'BusinessParking': "{'garage': False, 'street': True, 'validated': False, 'lot': False, 'valet': False}",
 'RestaurantsTakeOut': 'True',
 'NoiseLevel': "u'average'",
 'RestaurantsAttire': "u'casual'",
 'BusinessAcceptsCreditCards': 'True',
 'RestaurantsPriceRange2': '2',
 'BikeParking': 'True',
 'RestaurantsGoodForGroups': 'True',
 'RestaurantsTableService': 'True',
 'BusinessAcceptsBitcoin': 'False',
 'WiFi': "u'free'",
 'RestaurantsReservations': 'True',
 'OutdoorSeating': 'True',
 'Smoking': "u'no'",
 'WheelchairAccessible': 'True',
 'CoatCheck': 'False',
 'GoodForKids': 'True',
 'ByAppointmentOnly': 'False',
 'BestNights': "{'monday': False, 'tuesday': True, 'friday': True, 'wednesday': False, 'thursday': False, 'sunday': False, 'saturday': True}",
 'Music': "{'dj': False, 'background_music': False, 'no_music': False, 'jukebox': False, 'live': False, 'video': False, 'karaoke': False}",
 'BYOB': 'False',
 'Caters': 'True',
 'HappyHour': 'True',
 'Alcohol': "u'full

In [208]:
restaurants_nomissing_attr_df[restaurants_nomissing_attr_df.attributes.apply(lambda x: x.get(attr)) == 'False'].tail()


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
20660,s5eDVoLc5Es1b6lI1W-DVw,Gemma Love,415 Jessie St,Austin,TX,78704,30.250362,-97.74996,4.0,82,0,"{'RestaurantsAttire': 'u'casual'', 'NoiseLevel...","Caribbean, Restaurants, Food Trucks, Gluten-Fr...","{'Monday': '11:0-19:0', 'Tuesday': '11:0-19:0'..."
20662,znkU8_QJ4Dksd8XUVjXBcg,Fuku Boston Seaport,43 Northern Ave,Boston,MA,2210,42.353272,-71.046872,3.5,182,0,"{'RestaurantsReservations': 'False', 'HappyHou...","Restaurants, Chicken Shop","{'Monday': '11:0-20:0', 'Tuesday': '11:0-20:0'..."
20663,um4x6dIkv8w2a4e0VkRNmA,Rocco's Cucina & Bar,450 Commercial St,Boston,MA,2109,42.36738,-71.053411,3.5,164,1,"{'RestaurantsTakeOut': 'True', 'WiFi': 'u'free...","Beer, Wine & Spirits, Italian, American (Tradi...","{'Monday': '14:30-1:30', 'Tuesday': '16:0-22:0..."
20673,WoI1IisL_AgmWdiJLRb-Zw,Democracy Brewing,35 Temple Pl,Boston,MA,2111,42.355088,-71.06215,4.0,216,1,"{'RestaurantsPriceRange2': '2', 'RestaurantsAt...","Restaurants, Food, Bars, Venues & Event Spaces...","{'Monday': '0:0-0:0', 'Tuesday': '12:0-20:0', ..."
20687,E42dulQ-OfZ3caiKKh7J5w,LongHorn Steakhouse,3101 E Colonial Dr,Orlando,FL,32803,28.553748,-81.344369,4.0,73,1,"{'BusinessAcceptsBitcoin': 'False', 'OutdoorSe...","Barbeque, Restaurants, American (Traditional),...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-22:0', ..."


#### check 'AcceptInsurance'

In [209]:
attr = 'AcceptsInsurance'

In [211]:
restaurants_nomissing_attr_df.attributes.apply(lambda x: x.get(attr)).value_counts(dropna = False)

NaN     20587
True        4
Name: attributes, dtype: int64

In [212]:
restaurants_nomissing_attr_df[restaurants_nomissing_attr_df.attributes.apply(lambda x: x.get(attr)) == 'True']

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
989,bTSV6N7nKu0Ai-L_ud_hEg,Peoples Rx,4018 N Lamar Blvd,Austin,TX,78756,30.309143,-97.741172,4.0,102,1,"{'RestaurantsPriceRange2': '2', 'HasTV': 'Fals...","Restaurants, Delis, Gluten-Free, Drugstores, F...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-20:0', '..."
9668,7Kumug8bCzwDu4STXQhUjg,Peoples Rx,3801 S Lamar Blvd,Austin,TX,78704,30.237994,-97.789093,4.0,104,1,"{'RestaurantsReservations': 'False', 'Business...","Specialty Food, Nutritionists, Drugstores, Foo...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-20:0', '..."
11825,on1lHO28hVhcLpj1E4dQ-A,Peoples Rx,4201 Westbank Dr,Austin,TX,78746,30.275571,-97.816528,4.5,50,1,"{'GoodForKids': 'False', 'BikeParking': 'False...","Drugstores, Restaurants, Medical Centers, Shop...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-19:0', '..."
17254,t4idMNEUIzLgs2hDNwoNNg,"Geoffrey Cox, MD","7900 Fm 1826, Bldg 2, Ste 202",Austin,TX,78737,30.226264,-97.891933,4.0,23,1,"{'ByAppointmentOnly': 'True', 'BusinessAccepts...","Restaurants, Doctors, Health & Medical, Family...","{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ..."


#### check 'AgesAllowed'

In [215]:
attr = 'AgesAllowed'

In [216]:
restaurants_nomissing_attr_df.attributes.apply(lambda x: x.get(attr)).value_counts(dropna = False)

NaN           20575
u'21plus'        11
u'allages'        4
u'18plus'         1
Name: attributes, dtype: int64

In [217]:
restaurants_nomissing_attr_df[restaurants_nomissing_attr_df.attributes.apply(lambda x: x.get(attr)) == "u'21plus'"]

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
790,4t-RvT_2SJbxCOR1SYkYFg,SHOTS - Orlando,"69 E Pine St, Fl 1",Orlando,FL,32801,28.541491,-81.377613,4.0,68,1,"{'RestaurantsGoodForGroups': 'True', 'Wheelcha...","American (New), Dance Clubs, Food, Restaurants...","{'Monday': '0:0-0:0', 'Tuesday': '20:0-2:0', '..."
2062,DIRXMPneSiHoOani8QnI3A,Hong Kong At Faneuil Hall,65 Chatham St,Boston,MA,2109,42.359669,-71.053908,3.0,429,1,"{'RestaurantsGoodForGroups': 'True', 'GoodForK...","Restaurants, Karaoke, Pubs, Chinese, Dance Clu...","{'Monday': '16:0-2:0', 'Tuesday': '16:0-2:0', ..."
10548,geqIxijPUD8nVDI2VKnPuw,Day Dreams,257 Trinity Ave SW,Atlanta,GA,30303,33.750785,-84.396315,2.5,20,0,"{'RestaurantsReservations': 'True', 'WiFi': 'u...","Breakfast & Brunch, Dance Clubs, Bars, Nightli...","{'Tuesday': '20:0-1:0', 'Thursday': '23:0-3:0'..."
11596,9yoVzC9qgUU6bM3tVdEggA,Moondogs,3179 Peachtree Rd NE,Atlanta,GA,30305,33.840803,-84.375263,3.5,129,1,"{'GoodForKids': 'False', 'BikeParking': 'True'...","American (New), Dance Clubs, Restaurants, Kara...","{'Wednesday': '20:0-3:0', 'Thursday': '20:0-3:..."
12745,FDCVhmfBuIVomUSeqIZMqQ,MOTHER,447 Edgewood Ave SE,Atlanta,GA,30312,33.754156,-84.372881,3.5,125,0,"{'WiFi': 'u'free'', 'Caters': 'True', 'Busines...","Nightlife, Bars, Dance Clubs, American (Tradit...","{'Monday': '17:0-2:30', 'Tuesday': '17:0-2:30'..."
12753,5zaDIn0r3GvF8Ecsvgen_Q,Jones,107 NW Couch St,Portland,OR,97209,45.524083,-122.67167,3.0,199,1,"{'RestaurantsTakeOut': 'False', 'RestaurantsRe...","Dance Clubs, Diners, Nightlife, Pubs, Bars, Re...","{'Friday': '20:0-2:30', 'Saturday': '20:0-2:30'}"
13582,AYQ8IWdau__yAmL7Jrl1YA,Latitudes Rooftop Tiki Bar,33 W Church St,Orlando,FL,32801,28.540621,-81.379767,3.5,114,1,"{'Alcohol': 'u'full_bar'', 'RestaurantsTakeOut...","Dance Clubs, Nightlife, Tiki Bars, Restaurants...","{'Monday': '16:30-2:0', 'Tuesday': '16:30-2:0'..."
14429,lQLsl-Y2pTEOXZClTqKAig,The Highball,1120 S Lamar Blvd,Austin,TX,78704,30.256129,-97.763212,3.0,153,1,"{'BestNights': '{'monday': True, 'tuesday': Fa...","Tapas/Small Plates, Bars, Karaoke, Dance Clubs...","{'Monday': '16:0-0:0', 'Tuesday': '16:0-0:0', ..."
15485,iEqC57gAJ3abHs8ERz_9Qg,Chillers Frozen Drink Bar,33 W Church St,Orlando,FL,32801,28.540621,-81.379794,3.0,64,1,"{'RestaurantsAttire': ''casual'', 'Alcohol': '...","Bars, Restaurants, Nightlife, Local Flavor, Da...","{'Monday': '16:30-2:30', 'Tuesday': '16:30-2:3..."
16205,u0MJq6N7PsRGE_T8HjZfrQ,Club Ellery's,2008 Campbellton Rd,Atlanta,GA,30311,33.706613,-84.455378,4.0,33,1,"{'RestaurantsPriceRange2': '2', 'Alcohol': 'u'...","Pubs, American (New), Nightlife, Restaurants, ...","{'Wednesday': '18:0-3:0', 'Thursday': '18:0-3:..."


In [218]:
restaurants_nomissing_attr_df[restaurants_nomissing_attr_df.attributes.apply(lambda x: x.get(attr)) == "u'allages'"]

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
988,G4enu8vxbQk4h_2Cd3TEgA,West End Johnnies,138 Portland St,Boston,MA,2114,42.363774,-71.061382,3.0,311,1,"{'WiFi': 'u'no'', 'HasTV': 'True', 'BikeParkin...","Nightlife, Dance Clubs, Restaurants, American ...","{'Thursday': '16:0-2:0', 'Friday': '16:0-2:0',..."
2687,I3wU08pdP80roBXD9rf3XQ,GEM,42 Province St,Boston,MA,2108,42.356939,-71.059906,3.0,98,0,"{'GoodForKids': 'False', 'NoiseLevel': 'u'aver...","Bars, Restaurants, Lounges, Italian, Dance Clu...","{'Wednesday': '22:0-2:0', 'Friday': '21:0-2:0'..."
18689,Yt9F5q71-aGROQCWEMHAjg,Mango's Tropical Cafe,8126 International Dr,Orlando,FL,32819,28.448139,-81.471754,3.5,500,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Restaurants, Dance Clubs, Bars, Cabaret, Cockt...","{'Monday': '18:0-2:0', 'Tuesday': '18:0-2:0', ..."
18720,HTxDHuQMcmcxyqBlxxiTIQ,Cuba Libre Restaurant & Rum Bar - Orlando,9101 International Dr,Orlando,FL,32819,28.43127,-81.469966,4.0,783,1,"{'RestaurantsGoodForGroups': 'True', 'Caters':...","Dance Clubs, Cuban, Restaurants, Latin America...","{'Monday': '0:0-0:0', 'Tuesday': '17:0-20:0', ..."
