# About

Second run on extracting restaurants from Yelp business dataset. To be deleted after related functions are incorporated into `preprocess.py` and `create-sample-ids.ipynb`. 


In [1]:
import json
import pandas as pd

# Load dataset and file

## Load Yelp business dataset

In [3]:
data_file = open("yelp_academic_dataset_business.json")
data = []
for line in data_file:
    data.append(json.loads(line))
business_df = pd.DataFrame(data)
data_file.close()

In [4]:
business_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,6iYb2HFDywm3zjuRg0shjw,Oskar Blues Taproom,921 Pearl St,Boulder,CO,80302,40.017544,-105.283348,4.0,86,1,"{'RestaurantsTableService': 'True', 'WiFi': 'u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'..."
1,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,Portland,OR,97218,45.588906,-122.593331,4.0,126,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ..."
2,bvN78flM8NLprQ1a1y5dRg,The Reclaimory,4720 Hawthorne Ave,Portland,OR,97214,45.511907,-122.613693,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Antiques, Fashion, Used, Vintage & Consignment...","{'Thursday': '11:0-18:0', 'Friday': '11:0-18:0..."
3,oaepsyvc0J17qwi8cfrOWg,Great Clips,2566 Enterprise Rd,Orange City,FL,32763,28.914482,-81.295979,3.0,8,1,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...","Beauty & Spas, Hair Salons",
4,PE9uqAjdw0E4-8mjGl3wVA,Crossfit Terminus,1046 Memorial Dr SE,Atlanta,GA,30316,33.747027,-84.353424,4.0,14,1,"{'GoodForKids': 'False', 'BusinessParking': '{...","Gyms, Active Life, Interval Training Gyms, Fit...","{'Monday': '16:0-19:0', 'Tuesday': '16:0-19:0'..."


In [5]:
business_df.shape

(160585, 14)

## Load category selection file

In [6]:
category_df = pd.read_csv('categories_agg_211011.csv')

In [7]:
category_df.head()

Unnamed: 0,category,number_of_restaurants,exclude,notes
0,Restaurants,20693,0,
1,Food,7378,0,
2,Nightlife,4232,0,
3,Bars,4030,0,
4,Sandwiches,3099,0,


## Create a full list of categories that has *Hotels* in them

In [118]:
# identify businesses that tag themselves in 'Hotels' category
include_categ_hotels = business_df['categories'].str.contains('Hotels', na = False)

# extract those businesses
hotels_df = business_df[include_categ_hotels]

In [126]:
hotels_categories_raw = []

for item in hotels_df['categories']:
    my_list = item.split(',')
    for element in my_list:
        cleaned_element = element.strip()
        if not(cleaned_element in hotels_categories_raw):
            hotels_categories_raw.append(cleaned_element)

print("numbers of categories:", len(hotels_categories_raw))
print(hotels_categories_raw[:10])

numbers of categories: 582
['Restaurants', 'American (New)', 'Food Court', 'Flowers & Gifts', 'Hotels & Travel', 'Gift Shops', 'Resorts', 'Shopping', 'Hotels', 'Vacation Rentals']


In [132]:
hotels_categories_clean = []

for item in hotels_categories_raw:
    status = 'Hotels' in item
    if status:
        hotels_categories_clean.append(item)

print("number of categories that has keyword 'Hotels':", len(hotels_categories_clean))
print(hotels_categories_clean)

number of categories that has keyword 'Hotels': 2
['Hotels & Travel', 'Hotels']


## Subset for restaurants

In [11]:
# identify businesses that tag themselves in 'Restaurant' category
include_categ_restaurant = business_df['categories'].str.contains('Restaurants', na = False)

# extract those businesses
restaurant_raw_df = business_df[include_categ_restaurant]

In [12]:
restaurant_raw_df.shape

(50763, 14)

### Subset by categories

In [116]:
# create a list of categories to exclude (note that 'Hotels' is not yet included)
category_excluded = list(category_df[category_df['exclude'] == 1]['category'])
print("number of categories to exclude:", len(category_excluded))

number of categories to exclude: 345


In [134]:
# append categories that have keywords 'Hotels'
for category in hotels_categories_clean:
    category_excluded.append(category)

In [135]:
# sanity check
print("number of categories to exclude:", len(category_excluded))

number of categories to exclude: 347


In [72]:
case_1 = restaurant_raw_df['categories'].iloc[0]
case_1

'Gastropubs, Food, Beer Gardens, Restaurants, Bars, American (Traditional), Beer Bar, Nightlife, Breweries'

In [74]:
case_2 = restaurant_raw_df['categories'].iloc[6]
case_2

'Restaurants, American (New), Food Court, Flowers & Gifts, Hotels & Travel, Gift Shops, Resorts, Shopping'

In [136]:
def check_categories (item, reference_list):
    a_list = item.split(',')
    for item in a_list:
        status = item.strip() in reference_list
        #print('item:',item.strip(), '; status:', status)
        if status == True:
            return True
    return False

In [96]:
check_categories(case_1, category_excluded)

item: Gastropubs status: False
item: Food status: False
item: Beer Gardens status: False
item: Restaurants status: False
item: Bars status: False
item: American (Traditional) status: False
item: Beer Bar status: False
item: Nightlife status: False
item: Breweries status: False


False

In [95]:
check_categories(case_2, category_excluded)

item: Restaurants status: False
item: American (New) status: False
item: Food Court status: True


True

In [137]:
# create a new column to tag whether the businesses should be excluded
condition = restaurant_raw_df.categories.apply(check_categories, args = (category_excluded,))

In [139]:
restaurant_raw_df.loc[condition, 'not_restaurant'] = True
restaurant_raw_df.loc[~condition, 'not_restaurant'] = False

In [140]:
restaurant_raw_df.shape

(50763, 15)

In [141]:
restaurant_raw_df['not_restaurant'].value_counts()

False    38606
True     12157
Name: not_restaurant, dtype: int64

In [145]:
restaurant_clean_by_categories = restaurant_raw_df[restaurant_raw_df['not_restaurant'] == False]

In [147]:
restaurant_clean_by_categories.shape

(38606, 15)

In [149]:
restaurant_clean_by_categories.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours', 'not_restaurant'],
      dtype='object')

In [154]:
restaurant_clean_by_categories = restaurant_clean_by_categories.drop('not_restaurant', axis = 1)

In [155]:
restaurant_clean_by_categories.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours'],
      dtype='object')

In [156]:
restaurant_clean_by_categories.shape

(38606, 14)

### Subset by attributes

In [205]:
# create dictionary of attributes
attributes_dict = {}
index_to_check = []

for i, dict_item in enumerate(restaurant_clean_by_categories['attributes']):
    try:
        my_list = list(dict_item.keys())
        for element in my_list:
            cleaned_element = element.strip()
            if not(cleaned_element in attributes_dict):
                attributes_dict[cleaned_element] = 1
            else:
                attributes_dict[cleaned_element] += 1
    except:
        index_to_check.append(i)

In [212]:
# check whether attribute 'AcceptsInsurance' is still in the dataset
'AcceptsInsurance' in attributes_dict

False

No attribute to subset.