# Yelp Feature Preprocessing

## Dataset Information
Source: Yelp API

The data was pulled with the script `yelp_scrape.py` (in the repo), using the following parameters:

```py
DEFAULT_TERM = 'restaurants'
DEFAULT_LOCATION = 'New York'
NUM_BUSINESSES = 500
```

The pulled data is stored in `businesses.json` (not in the repo).
- date pulled: 11/08/2018

## Relevant Yelp Features
We want preprocess and save the following list of features for use in our app's recommendation functionality:

- Diversity of food??
- Type of cuisine  
- Type of restaurant - cafe, (sit-down) restaurant, diner, breakfast & brunch 
- Price ($)
- Restaurant Rating (>3.5 stars)
- Vegan, vegetarianism, gluten- free

In [1]:
import pandas as pd
import numpy as np
import sklearn
import sklearn.preprocessing
import re
import pickle
import json

In [2]:
businesses = json.load(open('businesses.json'))

# structure of businesses is basically a list of business entries, each represented as a dictionary

In [3]:
# e.g. the first business entry looks like this:
businesses[0]

{'id': 'ETgJqJHV7BW6pIr9Ox74sA',
 'alias': 'amélie-new-york',
 'name': 'Amélie',
 'image_url': 'https://s3-media4.fl.yelpcdn.com/bphoto/cSDgVuPMnJgMLTrTNSEXug/o.jpg',
 'is_claimed': True,
 'is_closed': False,
 'url': 'https://www.yelp.com/biz/am%C3%A9lie-new-york?adjust_creative=vSc4zqXxou5UdN6T_sbcqg&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_lookup&utm_source=vSc4zqXxou5UdN6T_sbcqg',
 'phone': '+12125332962',
 'display_phone': '(212) 533-2962',
 'review_count': 2220,
 'categories': [{'alias': 'french', 'title': 'French'},
  {'alias': 'wine_bars', 'title': 'Wine Bars'}],
 'rating': 4.5,
 'location': {'address1': '22 W 8th St',
  'address2': '',
  'address3': '',
  'city': 'New York',
  'zip_code': '10011',
  'country': 'US',
  'state': 'NY',
  'display_address': ['22 W 8th St', 'New York, NY 10011'],
  'cross_streets': '5th Ave & Mac Dougal St'},
 'coordinates': {'latitude': 40.7327, 'longitude': -73.99766},
 'photos': ['https://s3-media4.fl.yelpcdn.com/bphoto/cSDgVuPMnJgMLTr

In [4]:
businesses_frame = pd.io.json.json_normalize(businesses)
businesses_frame.shape

(500, 27)

In [5]:
businesses_frame.columns

Index(['alias', 'categories', 'coordinates.latitude', 'coordinates.longitude',
       'display_phone', 'hours', 'id', 'image_url', 'is_claimed', 'is_closed',
       'location.address1', 'location.address2', 'location.address3',
       'location.city', 'location.country', 'location.cross_streets',
       'location.display_address', 'location.state', 'location.zip_code',
       'name', 'phone', 'photos', 'price', 'rating', 'review_count',
       'transactions', 'url'],
      dtype='object')

In [6]:
# Diversity of food??
# Type of cuisine
# Type of restaurant - cafe, (sit-down) restaurant, diner, breakfast & brunch
# Price ($)
# Restaurant Rating (>3.5 stars)
# Vegan, vegetarianism, gluten- free

KEEP_COLS = ['id',
             'name',
             'url',
             'price',
             'rating',
             'coordinates.latitude',
             'coordinates.longitude',
             'categories'
            ]

In [7]:
businesses_frame = businesses_frame[KEEP_COLS]
businesses_frame.head()

Unnamed: 0,id,name,url,price,rating,coordinates.latitude,coordinates.longitude,categories
0,ETgJqJHV7BW6pIr9Ox74sA,Amélie,https://www.yelp.com/biz/am%C3%A9lie-new-york?...,$$,4.5,40.7327,-73.99766,"[{'alias': 'french', 'title': 'French'}, {'ali..."
1,UA2M9QFZghe-9th2KwLoWQ,Burger & Lobster,https://www.yelp.com/biz/burger-and-lobster-ne...,$$,4.0,40.74007,-73.99344,"[{'alias': 'seafood', 'title': 'Seafood'}, {'a..."
2,8Oo2AtQEPDfxIOnA8wfXoQ,886,https://www.yelp.com/biz/886-new-york?adjust_c...,$$,4.0,40.72877,-73.98873,"[{'alias': 'taiwanese', 'title': 'Taiwanese'}]"
3,pL2tig3FxJcrZ4wDIO4TsA,Eataly Downtown,https://www.yelp.com/biz/eataly-downtown-new-y...,$$,3.5,40.709716,-74.011714,"[{'alias': 'grocery', 'title': 'Grocery'}, {'a..."
4,jjJc_CrkB2HodEinB6cWww,LoveMama,https://www.yelp.com/biz/lovemama-new-york?adj...,$$,4.0,40.730386,-73.986061,"[{'alias': 'thai', 'title': 'Thai'}, {'alias':..."


In [8]:
# get all unique (title) values in "categories"

all_categories_title_set = set()
for i in range(businesses_frame.shape[0]):
    categories_list = businesses_frame.loc[i, 'categories']
    for j in range(len(categories_list)):
        all_categories_title_set.add(categories_list[j]['title'])
        
all_categories_title_set

{'African',
 'American (New)',
 'American (Traditional)',
 'Argentine',
 'Asian Fusion',
 'Australian',
 'Bakeries',
 'Barbeque',
 'Bars',
 'Basque',
 'Beer Bar',
 'Beer Gardens',
 'Beer, Wine & Spirits',
 'Brasseries',
 'Brazilian',
 'Breakfast & Brunch',
 'British',
 'Bubble Tea',
 'Buffets',
 'Burgers',
 'Cafes',
 'Cajun/Creole',
 'Cambodian',
 'Cantonese',
 'Caribbean',
 'Chicken Wings',
 'Chinese',
 'Cocktail Bars',
 'Colombian',
 'Comfort Food',
 'Cuban',
 'Delis',
 'Desserts',
 'Dim Sum',
 'Diners',
 'Dominican',
 'Ethiopian',
 'Fast Food',
 'Filipino',
 'Food Court',
 'Food Stands',
 'French',
 'Gastropubs',
 'Georgian',
 'German',
 'Gluten-Free',
 'Greek',
 'Grocery',
 'Halal',
 'Hot Pot',
 'Ice Cream & Frozen Yogurt',
 'Indian',
 'Indonesian',
 'Irish',
 'Italian',
 'Izakaya',
 'Japanese',
 'Jazz & Blues',
 'Karaoke',
 'Korean',
 'Laotian',
 'Latin American',
 'Lebanese',
 'Live/Raw Food',
 'Lounges',
 'Malaysian',
 'Meat Shops',
 'Mediterranean',
 'Mexican',
 'Middle Eastern

In [9]:
# flatten "categories" into one-hot encoded columns

# initialize a column for each of the category titles
for title in all_categories_title_set:
    businesses_frame[title] = 0  # recycle value 0
    
for i in range(businesses_frame.shape[0]):
    categories_list = businesses_frame.loc[i, 'categories']
    categories_title_list = []
    for j in range(len(categories_list)):
        categories_title_list.append(categories_list[j]['title'])
    
    for title in all_categories_title_set:
        if title in categories_title_list:
            businesses_frame.at[i, title] = 1  # one-hot encoding

In [10]:
# check that one-hot encoding is correct, e.g.

print(businesses_frame.loc[0,'categories'])  # original format of category titles
print(businesses_frame.loc[0, all_categories_title_set].unique())  # some encoding columns are on, some are off
print(businesses_frame.loc[0,'French'])  # the correct category title is on
print(businesses_frame.loc[0,'Wine Bars'])  # the correct category title is on
print(businesses_frame.loc[0, all_categories_title_set].drop(['French', 'Wine Bars']).unique())  # once we drop the correct category titles, no encoding column is on

[{'alias': 'french', 'title': 'French'}, {'alias': 'wine_bars', 'title': 'Wine Bars'}]
[0 1]
1
1
[0]


In [11]:
businesses_frame.drop('categories', axis = 1, inplace = True)

In [12]:
# only include restaurants with >= 4 rating, as most restaurants have >= 4 rating
businesses_frame = businesses_frame[businesses_frame['rating'] >= 4]
businesses_frame.shape[0]  # 481/500 restaurants have >= 4 rating

481

In [13]:
# reset index
businesses_frame.reset_index(drop=True, inplace=True)

In [14]:
businesses_frame.head()

Unnamed: 0,id,name,url,price,rating,coordinates.latitude,coordinates.longitude,Pubs,Indian,Taiwanese,...,Steakhouses,Tex-Mex,Peruvian,Gluten-Free,Diners,Portuguese,American (New),Ramen,Szechuan,Shanghainese
0,ETgJqJHV7BW6pIr9Ox74sA,Amélie,https://www.yelp.com/biz/am%C3%A9lie-new-york?...,$$,4.5,40.7327,-73.99766,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,UA2M9QFZghe-9th2KwLoWQ,Burger & Lobster,https://www.yelp.com/biz/burger-and-lobster-ne...,$$,4.0,40.74007,-73.99344,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,8Oo2AtQEPDfxIOnA8wfXoQ,886,https://www.yelp.com/biz/886-new-york?adjust_c...,$$,4.0,40.72877,-73.98873,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,jjJc_CrkB2HodEinB6cWww,LoveMama,https://www.yelp.com/biz/lovemama-new-york?adj...,$$,4.0,40.730386,-73.986061,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CwOAKJdX8AMz5iAoA-ZEuA,Uglyduckling,https://www.yelp.com/biz/uglyduckling-brooklyn...,$$,4.0,40.686023,-73.991302,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# save
pickle.dump(businesses_frame, open("yelp_businesses_cleaned.pickle", "wb"))