# Yelp Data - Pulling Bars

In [57]:
from __future__ import print_function

import argparse
import json
import numpy as np
import math
from math import pi
import matplotlib.pyplot as plt
import os
import pandas as pd
import pprint
import pylab as pl
import requests
from sklearn import preprocessing
import shutil 
import sys
import urllib
import urllib.request 
import zipfile
%matplotlib inline

try:
    # For Python 3.0 and later
    from urllib.error import HTTPError
    from urllib.parse import quote
    from urllib.parse import urlencode
except ImportError:
    # Fall back to Python 2's urllib2 and urllib
    from urllib2 import HTTPError
    from urllib import quote
    from urllib import urlencode
    
from pandas.io.json import json_normalize
from shapely.geometry import Point

In [91]:
# Constants for accessing the API
API_KEY = open('yelpKey.txt', 'r').readlines()[0] # api stored in seperate .txt file
API_HOST = 'https://api.yelp.com'
SEARCH_PATH = '/v3/businesses/search'
BUSINESS_PATH = '/v3/businesses/'

### Search Terms
params = {
    'term': 'bar',
    'location': 'Park Slope, Brooklyn',
    'limit':'50'
}

In [59]:
def request(host=API_HOST, path=SEARCH_PATH, api_key=API_KEY, url_params=params):
    """Given your API_KEY, send a GET request to the API.
    Args:
        host (str): The domain host of the API.
        path (str): The path of the API after the domain.
        API_KEY (str): Your API Key.
        url_params (dict): An optional set of query parameters in the request.
    Returns:
        dict: The JSON response from the request.
    Raises:
        HTTPError: An error occurs from the HTTP request.
    """
    url_params = url_params or {}
    url = '{0}{1}'.format(host, quote(path.encode('utf8')))
    headers = {
        'Authorization': 'Bearer %s' % api_key,
    }

    response = requests.request('GET', url, headers=headers, params=url_params)

    return response.json()

In [60]:
def search(api_key, term, location, categories, offset, price):
    """Query the Search API by a search term and location.
    Args:
        term (str): The search term passed to the API.
        location (str): The search location passed to the API.
    Returns:
        dict: The JSON response from the request.
    """

    url_params = {
        'term': term.replace(' ', '+'),
        'location': location.replace(' ', '+'),
        'limit': int(params['limit']),
        'offset': offset,
        'categories': categories,
        'price':price
    }
    
    find_locs = request(API_HOST, SEARCH_PATH, api_key, url_params=url_params)
    try:
        return json_normalize(find_locs['businesses'])
    except AttributeError:
        return find_locs

In [33]:
# don't really need this, but useful for initially analyzing the pulls from yelp
def print_pretty_yelp_json(json):
    for i in range(len(json['businesses'])):
        item = json['businesses'][i] #['alias']

        print(' Store Name:',item['name'], '\n',
              'Categories:', [item['categories'][i]['title'] \
                              for i in range(len(item['categories']))], '\n',
              'Latitude:', item['coordinates']['latitude'], '\n',
              'Longitude:', item['coordinates']['longitude'], '\n',
              'City:', item['location']['city'], '\n',
              'Zip Code:', item['location']['zip_code'], '\n',
              'Price:', item['price'],  '\n',
              'Rating:', item['rating'], '\n',
              'Review Count:', item['review_count'], '\n')

In [34]:
def filtered_bar(df):
    '''
    Returns dataframe with only necessary columns, and cleaned in such a way as we will need them later.
    '''
    category_list = [[df['categories'][j][i]['title'] \
    for i in range(len(df['categories'][j]))] \
    for j in range(len(df['categories']))]

    location = list(zip(df['coordinates.latitude'], 
                             df['coordinates.longitude']))

    columns = ['Store_Name', 'Categories', 'Location', 'City', 
                   'Zip_Code', 'Price', 'Rating', 'Review_Count']
    data = [list(df['name']), category_list, location, list(df['location.city']),
                list(df['location.zip_code']), list(df['price']), list(df['rating']), 
                list(df['review_count'])]


    filtered_bar = pd.DataFrame(data = dict(zip(columns, data)))
    filtered_bar = filtered_bar[columns]
    return filtered_bar

In [49]:
neighborhoods = pd.read_csv('data/neighborhoods.csv', index_col=0)
neighborhoods = neighborhoods[neighborhoods.City == 'New York'][:15]
neighborhoods.head()

Unnamed: 0,State,County,City,Name,search_term
1859,NY,Queens,New York,Rego Park,"Rego Park, New York"
1860,NY,Queens,New York,Belle Harbor,"Belle Harbor, New York"
1861,NY,New York,New York,Roosevelt Island,"Roosevelt Island, New York"
1862,NY,Queens,New York,Howard Beach,"Howard Beach, New York"
1863,NY,Queens,New York,Breezy Point,"Breezy Point, New York"


In [None]:
# If you would rather search by borough
boroughs = ['Brooklyn','Manhattan','Bronx','Staten Island','Queens']
for i in range(len(boroughs)):
     boroughs[i] += ', New York'

In [105]:
def find_bar(search_term='bars', loc='New York',category=None, loops=2, price=[1,2,3,4]):
    
    '''
    The main workhorse here, this pulls all of the bar shops from yelp, doing many exhausting cycles.
    '''
    
    bar_shops_df = None
    print(loc)
    
    for neighborhood in neighborhoods[neighborhoods.City==loc].search_term:
#     for neighborhood in boroughs:
        print(neighborhood)

        for i in range(loops):

            try:
                # max api allows is 50 per pull
                bar_results = search(API_KEY, search_term, neighborhood, category, i*50, price)
            except IndexError:
                # error raised once no more bar shops in the area are found.
                # this loop allows to capture the largest remaining no. of bars
                for j in range(49, -1, -1): 
                    if j == 0:
                        break
                    try:
                        bar_results = search(API_KEY, search_term, neighborhood, category, i*j, price)
                    except IndexError:
                        continue
                    break

            try:
                bar_results = filtered_bar(bar_results)

                if isinstance(bar_shops_df, pd.DataFrame):
                    bar_shops_df = pd.concat([bar_shops_df, bar_results])
                    bar_shops_df.reset_index(drop=True, inplace=True)
                else:
                    bar_shops_df = bar_results
            except KeyError:
                break
        
    bar_shops_df = bar_shops_df.drop_duplicates('Location')
    bar_shops_df = bar_shops_df[bar_shops_df['Review_Count'] > 5]
    bar_shops_df = bar_shops_df[bar_shops_df['Categories'].apply(lambda x:('Pubs' or 'Dive Bars' or 'Breweries' or 'Cocktail Bars' or 'Beer Gardens' or 'Beer Halls') in x)]
            
    return bar_shops_df.reset_index(drop=True)

In [9]:
def pd_to_gpd(df):
    '''
    Convert coffee df into geodataframe to allow for merge and analysis with neighborhood boundaries from Zillow.
    '''
    geom = df.apply(lambda x: Point(x['Location'][1], x['Location'][0]), axis=1)
    geo_df = gpd.GeoDataFrame(df, geometry=geom)
    geo_df.crs = {'init' :'epsg:4326'}
    return geo_df

In [107]:
# neighborhoods to work with
neighborhoods = pd.read_csv('data/neighborhoods.csv', index_col=0)
neighborhoods = neighborhoods[neighborhoods.City == 'New York'][:50]

bars_final = find_bar(loops=2)


New York
Rego Park, New York
Belle Harbor, New York
Roosevelt Island, New York
Howard Beach, New York
Breezy Point, New York
Neponsit, New York
East Elmhurst, New York
West Farms, New York
Cambria Heights, New York
Fresh Meadows, New York
Maspeth, New York
Fort Hamilton, New York
Bay Terrace, New York
Far Rockaway, New York
Hollis, New York
Jamaica, New York
South Ozone Park, New York
Ellis Island, New York
Bayside, New York
La Guardia Airport, New York
Woodhaven, New York
Arverne, New York
Broad Channel, New York
Flushing, New York
Oakland Gardens, New York
Ozone Park, New York
Fort Tilden, New York
Malba, New York
Briarwood, New York
College Point, New York
Corona, New York
Pomonok, New York


KeyError: 'businesses'