In [189]:
from yelpapi import YelpAPI

import datetime
from dateutil import relativedelta

import numpy as np
import pandas as pd

import itertools

In [238]:
api_key = ""
yelp_api = YelpAPI(api_key)


In [150]:
simple_category_list = [
    "whiskeybars",
    "tikibars",
    "pubs",
    "lounges",
    "beerbar",
    "divebars",
    "speakeasies",
]

complex_category_list = [
    "bars",
    "cocktailbars",
]

In [128]:
simple_businesses = []

for category in simple_category_list:
    work_businesses = []
    
    cur_offset = 0
    
    while True:
        search_results = yelp_api.search_query(
            categories=category,
            location="nyc",
            sort_by='rating',
            limit=50,
            offset=cur_offset
        )
        
        new_businesses = search_results["businesses"]
        
        if len(new_businesses) == 0: break
        
        work_businesses.extend(new_businesses)
        cur_offset += 50
    
    assert len(work_businesses) == search_results["total"]
    simple_businesses.extend(work_businesses)
        
    print(f"{category}, {search_results['total']}")


whiskeybars, 39
tikibars, 17
pubs, 403
lounges, 658
beerbar, 212
divebars, 165
speakeasies, 28


In [169]:
work_time = datetime.datetime.now()
work_time += datetime.timedelta(days=1)  
work_time = work_time.replace(hour=0, minute=0)

open_at = work_time.strftime("%s")


In [170]:
complex_businesses = []

for category in complex_category_list:
    for price in range(4):
        work_businesses = []

        cur_offset = 0

        while True:
            if cur_offset + 50 > 1000: break
                
            search_results = yelp_api.search_query(
                categories=category,
                location="nyc",
                sort_by='rating',
                limit=50,
                offset=cur_offset,
                price=str(price+1)
            )
            
            new_businesses = search_results["businesses"]
            if len(new_businesses) == 0: break

            work_businesses.extend(new_businesses)
            cur_offset += 50

        assert len(work_businesses) == np.min([1000,search_results["total"]])
        complex_businesses.extend(work_businesses)
        
        print(f"{category} - {price+1}, {search_results['total']}")
              
        if search_results["total"] > 1000:

            work_businesses = []

            cur_offset = 0

            while True:
                if cur_offset + 50 > 1000: break

                search_results = yelp_api.search_query(
                    categories=category,
                    location="nyc",
                    sort_by='rating',
                    limit=50,
                    offset=cur_offset,
                    price=str(price+1),
                    open_at=open_at
                )
              
                new_businesses = search_results["businesses"]
                if len(new_businesses) == 0: break

                work_businesses.extend(new_businesses)
                cur_offset += 50

            assert len(work_businesses) == np.min([1000,search_results["total"]])
            complex_businesses.extend(work_businesses)

            print(f"{category} - {price+1}, {search_results['total']}")
                  

bars - 1, 538
bars - 2, 2900
bars - 2, 940
bars - 3, 484
bars - 4, 60
cocktailbars - 1, 23
cocktailbars - 2, 645
cocktailbars - 3, 153
cocktailbars - 4, 18


In [221]:
class Business():

    def __init__(self, name, latitude, longitude, review_count, rating):
        self.name = name
        self.latitude = latitude
        self.longitude = longitude
        self.review_count = review_count
        self.rating = rating
        

In [223]:
business_list = [ 
    Business(
        cur_business["name"], *cur_business["coordinates"].values(),
        cur_business["review_count"], cur_business["rating"]
    ) for cur_business in itertools.chain(
        simple_businesses, complex_businesses
    ) 
]


In [235]:
business_data = pd.DataFrame.from_records(
    [ cur_business.__dict__ for cur_business in business_list ]
)

business_data.drop_duplicates(inplace=True)

business_data.sort_values(by=["rating", "review_count"], ascending=False, inplace=True)

business_data.reset_index(drop=True, inplace=True)


In [236]:
business_data.to_pickle("./data/pickles/business_data.pkl")
