In [1]:
import pandas as pd
import numpy as np
import requests
import yaml
import time
from selenium import webdriver
import random
import json
from pymongo import MongoClient
from collections import Counter
from selenium.common.exceptions import WebDriverException
import boto3
from collections import defaultdict,Counter
import pickle
pd.set_option('display.max_columns', 250)

In [2]:
def get_yelp_ids_from_region(locations, n_from_each):
    with open('/Users/ElliottC/.secrets/yelp_keys.txt') as f:
        keys = yaml.load(f)
    headers = {'Authorization': f"Bearer {keys['api_key']}"}

    url = "https://api.yelp.com/v3"
    url_biz = "https://api.yelp.com/v3/businesses/search"
    url_ind_biz = "https://api.yelp.com/v3/businesses/"
    
    client = MongoClient('mongodb://localhost:27017/')
    restaurants = client['restaurants']
    yelp_restaurant_search = restaurants['yelp_restaurant_search']
    for j in range(len(locations)):
        location_name = locations[j]
        print(location_name)
        i = 0
        status = 200
        while (i < int(n_from_each/50)+1) and (status == 200):
            response = requests.get(f"{url_biz}?location={location_name}&offset={i*50}&limit=50&term=restaurant"\
                                    ,params={},headers=headers)
            status = response.status_code
            if status != 200:
                time.sleep(10)
                print(response.json())
            else:
                yelp_restaurant_search.insert_one(response.json())
            print(n_from_each - i*50)
            print(status)
            time.sleep(1)
            i += 1

def clean_price(row):
    if type(row) == str:
        return len(row)
    else:
        return 1.5

def list_categories(row):
    categories = []
    for item in row:
        categories.append(item['title'])
    return categories

def add_restaurant_count_column(dataframe):
    restaurant_frequency = dataframe.groupby(['name']).count().sort_values('address', ascending=False)

    restaurant_frequency = pd.DataFrame(restaurant_frequency['address'])

    restaurant_frequency.columns = ['restaurant_count']

    restaurant_frequency['name'] = restaurant_frequency.index

    restaurant_frequency = restaurant_frequency[['name', 'restaurant_count']]

    return previously_open_US_restaurants.merge(restaurant_frequency, how='left', left_on='name', right_on='name')


In [None]:
seattle_zip_codes = ['98104','98154','98101','98121','98122','98109',
                     '98102','98122','98109','98119','98199','98107','98103','98105',]

In [None]:
us_cities = ['los angeles', 'san francisco', 'new york city',
             'portland', 'chicago', 'boston', 'houston', 
             'denver','philadelphia','phoenix','tacoma','bellevue',
            'san antonio','san diego','dallas','san jose','austin',
            'jacksonville','columbus','fort worth','charlotte',
            'el paso','detroit','new orleans','baltimore','louisville',
            'milwaukee','albuquerque','tucson','fresno','sacramento',
            'kansas city','long beach', 'mesa','atlanta','colorado springs',
            'virginia beach','raleigh','omaha','miami','oakland','minneapolis',
            'tulsa','wichita','arlington','salt lake city']

In [None]:
get_yelp_ids_from_region(seattle_zip_codes,10000)

In [53]:
client = MongoClient('mongodb://localhost:27017/')
restaurants = client['restaurants']
yelp_restaurant_search = restaurants['yelp_restaurant_search']
results = list(yelp_restaurant_search.find())

all_results = []
for row in results:
    all_results.extend(row['businesses'])
results_df = pd.DataFrame(all_results)

results_df['price'] = results_df['price'].apply(clean_price)

categories = set()

for j in range(len(results_df)):
    try:
        for i in range(len(results_df['categories'][j])):
            categories.add((results_df['categories'][j][i]['title']))
    except KeyError:
        print('error')

results_df['list_categories'] = results_df['categories'].apply(list_categories)

results_df['address'] = results_df['location'].apply(lambda x: x['address1'])
results_df['city'] = results_df['location'].apply(lambda x: x['city'])
results_df['country'] = results_df['location'].apply(lambda x: x['country'])
results_df['state'] = results_df['location'].apply(lambda x: x['state'])
results_df['zip_code'] = results_df['location'].apply(lambda x: x['zip_code'])
results_df['latitude'] = results_df['coordinates'].apply(lambda x: x['latitude'])
results_df['longitude'] = results_df['coordinates'].apply(lambda x: x['longitude'])

for key in categories:
    results_df[f"Category|{key}_true"] = results_df['list_categories'].apply(lambda x: key in x)

basic_yelp_columns = ['restaurant_count','restaurant_count > 1','restaurant_count > 5',
                'restaurant_count > 25','review_count','stars','Category|Restaurants_true','Category|Food_true',
 'Category|Nightlife_true','Category|Bars_true','Category|Fast Food_true','Category|American (Traditional)_true',
 'Category|Sandwiches_true','Category|Pizza_true','Category|Mexican_true','Category|Burgers_true',
 'Category|American (New)_true','Category|Breakfast & Brunch_true','Category|Coffee & Tea_true',
'Category|Grocery_true','Category|Italian_true','Category|Specialty Food_true','Category|Shopping_true',
 'Category|Chinese_true','Category|Event Planning & Services_true','Category|Chicken Wings_true',
'Category|Salad_true','Category|Bakeries_true','Category|Desserts_true','Category|Convenience Stores_true',
'Category|Ice Cream & Frozen Yogurt_true','Category|Sports Bars_true','Category|Seafood_true',
'Category|Caterers_true','Category|Delis_true','Category|Cafes_true',
'Category|Drugstores_true','Category|Japanese_true','Category|Arts & Entertainment_true',
'Category|Juice Bars & Smoothies_true','Category|Pubs_true','Category|Steakhouses_true','Category|Sushi Bars_true',
'Category|Asian Fusion_true','Category|Barbeque_true','Category|Diners_true','Category|Lounges_true',
'Category|Gas Stations_true','Category|Cocktail Bars_true',
'Category|Mediterranean_true','Category|Wine Bars_true','Category|Food Trucks_true','Category|Tex-Mex_true',
                     'Attribute|RestaurantsPriceRange2 value:']

info_columns = ['name','display_phone','price','rating','review_count','url','address','zip_code','latitude','longitude']

restaurant_count_df = pd.read_csv('../data/featurized_dataframe.csv')[['name','restaurant_count','restaurant_count > 1','restaurant_count > 5',
                'restaurant_count > 25']]

results_df = results_df.merge(restaurant_count_df, how='left', left_on='name', right_on='name')

results_df['restaurant_count'] = results_df['restaurant_count'].fillna(1)
results_df['restaurant_count > 1'] = results_df['restaurant_count > 1'].fillna(False)
results_df['restaurant_count > 5'] = results_df['restaurant_count > 5'].fillna(False)
results_df['restaurant_count > 25'] = results_df['restaurant_count > 25'].fillna(False)

results_df['stars'] = results_df['rating']
results_df['Attribute|RestaurantsPriceRange2 value:'] = results_df['price']

results_df = results_df.drop_duplicates(['name','address1']).reset_index()

with open('../models/basic_gb_model.pkl','rb') as f:
    basic_gb_model = pickle.load(f)

predictions = basic_gb_model.predict_proba(results_df[basic_yelp_columns])[:,1]

predictions_df = pd.DataFrame(data=predictions,columns=['closing_probability']).reset_index()

final_df = pd.concat([results_df[info_columns],predictions_df],axis=1)

final_df.columns = ['Name','Phone','Price','Rating','Review Count','Link','Address',
                   'Zip Code','Latitude','Longitude','Index','Closing Probability']

final_df = final_df[['Name','Phone','Price','Rating','Review Count','Link','Address',
                   'Zip Code','Latitude','Longitude','Closing Probability']]

final_df.to_csv('../data/50_cities_df.csv')