In [82]:
import pandas as pd
import numpy as np
import requests
import yaml
import time
from selenium import webdriver
import random
import json
from pymongo import MongoClient
from collections import Counter
from selenium.common.exceptions import WebDriverException
import boto3

In [6]:
def create_pandas_df_from_json(path):
    '''
    INPUT: filepath string
    OUTPUT: pandas database
    '''
    return pd.read_json(file_path, lines=True)

def is_food(item):
    '''
    INPUT: cell from pandas dataframe
    OUTPUT: boolean
    '''
    restaurants_and_related_categories = ['Restaurants', 'Italian','Food', 'Bars','Fast Food', 'Coffee & Tea', 'Sandwiches']
    if len(set(restaurants_and_related_categories) & set(item)) >= 1:
        return True
    else:
        return False
    
def current_google_data(keys, index, dataframe, radius):
    name = dataframe[['name']].iloc[index,0]
    latitude = dataframe[['latitude']].iloc[index,0]
    longitude = dataframe[['longitude']].iloc[index,0]
    
    link = 'https://maps.googleapis.com/maps/api/place/nearbysearch/json?location=' + \
    str(latitude) + ',' + str(longitude) + '&radius=' + str(radius) + '&keyword=' + str(name) + '&key=' + str(keys)
    
    response = requests.get(link)
    response_dict = response.json()
    
    response_dict['yelp_business_id'] = dataframe[['business_id']].iloc[index,0]
    response_dict['queried_name'] = name
    response_dict['queried_latitude'] = latitude
    response_dict['queried_longitude'] = longitude
    
    
    if response.status_code != 200:
        print(response.status_code)
        time.sleep(10)
        response = requests.get(link)
        if response.status_code != 200:
            print(response.status_code)
            time.sleep(10)
            return "Came back empty"
    if len(str(response.json())) < 100:
        return response_dict
    else:
        return response_dict
    
def bulk_google_places_search(google_keys, dataframe, start_idx, end_idx, failed_rows,
                              radius=10, update_frequency=100, print_updates = True):
    
    client = MongoClient('mongodb://localhost:27017/')
    restaurants = client['restaurants']
    google_places = restaurants['google_places']
    start_time = time.time()
    
    
    with open('/Users/ElliottC/.secrets/google_keys.txt') as f:
        google_keys = yaml.load(f)
    
    for i in range(start_idx, end_idx):
        try:
            google_places.insert_one(current_google_data(google_keys, i, dataframe, radius))
        except requests.exceptions.SSLError:
            failed_rows.append({'time':time.time(), 'index': i})
            print(f"Error at index {i}")
            time.sleep(60)
        if (i % update_frequency == 0) and print_updates:
            print(f"At index {i}: {end_idx-i} remaining requests")
            elapsed = round(time.time() - start_time, 2)
            speed = round(elapsed / update_frequency, 2)
            remaining_time = str(round(((end_idx-i) * speed),2)/60/60) + " hours"
            print(f"{elapsed} per {update_frequency} requests, or {speed} per request\nRemaining time: {remaining_time}")
            start_time = time.time()
    return failed_rows
            
class ScrapeCensus:
    def __init__(self, url):
        self.browser = webdriver.Chrome()
        self.browser.get(url)
        self.s3 = boto3.client('s3')
    def scrape(self, list_of_zip_codes, start_idx, end_idx):
        for i in range(len(list_of_zip_codes[start_idx:end_idx])):
            search_box = self.browser.find_element_by_css_selector("input#cfsearchtextbox")
            search_box.click()
            search_box.send_keys(list_of_zip_codes[i])
            search_button = self.browser.find_element_by_css_selector("a#communityfactssubmit")
            search_button.click()
            time.sleep(2)
            try:
                show_all = self.browser.find_element_by_css_selector("a.leftnav_btn.all-measures")
                show_all.click()
            except WebDriverException:
                self.browser.get("https://factfinder.census.gov/faces/nav/jsf/pages/community_facts.xhtml")
                time.sleep(1)
                show_all = self.browser.find_element_by_css_selector("a.leftnav_btn.all-measures")
                show_all.click()
            time.sleep(2)
            page_source = self.browser.page_source
            self.s3.put_object(Bucket='zip-code-economic-data', Key='zip_code: '+list_of_zip_codes[i], Body=page_source)
            print(f"{i}: {list_of_zip_codes[i]}")
            
            
def get_zipped_postcode_data_from_s3_bucket(postcodes):
    s3 = boto3.client('s3')
    zip_code_data = []
    for code in postcodes:
        response = s3.get_object(Bucket='zip-code-economic-data', Key=f'zip_code: {code}')
        body = response['Body'].read()
        df = pd.read_html(body)[0][pd.read_html(body)[0]['Measure'].map(type) == str][['Description', 'Measure']]
        keys = [str(x) for x in list(df['Description'].values)]
        vals = [str(x) for x in list(df['Measure'].values)]
        zipped = dict(zip(keys, vals))
        zipped['Zip Code'] = code
        zip_code_data.append(zipped)
    return zip_code_data

In [7]:
file_path = 'https://s3-us-west-2.amazonaws.com/businesspredictiondata/business.json'
yelp_business_data = create_pandas_df_from_json(file_path)

#filters businesses that were open when this dataset was published Jan. 2018
open_businesses = yelp_business_data.loc[yelp_business_data['is_open'] == 1, :].copy()

#creates column that says if business is restaurant and creates df of just open restaurants
open_businesses['is_food'] = open_businesses.loc[:, 'categories'].apply(is_food)
open_restaurants = open_businesses.loc[open_businesses['is_food'] == True, :].copy()

#creates column that says if business is in USA and creates df of just
#restaurants open in the US as of January 2018
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA",
      "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
      "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
      "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
      "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]
open_restaurants['in_US'] = open_restaurants['state'].isin(states)
previously_open_US_restaurants = open_restaurants[open_restaurants['in_US'] == True]

In [2]:
#scrapes google to get updated business information
client = MongoClient('mongodb://localhost:27017/')
restaurants = client['restaurants']
google_places = restaurants['google_places']

with open('/Users/ElliottC/.secrets/google_keys.txt') as f:
    google_keys = yaml.load(f)

failed_rows = []

bulk_google_places_search(google_keys, previously_open_US_restaurants, 0, len(previously_open_US_restaurants), failed_rows, 10, 100)

NameError: name 'MongoClient' is not defined

In [71]:
#gets the valid postal codes from the dataframe and then scrapes the census for data on each
postcodes = list(previously_open_US_restaurants['postal_code'].unique())

postcodes = [x for x in postcodes if len(x) > 2]

scraper = ScrapeCensus('https://factfinder.census.gov/faces/nav/jsf/pages/community_facts.xhtml')

scrape_this.scrape(postcodes, 0, len(postcodes))

In [389]:
#turns the zip code data into a dataframe and saves it
zip_code_dicts = get_zipped_postcode_data_from_s3_bucket(postcodes)
zip_code_df = pd.DataFrame(zip_code_dicts)
zip_code_df.to_csv('/Users/ElliottC/g/projects/yelp/predicting_restaurant_closure/data/zip_code_data.csv')

In [145]:
def google_nearby_restaurants(keys, index, dataframe, radius):
    latitude = dataframe[['latitude']].iloc[index,0]
    longitude = dataframe[['longitude']].iloc[index,0]
    
    link = 'https://maps.googleapis.com/maps/api/place/nearbysearch/json?location=' + \
    str(latitude) + ',' + str(longitude) + '&radius=' + str(radius) + '&key=' + str(keys) + '&type=restaurant'
    
    response = requests.get(link)
    response_dict = response.json()
    
    response_dict['yelp_business_id'] = dataframe[['business_id']].iloc[index,0]
    response_dict['radius'] = radius
    response_dict['queried_latitude'] = latitude
    response_dict['queried_longitude'] = longitude
    
    
    if response.status_code != 200:
        print(response.status_code)
        time.sleep(10)
        response = requests.get(link)
        if response.status_code != 200:
            print(response.status_code)
            time.sleep(10)
            return "Came back empty"
    if len(str(response.json())) < 100:
        return response_dict
    else:
        return response_dict

In [152]:
def bulk_google_nearby(start_idx, end_idx, dataframe, radius, update_frequency=100, print_updates=True):
    failed_rows = []
    
    client = MongoClient('mongodb://localhost:27017/')
    restaurants = client['restaurants']
    maps_nearby = restaurants['maps_nearby']
    start_time = time.time()
    
    with open('/Users/ElliottC/.secrets/google_keys.txt') as f:
        google_keys = yaml.load(f)
    
    for i in range(start_idx, end_idx):
        try:
            maps_nearby.insert_one(google_nearby_restaurants(google_keys, i, dataframe, radius))
        except requests.exceptions.SSLError:
            failed_rows.append({'time':time.time(), 'index': i})
            print(f"Error at index {i}")
            time.sleep(60)
        if (i % update_frequency == 0) and print_updates:
            print(f"At index {i}: {end_idx-i} remaining requests")
            elapsed = round(time.time() - start_time, 2)
            speed = round(elapsed / update_frequency, 2)
            remaining_time = str(round(((end_idx-i) * speed),2)/60/60) + " hours"
            print(f"{elapsed} per {update_frequency} requests, or {speed} per request\nRemaining time: {remaining_time}")
            start_time = time.time()
    return failed_rows

In [159]:
client = MongoClient('mongodb://localhost:27017/')
restaurants = client['restaurants']
maps_nearby = restaurants['maps_nearby']
start_time = time.time()

In [None]:
test = bulk_google_nearby(10000, 20000, previously_open_US_restaurants, 400)

At index 10000: 10000 remaining requests
0.3 per 100 requests, or 0.0 per request
Remaining time: 0.0 hours
At index 10100: 9900 remaining requests
25.81 per 100 requests, or 0.26 per request
Remaining time: 0.715 hours
At index 10200: 9800 remaining requests
28.1 per 100 requests, or 0.28 per request
Remaining time: 0.7622222222222222 hours
At index 10300: 9700 remaining requests
26.5 per 100 requests, or 0.27 per request
Remaining time: 0.7274999999999999 hours
At index 10400: 9600 remaining requests
28.38 per 100 requests, or 0.28 per request
Remaining time: 0.7466666666666666 hours
At index 10500: 9500 remaining requests
26.36 per 100 requests, or 0.26 per request
Remaining time: 0.6861111111111111 hours
At index 10600: 9400 remaining requests
24.67 per 100 requests, or 0.25 per request
Remaining time: 0.6527777777777778 hours
At index 10700: 9300 remaining requests
27.18 per 100 requests, or 0.27 per request
Remaining time: 0.6975 hours
At index 10800: 9200 remaining requests
25.2

In [79]:
for i in range()
google_nearby_restaurants(google_keys,0,previously_open_US_restaurants, 5000)

In [126]:
rand_idxs = [random.randint(0, len(previously_open_US_restaurants)) for i in range(100)]

In [133]:
for radius in radiuses:
    biz_results = []
    for val in rand_idxs:
        results = google_nearby_restaurants(google_keys,val, previously_open_US_restaurants,radius)
        biz_results.append(len(results['results']))
    radius_values[radius] = biz_results

In [134]:
for key, value in radius_values.items():
    avg_results[key] = (sum(value) / len(value))

In [135]:
avg_results

{10: 0.39,
 21: 0.74,
 46: 1.43,
 100: 2.88,
 215: 5.83,
 300: 8.09,
 322: 8.57,
 344: 9.02,
 366: 9.36,
 388: 9.71,
 411: 10.26,
 433: 10.53,
 455: 10.91,
 464: 10.97,
 477: 11.17,
 500: 11.55,
 1000: 15.75,
 2154: 18.85,
 4641: 19.87,
 10000: 20.0}

In [None]:
#400 meters will provide an average number of restaurants close to 10. Next step is to do run this on all of my businesses

In [125]:
radiuses = np.logspace(1, 4, num=10).astype(int)

In [132]:
radiuses = np.linspace(300, 500, num=10).astype(int)

In [None]:
#scrapes google to get updated business information
client = MongoClient('mongodb://localhost:27017/')
restaurants = client['restaurants']
google_nearby_restaurants = restaurants['google_nearby_restaurants']


In [None]:
previously_open_US_restaurants