In [1]:
import pandas as pd
import numpy as np
import requests
import yaml
import time
from selenium import webdriver
import random
import json
from pymongo import MongoClient
from collections import Counter
from selenium.common.exceptions import WebDriverException
import boto3

In [3]:
def create_pandas_df_from_json(path):
    '''
    INPUT: filepath string
    OUTPUT: pandas database
    '''
    return pd.read_json(file_path, lines=True)

def is_food(item):
    '''
    INPUT: cell from pandas dataframe
    OUTPUT: boolean
    '''
    restaurants_and_related_categories = ['Restaurants', 'Italian','Food', 'Bars','Fast Food', 'Coffee & Tea', 'Sandwiches']
    if len(set(restaurants_and_related_categories) & set(item)) >= 1:
        return True
    else:
        return False
    
def current_google_data(keys, index, dataframe, radius):
    name = dataframe[['name']].iloc[index,0]
    latitude = dataframe[['latitude']].iloc[index,0]
    longitude = dataframe[['longitude']].iloc[index,0]
    
    link = 'https://maps.googleapis.com/maps/api/place/nearbysearch/json?location=' + \
    str(latitude) + ',' + str(longitude) + '&radius=' + str(radius) + '&keyword=' + str(name) + '&key=' + str(keys)
    
    response = requests.get(link)
    response_dict = response.json()
    
    response_dict['yelp_business_id'] = dataframe[['business_id']].iloc[index,0]
    response_dict['queried_name'] = name
    response_dict['queried_latitude'] = latitude
    response_dict['queried_longitude'] = longitude
    
    
    if response.status_code != 200:
        print(response.status_code)
        time.sleep(10)
        response = requests.get(link)
        if response.status_code != 200:
            print(response.status_code)
            time.sleep(10)
            return "Came back empty"
    if len(str(response.json())) < 100:
        return response_dict
    else:
        return response_dict
    
def bulk_google_places_search(google_keys, dataframe, start_idx, end_idx, failed_rows,
                              radius=10, update_frequency=100, print_updates = True):
    
    client = MongoClient('mongodb://localhost:27017/')
    restaurants = client['restaurants']
    google_places = restaurants['google_places']
    start_time = time.time()
    
    
    with open('/Users/ElliottC/.secrets/google_keys.txt') as f:
        google_keys = yaml.load(f)
    
    for i in range(start_idx, end_idx):
        try:
            google_places.insert_one(current_google_data(google_keys, i, dataframe, radius))
        except requests.exceptions.SSLError:
            failed_rows.append({'time':time.time(), 'index': i})
            print(f"Error at index {i}")
            time.sleep(60)
        if (i % update_frequency == 0) and print_updates:
            print(f"At index {i}: {end_idx-i} remaining requests")
            elapsed = round(time.time() - start_time, 2)
            speed = round(elapsed / update_frequency, 2)
            remaining_time = str(round(((end_idx-i) * speed),2)/60/60) + " hours"
            print(f"{elapsed} per {update_frequency} requests, or {speed} per request\nRemaining time: {remaining_time}")
            start_time = time.time()
    return failed_rows
            
class ScrapeCensus:
    def __init__(self, url):
        self.browser = webdriver.Chrome()
        self.browser.get(url)
        self.s3 = boto3.client('s3')
    def scrape(self, list_of_zip_codes, start_idx, end_idx):
        for i in range(len(list_of_zip_codes[start_idx:end_idx])):
            search_box = self.browser.find_element_by_css_selector("input#cfsearchtextbox")
            search_box.click()
            search_box.send_keys(list_of_zip_codes[i])
            search_button = self.browser.find_element_by_css_selector("a#communityfactssubmit")
            search_button.click()
            time.sleep(2)
            try:
                show_all = self.browser.find_element_by_css_selector("a.leftnav_btn.all-measures")
                show_all.click()
            except WebDriverException:
                self.browser.get("https://factfinder.census.gov/faces/nav/jsf/pages/community_facts.xhtml")
                time.sleep(1)
                show_all = self.browser.find_element_by_css_selector("a.leftnav_btn.all-measures")
                show_all.click()
            time.sleep(2)
            page_source = self.browser.page_source
            self.s3.put_object(Bucket='zip-code-economic-data', Key='zip_code: '+list_of_zip_codes[i], Body=page_source)
            print(f"{i}: {list_of_zip_codes[i]}")
            
            
def get_zipped_postcode_data_from_s3_bucket(postcodes):
    s3 = boto3.client('s3')
    zip_code_data = []
    for code in postcodes:
        response = s3.get_object(Bucket='zip-code-economic-data', Key=f'zip_code: {code}')
        body = response['Body'].read()
        df = pd.read_html(body)[0][pd.read_html(body)[0]['Measure'].map(type) == str][['Description', 'Measure']]
        keys = [str(x) for x in list(df['Description'].values)]
        vals = [str(x) for x in list(df['Measure'].values)]
        zipped = dict(zip(keys, vals))
        zipped['Zip Code'] = code
        zip_code_data.append(zipped)
    return zip_code_data

def google_nearby_restaurants(keys, index, dataframe, radius):
    latitude = dataframe[['latitude']].iloc[index,0]
    longitude = dataframe[['longitude']].iloc[index,0]
    
    link = 'https://maps.googleapis.com/maps/api/place/nearbysearch/json?location=' + \
    str(latitude) + ',' + str(longitude) + '&radius=' + str(radius) + '&key=' + str(keys) + '&type=restaurant'
    
    response = requests.get(link)
    response_dict = response.json()
    
    response_dict['yelp_business_id'] = dataframe[['business_id']].iloc[index,0]
    response_dict['radius'] = radius
    response_dict['queried_latitude'] = latitude
    response_dict['queried_longitude'] = longitude
    
    
    if response.status_code != 200:
        print(response.status_code)
        time.sleep(10)
        response = requests.get(link)
        if response.status_code != 200:
            print(response.status_code)
            time.sleep(10)
            return "Came back empty"
    if len(str(response.json())) < 100:
        return response_dict
    else:
        return response_dict

def bulk_google_nearby(start_idx, end_idx, dataframe, radius, update_frequency=100, print_updates=True):
    failed_rows = []
    
    client = MongoClient('mongodb://localhost:27017/')
    restaurants = client['restaurants']
    maps_nearby = restaurants['maps_nearby']
    start_time = time.time()
    
    with open('/Users/ElliottC/.secrets/google_keys.txt') as f:
        google_keys = yaml.load(f)
    
    for i in range(start_idx, end_idx):
        try:
            maps_nearby.insert_one(google_nearby_restaurants(google_keys, i, dataframe, radius))
        except requests.exceptions.SSLError:
            failed_rows.append({'time':time.time(), 'index': i})
            print(f"Error at index {i}")
            time.sleep(60)
        if (i % update_frequency == 0) and print_updates:
            print(f"At index {i}: {end_idx-i} remaining requests")
            elapsed = round(time.time() - start_time, 2)
            speed = round(elapsed / update_frequency, 2)
            remaining_time = str(round(((end_idx-i) * speed),2)/60/60) + " hours"
            print(f"{elapsed} per {update_frequency} requests, or {speed} per request\nRemaining time: {remaining_time}")
            start_time = time.time()
    return failed_rows

def summaries_from_google(dataframe, key):
    summaries = []
    key_errors = 0
    for i in range(len(dataframe)):
        total = 0
        count = 0
        for j in range(len(dataframe['results'][i])):
            try:
                total += dataframe['results'][i][j][key]
                count += 1
            except KeyError:
                key_errors += 1
        try:
            summaries.append({'business_id': nearby_df['yelp_business_id'][i], 'avg_'+key: (total / count)})
        except ZeroDivisionError:
            summaries.append({'business_id': nearby_df['yelp_business_id'][i], 'avg_'+key: 0})
    return pd.DataFrame(summaries)

In [7]:
file_path = 'https://s3-us-west-2.amazonaws.com/businesspredictiondata/business.json'
yelp_business_data = create_pandas_df_from_json(file_path)

#filters businesses that were open when this dataset was published Jan. 2018
open_businesses = yelp_business_data.loc[yelp_business_data['is_open'] == 1, :].copy()

#creates column that says if business is restaurant and creates df of just open restaurants
open_businesses['is_food'] = open_businesses.loc[:, 'categories'].apply(is_food)
open_restaurants = open_businesses.loc[open_businesses['is_food'] == True, :].copy()

#creates column that says if business is in USA and creates df of just
#restaurants open in the US as of January 2018
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA",
      "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
      "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
      "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
      "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]
open_restaurants['in_US'] = open_restaurants['state'].isin(states)
previously_open_US_restaurants = open_restaurants[open_restaurants['in_US'] == True]

In [2]:
#scrapes google to get updated business information
client = MongoClient('mongodb://localhost:27017/')
restaurants = client['restaurants']
google_places = restaurants['google_places']

with open('/Users/ElliottC/.secrets/google_keys.txt') as f:
    google_keys = yaml.load(f)

failed_rows = []

bulk_google_places_search(google_keys, previously_open_US_restaurants, 0, len(previously_open_US_restaurants), failed_rows, 10, 100)

NameError: name 'MongoClient' is not defined

In [None]:
scraper = ScrapeCensus('https://factfinder.census.gov/faces/nav/jsf/pages/community_facts.xhtml')

scrape_this.scrape(postcodes, 0, len(postcodes))

In [15]:
scraper = ScrapeCensus('https://factfinder.census.gov/faces/nav/jsf/pages/community_facts.xhtml')

scraper.scrape(zips, 0, len(zips))

0: 28223
1: 85264
2: 16930
3: 28219
4: 15135
5: 15031
6: 85311
7: 44211
8: 28127
9: 89138


In [14]:
zips = postcodes[520:530]

In [9]:
#gets the valid postal codes from the dataframe and then scrapes the census for data on each
postcodes = list(previously_open_US_restaurants['postal_code'].unique())

postcodes = [x for x in postcodes if len(x) > 2]

In [8]:
#turns the zip code data into a dataframe and saves it
zip_code_dicts = get_zipped_postcode_data_from_s3_bucket(postcodes)
zip_code_df = pd.DataFrame(zip_code_dicts)
zip_code_df.to_csv('../data/zip_code_data.csv')

KeyboardInterrupt: 

In [159]:
client = MongoClient('mongodb://localhost:27017/')
restaurants = client['restaurants']
maps_nearby = restaurants['maps_nearby']
start_time = time.time()
test = bulk_google_nearby(0, len(previously_open_US_restaurants), previously_open_US_restaurants, 400, 100, True)

In [164]:
test = bulk_google_nearby(20000, len(previously_open_US_restaurants), previously_open_US_restaurants, 400)

At index 20000: 12188 remaining requests
0.19 per 100 requests, or 0.0 per request
Remaining time: 0.0 hours
At index 20100: 12088 remaining requests
26.95 per 100 requests, or 0.27 per request
Remaining time: 0.9066 hours
At index 20200: 11988 remaining requests
26.19 per 100 requests, or 0.26 per request
Remaining time: 0.8658 hours
At index 20300: 11888 remaining requests
26.24 per 100 requests, or 0.26 per request
Remaining time: 0.8585777777777779 hours
At index 20400: 11788 remaining requests
25.73 per 100 requests, or 0.26 per request
Remaining time: 0.8513555555555555 hours
At index 20500: 11688 remaining requests
25.81 per 100 requests, or 0.26 per request
Remaining time: 0.8441333333333334 hours
At index 20600: 11588 remaining requests
26.78 per 100 requests, or 0.27 per request
Remaining time: 0.8691 hours
At index 20700: 11488 remaining requests
28.67 per 100 requests, or 0.29 per request
Remaining time: 0.9254222222222223 hours
At index 20800: 11388 remaining requests
27.4

At index 26700: 5488 remaining requests
27.06 per 100 requests, or 0.27 per request
Remaining time: 0.4116 hours
At index 26800: 5388 remaining requests
27.28 per 100 requests, or 0.27 per request
Remaining time: 0.40409999999999996 hours
At index 26900: 5288 remaining requests
29.04 per 100 requests, or 0.29 per request
Remaining time: 0.4259777777777778 hours
At index 27000: 5188 remaining requests
27.18 per 100 requests, or 0.27 per request
Remaining time: 0.3891 hours
At index 27100: 5088 remaining requests
26.34 per 100 requests, or 0.26 per request
Remaining time: 0.3674666666666667 hours
At index 27200: 4988 remaining requests
26.63 per 100 requests, or 0.27 per request
Remaining time: 0.37410000000000004 hours
At index 27300: 4888 remaining requests
26.79 per 100 requests, or 0.27 per request
Remaining time: 0.3666 hours
At index 27400: 4788 remaining requests
24.21 per 100 requests, or 0.24 per request
Remaining time: 0.3192 hours
At index 27500: 4688 remaining requests
25.75 

In [126]:
rand_idxs = [random.randint(0, len(previously_open_US_restaurants)) for i in range(100)]

In [133]:
for radius in radiuses:
    biz_results = []
    for val in rand_idxs:
        results = google_nearby_restaurants(google_keys,val, previously_open_US_restaurants,radius)
        biz_results.append(len(results['results']))
    radius_values[radius] = biz_results

In [134]:
for key, value in radius_values.items():
    avg_results[key] = (sum(value) / len(value))

In [135]:
avg_results

{10: 0.39,
 21: 0.74,
 46: 1.43,
 100: 2.88,
 215: 5.83,
 300: 8.09,
 322: 8.57,
 344: 9.02,
 366: 9.36,
 388: 9.71,
 411: 10.26,
 433: 10.53,
 455: 10.91,
 464: 10.97,
 477: 11.17,
 500: 11.55,
 1000: 15.75,
 2154: 18.85,
 4641: 19.87,
 10000: 20.0}

In [None]:
#400 meters will provide an average number of restaurants close to 10. Next step is to do run this on all of my businesses

In [125]:
radiuses = np.logspace(1, 4, num=10).astype(int)

In [132]:
radiuses = np.linspace(300, 500, num=10).astype(int)

In [None]:
#scrapes google to get updated business information
client = MongoClient('mongodb://localhost:27017/')
restaurants = client['restaurants']
google_nearby_restaurants = restaurants['google_nearby_restaurants']


In [363]:
df = pd.read_csv('../data/featurized_dataframe.csv')

In [364]:
nearby_df = pd.DataFrame(list(maps_nearby.find()))

In [251]:
nearby_df['num_nearby_restaurants'] = nearby_df['results'].apply(lambda x: len(x))

In [349]:
rating_summaries = []
key_errors = 0
for i in range(len(nearby_df)):
    rating_total = 0
    rating_count = 0
    for j in range(len(nearby_df['results'][i])):
        try:
            rating_total += nearby_df['results'][i][j]['rating']
            rating_count += 1
        except KeyError:
            key_errors += 1
    try:
        rating_summaries.append({'business_id': nearby_df['yelp_business_id'][i], 'avg_rating': (rating_total / rating_count)})
    except ZeroDivisionError:
        rating_summaries.append({'business_id': nearby_df['yelp_business_id'][i], 'avg_rating': 0})
rating_summaries = pd.DataFrame(rating_summaries)

In [353]:
price_summaries = []
key_errors = 0
for i in range(len(nearby_df)):
    price_total = 0
    price_count = 0
    for j in range(len(nearby_df['results'][i])):
        try:
            price_total += nearby_df['results'][i][j]['price_level']
            price_count += 1
        except KeyError:
            key_errors += 1
    try:
        price_summaries.append({'business_id': nearby_df['yelp_business_id'][i], 'avg_price': (price_total / price_count)})
    except ZeroDivisionError:
        price_summaries.append({'business_id': nearby_df['yelp_business_id'][i], 'avg_price': 0})
price_summaries = pd.DataFrame(price_summaries)

In [354]:
nearby_prices_and_rating = rating_summaries.merge(price_summaries, how='outer', on='business_id')

nearby_trimmed_df = nearby_df.merge(nearby_prices_and_rating, how='outer', left_on='yelp_business_id', right_on='business_id')[['business_id','num_nearby_restaurants','avg_price', 'avg_rating']]

restaurants_with_nearby_data = df.merge(nearby_trimmed_df, how='left', on='business_id')

In [374]:
nearby_prices = summaries_from_google(nearby_df, 'price_level')
nearby_ratings = summaries_from_google(nearby_df, 'rating')
nearby_prices_and_rating = nearby_prices.merge(nearby_ratings, how='outer', on='business_id')

In [376]:
restaurants_with_nearby_data = df.merge(nearby_prices_and_rating, how='left', on='business_id')