In [1]:
import pandas as pd
import numpy as np
import json
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait 
import time
import os
from collections import defaultdict

In [2]:
# get addresses from trulia files
df = pd.read_csv('../data/trulia_cleaned.csv')

In [3]:
# function creates part of yelp url string

def get_link_part(row):

    address_string = row.replace(',','')
    address_elements = address_string.split(' ')
    adrs_part = ('+').join(address_elements)
    return(adrs_part)

In [4]:
df['adrs_part'] = df.address.apply(get_link_part)

In [5]:
yelp_df = df[['url','address','adrs_part']].copy()

In [6]:
yelp_df.head(10)

Unnamed: 0,url,address,adrs_part
0,https://www.trulia.com/p/ny/manhattan/66-w-38t...,"66 West 38th Street, Manhattan, NY",66+West+38th+Street+Manhattan+NY
1,https://www.trulia.com/p/ny/manhattan/435-w-45...,"435 West 45th Street, Manhattan, NY",435+West+45th+Street+Manhattan+NY
2,https://www.trulia.com/p/ny/manhattan/56-w-11t...,"56 West 11th Street, Manhattan, NY",56+West+11th+Street+Manhattan+NY
3,https://www.trulia.com/p/ny/manhattan/298-mulb...,"298 Mulberry Street, Manhattan, NY",298+Mulberry+Street+Manhattan+NY
4,https://www.trulia.com/p/ny/manhattan/71-w-107...,"71 West 107th Street, Manhattan, NY",71+West+107th+Street+Manhattan+NY
5,https://www.trulia.com/p/ny/manhattan/200-e-72...,"200 East 72nd Street, Manhattan, NY",200+East+72nd+Street+Manhattan+NY
6,https://www.trulia.com/p/ny/new-york/516-e-78t...,"516 East 78th Street, New York, NY",516+East+78th+Street+New+York+NY
7,https://www.trulia.com/p/ny/manhattan/175-w-95...,"175 West 95th Street, Manhattan, NY",175+West+95th+Street+Manhattan+NY
8,https://www.trulia.com/p/ny/manhattan/95-e-7th...,"95 East 7th Street, Manhattan, NY",95+East+7th+Street+Manhattan+NY
9,https://www.trulia.com/p/ny/manhattan/39-bedfo...,"39 Bedford Street, Manhattan, NY",39+Bedford+Street+Manhattan+NY


# Scraper 

In [7]:
def url_link(address):
    url = "https://www.yelp.com/search?find_loc=" + address + "&cflt=restaurants"
    return(url)

In [8]:
def yelp_scraper(url):
    driver.get(url)
    # get total number of restaurants 
    total_rest_loc = '//span[contains(text(),"Showing 1")]'
    total_rest_raw = driver.find_element_by_xpath(total_rest_loc).text
    total_rest = int(re.sub(r'Showing 1.*of\s','',total_rest_raw))
    
    button1 = driver.find_element_by_xpath('//span[@class="filter-label filters-toggle js-all-filters-toggle show-tooltip"]')
    button1.click()
    time.sleep(1)
    
    button2 = driver.find_element_by_xpath('//span[contains(text(),"Walking (1 mi.)")]')
    button2.click()
    time.sleep(2)
    
    rest_num_loc = '//span[contains(text(),"Showing 1")]'
    rest_num_raw = driver.find_element_by_xpath(rest_num_loc).text
    rest_num = int(re.sub(r'Showing 1.*of\s','',rest_num_raw))
    
    if total_rest==rest_num:
        
        button3 = driver.find_element_by_xpath('//span[contains(text(),"Biking (2 mi.)")]')
        button3.click()
        time.sleep(2)
        
        button4 = driver.find_element_by_xpath('//span[contains(text(),"Walking (1 mi.)")]')
        button4.click()
        time.sleep(2)
        
        rest_num_loc = '//span[contains(text(),"Showing 1")]'
        rest_num_raw = driver.find_element_by_xpath(rest_num_loc).text
        rest_num = int(re.sub(r'Showing 1.*of\s','',rest_num_raw))

        
    return(rest_num)

In [9]:
# create yelp links
yelp_df['url_total_rest'] = yelp_df.address.apply(url_link)

# get a list of unqiue addresses
urls_total_rest = list(set(yelp_df.url_total_rest))
len(urls_total_rest)

4815

In [13]:
# split list into smaller lists
cut_offs = list(np.arange(10, 5000, 10))

lol = []
i = 0
for c in cut_offs:
    list_slice = urls_total_rest[i:c]
    i = c
    lol.append(list_slice)

In [14]:
len(lol)

499

In [15]:
lol[0]

['https://www.yelp.com/search?find_loc=119 West 69th Street, Manhattan, NY&cflt=restaurants',
 'https://www.yelp.com/search?find_loc=250 West 90th Street, Manhattan, NY&cflt=restaurants',
 'https://www.yelp.com/search?find_loc=24060 65th Avenue, Douglaston, NY&cflt=restaurants',
 'https://www.yelp.com/search?find_loc=321 61st Street, Brooklyn, NY&cflt=restaurants',
 'https://www.yelp.com/search?find_loc=7525 153rd Street, Flushing, NY&cflt=restaurants',
 'https://www.yelp.com/search?find_loc=350 West 43rd Street, Manhattan, NY&cflt=restaurants',
 'https://www.yelp.com/search?find_loc=2475 Palisade Avenue, Bronx, NY&cflt=restaurants',
 'https://www.yelp.com/search?find_loc=484 Broome Street, Manhattan, NY&cflt=restaurants',
 'https://www.yelp.com/search?find_loc=21 West 16th Street, Manhattan, NY&cflt=restaurants',
 'https://www.yelp.com/search?find_loc=405 Main Street, New York, NY&cflt=restaurants']

In [None]:
# Scrape!
chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)

for url_list in lol[4:]:
    
    yelp_data = defaultdict(list)
    list_idx = lol.index(url_list)
   
    for url in url_list:
        yelp_data[url] = yelp_scraper(url)
        
    json.dump(yelp_data, open(f'../data/yelp_json/yelp_{list_idx}.json', 'w'), indent="\t")
    print(f'Scraped file {list_idx}')
    
driver.close()