In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import re
import time
import json
import csv
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm_notebook
import pandas as pd

HEADERS = {'User-Agent': 'Mozilla/5.0', 'Accept': 'text/html,application/xhtml+xml,application/xml'}

COLUMNS_LIST = ['name', 'url', 'city', 'cuisine', 'avgRating', 'numReviews', 'priceRange', 'latitude', 'longitude', 'telephone', 'postalCode', 'streetAddress', 'addressLocality', 'addressRegion', 'openingHoursSpecification', 'addressString', 'geoString', 'ratingString']

In [2]:
def get_rest_details(row):
    rest_city = row['city']
    rest_url  = row['url']
    dict1 = {'city': rest_city, 'url': rest_url, 'name': None}
    try:
        req = Request(rest_url, headers=HEADERS)
        webpage = urlopen(req, timeout=20).read()
        soup = BeautifulSoup(webpage, 'html.parser')
        obj = json.loads(soup.find("script", type="application/ld+json").text.replace('&amp;', '&'))
        dict1 = {
                    'name': obj['name'],
                    'url': rest_url,
                    'city': rest_city,
                    'cuisine': obj.get('servesCuisine'),
                    'avgRating': obj.get('aggregateRating', {}).get('ratingValue'),
                    'numReviews': obj.get('aggregateRating', {}).get('reviewCount'),
                    'priceRange': obj.get('priceRange'),
                    'latitude': obj.get('geo', {}).get('latitude'),
                    'longitude': obj.get('geo', {}).get('longitude'),
                    'telephone': obj.get('telephone'),
                    'postalCode': obj.get('address', {}).get('postalCode'),
                    'streetAddress': obj.get('address', {}).get('streetAddress'),
                    'addressLocality': obj.get('address', {}).get('addressLocality'),
                    'addressRegion': obj.get('address', {}).get('addressRegion'),
                    'openingHoursSpecification': obj.get('openingHoursSpecification'),
                    'addressString': obj.get('address'),
                    'geoString': obj.get('geo'),
                    'ratingString': obj.get('aggregateRating')
                }
    except Exception as e:
        print(f"Restaurant error: {e} url: {rest_url}")
        
    return dict1

In [3]:
def scrap_city_restaurants(cityname, columnsList=COLUMNS_LIST):
    urls_df = pd.read_csv('./archive/'+cityname+'.csv')
    num_restaurants = urls_df.shape[0]
    restaurant_list = []
    
#     for i in tqdm_notebook(range(num_restaurants), desc=cityname):
    with ThreadPoolExecutor(max_workers = 100) as executor:
        future_to_url = {executor.submit(get_rest_details, row): row for index,row in urls_df.iterrows()}
        for future in tqdm_notebook(as_completed(future_to_url), desc="cities"):
            row = future_to_url[future]
            dict1 = future.result()
            restaurant_list.append(dict1)

    details_df = pd.DataFrame(restaurant_list, columns=columnsList)
    return details_df

In [4]:
def store_city_restaurants(df, cityname):
    filename = './scraped-data/rest-details/'+cityname+'-details.csv'
    df.to_csv(filename, index=False)

In [5]:
def execute_city(cityname):
    city_restaurants_df = scrap_city_restaurants(cityname)
    store_city_restaurants(city_restaurants_df, cityname)

In [6]:
execute_city("new-york")

Restaurant error: HTTP Error 410: Gone url: https://www.ubereats.com/new-york/food-delivery/le-tea/ZFKlr3UdQ2WhJjYp5FU02A
Restaurant error: Remote end closed connection without response url: https://www.ubereats.com/new-york/food-delivery/umami-burger-57th-street/2RpbVoSRTjefmzq1sd85BQ
Restaurant error: HTTP Error 404: Not Found url: https://www.ubereats.com/new-york/food-delivery/healthy-market-%26-deli/D1Sw07WwQi-39-mzSDcuVQ
Restaurant error: HTTP Error 410: Gone url: https://www.ubereats.com/new-york/food-delivery/lions-tigers-%26-squares-detroit-pizza-2nd-ave/HHYBFvGuRY-_WTy3chiNrg
Restaurant error: HTTP Error 410: Gone url: https://www.ubereats.com/new-york/food-delivery/bar-pa-tea/n22IIJ6fQ4qPqzZ6W5SnpQ
Restaurant error: HTTP Error 410: Gone url: https://www.ubereats.com/new-york/food-delivery/the-wayfarer/4aNGnGZZSg-0A1gIksZHwQ
Restaurant error: Remote end closed connection without response url: https://www.ubereats.com/new-york/food-delivery/seven-grams-caffe-chelsea/0Fmj7UHyS5