# Fetch restaurant details from URL database of cities

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import re
import time
import json
import csv
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm
import pandas as pd

HEADERS = {'User-Agent': 'Mozilla/5.0', 'Accept': 'text/html,application/xhtml+xml,application/xml'}

COLUMNS_LIST = ['name', 'url', 'city', 'cuisine', 'avgRating', 'numReviews', 'priceRange', 'latitude', 'longitude', 'telephone', 'postalCode', 'streetAddress', 'addressLocality', 'addressRegion', 'openingHoursSpecification', 'addressString', 'geoString', 'ratingString']

In [None]:
def get_rest_details(row):
    rest_city = row['city']
    rest_url  = row['url']
    dict1 = {'city': rest_city, 'url': rest_url, 'name': None}
    try:
        req = Request(rest_url, headers=HEADERS)
        webpage = urlopen(req, timeout=20).read()
        soup = BeautifulSoup(webpage, 'html.parser')
        obj = json.loads(soup.find("script", type="application/ld+json").text.replace('&amp;', '&'))
        dict1 = {
                    'name': obj['name'],
                    'url': rest_url,
                    'city': rest_city,
                    'cuisine': obj.get('servesCuisine'),
                    'avgRating': obj.get('aggregateRating', {}).get('ratingValue'),
                    'numReviews': obj.get('aggregateRating', {}).get('reviewCount'),
                    'priceRange': obj.get('priceRange'),
                    'latitude': obj.get('geo', {}).get('latitude'),
                    'longitude': obj.get('geo', {}).get('longitude'),
                    'telephone': obj.get('telephone'),
                    'postalCode': obj.get('address', {}).get('postalCode'),
                    'streetAddress': obj.get('address', {}).get('streetAddress'),
                    'addressLocality': obj.get('address', {}).get('addressLocality'),
                    'addressRegion': obj.get('address', {}).get('addressRegion'),
                    'openingHoursSpecification': obj.get('openingHoursSpecification'),
                    'addressString': obj.get('address'),
                    'geoString': obj.get('geo'),
                    'ratingString': obj.get('aggregateRating')
                }
    except Exception as e:
        # print(f"Restaurant error: {e} url: {rest_url}")
        pass
        
    return dict1

In [None]:
def scrap_city_restaurants(cityname, columnsList=COLUMNS_LIST):
    urls_df = pd.read_csv('./scraped-data/restaurant-urls/'+cityname+'.csv')
    num_restaurants = urls_df.shape[0]
    restaurant_list = []
    
    with ThreadPoolExecutor(max_workers = 500) as executor:
        future_to_url = {executor.submit(get_rest_details, row): row for index,row in urls_df.iterrows()}
        for future in tqdm(as_completed(future_to_url), desc=cityname):
            row = future_to_url[future]
            dict1 = future.result()
            restaurant_list.append(dict1)

    details_df = pd.DataFrame(restaurant_list, columns=columnsList)
    return details_df

In [None]:
def store_city_restaurants(df, cityname):
    filename = './scraped-data/restaurant-details/'+cityname+'-details.csv'
    df.to_csv(filename, index=False)

In [None]:
def execute_city(city_url):
    cityname = city_url.split('/')[-1]
    city_restaurants_df = scrap_city_restaurants(cityname)
    store_city_restaurants(city_restaurants_df, cityname)

In [None]:
city_df = pd.read_csv("./scraped-data/UE-cities.csv")
for index,row in tqdm(city_df.iterrows(), desc="cities"):
    execute_city(row['url'])