# Fetch URLs of all Uber Eats restaurants in a city

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import re
import time
import json
import csv
import os
import pandas as pd
from tqdm.notebook import tqdm
from urllib.error import URLError
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

HEADERS = {'User-Agent': 'Mozilla/5.0', 'Accept': 'text/html,application/xhtml+xml,application/xml'}

### Popular Uber Eats cuisines

Popular cuisines are those which have a large no. restaurants tagged with that cuisine. New York City data was used to choose these cuisines for faster runtime of script (with the possibility of losing out on a few restaurants). Out the total 204 restaurants, the top 52 were chosen by sorting.

We don't miss out on many restaturants as there are hardly any resturants with a single tag.

If you still want to run the script for all cuisines, open the `UE-categories-all.txt` file instead.

In [None]:
f = open("./scraped-data/UE-categories-popular.txt")
categories = f.read().split('\n')
if '' in categories:
    categories.remove('')
len(categories)

### Fetch all restaurants in a city

 1) Check if the city webpage exists
 
 2) If city exists, then for each category, fetch all restaurants with the current category (cuisine) tag.
 
 3) Store URLs fetched from all categories in a single set for the city `restaurant_set`. This will ensure that if a restaurant has more than one category tag, then it won't be stored twice.

In [None]:
def get_city_restaurants(city_url, categories, headers=HEADERS):
    restaurant_set = set()
    cityname = city_url.split('/')[-1]
    
    # checking if city exists
    try:
        req = Request(city_url, headers=headers)
        webpage = urlopen(req, timeout=20).read()
        soup = BeautifulSoup(webpage, 'html.parser')
    except Exception as e:
        print(f"City error: {e}, url: {city_url}")
        return restaurant_set
    
    # now fetching all restaurants urls category-wise, and storing urls in restaurant_set
    for category in tqdm(categories, desc=cityname):
        category = category.lower().replace(" ", "-")
        category_city_url = "https://www.ubereats.com/category/"+cityname+"/"+category
        try:
            # fetching restaurants in a category
            req = Request(category_city_url, headers=headers)
            webpage = urlopen(req, timeout=20).read()
            soup = BeautifulSoup(webpage, 'html.parser')
            links = soup.find_all("a")
            links = [link['href'] for link in links 
                     if len(link['href'].split('/'))>=3 
                     and link['href'].split('/')[2] == 'food-delivery']
            
            # adding link to the final city restaurant_set
            for link in links:
                restaurant_set.add(link)   
        except Exception as e:
          pass
        
    return restaurant_set

### Storing the restaurant set as CSV file

In [None]:
def store_urls(city_url, city_restaurants):
    cityname = city_url.split('/')[-1]
    filename = './scraped-data/restaurant-urls/'+cityname+'.csv'
    city_df = pd.DataFrame(columns=['url', 'city'])
    
    i=0
    for rest_url in city_restaurants:
        city_df.loc[i] = [
            "https://www.ubereats.com" + rest_url,
            cityname
        ]
        i = i+1
    city_df.to_csv(filename, index=False)

### Running the script for a single city

In [None]:
def execute_city(city_url, categories=categories):
    city_restaurants = get_city_restaurants(city_url, categories)
    store_urls(city_url, city_restaurants)

### Parallel processing for scraping all cities
Set max_workers to a suitable value. When run on a Google Colab instance, I was able to run ~80 threads in parallel

In [None]:
city_df = pd.read_csv("./scraped-data/UE-cities.csv")
with ThreadPoolExecutor(max_workers = 100) as executor:
    future_to_url = {executor.submit(execute_city, city_url): city_url for city_url in city_df['url']}
    for future in tqdm(as_completed(future_to_url), desc="cities"):
        city_url = future_to_url[future]
        print("URL Done: ", city_url)