Create TXT with urls of each restaurant page (just let it run, usually 2 minutes to finish)

In [1]:
import requests
from bs4 import BeautifulSoup

# URL di partenza
base_url = 'https://guide.michelin.com/en/it/restaurants/page/'

def scrape_restaurant_links():
    print("I'm starting to Scrape!")
    page = 1
    all_links = []

    while True:
        # Costruisci l'URL della pagina corrente
        url = f"{base_url}{page}"
        response = requests.get(url)
        
        # Verifica che la richiesta sia andata a buon fine
        if response.status_code != 200:
            print(f"Errore nel caricamento della pagina {page}")
            break

        # Parsing della pagina HTML
        soup = BeautifulSoup(response.text, 'html.parser')

        # Trova tutti i div con la classe specificata
        for class1_div in soup.select("div.card__menu-content.card__menu-content--flex.js-match-height-content"):
            # Cerca il tag <h3> con classe specificata e il tag <a> figlio
            h3 = class1_div.select_one("h3.card__menu-content--title.pl-text.pl-big.js-match-height-title a")
            if h3:
                link = h3.get("href")
                full_link = "https://guide.michelin.com" + link if link else None
                if full_link:
                    all_links.append(full_link)

        # Trova la sezione di paginazione
        pagination_lis = soup.select("div.js-restaurant__bottom-pagination ul li")
        
        # Trova l'elemento <li> con la classe "active"
        active_index = None
        for i, li in enumerate(pagination_lis):
            if li.select_one("a.active"):
                active_index = i
                break

        # Se c'è una pagina successiva, incrementa il numero di pagina
        if active_index is not None and active_index + 1 < len(pagination_lis):
            next_page = pagination_lis[active_index + 1].select_one("a")
            if next_page and next_page.get("href"):
                page += 1
            else:
                break
        else:
            break

    # Salva tutti i link dei ristoranti in un file
    with open("soupUrls.txt", "w") as file:
        for link in all_links:
            file.write(link + "\n")

    print(f"Scraping completed. {len(all_links)} link saved in soupUrls.txt.")

# Avvia lo scraping
scrape_restaurant_links()

I'm starting to Scrape!


KeyboardInterrupt: 

Download each HTML page using .txt file just created.

In [96]:
#Scaricare html content da url

import aiohttp
import asyncio
import aiofiles
import os

CONCURRENT_REQUESTS = 20  # Lowered to reduce load on the server

async def load_urls(file_path):
    async with aiofiles.open(file_path, 'r') as f:
        urls = [line.strip() for line in await f.readlines()]
    return urls

async def download_url(session, url, output_dir):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
        'Referer': 'https://guide.michelin.com/',
        'Accept-Language': 'en-US,en;q=0.9',
    }
    try:
        async with session.get(url, headers=headers) as response:
            if response.status == 200:
                content = await response.text()
                filename = f"{output_dir}/{hash(url)}.html"
                async with aiofiles.open(filename, 'w') as f:
                    await f.write(content)
                print(f"Downloaded: {url}")
            else:
                print(f"Failed to download {url}: Status {response.status}")
    except Exception as e:
        print(f"Error downloading {url}: {e}")

async def download_all(urls, output_dir):
    connector = aiohttp.TCPConnector(limit=CONCURRENT_REQUESTS)
    async with aiohttp.ClientSession(connector=connector) as session:
        tasks = [download_url(session, url, output_dir) for url in urls]
        await asyncio.gather(*tasks)

if __name__ == "__main__":
    file_path = 'soupUrls.txt'
    output_dir = 'downloads'
    os.makedirs(output_dir, exist_ok=True)

    urls = await load_urls(file_path)
    await download_all(urls, output_dir)

Downloaded: https://guide.michelin.com/en/abruzzo/popoli_1845563/restaurant/donevandro
Downloaded: https://guide.michelin.com/en/basilicata/matera/restaurant/da-mo
Downloaded: https://guide.michelin.com/en/campania/gragnano/restaurant/o-me-o-il-mare
Downloaded: https://guide.michelin.com/en/piemonte/alba/restaurant/ape-vino-e-cucina
Downloaded: https://guide.michelin.com/en/campania/sorrento/restaurant/da-bob-cook-fish
Downloaded: https://guide.michelin.com/en/toscana/bibbiena/restaurant/il-tirabuscio262517
Downloaded: https://guide.michelin.com/en/lombardia/marne_7770347/restaurant/metodo-1213628
Downloaded: https://guide.michelin.com/en/emilia-romagna/montegridolfo_1826929/restaurant/osteria-dell-accademia
Downloaded: https://guide.michelin.com/en/piemonte/torino/restaurant/fratelli-bruzzone
Downloaded: https://guide.michelin.com/en/lombardia/aprica/restaurant/gimmy-s
Downloaded: https://guide.michelin.com/en/toscana/fiesole/restaurant/serrae-villa-fiesole
Downloaded: https://guide.m

Scrape each HTML page and create dataframe from data.

In [None]:
import os
import pandas as pd
import re
from bs4 import BeautifulSoup

# Directory containing the downloaded HTML files
output_dir = 'downloads'

# List to store restaurant data
restaurants_data = []

# Function to extract restaurant information from HTML
def extract_restaurant_info(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')

        # Extract information using CSS selectors or HTML structure of the page
        restaurant_info = {}
        
        # Ricerca div contenente info principali
        restaurantDetailsDiv = soup.find("div", class_="restaurant-details__components")

        # Ottiene tutte le row contenenti: Nome del ristorante (row1), 
        # Indirizzo, prezzo e tipo cucina (row2), row 3 da scartare 
        mainInfo = restaurantDetailsDiv.select("div.data-sheet > div.row")

        if mainInfo[0]: 
            restaurant_info['restaurantName'] = mainInfo[0].find("h1", class_="data-sheet__title").text
        if mainInfo[1]:
            indirizzo_price = mainInfo[1].select("div.data-sheet__block > div.data-sheet__block--text")

            # Splitta la stringa contenente indirizzo, citta, CAP e nazione
            indirizzoList = indirizzo_price[0].text.strip().split(",")
            # Assegno i valori della lista ai vari 

            # Seleziona gli ultimi tre e li assegna a country, postalCode e city, tutto il resto verrà assegnato ad indirizzo
            restaurant_info['city'] = indirizzoList[-3]
            restaurant_info['postalCode'] = indirizzoList[-2]
            restaurant_info['country'] = indirizzoList[-1]
            restaurant_info['address'] = " ".join(indirizzoList[:-3]).strip().replace("\n", "") # Unisce tutti gli elementi precedenti agli ultimi tre

            # if len(indirizzoList) == 4:
            #     restaurant_info['address'], restaurant_info['city'], restaurant_info['postalCode'], restaurant_info['country'] = indirizzoList
            # elif len(indirizzoList) == 5:
            #     restaurant_info['address'] = indirizzoList[0] + indirizzoList[1]
            #     restaurant_info['city'], restaurant_info['postalCode'], restaurant_info['country'] = indirizzoList[2:]

            # Split della riga contenente price e cuisineType info
            restaurant_info['priceRange'], restaurant_info['cuisineType'] = indirizzo_price[1].text.strip().split("·")
            
            restaurant_info['priceRange'] = restaurant_info['priceRange'].strip()
            # Possibili multiple cuisineType, dividi in lista
            restaurant_info['cuisineType'] = restaurant_info['cuisineType'].strip().split(",")

        # Description
        restaurant_info['description'] = soup.find("div", class_="data-sheet__description").text.strip().replace("\n", "")

        # Facilities and Services
        facilities = soup.select("div.restaurant-details__services ul li")
        restaurant_info['facilitiesServices'] = [s.text.strip() for s in facilities]
        
        # Accepted Credit Cards
        credit_cards = soup.select("div.list--card img")
        restaurant_info['creditCards'] = [re.search(r"(?<=\/)[a-z]*(?=-)", c.get("data-src"))[0] for c in credit_cards]
        
        # Phone Number
        spansDetails = restaurantDetailsDiv.select("section.section.section-main.section__text-componets.section__text-separator div.collapse__block-title div.d-flex span")
        restaurant_info['phoneNumber'] = spansDetails[0].text.strip()

        # Website URL (TODO)
        # restaurant_info['website'] = soup.select_one("a.website-link-selector")["href"].strip() if soup.select_one("a.website-link-selector") else None
        
    return restaurant_info

# Loop through all files in the directory and extract information
for filename in os.listdir(output_dir):
    if filename.endswith(".html"):
        print(filename)
        file_path = os.path.join(output_dir, filename)
        restaurant_info = extract_restaurant_info(file_path)
        restaurants_data.append(restaurant_info)

# Create a pandas DataFrame
df = pd.DataFrame(restaurants_data)

# Save the data to a CSV file
df.to_csv("restaurants_data.tsv", sep='\t', index=False)
print("Data saved to restaurants_data.csv")

-100070665522736091.html
-1012608591583606085.html
-1013275789882566635.html
-1026459696075863068.html
-1037761043025625296.html
-1041772283627193997.html
-1051921375776575268.html
-1056458440651219940.html
-1087933568841972890.html
-111196332836184501.html
-1122466102939635517.html
-1128731602424202350.html
-1136325965565758712.html
-1137639846157090016.html
-1150722271067412820.html
-1156190570324232514.html
-1160168931336690724.html
-1166996782643266922.html
-1171918725511351568.html
-1197012305151801552.html
-1204532554245692151.html
-1215754382005287082.html
-121923663188428701.html
-1219720164453460267.html
-1229511487850320122.html
-1229693020031429287.html
-1230549677587707491.html
-1231419902611711250.html
-1233550557235923350.html
-1233931975949475388.html
-123660809214175220.html
-1245036024120404542.html
-1254648223110685005.html
-1255825721844022357.html
-1256794833802928154.html
-1278408564502704111.html
-1301612643745234153.html
-1303913916405864803.html
-130807611001226