In [2]:
!pip install aiofiles aiohttp

Collecting aiofiles
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Downloading aiofiles-24.1.0-py3-none-any.whl (15 kB)
Installing collected packages: aiofiles
Successfully installed aiofiles-24.1.0


In [12]:
!pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/235.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.8


# 1. Data Collection

Create TXT with urls of each restaurant page (just let it run, usually 2 minutes to finish)

In [1]:
import requests
from bs4 import BeautifulSoup

# URL di partenza
base_url = 'https://guide.michelin.com/en/it/restaurants/page/'

def scrape_restaurant_links():
    print("I'm starting to Scrape!")
    page = 1
    all_links = []

    while True:
        # Costruisci l'URL della pagina corrente
        url = f"{base_url}{page}"
        response = requests.get(url)

        # Verifica che la richiesta sia andata a buon fine
        if response.status_code != 200:
            print(f"Errore nel caricamento della pagina {page}")
            break

        # Parsing della pagina HTML
        soup = BeautifulSoup(response.text, 'html.parser')

        # Trova tutti i div con la classe specificata
        for class1_div in soup.select("div.card__menu-content.card__menu-content--flex.js-match-height-content"):
            # Cerca il tag <h3> con classe specificata e il tag <a> figlio
            h3 = class1_div.select_one("h3.card__menu-content--title.pl-text.pl-big.js-match-height-title a")
            if h3:
                link = h3.get("href")
                full_link = "https://guide.michelin.com" + link if link else None
                if full_link:
                    all_links.append(full_link)

        # Trova la sezione di paginazione
        pagination_lis = soup.select("div.js-restaurant__bottom-pagination ul li")

        # Trova l'elemento <li> con la classe "active"
        active_index = None
        for i, li in enumerate(pagination_lis):
            if li.select_one("a.active"):
                active_index = i
                break

        # Se c'è una pagina successiva, incrementa il numero di pagina
        if active_index is not None and active_index + 1 < len(pagination_lis):
            next_page = pagination_lis[active_index + 1].select_one("a")
            if next_page and next_page.get("href"):
                page += 1
            else:
                break
        else:
            break

    # Salva tutti i link dei ristoranti in un file
    with open("soupUrls.txt", "w") as file:
        for link in all_links:
            file.write(link + "\n")

    print(f"Scraping completed. {len(all_links)} link saved in soupUrls.txt.")

# Avvia lo scraping
scrape_restaurant_links()

I'm starting to Scrape!
Scraping completed. 1983 link saved in soupUrls.txt.


Download each HTML page using .txt file just created.

In [3]:
#Scaricare html content da url

import aiohttp
import asyncio
import aiofiles
import os

CONCURRENT_REQUESTS = 20  # Lowered to reduce load on the server

async def load_urls(file_path):
    async with aiofiles.open(file_path, 'r') as f:
        urls = [line.strip() for line in await f.readlines()]
    return urls

async def download_url(session, url, output_dir):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
        'Referer': 'https://guide.michelin.com/',
        'Accept-Language': 'en-US,en;q=0.9',
    }
    try:
        async with session.get(url, headers=headers) as response:
            if response.status == 200:
                content = await response.text()
                filename = f"{output_dir}/{hash(url)}.html"
                async with aiofiles.open(filename, 'w') as f:
                    await f.write(content)
                print(f"Downloaded: {url}")
            else:
                print(f"Failed to download {url}: Status {response.status}")
    except Exception as e:
        print(f"Error downloading {url}: {e}")

async def download_all(urls, output_dir):
    connector = aiohttp.TCPConnector(limit=CONCURRENT_REQUESTS)
    async with aiohttp.ClientSession(connector=connector) as session:
        tasks = [download_url(session, url, output_dir) for url in urls]
        await asyncio.gather(*tasks)

if __name__ == "__main__":
    file_path = 'soupUrls.txt'
    output_dir = 'downloads'
    os.makedirs(output_dir, exist_ok=True)

    urls = await load_urls(file_path)
    await download_all(urls, output_dir)

Downloaded: https://guide.michelin.com/en/emilia-romagna/noceto_1827072/restaurant/palazzo-utini
Downloaded: https://guide.michelin.com/en/liguria/genova/restaurant/etra
Downloaded: https://guide.michelin.com/en/campania/marina-di-casal-velino/restaurant/alessandro-feo
Downloaded: https://guide.michelin.com/en/toscana/castiglione-della-pescaia/restaurant/la-trattoria-enrico-bartolini
Downloaded: https://guide.michelin.com/en/lombardia/cervesina/restaurant/dama-1213583
Downloaded: https://guide.michelin.com/en/campania/sorrento/restaurant/da-bob-cook-fish
Downloaded: https://guide.michelin.com/en/abruzzo/popoli_1845563/restaurant/donevandro
Downloaded: https://guide.michelin.com/en/piemonte/alba/restaurant/ape-vino-e-cucina
Downloaded: https://guide.michelin.com/en/lombardia/milano/restaurant/procaccini
Downloaded: https://guide.michelin.com/en/toscana/bibbiena/restaurant/il-tirabuscio262517
Downloaded: https://guide.michelin.com/en/campania/sorrento/restaurant/soul-fish
Downloaded: htt

Scrape each HTML page and create dataframe from data. (TODO: add URL of each restaurant to dataset)

In [4]:
import os
import pandas as pd
import re
from bs4 import BeautifulSoup

# Directory containing the downloaded HTML files
output_dir = 'downloads'

# List to store restaurant data
restaurants_data = []

# Function to extract restaurant information from HTML
def extract_restaurant_info(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')

        # Extract information using CSS selectors or HTML structure of the page
        restaurant_info = {}

        # Ricerca div contenente info principali
        restaurantDetailsDiv = soup.find("div", class_="restaurant-details__components")

        # Ottiene tutte le row contenenti: Nome del ristorante (row1),
        # Indirizzo, prezzo e tipo cucina (row2), row 3 da scartare
        mainInfo = restaurantDetailsDiv.select("div.data-sheet > div.row")

        if mainInfo[0]:
            restaurant_info['restaurantName'] = mainInfo[0].find("h1", class_="data-sheet__title").text
        if mainInfo[1]:
            indirizzo_price = mainInfo[1].select("div.data-sheet__block > div.data-sheet__block--text")

            # Splitta la stringa contenente indirizzo, citta, CAP e nazione
            indirizzoList = indirizzo_price[0].text.strip().split(",")

            # Seleziona gli ultimi tre e li assegna a country, postalCode e city, tutto il resto verrà assegnato ad indirizzo
            restaurant_info['city'] = indirizzoList[-3]
            restaurant_info['postalCode'] = indirizzoList[-2]
            restaurant_info['country'] = indirizzoList[-1]
            restaurant_info['address'] = " ".join(indirizzoList[:-3]).strip().replace("\n", "") # Unisce tutti gli elementi precedenti agli ultimi tre

            # Split della riga contenente price e cuisineType info
            restaurant_info['priceRange'], restaurant_info['cuisineType'] = indirizzo_price[1].text.strip().split("·")

            restaurant_info['priceRange'] = restaurant_info['priceRange'].strip()
            # Possibili multiple cuisineType, dividi in lista
            restaurant_info['cuisineType'] = restaurant_info['cuisineType'].strip().split(",")

        # Description
        restaurant_info['description'] = soup.find("div", class_="data-sheet__description").text.strip().replace("\n", "")

        # Facilities and Services
        facilities = soup.select("div.restaurant-details__services ul li")
        restaurant_info['facilitiesServices'] = [s.text.strip() for s in facilities]

        # Accepted Credit Cards
        credit_cards = soup.select("div.list--card img")
        restaurant_info['creditCards'] = [re.search(r"(?<=\/)[a-z]*(?=-)", c.get("data-src"))[0] for c in credit_cards]

        # Phone Number
        spansDetails = restaurantDetailsDiv.select("section.section.section-main.section__text-componets.section__text-separator div.collapse__block-title div.d-flex span")
        restaurant_info['phoneNumber'] = spansDetails[0].text.strip()

        # URL
        restaurant_info['website'] = soup.find("meta", property="og:url")["content"]

    return restaurant_info

# Loop through all files in the directory and extract information
for filename in os.listdir(output_dir):
    if filename.endswith(".html"):
        print(filename)
        file_path = os.path.join(output_dir, filename)
        restaurant_info = extract_restaurant_info(file_path)
        restaurants_data.append(restaurant_info)

# Create a pandas DataFrame
df = pd.DataFrame(restaurants_data)

# Save the data to a CSV file
df.to_csv("restaurants_data.tsv", sep='\t', index=False)
print("Data saved to restaurants_data.csv")

-1555992944052291252.html
3474728177582747135.html
-6621960012199011169.html
-1819259419380261519.html
1580202467623608718.html
-5795041316137700025.html
3539819720297207242.html
-9084943818133947763.html
-5321099145178601626.html
2130969920192603014.html
-2443611346587866788.html
-9171318619265868045.html
-9055519237437065912.html
-8617842784970996678.html
-4398084470863324850.html
-8071530692275292597.html
-6585996695224019982.html
-1367473743188929020.html
1865842064598906739.html
-7307536003896134834.html
6416168087842470367.html
2520161104594234066.html
-7134396952618494636.html
5650483420522940488.html
4029520904349495783.html
-3037575973914685608.html
-2895158356682905199.html
821146815773968440.html
-4418089486786369173.html
-2933120095093466019.html
-859447051601005571.html
-5157291223448444325.html
-6039241764189205550.html
4090626460166397892.html
-2121357832143728458.html
3220979462530893667.html
3945082836243799109.html
-1184704043871200801.html
7659224419088951333.html
36

# 2. Search Engine

## 2.0 Preprocessing

In [63]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from unidecode import unidecode
import string
import unicodedata
import pandas as pd
#from nltk.corpus import wordnet as wn
from collections import defaultdict
#nltk.download('wordnet')
#nltk.download('omw-1.4')
import re
import pickle

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [67]:
def preprocessing(doc):
    '''
    Function that preprocesses a document
    Input:
    doc: document to preprocess
    Output:
    tokens: list of cleaned tokens
    '''
    # Tokenize the document
    tokens = word_tokenize(doc)

    # Turn all words to lowercase
    tokens = [token.lower() for token in tokens]

    # Remove stopwords
    stops = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stops]

    # Remove puntuation
    tokens = [token for token in tokens if token not in string.punctuation]
    # tokens = [re.sub(r'[^\w\s]','',token) for token in tokens]

    # Apply stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    # Handle possessivenes
    def handle_possessives(token):
      if token.endswith("'s"):
        return token[:-2]  # Remove the "'s" part
      return token
    tokens = [handle_possessives(token) for token in tokens]

    # Normalize tokens
    tokens = [unidecode(token) for token in tokens]

    # Remove apostrophes
    tokens = [token.replace("'"," ").replace("-"," ") for token in tokens]

    # Remove numbers and empty strings
    tokens = [token for token in tokens if token != "" and not token.isdigit()]

    # Now split any token that contains a space into separate words
    final_tokens = []
    for token in tokens:
        # If the token contains spaces, split it into individual words
        if " " in token:
            final_tokens.extend(token.split())  # Extend adds each word separately to the list
        else:
            final_tokens.append(token)

    return final_tokens

In [51]:
# Test description
text = '''After many years' experience in Michelin-starred restaurants, Luigi Tramontano and his wife Nicoletta
have opened their first restaurant in the chef's native Gargnano. Previously a pasta factory, the building has been converted
into an elegant, contemporary-style restaurant which has nonetheless retained its charming high ceilings.
The cuisine is inspired by regional traditions which are reinterpreted to create gourmet dishes,
all prepared with respect for the ingredients used and a strong focus on local produce.'''

# Test preprocessing on test description
print(preprocessing(text))

['mani', 'year', 'experi', 'michelin', 'star', 'restaur', 'luigi', 'tramontano', 'wife', 'nicoletta', 'open', 'first', 'restaur', 'chef', 'nativ', 'gargnano', 'previous', 'pasta', 'factori', 'build', 'convert', 'eleg', 'contemporary', 'styl', 'restaur', 'nonetheless', 'retain', 'charm', 'high', 'ceil', 'cuisin', 'inspir', 'region', 'tradit', 'reinterpret', 'creat', 'gourmet', 'dish', 'prepar', 'respect', 'ingredi', 'use', 'strong', 'focu', 'local', 'produc']


## 2.1 Conjunctive Query

In [52]:
# Load the restaurants_data.tsv to a pandas DataFrame
df = pd.read_csv('restaurants_data.tsv', sep='\t')

### 2.1.1 Create your Index!

In [54]:
# 1. Vocabulary File

doc_tokens = [] # initialize list to store all tokens

for idx, row in enumerate(df.description):
  doc_tokens.extend(preprocessing(row))
  doc_tokens = list(set(doc_tokens)) # remove duplicates

vocabulary_df = pd.DataFrame({'term': doc_tokens,
                              'term_id': range(len(doc_tokens))})

vocabulary_df.to_csv('vocabulary.csv', index=False)

In [64]:
# 2. Inverted Index

inverted_index = defaultdict(list)

for doc_id, row in enumerate(df.description):
  tokens = preprocessing(row) # preprocess the description
  for token in tokens:
    # Look up the term_id of the current term/token
    term_id = vocabulary_df[vocabulary_df['term']==token]['term_id'].iloc[0]
    # If the doc_id is not in the term_id's list in inverted_index, add it
    if term_id not in inverted_index[term_id]:
      inverted_index[term_id].append(doc_id)


In [65]:
# Save the inverted_index dictionary to a file
with open("inverted_index.pkl", "wb") as file:
    pickle.dump(inverted_index, file)

In [66]:
df.columns

Index(['restaurantName', 'city', 'postalCode', 'country', 'address',
       'priceRange', 'cuisineType', 'description', 'facilitiesServices',
       'creditCards', 'phoneNumber', 'website'],
      dtype='object')

In [78]:
def find_restaurants(query, vocabulary_df, inverted_index, df):
    '''
    Find restaurants that match the given query using the inverted index
    Inputs:
    query: query string
    inverted_index: inverted index dictionary
    df: dataframe with restaurants data
    Outputs:
    restaurants_df: dataframe with restaurants that match the query
    '''
    # Preprocess the query
    query_tokens = preprocessing(query)

    target_docs = []

    try:
      # Retrieve the term_ids for each token in the query
      term_ids = [vocabulary_df[vocabulary_df['term'] == token]['term_id'].iloc[0] for token in query_tokens]

      # Retrieve the document IDs for each term_id (from inverted index)
      # Create a list of sets containing document IDs for each term in the query
      doc_sets = [set(inverted_index[term_id]) for term_id in term_ids]

      # Find the common document IDs across all query terms
      common_docs = set.intersection(*doc_sets)

      # If there are any common documents, add them to target_docs
      if common_docs:
          target_docs.extend(common_docs)

      # Convert target_docs to a list (if it's not already)
      target_docs = list(target_docs)

      # Retrieve the rows that match doc_ids in target_docs
      restaurants_df = df.loc[target_docs][['restaurantName', 'address', 'description', 'website']]

      # Return the DataFrame with the matching restaurants
      return restaurants_df

    except:
      print("No restaurants found for the given query.")

In [85]:
# Query

query = "modern seasonal cuisine" # Example query

# Query Results
find_restaurants(query, vocabulary_df, inverted_index, df)

Unnamed: 0,restaurantName,address,description,website
0,Chichibio,via Guglielmo Marconi 1,"Despite its lack of awards, this restaurant st...",https://guide.michelin.com/en/abruzzo/roccaras...
517,Materia | Spazio Cucina,via Teatro Massimo 29,The entrance to this restaurant is typical of ...,https://guide.michelin.com/en/sicilia/catania/...
7,Savô,piazza XXV Aprile 8,The reopening in 2022 of the Hotel Windsor wit...,https://guide.michelin.com/en/liguria/laiguegl...
1806,Vesta Mare,viale Roma 41,"This typical, elegant Versilian beach club wit...",https://guide.michelin.com/en/toscana/marina-d...
1808,Quadri Bistrot,Via Solferino 48,"A modern bistro with a cocktail-bar, trendy de...",https://guide.michelin.com/en/lombardia/milano...
1938,Retrobottega,via della Stelletta 4,Minimalist decor and clean lines characterise ...,https://guide.michelin.com/en/lazio/roma/resta...
406,Piccolo Lord,corso San Maurizio 69 bis/g,"Professional service in a welcoming, modern re...",https://guide.michelin.com/en/piemonte/torino/...
1303,La Valle,via Umberto I 25 località Valle Sauglio,A well - run restaurant in a quiet area just o...,https://guide.michelin.com/en/piemonte/trofare...
1945,Pipero Roma,corso Vittorio Emanuele II 250,Situated opposite the church of Santa Maria in...,https://guide.michelin.com/en/lazio/roma/resta...
1577,Winter Garden Florence,piazza Ognissanti 1,Horse-drawn carriages once entered the old cou...,https://guide.michelin.com/en/toscana/firenze/...
