# ADM-HWK3: GROUP #14

In [20]:
!pip install unidecode



In [19]:
!pip install aiofiles aiohttp



# 1. Data Collection

Create TXT with urls of each restaurant page (just let it run, usually 2 minutes to finish)

In [21]:
import requests
from bs4 import BeautifulSoup

# URL di partenza
base_url = 'https://guide.michelin.com/en/it/restaurants/page/'

def scrape_restaurant_links():
    print("I'm starting to Scrape!")
    page = 1
    all_links = []

    while True:
        # Costruisci l'URL della pagina corrente
        url = f"{base_url}{page}"
        response = requests.get(url)

        # Verifica che la richiesta sia andata a buon fine
        if response.status_code != 200:
            print(f"Errore nel caricamento della pagina {page}")
            break

        # Parsing della pagina HTML
        soup = BeautifulSoup(response.text, 'html.parser')

        # Trova tutti i div con la classe specificata
        for class1_div in soup.select("div.card__menu-content.card__menu-content--flex.js-match-height-content"):
            # Cerca il tag <h3> con classe specificata e il tag <a> figlio
            h3 = class1_div.select_one("h3.card__menu-content--title.pl-text.pl-big.js-match-height-title a")
            if h3:
                link = h3.get("href")
                full_link = "https://guide.michelin.com" + link if link else None
                if full_link:
                    all_links.append(full_link)

        # Trova la sezione di paginazione
        pagination_lis = soup.select("div.js-restaurant__bottom-pagination ul li")

        # Trova l'elemento <li> con la classe "active"
        active_index = None
        for i, li in enumerate(pagination_lis):
            if li.select_one("a.active"):
                active_index = i
                break

        # Se c'è una pagina successiva, incrementa il numero di pagina
        if active_index is not None and active_index + 1 < len(pagination_lis):
            next_page = pagination_lis[active_index + 1].select_one("a")
            if next_page and next_page.get("href"):
                page += 1
            else:
                break
        else:
            break

    # Salva tutti i link dei ristoranti in un file
    with open("soupUrls.txt", "w") as file:
        for link in all_links:
            file.write(link + "\n")

    print(f"Scraping completed. {len(all_links)} link saved in soupUrls.txt.")

# Avvia lo scraping
scrape_restaurant_links()

I'm starting to Scrape!
Scraping completed. 1983 link saved in soupUrls.txt.


Download each HTML page using .txt file just created.

In [22]:
#Scaricare html content da url

import aiohttp
import asyncio
import aiofiles
import os

CONCURRENT_REQUESTS = 20  # Lowered to reduce load on the server

async def load_urls(file_path):
    async with aiofiles.open(file_path, 'r') as f:
        urls = [line.strip() for line in await f.readlines()]
    return urls

async def download_url(session, url, output_dir):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
        'Referer': 'https://guide.michelin.com/',
        'Accept-Language': 'en-US,en;q=0.9',
    }
    try:
        async with session.get(url, headers=headers) as response:
            if response.status == 200:
                content = await response.text()
                filename = f"{output_dir}/{hash(url)}.html"
                async with aiofiles.open(filename, 'w') as f:
                    await f.write(content)
                print(f"Downloaded: {url}")
            else:
                print(f"Failed to download {url}: Status {response.status}")
    except Exception as e:
        print(f"Error downloading {url}: {e}")

async def download_all(urls, output_dir):
    connector = aiohttp.TCPConnector(limit=CONCURRENT_REQUESTS)
    async with aiohttp.ClientSession(connector=connector) as session:
        tasks = [download_url(session, url, output_dir) for url in urls]
        await asyncio.gather(*tasks)

if __name__ == "__main__":
    file_path = 'soupUrls.txt'
    output_dir = 'downloads'
    os.makedirs(output_dir, exist_ok=True)

    urls = await load_urls(file_path)
    await download_all(urls, output_dir)

Downloaded: https://guide.michelin.com/en/abruzzo/popoli_1845563/restaurant/donevandro
Downloaded: https://guide.michelin.com/en/sicilia/catania/restaurant/menage
Downloaded: https://guide.michelin.com/en/sardegna/cagliari/restaurant/sa-domu-sarda
Downloaded: https://guide.michelin.com/en/toscana/castiglione-della-pescaia/restaurant/la-trattoria-enrico-bartolini
Downloaded: https://guide.michelin.com/en/emilia-romagna/noceto_1827072/restaurant/palazzo-utini
Downloaded: https://guide.michelin.com/en/piemonte/alba/restaurant/ape-vino-e-cucina
Downloaded: https://guide.michelin.com/en/liguria/genova/restaurant/20tre
Downloaded: https://guide.michelin.com/en/piemonte/torino/restaurant/fratelli-bruzzone
Downloaded: https://guide.michelin.com/en/toscana/bibbiena/restaurant/il-tirabuscio262517
Downloaded: https://guide.michelin.com/en/lombardia/milano/restaurant/procaccini
Downloaded: https://guide.michelin.com/en/lombardia/aprica/restaurant/gimmy-s
Downloaded: https://guide.michelin.com/en/t

Scrape each HTML page and create dataframe from data. (TODO: add URL of each restaurant to dataset)

In [23]:
import os
import pandas as pd
import re
from bs4 import BeautifulSoup

# Directory containing the downloaded HTML files
output_dir = 'downloads'

# List to store restaurant data
restaurants_data = []

# Function to extract restaurant information from HTML
def extract_restaurant_info(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')

        # Extract information using CSS selectors or HTML structure of the page
        restaurant_info = {}

        # Ricerca div contenente info principali
        restaurantDetailsDiv = soup.find("div", class_="restaurant-details__components")

        # Ottiene tutte le row contenenti: Nome del ristorante (row1),
        # Indirizzo, prezzo e tipo cucina (row2), row 3 da scartare
        mainInfo = restaurantDetailsDiv.select("div.data-sheet > div.row")

        if mainInfo[0]:
            restaurant_info['restaurantName'] = mainInfo[0].find("h1", class_="data-sheet__title").text
        if mainInfo[1]:
            indirizzo_price = mainInfo[1].select("div.data-sheet__block > div.data-sheet__block--text")

            # Splitta la stringa contenente indirizzo, citta, CAP e nazione
            indirizzoList = indirizzo_price[0].text.strip().split(",")

            # Seleziona gli ultimi tre e li assegna a country, postalCode e city, tutto il resto verrà assegnato ad indirizzo
            restaurant_info['city'] = indirizzoList[-3]
            restaurant_info['postalCode'] = indirizzoList[-2]
            restaurant_info['country'] = indirizzoList[-1]
            restaurant_info['address'] = " ".join(indirizzoList[:-3]).strip().replace("\n", "") # Unisce tutti gli elementi precedenti agli ultimi tre

            # Split della riga contenente price e cuisineType info
            restaurant_info['priceRange'], restaurant_info['cuisineType'] = indirizzo_price[1].text.strip().split("·")

            restaurant_info['priceRange'] = restaurant_info['priceRange'].strip()
            # Possibili multiple cuisineType, dividi in lista
            restaurant_info['cuisineType'] = restaurant_info['cuisineType'].strip().split(",")

        # Description
        restaurant_info['description'] = soup.find("div", class_="data-sheet__description").text.strip().replace("\n", "")

        # Facilities and Services
        facilities = soup.select("div.restaurant-details__services ul li")
        restaurant_info['facilitiesServices'] = [s.text.strip() for s in facilities]

        # Accepted Credit Cards
        credit_cards = soup.select("div.list--card img")
        restaurant_info['creditCards'] = [re.search(r"(?<=\/)[a-z]*(?=-)", c.get("data-src"))[0] for c in credit_cards]

        # Phone Number
        spansDetails = restaurantDetailsDiv.select("section.section.section-main.section__text-componets.section__text-separator div.collapse__block-title div.d-flex span")
        restaurant_info['phoneNumber'] = spansDetails[0].text.strip()

        # URL
        restaurant_info['website'] = soup.find("meta", property="og:url")["content"]

    return restaurant_info

# Loop through all files in the directory and extract information
for filename in os.listdir(output_dir):
    if filename.endswith(".html"):
        print(filename)
        file_path = os.path.join(output_dir, filename)
        restaurant_info = extract_restaurant_info(file_path)
        restaurants_data.append(restaurant_info)

# Create a pandas DataFrame
df = pd.DataFrame(restaurants_data)

# Save the data to a CSV file
df.to_csv("restaurants_data.tsv", sep='\t', index=False)
print("Data saved to restaurants_data.csv")

6706894439729502413.html
4153014220348861622.html
-6031369243301873507.html
-7192872647037498508.html
466811023100640000.html
-3883504420800038177.html
4883142669730757591.html
-4309768831515783443.html
8979407818899109607.html
2305095741972052947.html
464086669065753798.html
-8016628866166017019.html
-4945726240752299744.html
-1862626890284924252.html
-1062127915983694768.html
-2391288853130004288.html
-7489653741732853955.html
20129926633294648.html
-4779826814049809605.html
6439505240394296306.html
2026964129338760377.html
4832835010505461473.html
8484576122150917283.html
-547487292752067216.html
377170581107277924.html
7403813488529855440.html
4757595202191428180.html
-7983129608896405636.html
3572998235815623020.html
-3646215321018368336.html
-6494709554291634859.html
-4900184062205703815.html
8400371506885590659.html
-6958575128227690967.html
8173086101722147720.html
-3872465610697458210.html
-3519008496912572058.html
-4827132141011159151.html
-8282993806174867414.html
6210866260

# 2. Search Engine

## 2.0 Preprocessing

In [3]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from unidecode import unidecode
import string
import unicodedata
import pandas as pd
#from nltk.corpus import wordnet as wn
from collections import defaultdict
#nltk.download('wordnet')
#nltk.download('omw-1.4')
import re
import pickle
import numpy as np

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
def preprocessing(doc):
    '''
    Function that preprocesses a document
    Input:
    doc: document to preprocess
    Output:
    tokens: list of cleaned tokens
    '''
    # Tokenize the document
    tokens = word_tokenize(doc)

    # Turn all words to lowercase
    tokens = [token.lower() for token in tokens]

    # Remove stopwords
    stops = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stops]

    # Remove puntuation
    tokens = [token for token in tokens if token not in string.punctuation]
    # tokens = [re.sub(r'[^\w\s]','',token) for token in tokens]

    # Apply stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    # Handle possessivenes
    def handle_possessives(token):
      if token.endswith("'s"):
        return token[:-2]  # Remove the "'s" part
      return token
    tokens = [handle_possessives(token) for token in tokens]

    # Normalize tokens
    tokens = [unidecode(token) for token in tokens]

    # Remove apostrophes
    tokens = [token.replace("'"," ").replace("-"," ") for token in tokens]

    # Remove numbers and empty strings
    tokens = [token for token in tokens if token != "" and not token.isdigit()]

    # Now split any token that contains a space into separate words
    final_tokens = []
    for token in tokens:
        # If the token contains spaces, split it into individual words
        if " " in token:
            final_tokens.extend(token.split())  # Extend adds each word separately to the list
        else:
            final_tokens.append(token)

    return final_tokens

In [5]:
# Test description
text = '''After many years' experience in Michelin-starred restaurants, Luigi Tramontano and his wife Nicoletta
have opened their first restaurant in the chef's native Gargnano. Previously a pasta factory, the building has been converted
into an elegant, contemporary-style restaurant which has nonetheless retained its charming high ceilings.
The cuisine is inspired by regional traditions which are reinterpreted to create gourmet dishes,
all prepared with respect for the ingredients used and a strong focus on local produce.'''

# Test preprocessing on test description
print(preprocessing(text))

['mani', 'year', 'experi', 'michelin', 'star', 'restaur', 'luigi', 'tramontano', 'wife', 'nicoletta', 'open', 'first', 'restaur', 'chef', 'nativ', 'gargnano', 'previous', 'pasta', 'factori', 'build', 'convert', 'eleg', 'contemporary', 'styl', 'restaur', 'nonetheless', 'retain', 'charm', 'high', 'ceil', 'cuisin', 'inspir', 'region', 'tradit', 'reinterpret', 'creat', 'gourmet', 'dish', 'prepar', 'respect', 'ingredi', 'use', 'strong', 'focu', 'local', 'produc']


## 2.1 Conjunctive Query

In [6]:
# Load the restaurants_data.tsv to a pandas DataFrame
df = pd.read_csv('restaurants_data.tsv', sep='\t')

### 2.1.1 Create your Index!

In [115]:
# 1. Vocabulary File

doc_tokens = [] # initialize list to store all tokens

for idx, row in enumerate(df.description):
  doc_tokens.extend(preprocessing(row))
  doc_tokens = list(set(doc_tokens)) # remove duplicates

vocabulary_dict = {term: i for i,term in enumerate(doc_tokens)}

vocabulary_df = pd.DataFrame({'term': vocabulary_dict.keys(),
                              'term_id': vocabulary_dict.values()})

vocabulary_df.to_csv('vocabulary.csv', index=False)

In [128]:
# 2. Inverted Index

inverted_index = defaultdict(list) # initialize inverted_index dictionary
preprocessed_docs = defaultdict(list) # initialize dictionary to store pre-processed docs (restaurant descriptions)

for doc_id, row in enumerate(df.description):
  preprocessed_docs[doc_id] = preprocessing(row) # preprocess the description
  tokens = set(preprocessed_docs[doc_id]) # preprocess the description
  for token in tokens: # eliminate duplicates
    # Look up the term_id of the current term/token
    term_id = vocabulary_dict[token]
    # If the doc_id is not in the term_id's list in inverted_index, add it
    if doc_id not in inverted_index[term_id]:
      inverted_index[term_id].append(doc_id)

In [129]:
''' debugging cell
for term_id, docs in inverted_index.items():
  if len(inverted_index[term_id]) == 0:
    print(f"Term {term_id} has no documents")
'''

In [118]:
# Save the inverted_index dictionary to a file
with open("inverted_index.pkl", "wb") as file:
    pickle.dump(inverted_index, file)

In [119]:
def find_restaurants(query, vocabulary_df, inverted_index, df):
    '''
    Find restaurants that match the given query using the inverted index
    Inputs:
    query: query string
    inverted_index: inverted index dictionary
    df: dataframe with restaurants data
    Outputs:
    restaurants_df: dataframe with restaurants that match the query
    '''
    # Preprocess the query
    query_tokens = preprocessing(query)

    target_docs = []

    try:
      # Retrieve the term_ids for each token in the query
      term_ids = [vocabulary_df[vocabulary_df['term'] == token]['term_id'].iloc[0] for token in query_tokens]

      # Retrieve the document IDs for each term_id (from inverted index)
      # Create a list of sets containing document IDs for each term in the query
      doc_sets = [set(inverted_index[term_id]) for term_id in term_ids]

      # Find the common document IDs across all query terms
      common_docs = set.intersection(*doc_sets)

      # If there are any common documents, add them to target_docs
      if common_docs:
          target_docs.extend(common_docs)

      # Convert target_docs to a list (if it's not already)
      target_docs = list(target_docs)

      # Retrieve the rows that match doc_ids in target_docs
      restaurants_df = df.loc[target_docs][['restaurantName', 'address', 'description', 'website']]

      # Return the DataFrame with the matching restaurants
      return restaurants_df

    except:
      print("No restaurants found for the given query.")

In [106]:
# Query
query = "modern seasonal cuisine" # Example query

# Query Results
find_restaurants(query, vocabulary_df, inverted_index, df)

Unnamed: 0,restaurantName,address,description,website
512,[àbitat],via Henry Dunant 1,"A young, enthusiastic and professional couple ...",https://guide.michelin.com/en/lombardia/san-fe...
1924,Winter Garden Florence,piazza Ognissanti 1,Horse-drawn carriages once entered the old cou...,https://guide.michelin.com/en/toscana/firenze/...
520,Mima,via Madonnelle 9,You’ll be won over by the seasonal Mediterrane...,https://guide.michelin.com/en/campania/vico-eq...
18,Ronchi Rò,località Cime di Dolegna 12,Ronchi Rò is an estate-cum-agriturismo surroun...,https://guide.michelin.com/en/friuli-venezia-g...
1561,Degusteria del Gigante,via degli Anelli 19,Situated in the charming historic centre of th...,https://guide.michelin.com/en/marche/san-bened...
25,Osteria Taviani,piazza Vittorio Emanuele II 28,"This pleasant, warmly decorated restaurant is ...",https://guide.michelin.com/en/toscana/bientina...
539,Ca' Del Moro,località Erbin 31,Situated within the La Collina dei Ciliegi win...,https://guide.michelin.com/en/veneto/grezzana/...
30,Savô,piazza XXV Aprile 8,The reopening in 2022 of the Hotel Windsor wit...,https://guide.michelin.com/en/liguria/laiguegl...
1311,Il Luogo Aimo e Nadia,via Montecuccoli 6,This long-established restaurant has been part...,https://guide.michelin.com/en/lombardia/milano...
673,Contrasto,via Roma 55,"Having returned to his native village, owner-c...",https://guide.michelin.com/en/molise/cercemagg...


## 2.2 Ranked Search Engine with TF-IDF and Cosine Similarity

### 2.2.1 Inverted Index with TF-IDF Scores

In [120]:
def tf_idf(term_id, inverted_index, preprocessed_docs, vocabulary_df, n):
  '''
  Calculate the TF-IDF scores for a given term
  Inputs:
  term_id: term id
  inverted_index: dictionary storing the documents that each term appears in
  preprocessed_docs: dictionary storing all the preprocessed documents
  vocabulary_df: dataframe containing the vocabulary of terms
  n = total number of documents
  Output:
  tf_idf_scores: vector of TF-IDF scores for the given term
  '''

  term = vocabulary_df['term'][term_id] # get term from term_id
  n_term = len(inverted_index[term_id]) # number of documents that contain the term
  #print(n_term)
  IDF = np.log10(n / n_term) # calculate IDF of the term, inverse document frequency
  #print(f"IDF= {IDF}")
  tf_idf_scores = [] # initialize list to store TF-IDF scores

  for doc_id in inverted_index[term_id]:
    #TF = preprocessed_docs[doc_id].count(term) / len(preprocessed_docs[doc_id]) # RELATIVE term frequency (tf)
    TF = preprocessed_docs[doc_id].count(term) # raw term frequency (tf)
    #print(f"TF = {TF}")
    tf_idf_scores.append(TF * IDF) # calculate TF-IDF score

  return tf_idf_scores


In [121]:
# @title Compute updated_inverted_index

n = len(preprocessed_docs)
updated_inverted_index = defaultdict(list) # initialize default dictionary to store the inverted_index values with TF-IDF scores

# Create a copy of the inverted_index to iterate over
inverted_index_copy = inverted_index.copy()

for term_id, docs in inverted_index_copy.items():
  tf_idf_scores = tf_idf(int(term_id), inverted_index, preprocessed_docs, vocabulary_df, n)
  updated_inverted_index[term_id] = list(zip(docs, tf_idf_scores))

with open('updated_inverted_index.pkl', 'wb') as file:
    pickle.dump(updated_inverted_index, file)

In [122]:
# @title Compute the TF-IDF vectors of all documents and store them in a file

doc_tf_idf_scores = defaultdict(list) # initialize dictionary to store non-zero TF-IDF scores for each document

for term_id, docs_scores in updated_inverted_index.items():
  for doc_id, tf_idf_score in docs_scores:
    if tf_idf_score != 0:
      doc_tf_idf_scores[doc_id].append((term_id,tf_idf_score))
  doc_tf_idf_scores[doc_id].sort(key=lambda x: x[0]) # sort the terms

with open('doc_tf_idf_scores.pkl', 'wb') as file:
    pickle.dump(doc_tf_idf_scores, file)

In [162]:
def top_k_restaurants(query, inverted_index, vocabulary_dict, doc_tf_idf_scores, df, k=5):
  '''
  Find the top k restaurants that match the given query using the TF-IDF scores
  Inputs:
  query: query string
  inverted_index: inverted index dictionary
  vocabulary_dict: dictionary containing the vocabulary of terms and their indeces
  doc_tf_idf_scores: dictionary storing the TF-IDF scores for each term in each document
  df: dataframe with restaurants data
  k: number of restaurants to return
  Outputs:
  restaurants_df: dataframe with restaurants that match the query
  '''
  processed_query = preprocessing(query) # processed query
  query_tokens = list(set(processed_query)) # unique query tokens
  # print(query_tokens) # debugging line
  # Find all docs to consider
  docs_to_consider = [] # initialize list to store documents to consider (non-zero intersection with the query tokens)

  for token in query_tokens:
    if vocabulary_dict[token]: # check if the token is in the vocabulary
      token_id = vocabulary_dict[token] # get the term_id of the token
      docs_to_consider.extend(inverted_index[token_id]) # add the documents that contain the token to the docs to consider

  docs_to_consider = list(set(docs_to_consider)) # remove duplicates

  # Calculate the TF-IDF score of the query
  query_tf_idf_scores = [] # initialize list to store the TF-IDF scores of the query
  for term in query_tokens:
    term_id = vocabulary_dict[term] # get the term_id of the term
    #print(inverted_index[term_id]) # debugging line
    n_term = len(inverted_index[term_id]) # number of documents that contain the term
    IDF = np.log10(n / n_term) # calculate IDF of the term
    TF = processed_query.count(term) # calculate TF of the term
    #print(f"TF = {TF}") # debugging line
    #print(f"IDF = {IDF}") # debuggin line
    query_tf_idf_scores.append((term_id, TF * IDF)) # calculate TF-IDF score

  query_tf_idf_scores.sort(key=lambda x: x[0]) # sort the query_tf_idf_scores in order of term_id

  query_norm = np.linalg.norm(np.array([score for _, score in query_tf_idf_scores])) # calculate the norm of the query
  #print(f"query tf_idf_scores: {query_tf_idf_scores}") # debuggin line
  #print(f"query norm: {query_norm}") # debugging line
  # calculate document norms
  doc_norms = {doc_id: np.linalg.norm(np.array([doc_tf_idf_scores[doc_id][i][1] for i in range(len(doc_tf_idf_scores[doc_id]))])) for doc_id in docs_to_consider}

  # Function that returns two lists of tuples (term, query_tf_idf) and (term, doc_tf_idf) such that
  # the terms are in the intersection of the query terms and the doc's terms
  def query_doc_intersection(query_tf_idf_scores, doc_tf_idf_scores):
    '''
    Calculate the intersection of the query and the document
    Inputs:
    query_terms: list of sorted unique query terms
    doc_terms: list of sorted unique document terms
    Output:
    query_intersection: list of tuples (term, query_tf_idf)
    doc_intersection: list of tuples (term, doc_tf_idf)
    '''
    query_intersection = [] # initialize list to store (term, query_tf_idf) tuples in the intersection
    doc_intersection = [] # initialize list to store (term, doc_tf_idf) tuples in the intersection
    i, j = 0, 0 # initialize two pointers
    while i<len(query_tf_idf_scores) and j<len(doc_tf_idf_scores):
      if query_tf_idf_scores[i][0] == doc_tf_idf_scores[j][0]:
        query_intersection.append(query_tf_idf_scores[i])
        doc_intersection.append(doc_tf_idf_scores[j])
        i += 1
        j += 1
      elif query_tf_idf_scores[i][0] < doc_tf_idf_scores[j][0]:
        i += 1
      else:
        j += 1
    return query_intersection, doc_intersection

  # Calculate cosine-similarity between the query and each document
  cosine_similarity = defaultdict(float) # initialize dictionary to store the cosine similarity results
  for doc_id in docs_to_consider:
    query_intersection, doc_intersection = query_doc_intersection(query_tf_idf_scores, doc_tf_idf_scores[doc_id]) # find the
    cosine_similarity[doc_id] = np.dot(np.array([score for _, score in query_intersection]), np.array([score for _, score in doc_intersection])) / (query_norm * doc_norms[doc_id])

  # Sort the cosine similarities in descending order
  sorted_cosine_similarity = sorted(cosine_similarity.items(), key=lambda x: x[1], reverse=True) # list of tuples

  # Get the top k restaurants
  top_k_restaurants = sorted_cosine_similarity[:min(k,len(sorted_cosine_similarity))]

  '''
  Information to store in the end data frame:
  restaurantName
  address
  description
  website
  Similarity score (between 0 and 1)
  '''

  top_k_restaurant_idx = [doc_id for doc_id, _ in top_k_restaurants]
  top_k_restaurant_scores = [score for _, score in top_k_restaurants]
  #print([score for _, score in top_k_restaurants])

  # build result dataframe
  restaurants_df = df.loc[top_k_restaurant_idx][['restaurantName', 'address', 'description', 'website']]
  restaurants_df['Similarity score'] = top_k_restaurant_scores

  return restaurants_df

In [158]:
''' debugging cell
for term_id, docs in inverted_index.items():
  if len(inverted_index[term_id]) == 0:
    print(f"Term {term_id} has no documents")
'''

In [165]:
query = 'modern seasonal cuisine' # Example query
k = 6 # number of restaurants to display in the results

with open('inverted_index.pkl', 'rb') as file:
    inverted_index = pickle.load(file)

top_k_restaurants(query, inverted_index, vocabulary_dict, doc_tf_idf_scores, df, k)

Unnamed: 0,restaurantName,address,description,website,Similarity score
1319,La Botte,via Giuseppe Garibaldi 8,A modern and welcoming contemporary bistro sit...,https://guide.michelin.com/en/piemonte/stresa/...,0.22759
1700,RistoFante,via Mazzini 41,The motto of this restaurant is “In step with ...,https://guide.michelin.com/en/lombardia/alzano...,0.169987
1216,Chichibio,via Guglielmo Marconi 1,"Despite its lack of awards, this restaurant st...",https://guide.michelin.com/en/abruzzo/roccaras...,0.164407
1455,Guallina,via Molino Faenza 19 località Guallina,Situated in a small house in an outlying villa...,https://guide.michelin.com/en/lombardia/mortar...,0.163298
485,20Tre,via David Chiossone 20 r,Situated in the heart of Genoa’s historic cent...,https://guide.michelin.com/en/liguria/genova/r...,0.162945
1887,La Corte,via San Pancrazio 41,Mediterranean-style dishes take pride of place...,https://guide.michelin.com/en/lombardia/palazz...,0.145908
