In [77]:
#Importing all necessary libraries
import numpy as np
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup
from IPython.display import HTML
import re
#this is the basic url we will need to use
url = 'https://guide.michelin.com/en/it/restaurants'

response = requests.get(url)
raw_html = response.text
html = BeautifulSoup(raw_html, "html.parser")

In [78]:
#This selects the buttons corresponding to the page selectors at the bottom of the page. 
# The rightmost button is always the last page, so we can extract the total number of pages from its href.
pages = html.select('.btn-outline-secondary:not(.active)')
page_count = 1

for page in pages:
  #e.g. 'www.example.com/page/15 -> number = 15
  number = int(page.get('href').split('/')[-1])
  if number > page_count:
    page_count = number
urls = []

#The URLs for all the pages follow a common pattern, 
# so we can construct the URLs as soon as we know the total amount of pages.

urls.append(url)

for i in range(2, page_count + 1):
  new_url = url + '/page/' + str(i)
  urls.append(new_url)

#Making the directories for sorting all the html files in.

for page in range(1, page_count + 1):
  dir_name="Page " + str(page)
  os.mkdir(dir_name)

print(f"here are the urls of the first 10 pages:\n")
display(urls[0:10])

here are the urls of the first 10 pages:



['https://guide.michelin.com/en/it/restaurants',
 'https://guide.michelin.com/en/it/restaurants/page/2',
 'https://guide.michelin.com/en/it/restaurants/page/3',
 'https://guide.michelin.com/en/it/restaurants/page/4',
 'https://guide.michelin.com/en/it/restaurants/page/5',
 'https://guide.michelin.com/en/it/restaurants/page/6',
 'https://guide.michelin.com/en/it/restaurants/page/7',
 'https://guide.michelin.com/en/it/restaurants/page/8',
 'https://guide.michelin.com/en/it/restaurants/page/9',
 'https://guide.michelin.com/en/it/restaurants/page/10']

In [79]:
restaurants = []
base_url_restaurants = 'https://guide.michelin.com'

In [80]:
# By iterating through the pages and the restaurants in them, 
# we can build the URLs for each restaurant by appending its relative href to the common beginning they all share. 
# Each page has up to 20 restaurants.

for url in urls:
  response = requests.get(url)
  raw_html = response.text
  html = BeautifulSoup(raw_html, "html.parser")
  links = html.select('a[class = "link"]')
  hrefs = [link.get('href') for link in links]
  new_urls = [base_url_restaurants + href for href in hrefs]
  for n in new_urls:
    restaurants.append(n)

restaurants[0:10]

['https://guide.michelin.com/en/campania/gragnano/restaurant/o-me-o-il-mare',
 'https://guide.michelin.com/en/abruzzo/popoli_1845563/restaurant/donevandro',
 'https://guide.michelin.com/en/piemonte/alba/restaurant/ape-vino-e-cucina',
 'https://guide.michelin.com/en/campania/sorrento/restaurant/da-bob-cook-fish',
 'https://guide.michelin.com/en/basilicata/matera/restaurant/da-mo',
 'https://guide.michelin.com/en/sardegna/cagliari/restaurant/sa-domu-sarda',
 'https://guide.michelin.com/en/sicilia/palermo/restaurant/charleston',
 'https://guide.michelin.com/en/toscana/bibbiena/restaurant/il-tirabuscio262517',
 'https://guide.michelin.com/en/emilia-romagna/cesenatico/restaurant/la-buca130947',
 'https://guide.michelin.com/en/campania/marina-di-casal-velino/restaurant/alessandro-feo']

In [81]:
#File to write the dataset to
filename = "michelin.tsv"
rows = []

for i in range(0, page_count):
  
  for j in range(20):
    #For example, the 35th restaurant will be the 15th restaurant on the 2nd page, with 1*20 + 14 restaurants behind it.
    nr = j + i*20
    #Since the first n-1 pages have 20 restaurants, but the last one may have less,
    #  we want to break out of the loop when the index goes out of bounds - it means we've gone through all restaurants.
    try:
      r = requests.get(restaurants[nr])
    except:
      break
    #Name to be used for writing the HTML file to the appropriate page folder.
    name = restaurants[nr].split('/')[-1]
    path = '/Users/augustodeluzenberger/Documents/Python/ADM-HW3/Page '+ str(i+1) + '/' + name + '.html'
    
    with open(path, 'wb+') as f:
      f.write(r.content)
    raw_html = r.text
    html = BeautifulSoup(raw_html, "html.parser")
    #The restaurant's name is in the first <h1> element on its page.
    rname = html.select_one('h1').text
    
    # Most of the attributes we are scraping come from the div with this class.
    # Temp contains two lines of text with a heavy amount of whitespace that
    # needs to be stripped.
    temp = html.select('.data-sheet__block--text')[:2]
    attrs = []
    
    for t in temp:
      attrs.append(t.text.strip())
    
    #Each row has the attributes we need delimited in a different way, so we split them with different separators.
    first_row = attrs[0].split(',')
    second_row = attrs[1].split('·')
    price = second_row[0].strip()
    categories = second_row[1].strip().split(',')
    
    #This indexing is made necessary by there being more fields than we're interested in for the city column, 
    # with some restaurants also having a neighborhood/comune listed with a comma next to the city.
    #Taking only the -3rd index allows to retain only the city.
    
    address, postal_code, country, city = first_row[0], first_row[-2], first_row[-1], first_row[-3]
    
    #We need to substitute away expressions like \r and \n from the scraped address and desc rows,
    #because they interfere with the tsv parsing and what should be one entry gets split into two
    
    address = re.sub(r'[\r\n\t\f]+', ' ', address)
    desc = html.select('.data-sheet__description')[0].text.strip()
    desc = re.sub(r'[\r\n\t\f]+', ' ', desc)
    
    #The services are an unordered list, so we can obtain a list of them from the get go instead of splitting.
    services = []
    
    for s in html.select('.restaurant-details__services ul li'):
      services.append(s.text.strip())
    
    #There is no alt text for the credit card images, but the credit card types are contained in the image source file name, 
    # so we take a capitalized version of that.
    cards = []
    
    for card in html.select('.restaurant-details__services .list--card .lazy'):
      c = card.get('data-src').split('/')[4].split('-')[0].capitalize()
      cards.append(c)
    phone = html.select('.collapse__block span')[0].text.strip()
    #A small number of restaurants do not have a website listed, so the try block serves to avoid a NoneType error.
    try:
      website = html.select('.collapse__block a')[1].get('href')
    except:
      website = ""
    #Each row is delimited by tabs in order to build a TSV instead of CSV, as instructed.
    row = "\t".join([rname, address, city, postal_code, country, price, str(categories), desc, str(services), str(cards), phone, website])
    rows.append(row)


In [82]:
#finally we create the tsv file

with open(filename, mode="w") as out:
  #The first row will be used as a header containing column names when reading the dataset.
  c = ["restaurantName", "address", "city", "postalCode", "country", "priceRange", "cuisineType", "description", "facilitiesServices", "creditCards", "phoneNumber", "website"]
  columns = "\t".join(c)
  #The print function is used with stdout changed for the file, because it automatically writes a newline after each call (unlike other file writing methods),
  #  making it convenient for making sure the rows are written properly.
  print(columns, file = out)
  for row in rows:
    print(row, file=out)


---

# 2.0

In [83]:
import pandas as pd
import ast
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import defaultdict
import csv
import numpy as np

# Get the English stopwords list
nltk.download('stopwords')
# Set the display option to show the full width of each column
pd.set_option('display.max_colwidth', None)
df = pd.read_csv('michelin.tsv', sep='\t', header=0)

# we remove the square brakets and ' from text
# This will come useful later for the map visualization
df['cuisineType'] = df['cuisineType'].str.replace('[', '', regex=False).str.replace(']', '', regex=False).str.replace('\'', '', regex=False)
df['facilitiesServices'] = df['facilitiesServices'].str.replace('[', '', regex=False).str.replace(']', '', regex=False).str.replace('\'', '', regex=False)
df['creditCards'] = df['creditCards'].str.replace('[', '', regex=False).str.replace(']', '', regex=False).str.replace('\'', '', regex=False)


df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/augustodeluzenberger/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,restaurantName,address,city,postalCode,country,priceRange,cuisineType,description,facilitiesServices,creditCards,phoneNumber,website
0,O Me O Il Mare,Via Roma 45/47,Gragnano,80054,Italy,€€€€,"Italian Contemporary, Modern Cuisine","Known around the world as the town of pasta, Gragnano is still home to small-scale pasta-makers renowned for their top-quality pasta. To reinforce this fact, this restaurant is housed in a building (dating back to 1695) once used as pasta factory. Now boasting a modern, spacious dining room with a vaulted ceiling and open-view kitchen, the restaurant serves three tasting menus, all with strong links to the region but also featuring more creative touches. The region also takes pride of place on the wine list – it’s worth asking the talented and experienced sommelier for her recommendations.","Air conditioning, Interesting wine list, Wheelchair access","Amex, Dinersclub, Mastercard, Visa",+39 081 620 0550,http://omeoilmare.com
1,Donevandro,via Garibaldi 2,Popoli,65026,Italy,€€,"Contemporary, Seasonal Cuisine","Up until a few years ago, the owner-chef at this restaurant was working as a painter – a fact that is evident from the artistic touch in his cuisine. His recipes are modern and personalised, with careful attention naturally paid to harmonious presentation, while the flavour of his dishes is brought out by ingredients that are skilfully chosen from the Abruzzo inland area. In 2024, the restaurant moved to new, centrally located premises which have an intimate feel and are elegant and minimalist in style.",Air conditioning,"Mastercard, Visa",+39 388 887 6858,http://www.donevandroristorante.it
2,Ape Vino e Cucina,Piazza Risorgimento 3,Alba,12051,Italy,€€,"Piedmontese, Contemporary","This attractive restaurant in the heart of Alba, the gastronomic capital of the Langhe, is home to chef Damiano Nigro who has previous experience in Michelin-starred restaurants in the area. The restaurant serves aperitif snacks, tapas and top-quality cocktails alongside its delicious, generously portioned meals which, as you might expect in Piedmont, have a traditional flavour. The pasta, bread, desserts and ice-cream are all homemade, while on our last visit in the summer we particularly enjoyed the maccheroncini with scampi and peppers, and the exquisite Fassona beef with a red-wine sauce and seasonal vegetables. Booked ahead is recommended.","Air conditioning, Terrace, Wheelchair access","Amex, Dinersclub, Maestrocard, Mastercard, Visa",+39 0173 363453,https://www.apewinebar.it/alba/
3,Da Bob Cook Fish,largo Parsano vecchio 16,Sorrento,80067,Italy,€€,Seafood,"Working in partnership with the nearby fishmongers which supplies it with fresh fish and seafood, this restaurant occupies a building once used for storing oranges that dates back 400 years. Although there is a menu, given the restaurant’s close relationship with the fishmongers we recommend asking the serving staff about the catch of the day and taking a look in the glass cabinet where the fish are displayed. Dishes are classic and traditional, allowing the full flavour of the fresh fish to shine through.","Air conditioning, Terrace","Amex, Dinersclub, Mastercard, Visa",+39 081 1778 3873,https://www.dabobcookfish.com/
4,DA_MÓ,Via Bruno Buozzi 20,Matera,75100,Italy,€€,"Regional Cuisine, Contemporary","This new, restored restaurant in the upper part of the Sassi (yet still near the centre of Matera and its car parks) has a warm friendly ambience and boasts a small, charming outdoor space. Here, an entire family (father in the kitchen, mother and daughter front of house), who moved to Matera from Venosa (home of the Latin poet Horatio), serves regional cuisine with a contemporary twist and no little skill and professionalism. Various tasting menus are available, as well as an à la carte. An interesting wine list completes the picture.","Air conditioning, Terrace","Amex, Dinersclub, Mastercard, Visa",+39 0835 686548,https://www.damoristorante.it/


### 2.0.0) Preprocessing the Text

Before building the search engine, you must clean and prepare the text in each restaurant’s description. We will:

- Remove stopwords.
- Remove punctuation.
- Apply stemming.
- Perform any other necessary cleaning to improve search accuracy.

For this, we use the nltk library.



In [84]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Let's create a function to remove the stopwords and ponctuation marks

def PROC(text):
    # Tokenize the text
    
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)

    tokens = word_tokenize(text)
    
    # Remove stopwords and make words lowercase
    
    filtered_tokens = [word.lower() for word in tokens if word.lower() not in stop_words]

    
    filtered_tokens = [stemmer.stem(word) for word in filtered_tokens]

    return filtered_tokens


In [85]:
def build_vocabulary(documents):
    # Initialize a defaultdict to automatically assign unique term IDs
    vocabulary = defaultdict(lambda: len(vocabulary) + 1)
    tokens_set = []
    new_column = []
    
    # Populate vocabulary with unique terms and assign term IDs
    for i, doc in enumerate(documents):
        processed = PROC(doc)
        tokens_set += processed
        new_column.append(processed)

    df['Proc_Descr'] = new_column
        

    for token in set(tokens_set):
        _ = vocabulary[token]   # Access the token in the vocabulary to trigger the assignment
    
    # Convert defaultdict to a regular dictionary
    vocabulary = dict(vocabulary)
    
    # Write vocabulary to a CSV file
    with open('vocabulary.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['term', 'term_id'])
        for term, term_id in vocabulary.items():
            writer.writerow([term, term_id])
    


build_vocabulary(df['description'])

In [86]:
def build_inverted_index(documents, vocabulary):

    voc = pd.read_csv(vocabulary)

    # Convert the DataFrame to a dictionary
    voc = voc.set_index('term')['term_id'].to_dict()
    
    inverted_index = defaultdict(list)
    
    for doc_id, doc in enumerate(documents):
        for token in voc:  # Use set to avoid duplicate entries in the same document
            term_id = voc[token]
            if token in doc:
                inverted_index[term_id].append(doc_id)
    
    inverted_index = dict(inverted_index)

    with open('Inverted_Index.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['term_id', 'document_id'])
        for term, term_id in inverted_index.items():
            writer.writerow([term, term_id])
    
    

build_inverted_index(df['Proc_Descr'], 'vocabulary.csv')



In [87]:
def query_fun(query, df):

    vocabulary = pd.read_csv('vocabulary.csv')
    # Convert the DataFrame to a dictionary
    vocabulary = vocabulary.set_index('term')['term_id'].to_dict()
    
    inv_index  = pd.read_csv('Inverted_Index.csv')
    #corrects the strings back to the lists they were
    inv_index['document_id'] = inv_index['document_id'].apply(ast.literal_eval)
    inv_index = inv_index.set_index('term_id')['document_id'].to_dict()

    query_tokens = PROC(query)

    doc_results = []

    for token in query_tokens:
        if token in vocabulary:
            idx = vocabulary[token]
            
            doc = inv_index[idx]
            
            doc_results.append(doc)
        
    
    # Find the intersection of document IDs that contain all query terms
    if doc_results:
        intersection = set(doc_results[0]).intersection(*doc_results[1:])
    else:
        intersection = set()
    
    output = df[['restaurantName', 'address', 'description', 'website']].loc[list(intersection)]
    
    output = output.rename(columns={'restaurantName': 'Restaurant Name','address': 'Address','description': 'Description','website': 'Website'
    })

    display(output.head())
    

query_fun("modern seasonal cuisine", df)


        

Unnamed: 0,Restaurant Name,Address,Description,Website
1281,Flurin,Laubengasse 2,"Flurin occupies an old medieval tower in Glorenza, a small hamlet in the Val Venosta which is surrounded by walls that also date back to the Middle Ages. Furnished in a modern style that contrasts with its old vaulted ceiling, the restaurant serves contemporary cuisine that the enthusiastic owner-chef prepares from carefully selected seasonal ingredients from the region. His creative and imaginative recipes include dishes such as Alpine char ceviche with grapefruit, cucumber and sesame seeds. A bar and guest suites complete the picture.",https://www.flurin.it
1546,[àbitat],via Henry Dunant 1,"A young, enthusiastic and professional couple has taken over the reins at this modern and welcoming restaurant, serving cuisine that is inspired by a sustainable and seasonal approach. Ingredients include herbs harvested from their own garden, as well as typical produce from the mountains and the sea, all sourced from trusted suppliers who demonstrate full respect for nature and animals. Food waste is kept to a minimum to ensure a circular-based cuisine.",https://www.abitatproject.it
1678,Osteria Taviani,piazza Vittorio Emanuele II 28,"This pleasant, warmly decorated restaurant is run with passion and enthusiasm by Elena and Alessandro, the former supervising the front of house, the latter at the helm in the kitchen. The cuisine here is modern and full of flavour, with a focus on meat dishes (although a few fish options also feature) with their roots firmly anchored in Tuscan traditions (such as the fried steak). During the hunting season, the contemporary-style game dishes are particularly delicious. The wine list includes a few French labels.",
17,20Tre,via David Chiossone 20 r,"Run by three partners, this contemporary-style restaurant is situated in the heart of Genova’s historic centre. The cuisine here is prepared using the best seasonal ingredients, all presented in an array of modern dishes. Meat and fish feature on the menu, with a focus on regional recipes and the occasional Asian twist (for example, the amberjack and ponzu bonbon).",https://www.ristorante20tregenova.it/
913,Pipero Roma,corso Vittorio Emanuele II 250,"Situated opposite the church of Santa Maria in Vallicella (known to locals as Chiesa Nuova or new church), this restaurant is named after its owner and maître-d’ Alessandro Pipero. Young chef Ciro Scamardella from Campania is at the helm in the kitchen, where he prepares modern dishes with a focus on the seasons, including a few specialities from his native region. His cuisine demonstrates a continuous search for balance and colour – the recipe that particularly impressed our inspectors was his reinterpretation of mussel soup, in which the full flavour of the sea shone through. A second dining room on the first floor, decorated in the same refurbished style, is available for small groups and private dinners.",https://www.piperoroma.it/


## 2.2 Ranked Search Engine with TF-IDF and Cosine Similarity
For the second search engine, given a query, retrieve the top-k restaurants ranked by relevance to the query.

## 2.2.1 Inverted Index with TF-IDF Scores

- tfIdf Scores: Calculate TF-IDF scores for each term in each restaurant’s description.
- Updated Inverted Index: Build a new inverted index where each entry is a term, and the value is a list of tuples containing document IDs and TF-IDF scores.


In [88]:
N = len(df['Proc_Descr'])

vocabulary = pd.read_csv('vocabulary.csv')
# Convert the DataFrame to a dictionary
vocabulary = vocabulary.set_index('term')['term_id'].to_dict()

def tf(token, document):
    return document.count(token)/ float(len(document))

# create a dictionary with the IDF values for each term in the vocabulary
IDF_dict = {}
    
for token in vocabulary:
    count = 0
    for doc in df['Proc_Descr']:
        if token in doc:
            count +=1
    IDF_dict[token] = float(np.log(N/count))




In [89]:
# create the inverted index
inverted_index = defaultdict(list)

for token in vocabulary: #this iterates over the vocabulary keys, in other words the terms
    for i ,doc in enumerate(df['Proc_Descr']): 
        #here I don't need to check that the term is in the document, I want this index to contain the zeros TF
        #this is useful to construct the matrix with tfidf scores later and do the dot product with vectors all of the same lenght
        TF = tf(token, doc)
        inverted_index[vocabulary[token]].append((i, TF*IDF_dict[token]))

inverted_index = dict(inverted_index)

In [90]:
# Optionally, save the inverted index to a CSV file
with open('Updated_Inverted_Index.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['term_id', 'document_id and TF-IDF'])
    for term, postings in inverted_index.items():
        writer.writerow([term, postings])

## 2.2.2 Execute the Ranked Query
For the ranked search engine:

- Process the query terms.
- Use Cosine Similarity to rank matching restaurants based on the TF-IDF vectors of the query and each document.
- Return the top-k results or all matching restaurants if fewer than k have non-zero similarity.

In [91]:
import heapq
import math
from collections import defaultdict
import ast


# Step 1: Use the Inverted index to create a matrix, with each column corresponding to a normalized document vector
doc_matrix = defaultdict(list)

for term_id in inverted_index:

    for doc_id, doc_tf_idf in inverted_index[term_id]:

        doc_matrix[doc_id].append(float(doc_tf_idf)) 
    
#normalization

for key in doc_matrix:
    doc_matrix[key] = np.array(doc_matrix[key]) / np.linalg.norm(np.array(doc_matrix[key]))


def fast_cosine_score(query, K, IDF):

    scores = defaultdict(float)
    
    #preprocess query and then create the tfidf vector for the query
    query_tokens = set(PROC(query))

    query_tokens = [i for i in list(query_tokens) if i in vocabulary]
    
    vector_query = np.array([IDF[token] * tf(token, query_tokens) for token in vocabulary])

    vector_query /= np.linalg.norm(vector_query)

    #calculate the score for each document

    for doc_id, doc_tf_idf in inverted_index[vocabulary[token]]:
        
        scores[doc_id] = np.dot(vector_query, doc_matrix[doc_id])

    #Retrieve the top K documents
    top_k_docs = heapq.nlargest(K, scores.items(), key=lambda x: x[1])

    return top_k_docs

def print_top_k(top_docs):
    doc_results = []
    score_results = []

    for result in top_docs:
    
        doc_id, score = result
        doc_results.append(doc_id)
        score_results.append(score)

    data_results = df[['restaurantName','address','description','website']].loc[doc_results]

    data_results['Similarity Score'] = score_results

    display(data_results)

In [92]:
# Example usage #1: search with the entire text of a document in the collection
query = str(df['Proc_Descr'].loc[1672])
top_k = 5
top_docs = fast_cosine_score(query, top_k, IDF_dict)
print_top_k(top_docs)

Unnamed: 0,restaurantName,address,description,website,Similarity Score
1672,Gennaro Di Pace,vicolo della Chiesa 8,The owner-chef’s Calabrian origins are evident on a menu that combines Mediterranean cuisine (including fish and seafood) with Piedmontese specialities and showcases beautifully presented dishes prepared using contemporary techniques. This small modern restaurant situated near Perno castle is also open at lunchtime (by prior reservation only).,https://gennarodipace.it/,0.943742
1428,Mirepuà Food Lab,via Umberto I 69,"Situated in a picturesque medieval village with a castle in the Monferrato hills, this restaurant serves classic Piedmontese cuisine with plenty of meat options, alongside a few specialities from the chef’s native Liguria. Raw fish, Piedmontese-style fritto misto and grilled meat and vegetables are available by prior reservation.",https://www.mirepua.it/,0.237324
263,Trequarti,piazza del Donatore 3/4,This modern and minimalist-style restaurant with three small dining rooms full of character provides the backdrop for contemporary-style cuisine which is constantly evolving. The menu includes a few snack options for those in a hurry or looking to sample a variety of dishes in smaller portions. Open at lunchtime by prior reservation.,https://www.ristorantetrequarti.com/,0.236425
557,La Perla del Mare,via della Meloria 9,The occasional creative touch is evident in the fish and seafood dishes served in this welcoming Mediterranean - style restaurant with an attractive outdoor terrace. The restaurant stands right on the beach!,https://laperladelmare.it/,0.169946
1741,Osteria dell'Oca Bianca,via Umberto I 2,"In the heart of a small town, right opposite the church, this typical village osteria run by a friendly family serves traditional cuisine from the Piedmont, including goose dishes. Highlights here include the rustic, welcoming ambience, an excellent wine list, a wine cellar that can be visited, and a charming glass-fronted veranda that is used throughout the year. There are also three beautiful guestrooms reserved exclusively for guests dining at the restaurant.",https://www.osteriadellocabianca.it/,0.166317


In [93]:
# Example usage #2: search with custom query
query = "tasty italian food"
top_k = 5
top_docs = fast_cosine_score(query, top_k, IDF_dict)
print_top_k(top_docs)

Unnamed: 0,restaurantName,address,description,website,Similarity Score
1710,Sot'Ajarchi,via Marconi 93,"An informal atmosphere in a small trattoria where you can really feel at ease, perhaps in company, eating tasty seafood dishes, based on the daily fish catch.",https://www.ristorantesotajarchi.it/,0.276789
1517,Al Carroponte,via De Amicis 4,"This restaurant is renowned in Bergamo for its good choice of varied cuisine, as well as its lively, buzzing atmosphere. The excellent, tasty cuisine is typically Italian and contemporary in style, and includes luxury ingredients such as lobster and caviar, while the “classic” à la carte is enhanced by a selection of cured hams, cheeses and finger food. But the Carroponte’s beating heart is owner Oscar Mazzoleni’s passion for the world of wine, with an incredible selection of over 2 000 labels that puts many more famous Michelin-starred restaurants in the shade.",https://www.alcarroponte.it/,0.234513
800,La Chioccia d'Oro,via Novi 2,"For over 40 years, La Chioccia d'Oro has been renowned for its delicious, bountiful cuisine inspired by the Cilento region, including excellent pasta dishes (fresh and dried pasta served with tasty sauces) and equally delicious meat-based main courses. The ambience is decidedly unpretentious and the prices are some of the most reasonable you’ll find in Italy!",,0.196821
127,Corte Matilde,via Pelate 38,"The owners' professionalism and passion go hand in hand with a cuisine made with first - class products. The dishes are simply prepared but tasty, exalting the flavour of the ingredients. It is located in a pretty, refurbished farmhouse on the road that Matilda of Canossa once travelled on.",https://www.cortematilde.it/,0.189862
491,Rosita,via Manie 67,"Decorated in the same simple and rustic style as the Hotel Rosita in which it is housed, this restaurant boasts an attractive terrace overlooking the sea and the coast (call in advance to book one of the tables with the best view), which more than compensates for the narrow, winding road that you have to follow to get here. The delicious cuisine is regional in flavour, including seafood options such as the excellent stuffed baby squid in a sauce served with mashed potatoes, and meat dishes that include a tasty rabbit casserole.",https://www.hotelrosita.it/,0.186224


---
# 3.0 Define a New Score!

## Steps:
- User Query: The user provides a text query. We’ll retrieve relevant documents using the search engine built in Step 2.1.
- New Ranking Metric: After retrieving relevant documents, we’ll rank them using a new custom score. Instead of limiting the scoring to only the description field, we can include other attributes like priceRange, facilitiesServices, and cuisineType.
- You will use a heap data structure (e.g., Python’s heapq library) to maintain the top-k restaurants.

New Scoring Function:
Define a scoring function that takes into account various attributes:

- Description Match: Give weight based on the query similarity to the description (using TF-IDF scores).
- Cuisine Match: Increase the score for matching cuisine types.
- Facilities and Services: Give more points for matching facilities/services (e.g., “Terrace,” “Air conditioning”).
- Price Range: Higher scores could be given to more affordable options based on the user’s choice.




In [94]:
# first we create a vocabulary containing all the words in the cuisineType, facilitiesServices
def PROC_2(text):
    # Tokenize the text
    
    text = re.sub(r'[^a-zA-Z0-9-]', ' ', text) # in this version of the function we will keep the - signs

    tokens = word_tokenize(text)
    
    # Remove stopwords and make words lowercase
    
    filtered_tokens = [word.lower() for word in tokens if word.lower() not in stop_words]

    
    filtered_tokens = [stemmer.stem(word) for word in filtered_tokens]

    return filtered_tokens


def build_vocabulary_2(documents):
    
    # Initialize a vocabulary list to automatically assign unique term IDs
    Vocabulary = []
    new_column = []
    
    
    # Populate vocabulary with unique terms and assign term IDs
    for i, doc in enumerate(documents):
        processed = list(filter(lambda x: x != 'cuisin', PROC_2(doc))) #this is just to eliminate problems with the word cuisine in the cuisinetype column
        Vocabulary += processed
        new_column.append(processed)
        
    
    return list(set(Vocabulary)), new_column


voc_cuisine, df['proc_cuisine'] = build_vocabulary_2(df['cuisineType'])
voc_facserv, df['proc_facserv'] = build_vocabulary_2(df['facilitiesServices'])


In [95]:
# Now we asked a LLM to provide us with two lists of word that relates to price range

affordable_words = [
    "cheap", 
    "affordable", 
    "budget", 
    "inexpensive", 
    "economical", 
    "low-cost", 
    "reasonably priced", 
    "value for money", 
    "bargain", 
    "moderate", 
    "cost-effective", 
    "wallet-friendly", 
    "low-price", 
    "pocket-friendly", 
    "deal", 
    "discount", 
    "fair-priced", 
    "mid-range", 
    "accessible"
]

expensive_words = [
    "luxury", 
    "expensive", 
    "high-end", 
    "fine-dining", 
    "upscale", 
    "premium", 
    "exclusive", 
    "posh", 
    "fancy", 
    "lavish", 
    "gourmet", 
    "elegant", 
    "high-class", 
    "sophisticated", 
    "deluxe", 
    "top-tier", 
    "five-star", 
    "prestigious", 
    "opulent", 
    "elite", 
    "high-quality", 
    "refined", 
    "exquisite"
]

#we now process this list the same way we will process the query
affordable_words = PROC_2(str(affordable_words))
expensive_words = PROC_2(str(expensive_words))

#we finally create a dictionary to relate this terms to a price range in the df
price_dict = {}

for i in affordable_words:
    price_dict[i] = ['€', '€€']

for i in expensive_words:
    price_dict[i] = ['€€€', '€€€€']


In [113]:
# now we modify our query function to incorporate points for matching cuisine type and facilities and price range
# this will use the same doc_matrix of point 2.2
def new_cosine_score(query, K, IDF):

    scores = defaultdict(float)
    
    #preprocess query and then create the tfidf vector for the query
    query_tokens = set(PROC(query))

    query_tokens = [i for i in list(query_tokens) if i in vocabulary]
    
    vector_query = np.array([IDF[token] * tf(token, query_tokens) for token in vocabulary])

    vector_query /= np.linalg.norm(vector_query)

    #calculate the score for each document

    for doc_id, doc_tf_idf in inverted_index[vocabulary[token]]:
        
        scores[doc_id] = np.dot(vector_query, doc_matrix[doc_id])
    
    # Now we add the other metrics to the scores

    #first we check if the query contains any word from our voc_cuisine list
    query_cuisine = []
    
    for q in query_tokens:
        if q in voc_cuisine:
            query_cuisine.append(q)
   
    # then we add points for matching cuisin type
    for doc_id, doc_cuisine in enumerate(df['proc_cuisine']):
        # Check if any token in query_cuisine is in the specific restaurant description
        if any(token in doc_cuisine for token in query_cuisine):
            scores[doc_id] += 2
        else:
            scores[doc_id] += -2

    # we do the same for facilities and services 
    query_facserv = []
    
    for q in query_tokens:
        if q in voc_facserv:
            query_facserv.append(q)
   
    # then we add points for matching cuisin type
    for doc_id, doc_facserv in enumerate(df['proc_facserv']):
        # Check if any token in query_cuisine is in the specific restaurant description
        if any(token in doc_facserv for token in query_facserv):
            scores[doc_id] += 0.5
        else:
            scores[doc_id] += -0.5


    #finally we add or remove points for price match


    if any(q in affordable_words for q in query_tokens):
        indexes = df.index[df['priceRange'].isin(['€', '€€'])].tolist()
        for idx in indexes:
            scores[idx] += 1

    elif any(q in expensive_words for q in query_tokens):
        indexes = df.index[df['priceRange'].isin(['€€€', '€€€€'])].tolist()
        for idx in indexes:
            scores[idx] += 1       


    #Retrieve the top K documents
    top_k_docs = heapq.nlargest(K, scores.items(), key=lambda x: x[1])

    return top_k_docs


In [97]:
# we rewrite the printing function
def print_top_k(top_docs):
    doc_results = []
    score_results = []

    for result in top_docs:
    
        doc_id, score = result
        doc_results.append(doc_id)
        score_results.append(score)

    data_results = df[['restaurantName','address','description','cuisineType','facilitiesServices', 'priceRange']].loc[doc_results]

    data_results['Similarity Score'] = score_results

    display(data_results)

In [114]:
# Example usage #1: search with custom query 
query = "tasty campanian food with affordable price and terrace"
top_k = 5
top_docs = new_cosine_score(query, top_k, IDF_dict)
print_top_k(top_docs)

Unnamed: 0,restaurantName,address,description,cuisineType,facilitiesServices,priceRange,Similarity Score
800,La Chioccia d'Oro,via Novi 2,"For over 40 years, La Chioccia d'Oro has been renowned for its delicious, bountiful cuisine inspired by the Cilento region, including excellent pasta dishes (fresh and dried pasta served with tasty sauces) and equally delicious meat-based main courses. The ambience is decidedly unpretentious and the prices are some of the most reasonable you’ll find in Italy!","Country cooking, Campanian","Air conditioning, Car park, Terrace",€,3.661622
818,Lo Stuzzichino,via Deserto 1/a,"Situated in the centre of Sant'Agata, this welcoming, informal restaurant boasts traditional ceramics and an open-view kitchen. When ordering your meal, make sure you ask the charismatic owner, who is a real connoisseur of Campanian cuisine, for his recommendations. The regional dishes are full of memorable flavours, with a focus on vegetables, many of which are grown in the restaurant’s own kitchen garden nearby. This is somewhere you’d happily come to eat every day!","Campanian, Traditional Cuisine","Air conditioning, Terrace",€€,3.601885
924,La Torre,piazza Annunziata 7,"Before sitting down at this restaurant, why not take a short stroll to the nearby viewpoint looking out towards Capri – a perfect prelude to the excellent home-cooked and authentic Campanian cuisine that awaits here. In fact, Capri-style ravioli is one of the most popular dishes at the restaurant, which also serves aubergine parmigiana (highly recommended), potato “gâteau”, and various fish options among the main courses.","Campanian, Home Cooking","Air conditioning, Terrace",€,3.585572
300,Angiolina,via Passariello 2,"This trattoria is situated at the end of the marina in Pisciotta, a location which ensures a quiet, tranquil ambience. The excellent, reasonably priced seafood cuisine is another reason that we heartily recommend eating here. With a focus on local fish, the menu features a wide array of specialities from Campania, including anchovies served in different ways (stuffed, “alla scapece”, salted and served with butter, and with spaghetti).","Campanian, Traditional Cuisine","Restaurant offering vegetarian menus, Terrace",€,3.546535
1050,La Dispensa di Armatore,via Cantone 1,"A very small and unusual restaurant with a bistro feel and tables outside only, either at the counter with stools overlooking the sea or in the more traditional outdoor dining space. Although the menu is relatively small, the quality of the fish (including options such as bluefin tuna, anchovies, squid etc) is excellent – which is hardly surprising, as the restaurant owners have been involved in the fishing industry for four generations. The menu features just one dessert, “spumone”, which is made for the restaurant by a nearby artisanal ice-cream parlour, and the restaurant doesn’t serve coffee, but people flock here for the excellent fish and the attractive prices, which are very reasonable given its coastal location.","Seafood, Campanian","Counter dining, Great view, Terrace",€,3.538773


In [116]:
# Example usage #1: search with custom query 
query = "fancy spicy asian cuisine with air conditioning"
top_k = 5
top_docs = new_cosine_score(query, top_k, IDF_dict)
print_top_k(top_docs)

Unnamed: 0,restaurantName,address,description,cuisineType,facilitiesServices,priceRange,Similarity Score
766,Ba Restaurant,via Raffaello Sanzio 22,"Two huge red lamps welcome guests to this minimalist-style restaurant decorated in dark, typically Asian colours. The carefully prepared cuisine focuses on modern, reinterpreted Chinese dishes, while the wine list also includes a good selection of wines by the glass.","Chinese, Asian Contemporary",Air conditioning,€€€,3.584524
238,Angelo Sabatelli,via Santa Chiara 1,"A talented and brilliant chef is at the helm in this restaurant in the town’s historic centre, where elegant, contemporary-style dining rooms provide the backdrop for technical cuisine that showcases local produce with the occasional added Asian flavour. A visit to the large, well-stocked wine cellar is highly recommended (there’s an excellent selection of sparkling wines and champagnes), where a couple of beautiful tables are used for tasting and private dinners.","Country cooking, Asian Influences","Air conditioning, Interesting wine list, Wheelchair access",€€€,3.572542
745,Pacifico,via Moscova 29,"Although its dining rooms are fairly small, this restaurant, which acts as an ambassador for Peruvian cuisine with the occasional Asian influence, has an attractive, trendy ambience and a theatrical feel. Excellent choice of ceviche – raw fish or seafood dishes marinated in lemon and flavoured with spices such as chilli pepper and coriander – which are a typical speciality of Latin American countries along the Pacific coast.","Peruvian, Asian Influences",Air conditioning,€€€,3.551628
1116,Ichikawa,via Lazzaro Papi 18,"Ichikawa is one of the culinary masters who played a part in introducing Japanese cuisine to Italy. After decades in the country working in various restaurants, this chef has finally opened his own eatery, where he serves top-quality Japanese cuisine ranging from familiar favourites such as sushi and sashimi to other less-known family dishes and street food from the land of the Rising Sun. A good place to discover the joys of Japanese cuisine!","Japanese, Asian","Air conditioning, Wheelchair access",€€€,3.502118
923,Armani/Ristorante,via Manzoni 31,Elegant and carefully prepared contemporary cuisine served on the seventh floor of a palazzo which is totally dedicated to the world of Armani. Superb views of Milan combine with a decor of black marble and backlit onyx to create an exclusive and fashionable ambience.,"Italian Contemporary, Asian Influences","Air conditioning, Great view, Wheelchair access",€€€€,3.500994


### Let's compare them with the results obtained with the previous score method:

In [112]:
query = "tasty campanian food with affordable price and terrace"
top_k = 5
top_docs = fast_cosine_score(query, top_k, IDF_dict)
print_top_k(top_docs)

Unnamed: 0,restaurantName,address,description,cuisineType,facilitiesServices,priceRange,Similarity Score
1418,Trattoria Pomposa - Al Re gras,via Castel Maraldo 57,"It’s not just the affordable prices that will keep you coming back here, as the real highlight is its flavourful Emilian cuisine, which has been given a fresh, more modern twist. Among the first courses, we can recommend the tortellini in chicken broth or with a 24-month Parmesan cream; and for the main course, a delicious tripe with bread croutons. The Lambrusco is a must on the wine list here (we can recommend a good Grasparossa di Castelvetro). In the summer, meals are served outdoors on the small Piazza della Pomposa.","Emilian, Country cooking","Air conditioning, Terrace, Wheelchair access",€,0.209405
800,La Chioccia d'Oro,via Novi 2,"For over 40 years, La Chioccia d'Oro has been renowned for its delicious, bountiful cuisine inspired by the Cilento region, including excellent pasta dishes (fresh and dried pasta served with tasty sauces) and equally delicious meat-based main courses. The ambience is decidedly unpretentious and the prices are some of the most reasonable you’ll find in Italy!","Country cooking, Campanian","Air conditioning, Car park, Terrace",€,0.161622
1710,Sot'Ajarchi,via Marconi 93,"An informal atmosphere in a small trattoria where you can really feel at ease, perhaps in company, eating tasty seafood dishes, based on the daily fish catch.","Seafood, Cuisine from the Marches",Air conditioning,€€,0.154027
1297,Il Principe,via Colle San Bartolomeo 4,"This restaurant focuses on a skilful and creative reinterpretation of Campanian specialities, serving the type of cuisine that you’d be happy to eat every day. Although the dishes are inspired by the chef’s childhood memories and his grandmother’s cooking, he adds his own contemporary touch and careful presentation. Both meat and fish feature, showcasing all the colour and imagination of the best regional cuisine.","Modern Cuisine, Contemporary","Air conditioning, Terrace",€€,0.134607
491,Rosita,via Manie 67,"Decorated in the same simple and rustic style as the Hotel Rosita in which it is housed, this restaurant boasts an attractive terrace overlooking the sea and the coast (call in advance to book one of the tables with the best view), which more than compensates for the narrow, winding road that you have to follow to get here. The delicious cuisine is regional in flavour, including seafood options such as the excellent stuffed baby squid in a sauce served with mashed potatoes, and meat dishes that include a tasty rabbit casserole.","Ligurian, Classic Cuisine","Car park, Terrace",€€,0.120975


We can clearly see that for the first example the old code does a worse job at identifing restaurants that have the correct cuisine requested. This is key for returning good query responses.

In [117]:
query = "fancy spicy asian cuisine with air conditioning"
top_k = 5
top_docs = fast_cosine_score(query, top_k, IDF_dict)
print_top_k(top_docs)

Unnamed: 0,restaurantName,address,description,cuisineType,facilitiesServices,priceRange,Similarity Score
815,La Cascina 1899,strada statale 106,"Situated on the main road running alongside the sea, this pleasant restaurant with a large car park occupies a late-19C building that has been transformed into an attractive, rustic-style eatery with plenty of outdoor areas and air-conditioned rooms with exposed-stone walls and wooden ceilings that provide welcome respite from the heat of summer. It’s no surprise that fish and seafood take pride of place on the menu, although meat dishes also feature. The shop next door sells regional specialities, some produced by the restaurant itself, including the famous local bergamot citrus fruit.","Calabrian, Classic Cuisine","Air conditioning, Car park, Garden or park, Terrace",€€,0.229536
652,Bon Wei,via Castelvetro 16/18,"China on a plate! This attractive restaurant with an intriguing decor combining elegance, modern style and Asian influences serves a wide selection of Chinese specialities. Dishes include appetisers, dumplings, noodles and meat options, as well as a huge array of specialities from eight different regions, from the spicy dishes of Szechuan to the delicate flavours of Zhejiang.","Chinese, Asian","Air conditioning, Wheelchair access",€€,0.163202
1587,Le Nove Scodelle,viale Monza 4,"Now available everywhere in Italy, ethnic cuisine is gradually becoming more specialised, as is the case in this restaurant serving specialities from the province of Szechuan in south-west China. Exciting and original cuisine full of spicy flavours.","Chinese, Sichuan","Air conditioning, Wheelchair access",€,0.118349
1698,La Terrazza,via Collegio 33,"The aptly named Terrazza is a delightful open - air restaurant overlooking Asolo’s historic centre. In an elegant, fashionable ambience enjoy innovative delicacies that nonetheless make full use of the best local traditions and produce. Ideal for a romantic dinner.",Modern Cuisine,"Air conditioning, Car park, Terrace",€€,0.106301
235,Giglio,piazza del Giglio 2,"Housed in a beautiful 18C palazzo, Giglio boasts a superb location in one of the many squares in Lucca’s charming historic centre, as well as a delightful outdoor space. The meat and fish dishes on the menu here are inspired by traditional Italian recipes and reinterpreted with a light modern touch – a good example is the siphoned potato which is sprinkled with bottarga and served with a squid sauce, chanterelle mushrooms and spicy parsley cream. The bread is always excellent.","Classic Cuisine, Italian","Air conditioning, Terrace",€€€,0.101441


The same is true for this second example, we also can see that the new metric return results that have the correct price range requested.

---
# 4. Visualizing the Most Relevant Restaurants
Maps can provide users with an easy way to see where restaurants are located. This is especially useful for understanding which regions in Italy have more options.

## Steps for Visualization:

- Geocode Locations: Collect information on unique restaurant locations in Italy (in the format of City and Region). You can use tools such as Google API, OpenStreetMap, or a pre-defined list to retrieve representative coordinates for each region.

- Ask a Large Language Model (LLM): Alternatively, you can compile a list of unique cities and regions in Italy, formatted as (City, Region), and ask an LLM (e.g., ChatGPT) to provide coordinates for these locations. This can be an efficient way to gather data without using API calls. Just make sure that the retrieved information is correct and helpful.

- Map Setup: Use a mapping library like plotly or folium to create a visual display of restaurants by region.

- Encoding Price Ranges: Incorporate a visual representation for price ranges:

    - Use color-coding or marker size to represent the restaurant’s price range (€, €€, €€€, €€€€).
    - Include a legend for interpreting price levels.
- Plot Top-K Restaurants: Use the custom score from Step 3 to select the top-k restaurants for display.

This map will give users an overview of restaurant options across different regions in Italy, with an indication of cost based on visual cues.

In [None]:
#We will use geopy with a google api key to get the coordinates

from geopy.geocoders import GoogleV3
from geopy.extra.rate_limiter import RateLimiter #this is needed to prevent errors related to sending too many requests at a time

# We will of course use the same dataframe we created
# Drop rows with NaN in 'address' or 'city' columns because we will need those informations to do the geocoding
df.dropna(subset=['address', 'city'], inplace=True)

# Your Google API key, we removed ours for obvious privacy reasons
GOOGLE_API_KEY = 'YOUR_API_KEY'

# Initialize the Google geocoder
geolocator = GoogleV3(api_key=GOOGLE_API_KEY)

# This code line is strongly suggested to limit the number of request sent per second, we set it to 1 per sec
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)


> We chose here to use google instead of the free and more common Nominatim, becouse the latter explicitaly forbids large bulk requests to their servers. It was way too slow and gave too many errors. Google instead offers a basic free account with the equivalent of circa 40000 requests per month, which is more than enough as we will show.

In [101]:
# we now create a function to geocode the locations of the restaurants

def geocode_location(row):
    try:
        # we use adresses and city infos instead of the zip codes to have a more precise map
        location = geocode(f"{row['address']}, {row['city']}, Italy")
        if location:
            return pd.Series([location.latitude, location.longitude])
        else:
            return pd.Series([None, None]) #this is to catch errors
    except Exception as e:
        return pd.Series([None, None]) #this is to catch errors



In [102]:
# we now use the funxtion we created
df[['latitude', 'longitude']] = df.apply(geocode_location, axis=1)

# Save the updated DataFrame
df.to_csv('geocoded_michelin_restaurants.csv', index=False)

In [105]:
import folium

# Drop rows without latitude and longitude
df.dropna(subset=['latitude', 'longitude'], inplace=True)

# Initialize a Folium map centered around Italy
italy_coords = [41.9028, 12.4964]  # here we used the coordinates fo the city of Rome
restaurant_map = folium.Map(location=italy_coords, zoom_start=6)

# We now create a function that associate each price level to a color to be shown in the markers on the map
def get_marker_color(price_range):
    if price_range == '€':
        return 'green'
    elif price_range == '€€':
        return 'blue'
    elif price_range == '€€€':
        return 'orange'
    elif price_range == '€€€€':
        return 'red'
    else:
        return 'gray'


# Add a marker for each restaurant
for _, row in df.iterrows():
    location = (row['latitude'], row['longitude'])
    popup_text = f"{row['restaurantName']}<br>Price: {row['priceRange']}<br>Cuisine: {row['cuisineType']}" #this is to edit what the pop up text shows on each marker
    marker_color = get_marker_color(row['priceRange']) #we use our previously created function to assign the colors
    folium.Marker(location, popup=popup_text, icon=folium.Icon(color=marker_color)).add_to(restaurant_map)

# Save the map to an HTML file and display it
map_file_path = 'michelin_restaurants_map.html'
restaurant_map.save(map_file_path)

# Finally we can display the created interactive map
display(restaurant_map)


### Now we will generate a map for the top_k results of our query

In [118]:
#to generate a map that only give us a representation of the top k results for the query in point 3 we adjust the code
query = "fancy spicy asian cuisine with air conditioning"
top_k = 10

top_docs = new_cosine_score(query, top_k, IDF_dict)
doc_results = []
score_results = []

for result in top_docs:
    
    doc_id, score = result
    doc_results.append(doc_id)
    score_results.append(score)

data_results = df[['restaurantName','address','description','cuisineType','facilitiesServices', 'priceRange', 'latitude', 'longitude']].loc[doc_results]

data_results['Similarity Score'] = score_results

restaurant_map_2 = folium.Map(location=italy_coords, zoom_start=6)

# Add a marker for each restaurant
for _, row in data_results.iterrows():
    location = (row['latitude'], row['longitude'])
    popup_text = f"{row['restaurantName']}<br>Price: {row['priceRange']}<br>Cuisine: {row['cuisineType']}" 
    marker_color = get_marker_color(row['priceRange']) #we use our previously created function to assign the colors
    folium.Marker(location, popup=popup_text, icon=folium.Icon(color=marker_color)).add_to(restaurant_map_2)

# Save the map to an HTML file and display it
map_file_path = 'top_k_map.html'
restaurant_map_2.save(map_file_path)

# Finally we can display the created interactive map
display(restaurant_map_2)
