<a href="https://colab.research.google.com/github/ddivyansh04/Gen-ai-nugget/blob/main/zomato1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install required libraries
!pip install requests beautifulsoup4

# Import necessary modules
import requests
from bs4 import BeautifulSoup
import json




In [2]:
# This is a fake HTML example to simulate a restaurant site
html = """
<html>
  <body>
    <h1>Testaurant</h1>
    <div class="menu-item">
      <h4>Cheese Pizza</h4>
      <p class="description">Classic mozzarella pizza.</p>
      <span class="price">$9.99</span>
    </div>
    <div class="menu-item">
      <h4>Veggie Burger</h4>
      <p class="description">Patty with lettuce and tomato.</p>
      <span class="price">$7.49</span>
    </div>
  </body>
</html>
"""


In [3]:
# Parse the HTML
soup = BeautifulSoup(html, "html.parser")

# Extract restaurant name
restaurant_name = soup.find("h1").text.strip()

# Extract menu items
menu_items = []
for item in soup.select("div.menu-item"):
    name = item.find("h4").text.strip()
    desc = item.find("p", class_="description").text.strip()
    price = item.find("span", class_="price").text.strip()

    menu_items.append({
        "name": name,
        "description": desc,
        "price": price
    })

# Build the full data dict
data = {
    "restaurant": restaurant_name,
    "menu": menu_items
}

# Print result
print("Scraped Data:")
print(json.dumps(data, indent=2))


Scraped Data:
{
  "restaurant": "Testaurant",
  "menu": [
    {
      "name": "Cheese Pizza",
      "description": "Classic mozzarella pizza.",
      "price": "$9.99"
    },
    {
      "name": "Veggie Burger",
      "description": "Patty with lettuce and tomato.",
      "price": "$7.49"
    }
  ]
}


In [4]:
# Save to JSON file
with open("testaurant_data.json", "w") as f:
    json.dump(data, f, indent=2)

print("Saved as testaurant_data.json ✅")


Saved as testaurant_data.json ✅


In [5]:
# Reload from saved file
with open("testaurant_data.json") as f:
    loaded = json.load(f)

# Define price cleaner
def clean_price(price_str):
    try:
        return float(price_str.replace("$", "").strip())
    except:
        return None

# Clean all fields
cleaned_menu = []
for item in loaded["menu"]:
    cleaned_menu.append({
        "name": item["name"].lower().strip(),
        "description": item["description"].lower().strip(),
        "price": clean_price(item["price"])
    })

# Final cleaned version
cleaned_data = {
    "restaurant": loaded["restaurant"],
    "menu": cleaned_menu
}

# Preview cleaned data
print("✅ Cleaned Data:")
print(json.dumps(cleaned_data, indent=2))


✅ Cleaned Data:
{
  "restaurant": "Testaurant",
  "menu": [
    {
      "name": "cheese pizza",
      "description": "classic mozzarella pizza.",
      "price": 9.99
    },
    {
      "name": "veggie burger",
      "description": "patty with lettuce and tomato.",
      "price": 7.49
    }
  ]
}


In [6]:
!pip install -U sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_

In [7]:
from sentence_transformers import SentenceTransformer

# Load the pre-trained model (runs in under 30 seconds)
model = SentenceTransformer('all-MiniLM-L6-v2')

# Create sentences from your menu like: "cheese pizza: classic mozzarella pizza"
corpus = []
for item in cleaned_data["menu"]:
    sentence = f"{item['name']}: {item['description']}"
    corpus.append(sentence)

# Generate embeddings (vectors of 384 numbers each)
corpus_embeddings = model.encode(corpus)

# Preview 1 example
print("Text:", corpus[0])
print("Embedding (first 5 numbers):", corpus_embeddings[0][:5])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Text: cheese pizza: classic mozzarella pizza.
Embedding (first 5 numbers): [-0.05814555  0.05731873 -0.02101626  0.07333442 -0.10252272]


In [8]:
from sklearn.metrics.pairwise import cosine_similarity

def search(query, corpus, corpus_embeddings):
    # Convert the query into an embedding
    query_embedding = model.encode([query])

    # Calculate cosine similarity between query and all menu items
    similarities = cosine_similarity(query_embedding, corpus_embeddings)

    # Find the index of the most similar menu item
    best_match_idx = similarities.argmax()  # Index of the highest similarity score

    # Return the best match
    return corpus[best_match_idx], cleaned_data["menu"][best_match_idx]

# Test the function with a query
query = "Show me something with tomato"
best_match_text, best_match_item = search(query, corpus, corpus_embeddings)

# Print results
print("Best Match Text:", best_match_text)
print("Best Match Item:", best_match_item)


Best Match Text: veggie burger: patty with lettuce and tomato.
Best Match Item: {'name': 'veggie burger', 'description': 'patty with lettuce and tomato.', 'price': 7.49}


In [9]:
def search_top_n(query, corpus, corpus_embeddings, n=3):
    # Convert the query into an embedding
    query_embedding = model.encode([query])

    # Calculate cosine similarity between query and all menu items
    similarities = cosine_similarity(query_embedding, corpus_embeddings)

    # Get indices of the top N matches
    top_n_idx = similarities.argsort()[0][-n:][::-1]  # Indices of top N

    # Return the top N matches
    top_n_matches = [(corpus[i], cleaned_data["menu"][i], similarities[0][i]) for i in top_n_idx]
    return top_n_matches

# Test with top 3 matches
query = "Show me something with cheese"
top_matches = search_top_n(query, corpus, corpus_embeddings, n=3)

# Display results
for idx, (text, item, score) in enumerate(top_matches):
    print(f"Rank {idx+1}: {text} | Similarity: {score:.4f}")
    print(f"Menu Item: {item}\n")


Rank 1: cheese pizza: classic mozzarella pizza. | Similarity: 0.5729
Menu Item: {'name': 'cheese pizza', 'description': 'classic mozzarella pizza.', 'price': 9.99}

Rank 2: veggie burger: patty with lettuce and tomato. | Similarity: 0.3605
Menu Item: {'name': 'veggie burger', 'description': 'patty with lettuce and tomato.', 'price': 7.49}



In [11]:
def chat():
    print("Welcome to the Testaurant chatbot!")
    print("Ask me about the menu (e.g., 'What's vegetarian?') or type 'exit' to quit.")

    while True:
        user_query = input("\nYour query: ")

        if user_query.lower() == 'exit':
            print("Goodbye! 👋")
            break

        best_match_text, best_match_item = search(user_query, corpus, corpus_embeddings)

        print("\nI found this for you:")
        print(f"Menu: {best_match_text}")
        print(f"Item Details: {best_match_item}\n")

# Start the chat
chat()


Welcome to the Testaurant chatbot!
Ask me about the menu (e.g., 'What's vegetarian?') or type 'exit' to quit.

Your query: What’s spicy?

I found this for you:
Menu: veggie burger: patty with lettuce and tomato.
Item Details: {'name': 'veggie burger', 'description': 'patty with lettuce and tomato.', 'price': 7.49}


Your query: exit
Goodbye! 👋
