<a href="https://colab.research.google.com/github/ddivyansh04/Gen-ai-nugget/blob/main/Welcome_To_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ----------------------------
# STEP 1: INSTALL DEPENDENCIES
# ----------------------------
!apt-get update -y && apt-get install -y tesseract-ocr
!pip install pytesseract sentence-transformers scikit-learn Pillow requests

# ----------------------------
# STEP 2: IMPORT LIBRARIES
# ----------------------------
import requests
from PIL import Image
from io import BytesIO
import pytesseract
import re
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# ----------------------------
# STEP 3: CONFIGURATION
# ----------------------------
RESTAURANT_MENUS = {
   "Pukhtaan": [
        "https://b.zmtcdn.com/data/menus/853/21580853/feb85f6fa20259fe481a7bc440c24476.jpg"
    ],
    "Connaught_Club_House": [
        "https://b.zmtcdn.com/data/menus/106/19295106/faf36abd62cb22e25492a1e51d36d971.jpg",
    "https://b.zmtcdn.com/data/menus/106/19295106/6d533623bfb316c927d5087a1be26f1b.jpg",
    "https://b.zmtcdn.com/data/menus/106/19295106/92fdde9055013c6ab9ba7e9b76c1770d.jpg",
    "https://b.zmtcdn.com/data/menus/106/19295106/d59c8c08538a220438fab471790d2b3b.jpg",
    "https://b.zmtcdn.com/data/menus/106/19295106/d5ff7bf34e11499d79ee59b5492e892d.jpg",
    "https://b.zmtcdn.com/data/menus/106/19295106/5757370e09010293de96576c65c499f3.jpg",
    "https://b.zmtcdn.com/data/menus/106/19295106/e784555ca0fac7f4b2c78ad12d708108.jpg",
    "https://b.zmtcdn.com/data/menus/106/19295106/cf56b2e45236ab81e3adeebde09937e5.jpg",
    "https://b.zmtcdn.com/data/menus/106/19295106/b3a95e356d17115f48f31e488fa14510.jpg",
    "https://b.zmtcdn.com/data/menus/106/19295106/db6351e453b6a553e2a95a029ccd5c93.jpg",
    "https://b.zmtcdn.com/data/menus/106/19295106/63668b33e2cd94b084aba2400a32302d.jpg",
    "https://b.zmtcdn.com/data/menus/106/19295106/28c47c6101f4b6525ae0f00054685f7b.jpg",
    "https://b.zmtcdn.com/data/menus/106/19295106/8580a3bc941162b7ac0fcf6a2b29408f.jpg",
    "https://b.zmtcdn.com/data/menus/106/19295106/9e199b47a96c620137325a2edaa257ea.jpg",
    "https://b.zmtcdn.com/data/menus/106/19295106/a48b905eda7e351ec9d16be38263c6ce.jpg",
    "https://b.zmtcdn.com/data/menus/106/19295106/e1256a12cfc8d2671887daf5849462c3.jpg",
    "https://b.zmtcdn.com/data/menus/106/19295106/407cf6d0c92226a1e8d3c2e306bc7eee.jpg",
    "https://b.zmtcdn.com/data/menus/106/19295106/a06bc4526e30e16cd66e7fc33680ddd1.jpg",
    "https://b.zmtcdn.com/data/menus/106/19295106/f43d42f35215a54255c2b3c5e7bc361d.jpg"
],
    "Local": [
   "https://b.zmtcdn.com/data/menus/360/18382360/4a261b83c8d83e45c90ba18738606383.jpg",
    "https://b.zmtcdn.com/data/menus/360/18382360/e594cde054e3b9cd74bada9713d45647.jpg",
    "https://b.zmtcdn.com/data/menus/360/18382360/09f07621c6c933873eaa0e3cc2d8cee6.jpg",
    "https://b.zmtcdn.com/data/menus/360/18382360/9df0e7ea170f93df343a776cbb433b2e.jpg",
    "https://b.zmtcdn.com/data/menus/360/18382360/c7a32dfe8174158daf3395a2d8a3122a.jpg",
    "https://b.zmtcdn.com/data/menus/360/18382360/6213b5c0e0e6304b496aa40ec29ee7c0.jpg",
    "https://b.zmtcdn.com/data/menus/360/18382360/62bb0f0ab165aed9aac2f07386be6ce0.jpg",
    "https://b.zmtcdn.com/data/menus/360/18382360/9bb62a9813fd95783befe1f938dff819.jpg",
    "https://b.zmtcdn.com/data/menus/360/18382360/39c8d9378a98634edad40bd2e21c8269.jpg",
    "https://b.zmtcdn.com/data/menus/360/18382360/9059e84501429b654a3b187f8fa05f3b.jpg"
]
}

# ----------------------------
# STEP 4: OCR FUNCTIONS
# ----------------------------
def extract_text_from_image_url(image_url):
    try:
        response = requests.get(image_url, timeout=10)
        response.raise_for_status()
        image = Image.open(BytesIO(response.content))
        return pytesseract.image_to_string(image)
    except Exception as e:
        print(f"⚠️ Error processing {image_url}: {str(e)}")
        return ""

def clean_ocr_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s\.,₹$€¥¢&+()-]', '', text)
    replacements = {
        r'\bOe\b': 'Of', r'\bee\b': 'and',
        r'\bBe ary\b': 'Biryani', r'\bR (\d+)': r'₹\1',
        r'\b(\d+)\s*-\s*(\d+)\b': r'₹\1-\2'
    }
    for pattern, replacement in replacements.items():
        text = re.sub(pattern, replacement, text)
    return text

# ----------------------------
# STEP 5: ENHANCED MENU PARSER
# ----------------------------
def parse_menu_from_ocr_text(ocr_text):
    lines = [line.strip() for line in clean_ocr_text(ocr_text).split("\n") if line.strip()]
    items = []
    current_item = None
    description_buffer = []

    price_pattern = re.compile(
        r'(?:₹|Rs?\.?|INR|MRP|Price)\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)|'
        r'(?:USD|$|Price)\s*(\d+\.\d{2})',
        re.IGNORECASE
    )

    dietary_config = {
        'vegetarian': ['vegetarian', 'veg', 'paneer', 'tofu', 'cheese'],
        'non-vegetarian': ['chicken', 'mutton', 'fish', 'prawn', 'meat', 'lamb', 'beef', 'egg'],
        'gluten-free': ['gluten-free', 'gluten free', 'gf'],
        'vegan': ['vegan', 'dairy-free']
    }

    for line in lines:
        # Detect price and item separators first
        if handle_price_line(line, price_pattern, items, current_item, description_buffer):
            current_item = None
            continue

        if is_new_item(line):
            if current_item:
                finalize_item(current_item, description_buffer, dietary_config, items)
            current_item = create_new_item(line)
            description_buffer = []
        elif current_item:
            description_buffer.append(line)

    # Handle final item
    if current_item:
        finalize_item(current_item, description_buffer, dietary_config, items)

    return split_combined_items(items)

def handle_price_line(line, price_pattern, items, current_item, description_buffer):
    """Process lines containing prices"""
    price_match = price_pattern.search(line)
    if not price_match:
        return False

    price = price_match.group().replace("R ", "₹").strip()
    line = price_pattern.sub('', line).strip()

    if current_item:
        current_item['price'] = price
        if line: description_buffer.append(line)
        return True

    if items and not items[-1]['price']:
        items[-1]['price'] = price
        if line: items[-1]['description'] += ' ' + line
        return True

    return False

def is_new_item(line):
    """Improved item detection logic"""
    return (len(line) > 3 and
            line[0].isupper() and
            sum(1 for c in line if c.isupper()) > len(line)//2 and
            not any(c.isdigit() for c in line))

def create_new_item(name):
    return {'name': name, 'price': '', 'description': '', 'dietary': [], 'spice_level': 0}

def finalize_item(item, description_buffer, dietary_config, items):
    """Complete item processing"""
    item['description'] = ' '.join(description_buffer)
    item['dietary'] = detect_dietary_tags(item, dietary_config)
    item['spice_level'] = detect_spice_level(item['description'])
    items.append(item.copy())

# ----------------------------
# STEP 6: POST-PROCESSING
# ----------------------------
def split_combined_items(items):
    """Fix combined items in descriptions"""
    cleaned_items = []
    for item in items:
        parts = re.split(r'\b(\d{3,4})\b', item['description'])
        if len(parts) > 1:
            for i in range(0, len(parts)-1, 2):
                new_item = item.copy()
                new_item['description'] = parts[i].strip()
                if i+1 < len(parts):
                    new_item['price'] = f"₹{parts[i+1].strip()}"
                cleaned_items.append(new_item)
        else:
            cleaned_items.append(item)
    return cleaned_items

def detect_dietary_tags(item, config):
    text = f"{item['name']} {item['description']}".lower()
    tags = []
    if any(kw in text for kw in config['non-vegetarian']):
        tags.append('non-vegetarian')
    for category, keywords in config.items():
        if category == 'non-vegetarian': continue
        if any(kw in text for kw in keywords):
            tags.append(category)
    return tags

def detect_spice_level(description):
    text = description.lower()
    spice_levels = {
        3: ['extra spicy', 'fiery', 'blazing'],
        2: ['spicy', 'chilli', 'hot'],
        1: ['mild', 'light spice']
    }
    for level, keywords in spice_levels.items():
        if any(kw in text for kw in keywords):
            return level
    return 0

# ----------------------------
# STEP 7: DATA PROCESSING
# ----------------------------
def process_restaurant_menus(restaurant_name, menu_urls):
    all_text = []
    for url in menu_urls:
        print(f"Processing {restaurant_name} menu: {url[-20:]}")
        all_text.append(extract_text_from_image_url(url))
    return parse_menu_from_ocr_text("\n\n".join(all_text))

def prepare_embeddings(restaurant_data):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    for name, data in restaurant_data.items():
        corpus = [f"{item['name']} {item['description']}" for item in data['items']]
        data['embeddings'] = model.encode(corpus) if corpus else None
    return restaurant_data

# ----------------------------
# STEP 8: SEARCH ENGINE
# ----------------------------
def enhanced_search(query, restaurant_data, top_k=5):
    results = []
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Extract price filter
    price_filter = re.search(r'under ₹?(\d+)', query, re.IGNORECASE)
    max_price = int(price_filter.group(1)) if price_filter else None

    for rest_name, data in restaurant_data.items():
        # Handle restaurant-specific queries
        if ' in ' in query.lower():
            query_part, _, rest_part = query.lower().partition(' in ')
            if rest_name.lower() not in rest_part: continue
            query = query_part

        for idx, item in enumerate(data['items']):
            # Price validation
            price = extract_price(item['price']) or extract_price(item['description'])
            if max_price and (price or 9999) > max_price: continue

            # Calculate relevance score
            if query.lower() in item['name'].lower():
                score = 1.0
            else:
                query_emb = model.encode([query])
                item_emb = model.encode([f"{item['name']} {item['description']}"])
                score = cosine_similarity(query_emb, item_emb)[0][0]

            # Spice level filter
            if 'spicy' in query.lower() and item['spice_level'] < 1: continue

            results.append({
                'name': item['name'],
                'price': format_price(item['price']),
                'description': item['description'],
                'dietary': item['dietary'],
                'spice_level': item['spice_level'],
                'restaurant': rest_name,
                'score': score
            })

    return sorted(results, key=lambda x: x['score'], reverse=True)[:top_k]

def extract_price(text):
    matches = re.findall(r'\d{3,4}', text)
    return int(matches[-1]) if matches else None

def format_price(price_str):
    if not price_str: return "Check price"
    clean_price = re.sub(r'[^0-9]', '', price_str)
    return f"₹{int(clean_price):,}" if clean_price else "Check price"

# ----------------------------
# STEP 9: USER INTERFACE
# ----------------------------
def display_results(results):
    if not results:
        print("\n🔍 No matching dishes found. Try different keywords!")
        return

    print("\n🍽️ Top Results:")
    for idx, item in enumerate(results, 1):
        print(f"\n{idx}. {item['name']} @ {item['restaurant']}")
        print(f"   💵 Price: {item['price']}")
        print(f"   🌶️ Spice: {'★' * item['spice_level']}{'☆' * (3 - item['spice_level'])}")
        print(f"   🥗 Dietary: {', '.join(item['dietary']) or 'Not specified'}")
        if item['description']:
            print(f"   📝 Description: {item['description'][:100]}...")
    print("\n" + "="*60)

def restaurant_chatbot(restaurant_data):
    print("\n🍴 Welcome to Food Explorer!")
    print("Ask about dishes (e.g., 'spicy vegetarian under ₹500', 'non-vegetarian in Local', or 'exit')")

    while True:
        try:
            query = input("\nYou: ").strip()
            if query.lower() in ('exit', 'quit'):
                print("\n👋 Thank you for using Food Explorer!")
                break

            results = enhanced_search(query, restaurant_data)
            display_results(results)

        except KeyboardInterrupt:
            print("\n👋 Session ended by user")
            break
        except Exception as e:
            print(f"\n⚠️ Error processing request: {str(e)}")

# ----------------------------
# STEP 10: MAIN EXECUTION
# ----------------------------
if __name__ == "__main__":
    # Data processing pipeline
    restaurant_data = {}
    for name, urls in RESTAURANT_MENUS.items():
        items = process_restaurant_menus(name, urls)
        restaurant_data[name] = {'items': items, 'embeddings': None}
        print(f"\n✅ {name}: Processed {len(items)} menu items")

    # Prepare semantic embeddings
    restaurant_data = prepare_embeddings(restaurant_data)

    # Start interactive chatbot


0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
0% [Connecting to archive.ubuntu.com (185.125.190.82)] [Connecting to security.                                                                               Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,605 kB]
Get:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease [24.3 kB]
Hit:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:11 https://pp

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [2]:
    print("\n" + "="*60)
    print("🚀 System Ready for Queries!")
    print("="*60)
    restaurant_chatbot(restaurant_data)


🚀 System Ready for Queries!

🍴 Welcome to Food Explorer!
Ask about dishes (e.g., 'spicy vegetarian under ₹500', 'non-vegetarian in Local', or 'exit')

You: exit

👋 Thank you for using Food Explorer!
