In [1]:
# Authentication to Google API
import os
import math
from collections import Counter
from google.cloud import vision
import re

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] ='/Users/Cwkf_89/Library/CloudStorage/OneDrive-SingaporeManagementUniversity/Y3S2/data analytics/Project/google_vision_key.json'
WORD = re.compile(r"\w+")

In [40]:
def detect_text(path):
    """Detects text in the file."""

    client = vision.ImageAnnotatorClient()

    with open(path, "rb") as image_file:
        content = image_file.read()

    image = vision.Image(content=content)

    # for non-dense text 
    # response = client.text_detection(image=image)
    # for dense text
    response = client.document_text_detection(image=image)
    texts = response.text_annotations
    ocr_text = []

    for text in texts:
        ocr_text.append(f"\r\n{text.description}")

    if response.error.message:
        raise Exception(
            "{}\nFor more info on error messages, check: "
            "https://cloud.google.com/apis/design/errors".format(response.error.message)
        )
    return ocr_text

In [55]:
image_path = "./Menu_8.png"
text = detect_text(image_path)

In [56]:
text[0]


'\r\nBRUNCH\nsalmon\ncured\npaina\n&\ncream\n340.-\nlay low breakfast\nhouse cured salmon | dill cream\npickled shallot caper | sliced radish\ndill oil salad\ntomato & burrata\n280.-\negg & avocado\n260.\nscrambled eggs | baked beans | bacon\ngrilled mushroom & onion\nhouse-baked sourdough\nshrimp on toast\n300.-\n300.-\nmarinated tomatoes | burrata cheese\nparsley oil | house-baked sourdough\ngrilled cheese toast\n{260.-\nsmashed avocado | poached eggs\nsalad | house-baked sourdough\ncroque madame\nshrimp mayonnaise | herb yogurt\ndill oil shrimp oil pickled shallot\nhouse-baked sourdough & salad\n300.-\ncheddar, mozzarella, parmesan & gruyere cheese\nhouse-baked sourdough\ntomato sauce & salad\nhám cheddar | mozzarella | parmesan\ngruyere cheese mornay sauce\nhouse-baked sourdough\nsunny side up egg I tomato sauce & salad\n++ add on ++\n+sous vide egg\n+sourdough\n+avocado\n+burrata\n+cured salmon\n30-\n30-\n80.-\n100-\n100.-\nN\now'

In [53]:
import re
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

class MenuParser:
    def __init__(self, model_name="mistralai/Mistral-7B-Instruct-v0.2"):
        """Initialize the menu parser with a specified open-source LLM.
        
        Args:
            model_name: The Hugging Face model to use for extraction
        """
        # Load model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name, 
            torch_dtype=torch.bfloat16,  # Use lower precision for efficiency
            device_map="auto"  # Automatically use available hardware
        )
    
    def preprocess_text(self, text):
        """Clean and normalize OCR text."""
        # Normalize line breaks
        text = re.sub(r'\r\n|\r', '\n', text)
        
        # Replace multiple spaces with single space
        text = re.sub(r' +', ' ', text)
        
        # Merge lines that likely belong together
        lines = text.split('\n')
        merged_lines = []
        buffer = ""
        
        for line in lines:
            line = line.strip()
            if not line:
                continue
                
            # If the line contains a price pattern, it's likely the end of an item
            if re.search(r'\d+\.-', line):
                if buffer:
                    merged_lines.append(buffer)
                merged_lines.append(line)
                buffer = ""
            else:
                # Add space if buffer is not empty
                if buffer:
                    buffer += " " + line
                else:
                    buffer = line
        
        # Don't forget remaining buffer
        if buffer:
            merged_lines.append(buffer)
            
        return "\n".join(merged_lines)
    
    def extract_items_and_prices(self, text):
        """Use LLM to extract menu items and prices."""
        cleaned_text = self.preprocess_text(text)
        
        # Create prompt for the LLM
        prompt = f"""<s>[INST] 
        You are a specialized assistant for restaurant menu analysis.
        
        Extract all menu items and their prices from the following OCR-extracted menu text.
        Format your response as a JSON array of objects with 'item' and 'price' fields.
        For items with descriptions, include the main item name only.
        Only include items that have clear prices.
        
        Here's the OCR text:
        
        {cleaned_text}
        [/INST]"""
        
        # Generate response from the model
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        
        # Generate with appropriate parameters for a structured response
        with torch.no_grad():
            outputs = self.model.generate(
                inputs.input_ids,
                max_new_tokens=2048,
                temperature=0.1,  # Low temperature for more deterministic output
                top_p=0.95,
                repetition_penalty=1.2
            )
        
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract just the model's response (after the prompt)
        response = response.split("[/INST]")[-1].strip()
        
        # Use regex to isolate the JSON part
        json_match = re.search(r'\[\s*{.*}\s*\]', response, re.DOTALL)
        if json_match:
            import json
            try:
                items_prices = json.loads(json_match.group(0))
                return items_prices
            except json.JSONDecodeError:
                # If JSON parsing fails, return a simple extraction based on regex
                return self._fallback_extraction(cleaned_text)
        else:
            return self._fallback_extraction(cleaned_text)
    
    def _fallback_extraction(self, text):
        """Fallback method using regex if LLM parsing fails."""
        items_prices = []
        # Basic regex to find potential item-price pairs
        pattern = r'([a-zA-Z\s&]+)(?:\n|.)*?(\d+\.-)'
        matches = re.finditer(pattern, text)
        
        for match in matches:
            item = match.group(1).strip()
            price = match.group(2).strip()
            if item and price and not item.isdigit():
                items_prices.append({"item": item, "price": price})
        
        return items_prices

# Example usage
def process_menu(ocr_text):
    parser = MenuParser()
    items_prices = parser.extract_items_and_prices(ocr_text)
    
    # Print results
    print(f"Found {len(items_prices)} menu items:")
    for item in items_prices:
        print(f"{item['item']} - {item['price']}")
    
    return items_prices

# If run directly
if __name__ == "__main__":
    # Example OCR text
    ocr_text = text[0]
    
    process_menu(ocr_text)

  from .autonotebook import tqdm as notebook_tqdm


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2.
401 Client Error. (Request ID: Root=1-67e679ba-0b5fd33103c690c10ae27691;03ab8e8b-ea57-479f-9402-dff07cb4053c)

Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/config.json.
Access to model mistralai/Mistral-7B-Instruct-v0.2 is restricted. You must have access to it and be authenticated to access it. Please log in.

In [None]:
from PIL import Image, ImageDraw
image=Image.open(image_path)
image

processing with huggingface api

In [57]:
import requests
import json
import re
import pandas as pd
from typing import List, Dict, Any

def extract_menu_with_huggingface(ocr_text: str) -> List[Dict[str, Any]]:
    """Extract menu items using Hugging Face Inference API (free tier)."""
    
    # Hugging Face API endpoint (free tier, no authentication needed for some models)
    API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
    
    # Create structured prompt for the model
    prompt = f"""<s>[INST] I have OCR text from a restaurant menu. Please extract each menu item and its price, 
    and return a JSON array of objects with 'item' and 'price' fields.
    
    Please ignore category headers like "BEVERAGES", "coffee", "non-coffee", etc. Also ignore restaurant names.
    Format prices as integers (e.g., "85.-" should be 85).
    
    Here's the OCR text:
    {ocr_text}
    
    Return only a valid JSON array like:
    [
      {{"item": "americano", "price": 85}},
      {{"item": "caffe latte", "price": 100}}
    ]
    [/INST]</s>
    """
    
    # Make request to Hugging Face
    headers = {"Content-Type": "application/json"}
    data = {"inputs": prompt, "parameters": {"max_new_tokens": 1024}}
    
    response = requests.post(API_URL, headers=headers, json=data)
    result = response.json()
    
    # Extract JSON part from response
    if isinstance(result, list) and "generated_text" in result[0]:
        response_text = result[0]["generated_text"]
        # Find the JSON part in the response using regex
        json_match = re.search(r'\[\s*\{.*\}\s*\]', response_text, re.DOTALL)
        if json_match:
            try:
                menu_items = json.loads(json_match.group(0))
                return menu_items
            except json.JSONDecodeError:
                print("Error parsing JSON from response")
                return []
    
    print("Unexpected response format from API")
    return []

# Example usage
def process_menu(ocr_text: str) -> pd.DataFrame:
    """Process menu OCR text into a structured DataFrame."""
    # Get menu items from Hugging Face
    menu_items = extract_menu_with_huggingface(ocr_text)
    
    # Convert to DataFrame
    if menu_items:
        return pd.DataFrame(menu_items)
    else:
        # Fallback to basic pattern matching if API fails
        return basic_pattern_extraction(ocr_text)

def basic_pattern_extraction(ocr_text: str) -> pd.DataFrame:
    """Fallback method using regex pattern matching."""
    lines = ocr_text.strip().split('\n')
    menu_items = []
    
    # Simple pattern matching for item-price pairs
    for i in range(len(lines) - 1):
        current_line = lines[i].strip()
        next_line = lines[i + 1].strip()
        
        # Skip category headers and empty lines
        if not current_line or current_line.upper() == current_line:
            continue
            
        # Check if next line contains a price pattern
        price_match = re.search(r'(\d+)\.?-+', next_line)
        if price_match:
            price = int(price_match.group(1))
            # Skip if the item name is very short or looks like a category
            if len(current_line) > 2 and not current_line.isupper():
                menu_items.append({
                    "item": current_line,
                    "price": price
                })
    
    return pd.DataFrame(menu_items)



Processing with Ollama api

In [20]:

import requests
import json
import re
import pandas as pd
from typing import List, Dict, Any

def process_with_ollama(ocr_text: str) -> List[Dict[str, Any]]:
    """
    Process menu text using Ollama local LLM.
    
    Prerequisites:
    1. Install Ollama from https://ollama.ai/
    2. Run: ollama pull mistral or ollama pull llama2
    3. Start Ollama server
    """
    # Ollama API endpoint (running locally)
    API_URL = "http://localhost:11434/api/generate"
    
    # Prompt with clear instructions
    prompt = f"""
    Task: Extract menu items and prices from this OCR text of a restaurant menu.
    
    OCR Text:
    {ocr_text}
    
    Instructions:
    1. Identify each menu item and its corresponding price
    2. Ignore category headers (like "BEVERAGES", "coffee", etc.)
    3. Ignore restaurant names or repeated text at the bottom
    4. Format each price as an integer (remove the ".-" suffix)
    
    Return ONLY a valid JSON array with this structure:
    [
      {{"item": "item name", "price": price_as_integer}},
      ...
    ]
    """
    
    # Configure Ollama request
    payload = {
        "model": "llama2",  # or "llama2" if you prefer
        "prompt": prompt,
        "stream": False
    }
    
    try:
        response = requests.post(API_URL, json=payload)
        result = response.json()
        
        if "response" in result:
            # Extract JSON array from the response
            response_text = result["response"]
            json_match = re.search(r'\[\s*\{.*\}\s*\]', response_text, re.DOTALL)
            
            if json_match:
                try:
                    menu_items = json.loads(json_match.group(0))
                    return menu_items
                except json.JSONDecodeError:
                    print("Error parsing JSON from Ollama response")
                    return []
        
        print("Unexpected response format from Ollama")
        return []
    
    except Exception as e:
        print(f"Error connecting to Ollama: {e}")
        print("Make sure Ollama is installed and running locally")
        return []

def categorize_items(menu_items: List[Dict[str, Any]]) -> pd.DataFrame:
    """Categorize menu items based on their names."""
    df = pd.DataFrame(menu_items)
    
    # Add category column
    df['category'] = 'OTHER'
    
    # Define category patterns
    categories = {
        'COFFEE': ['americano', 'espresso', 'latte', 'cappuccino', 'mocha', 'macchiato', 'flat white', 'cortado', 'affogato'],
        'TEA': ['tea'],
        'NON-COFFEE': ['chocolate', 'frappe', 'soda', 'lemonade', 'passion fruit', 'peach']
    }
    
    # Assign categories based on item names
    for item_idx, item_name in enumerate(df['item']):
        item_lower = item_name.lower()
        for cat, keywords in categories.items():
            if any(keyword in item_lower for keyword in keywords):
                df.at[item_idx, 'category'] = cat
                break
    
    return df

def process_menu(ocr_text: str) -> pd.DataFrame:
    """Process menu OCR text into a structured DataFrame."""
    # Try Ollama first
    try:
        menu_items = process_with_ollama(ocr_text)
        if menu_items:
            return categorize_items(menu_items)
    except:
        print("Ollama processing failed, falling back to pattern matching")
    
    # Fallback to pattern matching
    return basic_pattern_extraction(ocr_text)

def basic_pattern_extraction(ocr_text: str) -> pd.DataFrame:
    """Fallback method using regex pattern matching."""
    lines = ocr_text.strip().split('\n')
    menu_items = []
    
    # Simple pattern matching for item-price pairs
    for i in range(len(lines) - 1):
        current_line = lines[i].strip()
        next_line = lines[i + 1].strip()
        
        # Skip category headers and empty lines
        if not current_line or current_line.upper() == current_line:
            continue
            
        # Check if next line contains a price pattern
        price_match = re.search(r'(\d+)\.?-+', next_line)
        if price_match:
            price = int(price_match.group(1))
            # Skip if the item name is very short or looks like a category
            if len(current_line) > 2 and not current_line.isupper():
                menu_items.append({
                    "item": current_line,
                    "price": price
                })
    
    # Convert to DataFrame and categorize
    df = pd.DataFrame(menu_items)
    if not df.empty:
        return categorize_items(df.to_dict('records'))
    return df

In [58]:
# To use in your notebook:

ocr_text = text[0]  # Get the full text from first element

# Process the menu
menu_df = process_menu(ocr_text)
print(menu_df)


Unexpected response format from API
                            item  price
0                          cream    340
1               tomato & burrata    280
2                shrimp on toast    300
3           grilled cheese toast    260
4  house-baked sourdough & salad    300
5                  +cured salmon     30


In [33]:
import re
import pandas as pd
from typing import List, Dict, Any, Tuple

def clean_text(ocr_text: str) -> str:
    """Clean the OCR text."""
    # Fix common OCR issues
    text = ocr_text.replace('--', '-')
    text = re.sub(r'\r', '', text)
    return text

def extract_menu_items(ocr_text: str) -> List[Dict[str, Any]]:
    """Extract menu items using rule-based pattern matching."""
    # Clean the text
    clean_ocr = clean_text(ocr_text)
    
    # Split into lines
    lines = clean_ocr.strip().split('\n')
    
    # Define words to skip (category headers, etc.)
    skip_words = {'beverages', 'coffee', 'non-coffee', 'organic tea', 'add-on'}
    
    # Extract menu items
    menu_items = []
    current_category = "UNCATEGORIZED"
    
    i = 0
    while i < len(lines):
        line = lines[i].strip()
        
        # Skip empty lines
        if not line:
            i += 1
            continue
        
        # Check if this is a category header
        if line.isupper() and len(line) > 2:
            current_category = line
            i += 1
            continue
        
        # Skip known headers
        if line.lower() in skip_words:
            i += 1
            continue
        
        # Look ahead for price pattern
        if i + 1 < len(lines):
            next_line = lines[i + 1].strip()
            price_match = re.search(r'(\d+)\.?-+', next_line)
            
            if price_match:
                # Found a price, extract the item and price
                price = int(price_match.group(1))
                
                # Check if this is a valid menu item (not too short, not all caps)
                if len(line) > 2 and not line.isupper():
                    menu_items.append({
                        "category": current_category,
                        "item": line,
                        "price": price
                    })
                
                # Skip the price line
                i += 2
                continue
        
        # If we didn't find a price pattern, move to the next line
        i += 1
    
    return menu_items

def categorize_items(menu_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """Categorize menu items based on item names."""
    # Define category keywords
    category_patterns = {
        "COFFEE": ['coffee', 'americano', 'espresso', 'latte', 'cappuccino', 'mocha', 'macchiato', 'flat white', 'cortado', 'affogato'],
        "TEA": ['tea'],
        "NON-COFFEE": ['chocolate', 'frappe', 'soda', 'lemonade']
    }
    
    # Process each item
    for item in menu_items:
        # Skip if already has a good category
        if item["category"] != "UNCATEGORIZED":
            continue
            
        item_name = item["item"].lower()
        
        # Check each category pattern
        for category, patterns in category_patterns.items():
            if any(pattern in item_name for pattern in patterns):
                item["category"] = category
                break
    
    return menu_items

def identify_add_ons(menu_items: List[Dict[str, Any]], ocr_text: str) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
    """Identify add-on items."""
    # Look for the add-on section
    add_on_match = re.search(r'add-on(.*?)(?:\n\w+:|$)', ocr_text, re.DOTALL | re.IGNORECASE)
    
    regular_items = []
    add_ons = []
    
    if add_on_match:
        add_on_section = add_on_match.group(1)
        
        # Extract add-on items
        add_on_items = re.findall(r'-\s*(.*?)\s*\+\s*(\d+)\.?-+', add_on_section)
        
        for name, price in add_on_items:
            add_ons.append({
                "category": "ADD-ON",
                "item": name.strip(),
                "price": int(price)
            })
    
    # Separate regular items from potential add-ons
    for item in menu_items:
        # Check if the item is already found in add-ons
        if any(add_on["item"] == item["item"] for add_on in add_ons):
            continue
            
        # Check if item has '+' in the name or price
        if "+" in item["item"] or (isinstance(item["price"], str) and "+" in item["price"]):
            # This is likely an add-on
            clean_item = item["item"].replace("+", "").strip()
            clean_price = str(item["price"]).replace("+", "").strip()
            
            try:
                price = int(clean_price)
                add_ons.append({
                    "category": "ADD-ON",
                    "item": clean_item,
                    "price": price
                })
            except ValueError:
                # If we can't parse the price, just add it to regular items
                regular_items.append(item)
        else:
            regular_items.append(item)
    
    return regular_items, add_ons

def process_menu(ocr_text: str) -> pd.DataFrame:
    """Process the OCR text into a structured menu DataFrame."""
    # Extract initial menu items
    menu_items = extract_menu_items(ocr_text)
    
    # Categorize items
    categorized_items = categorize_items(menu_items)
    
    # Identify add-ons
    regular_items, add_ons = identify_add_ons(categorized_items, ocr_text)
    
    # Create DataFrame
    df = pd.DataFrame(regular_items + add_ons)
    
    # Sort by category and item
    if not df.empty:
        df = df.sort_values(by=["category", "item"])
    
    return df



In [36]:
# Example usage
text = detect_text(image_path)
ocr_text = text[0]  # Get the full text

# Process the menu
menu_df = process_menu(ocr_text)
print(menu_df)

     category                      item  price
26     ADD-ON         - extra shot 25.-     50
27     ADD-ON           - oat milk 25.-    120
23     ADD-ON                extra shot     25
25     ADD-ON                  oat milk     25
24     ADD-ON             special beans     50
17  BEVERAGES               affogato***    140
19  BEVERAGES            black rose tea    120
1   BEVERAGES               caffe latte    100
3   BEVERAGES               caffe mocha    125
15  BEVERAGES            cappuccino (H)    110
5   BEVERAGES         caramel macchiato    140
13  BEVERAGES               cortado (H)    110
9   BEVERAGES                     dirty    110
21  BEVERAGES             earl grey tea    120
20  BEVERAGES     english breakfast tea    120
16  BEVERAGES              espresso (H)     70
11  BEVERAGES            flat white (H)    110
18  BEVERAGES                 green tea    120
22  BEVERAGES         herbal blends tea    120
14  BEVERAGES             lemonade soda    120
12  BEVERAGES

In [45]:
import requests

API_URL = "https://api-inference.huggingface.co/models/google/flan-t5-base"
headers = {"Authorization": f"Bearer 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"}
data = {"inputs": "Translate to French: Hello, how are you?"}

response = requests.post(API_URL, headers=headers, json=data)
print(response.status_code)
print(response.json())

200
[{'generated_text': "Bonjour, c'est-à-dire?"}]
