In [None]:
import os
os.listdir('../../../datasets/images/text_ocr')

In [None]:
# Generic OCR Image Text Search System with GUI
import os, json, sqlite3, shutil, warnings, time
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Any, Optional
import numpy as np
from PIL import Image
warnings.filterwarnings("ignore")

# Disable analytics to prevent connection errors
os.environ['DISABLE_TELEMETRY'] = '1'
os.environ['PADDLEOCR_DISABLE_TELEMETRY'] = '1'
os.environ['HUB_ANALYTICS'] = 'False'

# Configuration with OCR options
CONFIG = {
    'OCR_MODEL': 'paddleocr',
    'OCR_CONFIDENCE_THRESHOLD': 0.6,
    'LLM_PROVIDER': 'ollama',
    'LLM_MODEL': 'llama3.2:latest',
    'LLM_BASE_URL': 'http://localhost:11434',
    'LLM_TEMPERATURE': 0.1,
    'LLM_MAX_TOKENS': 1000,
    'EMBEDDING_PROVIDER': 'huggingface',
    'EMBEDDING_MODEL': 'sentence-transformers/all-MiniLM-L6-v2',
    'VECTOR_STORE_PATH': './image_vectorstore',
    'CHUNK_SIZE': 500,
    'CHUNK_OVERLAP': 50,
    'IMAGE_GALLERY_PATH': '../../../datasets/images/text_ocr',
    'DATABASE_PATH': './image_analysis_db',
    'RESULTS_OUTPUT_PATH': '../../../datasets/images/forensic_results',
    'BATCH_SIZE': 4,
    'MAX_RESULTS_DISPLAY': 10,
}

# OCR model options for different text types
OCR_MODELS = {
    'paddleocr': {'name': 'PaddleOCR', 'desc': 'Best overall, 80+ languages, handwriting capable', 'strengths': 'Asian text, complex layouts'},
    'easyocr': {'name': 'EasyOCR', 'desc': 'Good for European languages, simple setup', 'strengths': 'European text, clean images'},
    'tesseract': {'name': 'Tesseract', 'desc': 'Classic OCR, best for clean printed text', 'strengths': 'Printed documents, stable'},
    'trocr': {'name': 'TrOCR (Transformers)', 'desc': 'AI-based, excellent for handwriting', 'strengths': 'Handwritten notes, degraded images'}
}

print(f"🔍 GENERIC OCR TEXT SEARCH | LLM: {CONFIG['LLM_PROVIDER']}-{CONFIG['LLM_MODEL']} | 🔒 100% Private")

# Quick dependency check
def check_deps():
    deps = ['paddleocr', 'langchain', 'chromadb', 'sentence_transformers']
    for dep in deps:
        try: __import__(dep.replace('-', '_')); print(f"✅ {dep}")
        except ImportError: print(f"❌ {dep} - Install: pip install {dep}")

def check_ollama():
    try:
        import requests
        r = requests.get(f"{CONFIG['LLM_BASE_URL']}/api/tags", timeout=5)
        if r.status_code == 200:
            models = [m['name'] for m in r.json().get('models', [])]
            print(f"✅ Ollama: {models}")
            return CONFIG['LLM_MODEL'] in models
        return False
    except: print("❌ Ollama not running"); return False

check_deps()
ollama_ready = check_ollama() if CONFIG['LLM_PROVIDER'] == 'ollama' else True

# Safe OCR initialization
def init_ocr_safe():
    try:
        from paddleocr import PaddleOCR
        return PaddleOCR(
            use_textline_orientation=True,
            lang='en'
        )
    except Exception as e:
        print(f"OCR initialization error: {e}")
        return None

# Generic Image Text Search System
class GenericOCRSearchSystem:
    def __init__(self, config=None):
        self.config = config or CONFIG.copy()
        self._setup_dirs()
        self._init_db()
        self._init_ocr()
        self._init_llm()

    def _setup_dirs(self):
        for k in ['IMAGE_GALLERY_PATH', 'DATABASE_PATH', 'RESULTS_OUTPUT_PATH', 'VECTOR_STORE_PATH']:
            os.makedirs(self.config[k], exist_ok=True)

    def _init_db(self):
        db_path = os.path.join(self.config['DATABASE_PATH'], 'image_text.db')
        self.conn = sqlite3.connect(db_path)
        self.conn.execute('''CREATE TABLE IF NOT EXISTS image_texts (
            id INTEGER PRIMARY KEY,
            filename TEXT UNIQUE,
            full_path TEXT,
            extracted_text TEXT,
            ocr_confidence REAL,
            processed_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            text_analysis TEXT,
            relevance_score REAL,
            vector_stored BOOLEAN DEFAULT 0
        )''')
        self.conn.execute('''CREATE TABLE IF NOT EXISTS text_searches (
            id INTEGER PRIMARY KEY,
            query TEXT,
            search_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            results_json TEXT,
            matched_images TEXT,
            model_used TEXT,
            processing_time REAL
        )''')
        self.conn.commit()

    def _init_ocr(self):
        try:
            ocr_model = self.config['OCR_MODEL']
            if ocr_model == 'paddleocr':
                from paddleocr import PaddleOCR
                self.ocr = init_ocr_safe()
            elif ocr_model == 'easyocr':
                import easyocr
                self.ocr = easyocr.Reader(['en'], gpu=False)
            elif ocr_model == 'tesseract':
                import pytesseract
                self.ocr = pytesseract
            elif ocr_model == 'trocr':
                from transformers import TrOCRProcessor, VisionEncoderDecoderModel
                self.ocr_processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-printed')
                self.ocr_model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-printed')
                self.ocr = 'trocr'
            print(f"✅ {OCR_MODELS[ocr_model]['name']} ready")
        except Exception as e:
            print(f"❌ OCR error: {e}");
            self.ocr = None

    def _init_llm(self):
        try:
            if self.config['LLM_PROVIDER'] == 'ollama':
                from langchain_community.llms import Ollama
                self.llm = Ollama(
                    model=self.config['LLM_MODEL'],
                    base_url=self.config['LLM_BASE_URL'],
                    temperature=self.config['LLM_TEMPERATURE']
                )
            elif self.config['LLM_PROVIDER'] == 'transformers':
                from langchain_community.llms import HuggingFacePipeline
                from transformers import pipeline
                pipe = pipeline("text-generation", model=self.config['LLM_MODEL'],
                              max_new_tokens=self.config['LLM_MAX_TOKENS'])
                self.llm = HuggingFacePipeline(pipeline=pipe)

            from langchain_community.embeddings import HuggingFaceEmbeddings
            from langchain_community.vectorstores import Chroma
            from langchain.text_splitter import RecursiveCharacterTextSplitter

            self.embeddings = HuggingFaceEmbeddings(
                model_name=self.config['EMBEDDING_MODEL'],
                model_kwargs={'device': 'cpu'}
            )
            self.vectorstore = Chroma(
                persist_directory=self.config['VECTOR_STORE_PATH'],
                embedding_function=self.embeddings
            )
            self.text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=self.config['CHUNK_SIZE'],
                chunk_overlap=self.config['CHUNK_OVERLAP']
            )
            print("✅ LangChain ready")
        except Exception as e:
            print(f"❌ LLM error: {e}")

    def extract_text(self, img_path: str) -> Dict[str, Any]:
        """Extract text from image using selected OCR model"""
        try:
            img = Image.open(img_path).convert("RGB")
            result = {
                'filename': os.path.basename(img_path),
                'full_path': img_path,
                'extracted_text': '',
                'ocr_confidence': 0.0,
                'error': None
            }

            if self.ocr:
                ocr_model = self.config['OCR_MODEL']

                if ocr_model == 'paddleocr':
                    ocr_result = self.ocr.ocr(np.array(img), cls=True)
                    if ocr_result and ocr_result[0]:
                        texts, confs = [], []
                        for line in ocr_result:
                            for detection in line:
                                bbox, (text, conf) = detection
                                if conf >= self.config['OCR_CONFIDENCE_THRESHOLD']:
                                    texts.append(text); confs.append(conf)
                        result.update({
                            'extracted_text': ' '.join(texts),
                            'ocr_confidence': np.mean(confs) if confs else 0.0
                        })

                elif ocr_model == 'easyocr':
                    ocr_result = self.ocr.readtext(np.array(img))
                    texts, confs = [], []
                    for detection in ocr_result:
                        bbox, text, conf = detection
                        if conf >= self.config['OCR_CONFIDENCE_THRESHOLD']:
                            texts.append(text); confs.append(conf)
                    result.update({
                        'extracted_text': ' '.join(texts),
                        'ocr_confidence': np.mean(confs) if confs else 0.0
                    })

                elif ocr_model == 'tesseract':
                    import pytesseract
                    text = pytesseract.image_to_string(img, config='--psm 6')
                    if text.strip():
                        result.update({
                            'extracted_text': text.strip(),
                            'ocr_confidence': 0.8
                        })

                elif ocr_model == 'trocr':
                    pixel_values = self.ocr_processor(images=img, return_tensors="pt").pixel_values
                    generated_ids = self.ocr_model.generate(pixel_values)
                    text = self.ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
                    if text.strip():
                        result.update({
                            'extracted_text': text.strip(),
                            'ocr_confidence': 0.9
                        })

            return result
        except Exception as e:
            return {
                'filename': os.path.basename(img_path),
                'full_path': img_path,
                'extracted_text': '',
                'ocr_confidence': 0.0,
                'error': str(e)
            }

    def analyze_text_relevance(self, text: str, filename: str, search_context: str = "") -> Dict[str, Any]:
        """Analyze text relevance for search queries"""
        prompt = f'''Analyze the text content from image '{filename}':
Text: {text}
Context: {search_context if search_context else "General text analysis"}

Respond with JSON only:
{{
    "contains_relevant_text": true/false,
    "relevance_score": 0.0-1.0,
    "key_phrases": ["phrase1", "phrase2"],
    "content_summary": "brief summary",
    "text_categories": ["category1", "category2"],
    "searchable_keywords": ["keyword1", "keyword2"]
}}'''

        try:
            response = str(self.llm.invoke(prompt)).strip()
            response = response.replace('```json', '').replace('```', '').strip()
            start, end = response.find('{'), response.rfind('}') + 1

            if start >= 0 and end > start:
                parsed = json.loads(response[start:end])
                return {
                    'contains_relevant_text': parsed.get('contains_relevant_text', False),
                    'relevance_score': parsed.get('relevance_score', 0.0),
                    'key_phrases': parsed.get('key_phrases', []),
                    'content_summary': parsed.get('content_summary', ''),
                    'text_categories': parsed.get('text_categories', []),
                    'searchable_keywords': parsed.get('searchable_keywords', [])
                }
            else:
                return self._fallback_analysis(text, filename)
        except:
            return self._fallback_analysis(text, filename)

    def _fallback_analysis(self, text: str, filename: str) -> Dict[str, Any]:
        """Fallback analysis when LLM fails"""
        words = text.lower().split()
        return {
            'contains_relevant_text': len(words) > 3,
            'relevance_score': min(len(words) / 50.0, 1.0),
            'key_phrases': words[:5],
            'content_summary': f'Text with {len(words)} words',
            'text_categories': ['text_document'],
            'searchable_keywords': words[:10]
        }

    def add_to_vectorstore(self, text: str, metadata: Dict[str, Any]):
        """Add text to vector store for similarity search"""
        try:
            from langchain_core.documents import Document
            chunks = self.text_splitter.split_text(text)
            docs = [Document(page_content=chunk, metadata={**metadata, 'chunk_id': i})
                   for i, chunk in enumerate(chunks)]
            self.vectorstore.add_documents(docs)
            self.vectorstore.persist()
            return True
        except:
            return False

    def process_images(self):
        """Process all images in the gallery to extract text"""
        start_time = time.time()
        gallery_path = Path(self.config['IMAGE_GALLERY_PATH'])
        print(f"🔍 PROCESSING: {gallery_path} | OCR: {self.config['OCR_MODEL']}")

        if not gallery_path.exists():
            print(f"❌ Path not found: {gallery_path}");
            return

        exts = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
        files = [f for ext in exts
                for f in list(gallery_path.glob(f"**/*{ext}")) +
                         list(gallery_path.glob(f"**/*{ext.upper()}"))]
        total, processed, vector_stored = len(files), 0, 0

        print(f"📸 Found {total} images")
        if total == 0:
            print("⚠️ No images found!");
            return

        for i, img_path in enumerate(files, 1):
            filename = img_path.name
            print(f"[{i:3d}/{total}] ({i/total*100:5.1f}%) {filename[:40]}...", end='')

            # Skip if already processed
            if self.conn.execute("SELECT id FROM image_texts WHERE filename = ?", (filename,)).fetchone():
                print(" ⏭️ SKIP");
                continue

            # Extract text
            ocr_result = self.extract_text(img_path)
            if ocr_result.get('error'):
                print(" ❌ ERROR");
                continue

            text = ocr_result['extracted_text']
            analysis = None

            if text.strip():
                analysis = self.analyze_text_relevance(text, filename)
                metadata = {
                    'filename': filename,
                    'full_path': str(img_path),
                    'ocr_confidence': ocr_result['ocr_confidence']
                }
                if self.add_to_vectorstore(text, metadata):
                    vector_stored += 1

            # Store in database
            relevance = analysis['relevance_score'] if analysis else 0.0
            self.conn.execute('''INSERT OR REPLACE INTO image_texts
                (filename, full_path, extracted_text, ocr_confidence, text_analysis,
                 relevance_score, vector_stored) VALUES (?,?,?,?,?,?,?)''',
                (filename, str(img_path), text, ocr_result['ocr_confidence'],
                 json.dumps(analysis) if analysis else None, relevance, vector_stored > 0))

            if text.strip():
                print(f" ✅ TEXT ({len(text.split())} words)")
            else:
                print(" ⚪ NO TEXT")

            processed += 1
            if i % 10 == 0:
                self.conn.commit()

        self.conn.commit()
        print(f"\n🔍 COMPLETE: {processed} processed, {vector_stored} vectorized, {time.time()-start_time:.1f}s")

    def search_text(self, query: str) -> Dict[str, Any]:
        """Search for text across all processed images"""
        print(f"🔍 SEARCHING: '{query}'")
        try:
            # Use vector similarity search
            docs = self.vectorstore.similarity_search(query, k=10)
            if not docs:
                return {
                    'matches': [],
                    'summary': 'No matches found',
                    'total_matches': 0
                }

            # Group results by image
            image_matches = {}
            for doc in docs:
                filename = doc.metadata.get('filename', 'unknown')
                if filename not in image_matches:
                    image_matches[filename] = {
                        'filename': filename,
                        'full_path': doc.metadata.get('full_path', ''),
                        'text_chunks': [],
                        'relevance_score': 0.0
                    }
                image_matches[filename]['text_chunks'].append(doc.page_content)

            # Get additional info from database
            matches = []
            for filename, data in image_matches.items():
                db_result = self.conn.execute('''SELECT extracted_text, ocr_confidence, text_analysis
                    FROM image_texts WHERE filename = ?''', (filename,)).fetchone()

                if db_result:
                    full_text, confidence, analysis = db_result

                    # Calculate relevance based on query match
                    relevance = self._calculate_relevance(query, full_text)

                    matches.append({
                        'filename': filename,
                        'full_path': data['full_path'],
                        'extracted_text': full_text,
                        'ocr_confidence': confidence,
                        'relevance_score': relevance,
                        'matching_chunks': data['text_chunks'][:3],  # Top 3 chunks
                        'analysis': json.loads(analysis) if analysis else None
                    })

            # Sort by relevance
            matches.sort(key=lambda x: x['relevance_score'], reverse=True)

            results = {
                'matches': matches,
                'summary': f"Found {len(matches)} images with matching text",
                'total_matches': len(matches),
                'query': query
            }

            # Store search in database
            self.conn.execute('''INSERT INTO text_searches
                (query, results_json, matched_images, model_used) VALUES (?,?,?,?)''',
                (query, json.dumps(results),
                 json.dumps([m['filename'] for m in matches]),
                 f"{self.config['LLM_PROVIDER']}-{self.config['LLM_MODEL']}"))
            self.conn.commit()

            print(f"✅ Found {len(matches)} matches")
            return results

        except Exception as e:
            print(f"❌ Error: {e}")
            return {
                'matches': [],
                'summary': f'Search error: {e}',
                'total_matches': 0
            }

    def _calculate_relevance(self, query: str, text: str) -> float:
        """Calculate relevance score between query and text"""
        query_words = set(query.lower().split())
        text_words = set(text.lower().split())

        if not query_words or not text_words:
            return 0.0

        # Simple word overlap scoring
        overlap = len(query_words.intersection(text_words))
        return min(overlap / len(query_words), 1.0)

    def get_all_texts(self, min_confidence=0.5):
        """Get all extracted texts above confidence threshold"""
        return self.conn.execute('''SELECT filename, full_path, extracted_text,
            ocr_confidence, text_analysis FROM image_texts
            WHERE ocr_confidence >= ? ORDER BY ocr_confidence DESC''',
            (min_confidence,)).fetchall()

    def export_results(self, results: Dict[str, Any], query: str) -> str:
        """Export search results to files"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        query_safe = query.replace(' ', '_')[:30]
        output_path = Path(self.config['RESULTS_OUTPUT_PATH']) / f"search_{query_safe}_{timestamp}"
        output_path.mkdir(parents=True, exist_ok=True)

        # Copy matching images
        copied = []
        for i, match in enumerate(results.get('matches', []), 1):
            if Path(match['full_path']).exists():
                dest = output_path / f"{i:03d}_{match['filename']}"
                shutil.copy2(match['full_path'], dest)
                copied.append(str(dest))

        # Save reports
        report = {
            'query': query,
            'timestamp': timestamp,
            'ocr_model': self.config['OCR_MODEL'],
            'llm_model': f"{self.config['LLM_PROVIDER']}-{self.config['LLM_MODEL']}",
            'results': results,
            'copied_files': copied
        }

        with open(output_path / "search_report.json", 'w') as f:
            json.dump(report, f, indent=2)

        with open(output_path / "summary.txt", 'w') as f:
            f.write(f"IMAGE TEXT SEARCH REPORT\n")
            f.write(f"Query: {query}\n")
            f.write(f"Date: {timestamp}\n")
            f.write(f"OCR: {self.config['OCR_MODEL']}\n")
            f.write(f"LLM: {self.config['LLM_PROVIDER']}-{self.config['LLM_MODEL']}\n")
            f.write(f"Matches: {len(results.get('matches', []))}\n")
            f.write(f"Summary: {results.get('summary', 'N/A')}\n\n")

            for i, match in enumerate(results.get('matches', []), 1):
                f.write(f"{i}. {match['filename']} (Confidence: {match['ocr_confidence']:.3f})\n")
                f.write(f"   Text: {match['extracted_text'][:200]}...\n\n")

        print(f"📋 Results exported to: {output_path}")
        return str(output_path)

    def close(self):
        """Close database connection"""
        if hasattr(self, 'conn') and self.conn:
            self.conn.close()

# GUI Interface
import ipywidgets as widgets
from IPython.display import display, clear_output
import matplotlib.pyplot as plt

def create_search_gui():
    """Create interactive GUI for the OCR search system"""

    # Configuration widgets
    ocr_selector = widgets.Dropdown(
        options=[
            ('PaddleOCR - Best overall, 80+ languages', 'paddleocr'),
            ('EasyOCR - Good for European languages', 'easyocr'),
            ('Tesseract - Classic OCR for printed text', 'tesseract'),
            ('TrOCR - AI-based OCR for handwriting', 'trocr')
        ],
        value='paddleocr',
        description='OCR Engine:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='500px')
    )

    llm_provider = widgets.Dropdown(
        options=[('Ollama', 'ollama'), ('Transformers', 'transformers')],
        value='ollama',
        description='LLM Provider:',
        style={'description_width': 'initial'}
    )

    llm_model = widgets.Text(
        value='llama3.2:latest',
        description='LLM Model:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='300px')
    )

    gallery_path = widgets.Text(
        value='../../../datasets/images/text_ocr',
        description='Image Folder:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='400px')
    )

    search_input = widgets.Textarea(
        value='Enter text to search for in images...',
        description='Search Query:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='500px', height='60px')
    )

    confidence_slider = widgets.FloatSlider(
        value=0.60,
        min=0.1, max=0.9, step=0.05,
        description='OCR Confidence:',
        style={'description_width': 'initial'},
        readout_format='.2f'
    )

    # Quick search examples
    quick_searches = [
        "phone numbers", "email addresses", "addresses",
        "names", "dates", "prices"
    ]
    quick_btns = [widgets.Button(
        description=q,
        tooltip=f"Search for {q}",
        layout=widgets.Layout(width='150px', margin='2px')
    ) for q in quick_searches]

    # Action buttons
    test_btn = widgets.Button(description='🔍 Test Setup', button_style='info', layout=widgets.Layout(width='120px'))
    process_btn = widgets.Button(description='📁 Process Images', button_style='primary', layout=widgets.Layout(width='140px'))
    search_btn = widgets.Button(description='🔍 Search Text', button_style='success', layout=widgets.Layout(width='120px'))
    show_all_btn = widgets.Button(description='📝 Show All Texts', button_style='warning', layout=widgets.Layout(width='140px'))
    export_btn = widgets.Button(description='📋 Export Results', button_style='success', layout=widgets.Layout(width='140px'), disabled=True)
    clear_btn = widgets.Button(description='🗑️ Clear', layout=widgets.Layout(width='120px'))

    status_label = widgets.HTML(value="<b>Status:</b> Ready")
    progress_bar = widgets.IntProgress(value=0, min=0, max=100, layout=widgets.Layout(width='400px'))
    output_area = widgets.Output()

    # State variables
    current_system = None
    current_results = None

    def update_status(msg, progress=None):
        status_label.value = f"<b>Status:</b> {msg}"
        if progress is not None:
            progress_bar.value = progress

    def init_system():
        nonlocal current_system
        try:
            config = CONFIG.copy()
            config.update({
                'OCR_MODEL': ocr_selector.value,
                'LLM_PROVIDER': llm_provider.value,
                'LLM_MODEL': llm_model.value,
                'IMAGE_GALLERY_PATH': gallery_path.value,
                'OCR_CONFIDENCE_THRESHOLD': confidence_slider.value
            })
            current_system = GenericOCRSearchSystem(config)
            return True
        except Exception as e:
            print(f"❌ System initialization error: {e}")
            return False

    # Button event handlers
    def on_quick_search(search_text):
        def handler(b):
            search_input.value = search_text
        return handler

    for i, btn in enumerate(quick_btns):
        btn.on_click(on_quick_search(quick_searches[i]))

    def on_test(b):
        with output_area:
            clear_output(wait=True)
            update_status("Testing setup...", 20)
            print("🔍 TESTING OCR TEXT SEARCH SETUP")
            print("="*50)

            # Test OCR
            print(f"Selected OCR: {ocr_selector.value}")
            ocr_info = OCR_MODELS.get(ocr_selector.value, {})
            print(f"Purpose: {ocr_info.get('strengths', 'General OCR')}")

            try:
                if ocr_selector.value == 'paddleocr':
                    import paddleocr; print("✅ PaddleOCR available")
                elif ocr_selector.value == 'easyocr':
                    import easyocr; print("✅ EasyOCR available")
                elif ocr_selector.value == 'tesseract':
                    import pytesseract; print("✅ Tesseract available")
                elif ocr_selector.value == 'trocr':
                    import transformers; print("✅ TrOCR available")
            except ImportError as e:
                print(f"❌ {ocr_selector.value} not found: {e}")

            # Test dependencies
            deps = [('langchain', 'LangChain'), ('chromadb', 'ChromaDB'), ('sentence_transformers', 'Transformers')]
            for dep, name in deps:
                try:
                    __import__(dep.replace('-', '_'));
                    print(f"✅ {name}")
                except ImportError:
                    print(f"❌ {name} not found")

            # Test Ollama if selected
            if llm_provider.value == 'ollama':
                update_status("Testing Ollama...", 60)
                try:
                    import requests
                    r = requests.get("http://localhost:11434/api/tags", timeout=5)
                    if r.status_code == 200:
                        models = [m['name'] for m in r.json().get('models', [])]
                        print(f"✅ Ollama models: {models}")
                        if llm_model.value in models:
                            print(f"✅ Model '{llm_model.value}' ready")
                        else:
                            print(f"⚠️ Model '{llm_model.value}' not found. Install: ollama pull {llm_model.value}")
                    else:
                        print("❌ Ollama not responding")
                except:
                    print("❌ Ollama connection failed. Start: ollama serve")

            # Test image folder
            if Path(gallery_path.value).exists():
                img_count = len([f for f in Path(gallery_path.value).glob('**/*')
                               if f.suffix.lower() in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']])
                print(f"✅ Image folder: {img_count} images found")
            else:
                print(f"⚠️ Image folder not found: {gallery_path.value}")

            update_status("Testing system initialization...", 80)
            if init_system():
                print("✅ System ready for text search")
                update_status("Setup complete!", 100)
            else:
                print("❌ System initialization failed")
                update_status("Setup failed", 0)

    def on_process(b):
        with output_area:
            clear_output(wait=True)
            update_status("Processing images...", 20)
            if not init_system():
                update_status("Initialization failed", 0)
                return

            print(f"🔍 PROCESSING IMAGES | OCR: {ocr_selector.value} | LLM: {llm_provider.value}-{llm_model.value}")
            try:
                current_system.process_images()
                update_status("Processing complete!", 100)

                # Show summary
                total_texts = current_system.conn.execute("SELECT COUNT(*) FROM image_texts WHERE extracted_text != ''").fetchone()[0]
                total_images = current_system.conn.execute("SELECT COUNT(*) FROM image_texts").fetchone()[0]
                print(f"\n📊 SUMMARY: {total_texts}/{total_images} images contain extractable text")

            except Exception as e:
                update_status("Processing error", 0)
                print(f"❌ Error: {e}")

    def on_search(b):
        nonlocal current_results
        with output_area:
            clear_output(wait=True)
            if not current_system and not init_system():
                print("❌ System not ready")
                return

            query = search_input.value.strip()
            if not query or query == "Enter text to search for in images...":
                print("⚠️ Please enter a search query")
                return

            update_status("Searching...", 50)
            print(f"🔍 SEARCHING: '{query}' | OCR: {ocr_selector.value}")

            try:
                current_results = current_system.search_text(query)
                matches = current_results.get('matches', [])

                if matches:
                    update_status(f"Found {len(matches)} matches", 100)
                    export_btn.disabled = False

                    print(f"📊 {current_results.get('summary', 'No summary')}")
                    print("\n🔍 SEARCH RESULTS:")
                    print("-" * 60)

                    for i, match in enumerate(matches[:10], 1):  # Show top 10
                        print(f"{i}. {match['filename']}")
                        print(f"   Confidence: {match['ocr_confidence']:.3f} | Relevance: {match['relevance_score']:.3f}")
                        print(f"   Text: {match['extracted_text'][:150]}...")
                        if match.get('matching_chunks'):
                            print(f"   Matching: {match['matching_chunks'][0][:100]}...")
                        print()

                    # Display matching images
                    if matches:
                        print(f"🖼️ Displaying {min(6, len(matches))} matching images...")
                        display_search_results([m['filename'] for m in matches[:6]], current_system)

                else:
                    update_status("No matches found", 100)
                    export_btn.disabled = True
                    print("❌ No matches found. Try different search terms or check if images are processed.")

            except Exception as e:
                update_status("Search error", 0)
                print(f"❌ Search error: {e}")

    def on_show_all(b):
        with output_area:
            clear_output(wait=True)
            if not current_system and not init_system():
                print("❌ System not ready")
                return

            update_status("Loading all texts...", 50)
            try:
                all_texts = current_system.get_all_texts(confidence_slider.value)
                if all_texts:
                    print(f"📝 ALL EXTRACTED TEXTS (Confidence ≥ {confidence_slider.value})")
                    print("=" * 70)

                    for i, (filename, path, text, conf, analysis) in enumerate(all_texts, 1):
                        print(f"{i}. {filename} | Confidence: {conf:.3f}")
                        print(f"   Text: {text[:200]}...")

                        if analysis:
                            try:
                                data = json.loads(analysis)
                                keywords = data.get('searchable_keywords', [])
                                if keywords:
                                    print(f"   Keywords: {', '.join(keywords[:8])}")
                            except:
                                pass
                        print()

                        if i >= 20:  # Limit display
                            print(f"... and {len(all_texts) - 20} more images")
                            break

                    update_status(f"Showing {len(all_texts)} texts", 100)
                else:
                    print(f"📝 No texts found with confidence ≥ {confidence_slider.value}")
                    print("Try lowering the confidence threshold or process more images.")
                    update_status("No texts found", 100)

            except Exception as e:
                update_status("Error loading texts", 0)
                print(f"❌ Error: {e}")

    def on_export(b):
        with output_area:
            if not current_results:
                print("⚠️ No search results to export")
                return
            try:
                update_status("Exporting results...", 50)
                path = current_system.export_results(current_results, search_input.value or "search")
                update_status("Export complete!", 100)
                print(f"✅ Search results exported to: {path}")
            except Exception as e:
                update_status("Export error", 0)
                print(f"❌ Export error: {e}")

    def on_clear(b):
        nonlocal current_results
        current_results = None
        export_btn.disabled = True
        with output_area:
            clear_output()
            update_status("Cleared", 0)
            print("🗑️ Output cleared")

    # Connect button events
    test_btn.on_click(on_test)
    process_btn.on_click(on_process)
    search_btn.on_click(on_search)
    show_all_btn.on_click(on_show_all)
    export_btn.on_click(on_export)
    clear_btn.on_click(on_clear)

    # Layout components
    quick_grid = widgets.GridBox(
        quick_btns,
        layout=widgets.Layout(grid_template_columns='repeat(3, 1fr)', grid_gap='5px')
    )

    controls = widgets.VBox([
        widgets.HTML("<h3>🔍 Generic OCR Image Text Search System</h3>"),
        widgets.HTML("<p><i>Extract and search text from any images in your folder</i></p>"),

        widgets.HTML("<b>🔧 Configuration:</b>"),
        ocr_selector,
        widgets.HBox([llm_provider, llm_model]),
        gallery_path,
        widgets.HBox([confidence_slider]),

        widgets.HTML("<br><b>🔍 Search:</b>"),
        search_input,
        widgets.HTML("<b>Quick Search Examples:</b>"),
        quick_grid,

        widgets.HTML("<br><b>🎛️ Actions:</b>"),
        widgets.HBox([test_btn, process_btn, search_btn]),
        widgets.HBox([show_all_btn, export_btn, clear_btn]),

        status_label,
        progress_bar,
        widgets.HTML("<hr>")
    ])

    return widgets.VBox([controls, output_area])

def display_search_results(filenames: List[str], system: GenericOCRSearchSystem, max_images=6):
    """Display search result images with their extracted text"""
    if not filenames:
        return

    num = min(len(filenames), max_images)
    cols = 3 if num > 4 else 2 if num > 1 else 1
    rows = (num + cols - 1) // cols

    fig, axes = plt.subplots(rows, cols, figsize=(15, 5*rows))
    if num == 1:
        axes = [axes]
    elif rows == 1:
        axes = axes if num > 1 else [axes]
    else:
        axes = axes.flatten()

    for i, filename in enumerate(filenames[:num]):
        ax = axes[i] if num > 1 else axes[0]
        try:
            # Get image info from database
            result = system.conn.execute('''SELECT full_path, extracted_text, ocr_confidence, text_analysis
                FROM image_texts WHERE filename = ?''', (filename,)).fetchone()

            if not result:
                ax.text(0.5, 0.5, f"Not found\n{filename}", ha='center', va='center', transform=ax.transAxes)
                ax.axis('off')
                continue

            full_path, text, confidence, analysis = result

            try:
                img = plt.imread(full_path)
                ax.imshow(img)
            except:
                ax.text(0.5, 0.5, f"Image load error\n{filename}", ha='center', va='center', transform=ax.transAxes)
                ax.axis('off')
                continue

            # Title with confidence
            conf_color = "green" if confidence > 0.8 else "orange" if confidence > 0.6 else "red"
            ax.set_title(f"{filename}\nConfidence: {confidence:.2f}", fontsize=10, color=conf_color, fontweight='bold')

            # Text overlay
            text_preview = text[:100] if text else "No text detected"
            ax.text(0.02, 0.98, f"Text: {text_preview}...", transform=ax.transAxes, fontsize=8,
                   verticalalignment='top', bbox=dict(boxstyle="round,pad=0.3", facecolor="lightblue", alpha=0.8))

            # Show keywords if available
            if analysis:
                try:
                    data = json.loads(analysis)
                    keywords = data.get('searchable_keywords', [])
                    if keywords:
                        ax.text(0.02, 0.02, f"Keywords: {', '.join(keywords[:4])}", transform=ax.transAxes, fontsize=7,
                               verticalalignment='bottom', bbox=dict(boxstyle="round,pad=0.2", facecolor="lightyellow", alpha=0.8))
                except:
                    pass

            ax.axis('off')

        except Exception as e:
            ax.text(0.5, 0.5, f"Error\n{filename}\n{str(e)[:30]}", ha='center', va='center', transform=ax.transAxes)
            ax.axis('off')

    # Hide unused subplots
    for j in range(num, len(axes)):
        axes[j].axis('off')

    plt.tight_layout()
    plt.suptitle('🔍 Search Results', fontsize=14, y=1.02)
    plt.show()

# Quick utility functions
def quick_setup():
    """Quick dependency and setup check"""
    print("🔍 QUICK SETUP CHECK FOR OCR TEXT SEARCH")
    print("=" * 50)

    # Check Python packages
    deps = ['paddleocr', 'langchain', 'chromadb', 'sentence_transformers', 'PIL', 'numpy']
    for dep in deps:
        try:
            __import__(dep.replace('-', '_'))
            print(f"✅ {dep}")
        except ImportError:
            print(f"❌ {dep} - Install: pip install {dep}")

    # Check Ollama
    try:
        import requests
        r = requests.get("http://localhost:11434/api/tags", timeout=5)
        if r.status_code == 200:
            models = [m['name'] for m in r.json().get('models', [])]
            print(f"✅ Ollama: {models}")
            if 'llama3.2:latest' in models:
                print("✅ Recommended model ready")
            else:
                print("⚠️ Install recommended model: ollama pull llama3.2:latest")
        else:
            print("❌ Ollama not responding")
    except:
        print("❌ Ollama not available - Start: ollama serve")

def quick_search(query: str, image_folder: str = '../../../datasets/images/text_ocr'):
    """Quick text search in images"""
    print(f"🔍 Quick Search: '{query}' in {image_folder}")
    print("🔒 100% Local Processing")

    try:
        config = CONFIG.copy()
        config['IMAGE_GALLERY_PATH'] = image_folder
        system = GenericOCRSearchSystem(config)

        # Check if images are processed
        count = system.conn.execute("SELECT COUNT(*) FROM image_texts WHERE vector_stored = 1").fetchone()[0]
        if count == 0:
            print("📁 Processing images first...")
            system.process_images()

        # Search
        results = system.search_text(query)
        matches = results.get('matches', [])

        if matches:
            print(f"\n✅ Found {len(matches)} matches:")
            for i, match in enumerate(matches[:5], 1):
                print(f"{i}. {match['filename']} (Confidence: {match['ocr_confidence']:.2f})")
                print(f"   Text: {match['extracted_text'][:150]}...")
                print()

            # Export results
            path = system.export_results(results, query)
            print(f"📋 Results exported to: {path}")
        else:
            print("❌ No matches found")

        system.close()
        return results

    except Exception as e:
        print(f"❌ Error: {e}")
        return None

# Create and display the interface
print("🔍 Creating Generic OCR Text Search Interface...")
print("🔧 Features: Multi-OCR support, semantic search, text extraction")

search_interface = create_search_gui()
display(search_interface)

print(f"\n🔍 GENERIC OCR TEXT SEARCH SYSTEM READY!")
print("=" * 60)
print("🔒 PRIVACY: 100% local processing, no external API calls")
print("🤖 COMPONENTS: Multi-OCR + Ollama/Transformers + ChromaDB + SQLite")
print("📝 OCR OPTIONS: PaddleOCR, EasyOCR, Tesseract, TrOCR")
print("🚀 SETUP: ollama serve → ollama pull llama3.2:latest")
print("💡 QUICK: quick_setup() | quick_search('phone number', './my_images')")
print("=" * 60)
print("\n🔍 USAGE GUIDE:")
print("1. Select OCR engine based on your image types")
print("2. Set your image folder path")
print("3. Click 'Process Images' to extract all text")
print("4. Enter search query and click 'Search Text'")
print("5. Export results with matching images")
print("=" * 60)