In [None]:
import os
os.listdir('../../datasets/images/text_ocr')


In [None]:
# Pure Transformer OCR Image Text Search System
import os, json, sqlite3, shutil, warnings, time
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Any, Optional
import numpy as np
from PIL import Image
import torch
warnings.filterwarnings("ignore")

# Disable analytics to prevent connection errors
os.environ['DISABLE_TELEMETRY'] = '1'
os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
os.environ['HUB_ANALYTICS'] = 'False'

# Configuration for Pure Transformer OCR
CONFIG = {
    'OCR_MODEL': 'microsoft_trocr',
    'OCR_CONFIDENCE_THRESHOLD': 0.6,
    'EMBEDDING_MODEL': 'sentence-transformers/all-MiniLM-L6-v2',
    'VECTOR_STORE_PATH': './image_vectorstore',
    'CHUNK_SIZE': 500,
    'CHUNK_OVERLAP': 50,
    'IMAGE_GALLERY_PATH': '../../datasets/images/text_ocr',
    'DATABASE_PATH': './image_analysis_db',
    'RESULTS_OUTPUT_PATH': '../../datasets/images/forensic_results',
    'BATCH_SIZE': 4,
    'MAX_RESULTS_DISPLAY': 10,
    'DEVICE': 'cuda' if torch.cuda.is_available() else 'cpu'
}

# Pure Transformer OCR model options (no BLIP, no Ollama dependency)
TRANSFORMER_OCR_MODELS = {
    'microsoft_trocr': {
        'name': 'Microsoft TrOCR Base (Printed)',
        'model_id': 'microsoft/trocr-base-printed',
        'desc': 'End-to-end transformer for printed text',
        'strengths': 'Clean printed documents, forms, receipts',
        'type': 'vision_encoder_decoder'
    },
    'microsoft_trocr_handwritten': {
        'name': 'Microsoft TrOCR Base (Handwritten)',
        'model_id': 'microsoft/trocr-base-handwritten',
        'desc': 'Specialized for handwritten text recognition',
        'strengths': 'Handwritten notes, signatures, cursive text',
        'type': 'vision_encoder_decoder'
    },
    'microsoft_trocr_large': {
        'name': 'Microsoft TrOCR Large (Printed)',
        'model_id': 'microsoft/trocr-large-printed',
        'desc': 'Large model for high-accuracy printed text',
        'strengths': 'Complex documents, high accuracy needs',
        'type': 'vision_encoder_decoder'
    },
    'allenai_olmo_ocr': {
        'name': 'AllenAI OLMo OCR 7B',
        'model_id': 'allenai/OLMo-7B-0724-hf',
        'desc': 'Large multimodal model with OCR capabilities',
        'strengths': 'Complex documents, reasoning about text content',
        'type': 'multimodal_llm',
        'note': 'Requires special implementation - placeholder for now'
    }
}

print(f"🔍 PURE TRANSFORMER OCR SEARCH | Device: {CONFIG['DEVICE']} | 🔒 100% Local Processing")

# Dependency check for Pure Transformer setup
def check_pure_transformer_deps():
    deps = {
        'transformers': 'Hugging Face Transformers',
        'torch': 'PyTorch',
        'torchvision': 'TorchVision', 
        'chromadb': 'ChromaDB',
        'sentence_transformers': 'Sentence Transformers',
        'accelerate': 'Hugging Face Accelerate'
    }
    
    print("🔧 Checking Pure Transformer OCR Dependencies...")
    for dep, name in deps.items():
        try: 
            __import__(dep.replace('-', '_')); 
            print(f"✅ {name}")
        except ImportError: 
            print(f"❌ {name} - Install: pip install {dep}")

def check_gpu_availability():
    """Check GPU availability for faster inference"""
    if torch.cuda.is_available():
        gpu_name = torch.cuda.get_device_name(0)
        gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
        print(f"✅ GPU: {gpu_name} ({gpu_memory:.1f}GB)")
        return True
    else:
        print("⚠️ GPU not available, using CPU (slower)")
        return False

check_pure_transformer_deps()
gpu_available = check_gpu_availability()

# Pure Transformer OCR System (No Ollama, No BLIP)
class PureTransformerOCRSystem:
    def __init__(self, config=None):
        self.config = config or CONFIG.copy()
        self.device = self.config['DEVICE']
        self._setup_dirs()
        self._init_db()
        self._init_ocr()
        self._init_embeddings()

    def _setup_dirs(self):
        for k in ['IMAGE_GALLERY_PATH', 'DATABASE_PATH', 'RESULTS_OUTPUT_PATH', 'VECTOR_STORE_PATH']:
            os.makedirs(self.config[k], exist_ok=True)

    def _init_db(self):
        db_path = os.path.join(self.config['DATABASE_PATH'], 'image_text.db')
        self.conn = sqlite3.connect(db_path)
        self.conn.execute('''CREATE TABLE IF NOT EXISTS image_texts (
            id INTEGER PRIMARY KEY,
            filename TEXT UNIQUE,
            full_path TEXT,
            extracted_text TEXT,
            ocr_confidence REAL,
            ocr_model TEXT,
            processed_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            text_keywords TEXT,
            text_length INTEGER,
            relevance_score REAL,
            vector_stored BOOLEAN DEFAULT 0,
            processing_time REAL
        )''')
        self.conn.execute('''CREATE TABLE IF NOT EXISTS text_searches (
            id INTEGER PRIMARY KEY,
            query TEXT,
            search_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            results_json TEXT,
            matched_images TEXT,
            model_used TEXT,
            processing_time REAL
        )''')
        self.conn.commit()

    def _init_ocr(self):
        """Initialize selected Transformer OCR model"""
        try:
            ocr_model = self.config['OCR_MODEL']
            model_info = TRANSFORMER_OCR_MODELS[ocr_model]
            model_id = model_info['model_id']
            model_type = model_info['type']
            
            print(f"🔄 Loading {model_info['name']}...")
            
            if model_type == 'vision_encoder_decoder':
                from transformers import TrOCRProcessor, VisionEncoderDecoderModel
                self.ocr_processor = TrOCRProcessor.from_pretrained(model_id)
                self.ocr_model = VisionEncoderDecoderModel.from_pretrained(model_id)
                self.ocr_model.to(self.device)
                self.ocr_type = 'trocr'
                
            elif model_type == 'multimodal_llm':
                # For OLMo OCR - placeholder implementation
                print("⚠️ OLMo OCR requires custom implementation")
                self.ocr_type = 'olmo'
                self.ocr_processor = None
                self.ocr_model = None
                return

            elif model_type == 'multimodal_vlm':
                # AllenAI Molmo 7B
                from transformers import AutoModelForCausalLM, AutoProcessor
                self.ocr_processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
                self.ocr_model = AutoModelForCausalLM.from_pretrained(
                    model_id, trust_remote_code=True,
                    torch_dtype=torch.float16 if self.device == 'cuda' else torch.float32,
                    device_map='auto' if self.device == 'cuda' else 'cpu'
                )
                self.ocr_type = 'molmo'

            elif model_type == 'multimodal_vlm':
                # AllenAI Molmo 7B - requires specific setup
                print("🤖 Loading AllenAI Molmo 7B (large download ~13GB)...")
                try:
                    from transformers import AutoModelForCausalLM, AutoProcessor
                    import torch

                    # Check transformers version
                    import transformers
                    version = transformers.__version__
                    print(f"Transformers version: {version}")

                    self.ocr_processor = AutoProcessor.from_pretrained(
                        model_id,
                        trust_remote_code=True,
                        torch_dtype='auto'
                    )

                    self.ocr_model = AutoModelForCausalLM.from_pretrained(
                        model_id,
                        trust_remote_code=True,
                        torch_dtype=torch.bfloat16 if self.device == 'cuda' else torch.float32,
                        device_map='auto' if self.device == 'cuda' else None
                    )

                    if self.device == 'cpu':
                        self.ocr_model = self.ocr_model.to('cpu')

                    self.ocr_type = 'molmo'
                    print("✅ Molmo 7B loaded successfully")

                except ImportError as e:
                    print(f"❌ Missing dependency: {e}")
                    print("💡 Install: pip install transformers>=4.44.0 einops")
                    self.ocr_processor = None
                    self.ocr_model = None
                    self.ocr_type = None
                    return
                except Exception as e:
                    print(f"❌ Molmo 7B load failed: {e}")
                    print("💡 Try: pip install --upgrade transformers accelerate")
                    self.ocr_processor = None
                    self.ocr_model = None
                    self.ocr_type = None
                    return

                
            print(f"✅ {model_info['name']} loaded on {self.device}")
            
        except Exception as e:
            print(f"❌ OCR initialization error: {e}")
            self.ocr_processor = None
            self.ocr_model = None
            self.ocr_type = None

    def _init_embeddings(self):
        """Initialize embeddings and vector store (no LLM needed)"""
        try:
            from sentence_transformers import SentenceTransformer
            from chromadb import Client
            from chromadb.config import Settings
            import chromadb.utils.embedding_functions as embedding_functions
            
            # Initialize sentence transformer for embeddings
            self.embedding_model = SentenceTransformer(self.config['EMBEDDING_MODEL'])
            
            # Initialize ChromaDB
            self.chroma_client = Client(Settings(
                persist_directory=self.config['VECTOR_STORE_PATH'],
                anonymized_telemetry=False
            ))
            
            # Create or get collection
            self.collection_name = "ocr_texts"
            try:
                self.collection = self.chroma_client.get_collection(name=self.collection_name)
            except:
                self.collection = self.chroma_client.create_collection(
                    name=self.collection_name,
                    metadata={"hnsw:space": "cosine"}
                )
            
            print("✅ Pure embedding system ready (no LLM required)")
            
        except Exception as e:
            print(f"❌ Embedding system error: {e}")
            self.embedding_model = None
            self.collection = None

    def extract_text(self, img_path: str) -> Dict[str, Any]:
        """Extract text using pure Transformer OCR models"""
        start_time = time.time()
        
        try:
            img = Image.open(img_path).convert("RGB")
            result = {
                'filename': os.path.basename(img_path),
                'full_path': img_path,
                'extracted_text': '',
                'ocr_confidence': 0.0,
                'ocr_model': self.config['OCR_MODEL'],
                'processing_time': 0.0,
                'error': None
            }
            if not self.ocr_processor or not self.ocr_model:
                result['error'] = "OCR model not initialized"
                return result

            if self.ocr_type == 'trocr':
                # Microsoft TrOCR processing
                pixel_values = self.ocr_processor(images=img, return_tensors="pt").pixel_values
                pixel_values = pixel_values.to(self.device)
                
                with torch.no_grad():
                    generated_ids = self.ocr_model.generate(
                        pixel_values, 
                        max_length=512,
                        num_beams=4,
                        early_stopping=True
                    )
                
                text = self.ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
                
                if text.strip():
                    result.update({
                        'extracted_text': text.strip(),
                        'ocr_confidence': 0.95  # TrOCR doesn't provide confidence, use high default
                    })
                    
            elif self.ocr_type == 'olmo':
                # OLMo OCR placeholder
                result.update({
                    'extracted_text': '',
                    'ocr_confidence': 0.0,
                    'error': 'OLMo OCR not implemented yet'
                })

            elif self.ocr_type == 'molmo':
                # AllenAI Molmo 7B processing
                ocr_prompt = "Extract all text from this image. Provide only the text content."
                inputs = self.ocr_processor.process(images=[img], text=ocr_prompt)
                inputs = {k: v.to(self.ocr_model.device) for k, v in inputs.items()}

                with torch.no_grad():
                    output = self.ocr_model.generate_from_batch(
                        inputs, generation_config={"max_new_tokens": 200}
                    )

                generated_tokens = output[0, inputs['input_ids'].size(1):]
                text = self.ocr_processor.decode(generated_tokens, skip_special_tokens=True)

                if text.strip():
                    result.update({
                        'extracted_text': text.strip(),
                        'ocr_confidence': 0.90
                    })

            
        except Exception as e:
            return {
                'filename': os.path.basename(img_path),
                'full_path': img_path,
                'extracted_text': '',
                'ocr_confidence': 0.0,
                'ocr_model': self.config['OCR_MODEL'],
                'processing_time': time.time() - start_time,
                'error': str(e)
            }

    def analyze_text_simple(self, text: str, filename: str) -> Dict[str, Any]:
        """Simple text analysis without LLM dependency"""
        words = text.lower().split()
        
        # Extract basic features
        keywords = []
        entities = {'numbers': [], 'emails': [], 'urls': []}
        
        for word in words:
            # Simple keyword extraction (words > 3 chars, not common words)
            if len(word) > 3 and word not in ['the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had', 'her', 'was', 'one', 'our', 'out', 'day', 'get', 'has', 'him', 'his', 'how', 'may', 'new', 'now', 'old', 'see', 'two', 'who', 'its', 'said', 'each', 'make', 'most', 'over', 'said', 'some', 'time', 'very', 'when', 'with', 'have', 'from', 'they', 'know', 'want', 'been', 'good', 'much', 'some', 'time', 'very', 'when', 'come', 'could', 'state', 'there', 'think', 'where', 'being', 'every', 'great', 'might', 'shall', 'still', 'those', 'under', 'while']:
                keywords.append(word)
                
        # Simple entity detection
        import re
        numbers = re.findall(r'\b\d+\.?\d*\b', text)
        emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
        urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
        
        return {
            'text_length': len(text),
            'word_count': len(words),
            'keywords': keywords[:10],  # Top 10 keywords
            'entities': {
                'numbers': numbers[:5],
                'emails': emails[:3],
                'urls': urls[:3]
            },
            'has_numbers': len(numbers) > 0,
            'has_emails': len(emails) > 0,
            'has_urls': len(urls) > 0,
            'readability_score': min(len(words) / 50.0, 1.0),
            'relevance_score': min(len(keywords) / 20.0, 1.0)
        }

    def add_to_vectorstore(self, text: str, metadata: Dict[str, Any]):
        """Add text to vector store using pure embeddings"""
        try:
            if not self.embedding_model or not self.collection:
                return False
                
            # Generate embedding
            embedding = self.embedding_model.encode([text])[0].tolist()
            
            # Create unique ID
            doc_id = f"{metadata['filename']}_{int(time.time() * 1000)}"
            
            # Add to ChromaDB
            self.collection.add(
                embeddings=[embedding],
                documents=[text],
                metadatas=[metadata],
                ids=[doc_id]
            )
            
            return True
            
        except Exception as e:
            print(f"⚠️ Vector store error: {e}")
            return False

    def process_images(self):
        """Process all images using pure Transformer OCR"""
        start_time = time.time()
        gallery_path = Path(self.config['IMAGE_GALLERY_PATH'])
        
        print(f"🔍 PROCESSING: {gallery_path}")
        print(f"🤖 OCR Model: {TRANSFORMER_OCR_MODELS[self.config['OCR_MODEL']]['name']}")
        print(f"⚡ Device: {self.device}")
        print(f"📊 Pure Transformer processing (no LLM required)")

        if not gallery_path.exists():
            print(f"❌ Path not found: {gallery_path}")
            return

        # Find all image files
        exts = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
        files = [f for ext in exts
                for f in list(gallery_path.glob(f"**/*{ext}")) +
                         list(gallery_path.glob(f"**/*{ext.upper()}"))]
        
        total = len(files)
        print(f"📸 Found {total} images")
        
        if total == 0:
            print("⚠️ No images found!")
            return

        processed, vectorized = 0, 0
        total_ocr_time = 0

        for i, img_path in enumerate(files, 1):
            filename = img_path.name
            print(f"[{i:3d}/{total}] ({i/total*100:5.1f}%) Processing {filename[:40]}...", end='')

            # Skip if already processed with same model
            existing = self.conn.execute(
                "SELECT id FROM image_texts WHERE filename = ? AND ocr_model = ?",
                (filename, self.config['OCR_MODEL'])
            ).fetchone()
            
            if existing:
                print(" ⏭️ SKIP")
                continue

            # Extract text using Transformer OCR
            ocr_result = self.extract_text(img_path)
            
            if ocr_result.get('error'):
                print(f" ❌ ERROR: {ocr_result['error'][:30]}")
                continue

            text = ocr_result['extracted_text']
            processing_time = ocr_result['processing_time']
            total_ocr_time += processing_time

            analysis = None
            vector_stored = False

            if text.strip():
                # Simple analysis (no LLM needed)
                analysis = self.analyze_text_simple(text, filename)
                
                # Add to vector store
                metadata = {
                    'filename': filename,
                    'full_path': str(img_path),
                    'ocr_model': self.config['OCR_MODEL'],
                    'ocr_confidence': ocr_result['ocr_confidence']
                }
                
                if self.add_to_vectorstore(text, metadata):
                    vector_stored = True
                    vectorized += 1

            # Store in database
            relevance = analysis['relevance_score'] if analysis else 0.0
            keywords_json = json.dumps(analysis['keywords']) if analysis else None
            text_length = len(text)
            
            self.conn.execute('''INSERT OR REPLACE INTO image_texts
                (filename, full_path, extracted_text, ocr_confidence, ocr_model,
                 text_keywords, text_length, relevance_score, vector_stored, processing_time)
                VALUES (?,?,?,?,?,?,?,?,?,?)''',
                (filename, str(img_path), text, ocr_result['ocr_confidence'],
                 self.config['OCR_MODEL'], keywords_json, text_length,
                 relevance, vector_stored, processing_time))

            if text.strip():
                words = len(text.split())
                print(f" ✅ TEXT ({words} words, {processing_time:.2f}s)")
            else:
                print(f" ⚪ NO TEXT ({processing_time:.2f}s)")

            processed += 1
            
            # Commit every 10 images
            if i % 10 == 0:
                self.conn.commit()
                if self.device == 'cuda':
                    torch.cuda.empty_cache()

        self.conn.commit()
        total_time = time.time() - start_time
        avg_ocr_time = total_ocr_time / processed if processed > 0 else 0
        
        print(f"\n🔍 PROCESSING COMPLETE:")
        print(f"   📊 Processed: {processed}/{total} images")
        print(f"   💾 Vectorized: {vectorized} for search")
        print(f"   ⏱️ Total time: {total_time:.1f}s")
        print(f"   ⚡ Avg OCR time: {avg_ocr_time:.2f}s per image")
        print(f"   🤖 Model: {TRANSFORMER_OCR_MODELS[self.config['OCR_MODEL']]['name']}")

    def semantic_search(self, query: str, k: int = 10) -> Dict[str, Any]:
        """Pure embedding-based semantic search"""
        print(f"🔍 PURE SEMANTIC SEARCH: '{query}'")
        start_time = time.time()
        
        try:
            if not self.embedding_model or not self.collection:
                return {
                    'matches': [],
                    'summary': 'Embedding system not available',
                    'total_matches': 0,
                    'search_type': 'error'
                }

            # Generate query embedding
            query_embedding = self.embedding_model.encode([query])[0].tolist()
            
            # Search in ChromaDB
            results = self.collection.query(
                query_embeddings=[query_embedding],
                n_results=k,
                include=['documents', 'metadatas', 'distances']
            )
            
            if not results['documents'][0]:
                return {
                    'matches': [],
                    'summary': 'No semantic matches found',
                    'total_matches': 0,
                    'search_type': 'semantic'
                }

            # Process results
            matches = []
            documents = results['documents'][0]
            metadatas = results['metadatas'][0]
            distances = results['distances'][0]
            
            for i, (doc, metadata, distance) in enumerate(zip(documents, metadatas, distances)):
                filename = metadata.get('filename', f'unknown_{i}')
                similarity = 1.0 - distance  # Convert distance to similarity
                
                # Get full info from database
                db_result = self.conn.execute('''SELECT extracted_text, ocr_confidence, 
                    text_keywords, text_length, processing_time, full_path
                    FROM image_texts WHERE filename = ?''', (filename,)).fetchone()

                if db_result:
                    full_text, confidence, keywords_json, text_len, proc_time, full_path = db_result
                    keywords = json.loads(keywords_json) if keywords_json else []
                    
                    matches.append({
                        'filename': filename,
                        'full_path': full_path,
                        'extracted_text': full_text,
                        'ocr_confidence': confidence,
                        'ocr_model': self.config['OCR_MODEL'],
                        'processing_time': proc_time,
                        'similarity_score': similarity,
                        'matching_text': doc[:200],  # Preview of matching text
                        'keywords': keywords[:5],
                        'text_length': text_len
                    })

            # Sort by similarity score
            matches.sort(key=lambda x: x['similarity_score'], reverse=True)
            
            search_time = time.time() - start_time
            
            results = {
                'matches': matches,
                'summary': f"Found {len(matches)} semantically similar images",
                'total_matches': len(matches),
                'query': query,
                'search_type': 'pure_semantic',
                'search_time': search_time
            }

            # Store search in database
            self.conn.execute('''INSERT INTO text_searches
                (query, results_json, matched_images, model_used, processing_time)
                VALUES (?,?,?,?,?)''',
                (query, json.dumps(results),
                 json.dumps([m['filename'] for m in matches]),
                 f"pure_transformer_{self.config['OCR_MODEL']}",
                 search_time))
            self.conn.commit()

            print(f"✅ Found {len(matches)} matches in {search_time:.2f}s")
            return results

        except Exception as e:
            print(f"❌ Search error: {e}")
            return {
                'matches': [],
                'summary': f'Search error: {e}',
                'total_matches': 0,
                'search_type': 'error'
            }

    def keyword_search(self, query: str) -> Dict[str, Any]:
        """Simple keyword-based search in extracted text"""
        print(f"🔍 KEYWORD SEARCH: '{query}'")
        start_time = time.time()
        
        try:
            query_words = query.lower().split()
            
            # Search in database using SQL LIKE
            like_conditions = ' AND '.join(['LOWER(extracted_text) LIKE ?' for _ in query_words])
            like_params = [f'%{word}%' for word in query_words]
            
            sql = f'''SELECT filename, full_path, extracted_text, ocr_confidence,
                text_keywords, text_length, processing_time FROM image_texts
                WHERE {like_conditions} AND extracted_text != ""
                ORDER BY ocr_confidence DESC LIMIT 20'''
            
            results = self.conn.execute(sql, like_params).fetchall()
            
            matches = []
            for result in results:
                filename, full_path, text, confidence, keywords_json, text_len, proc_time = result
                keywords = json.loads(keywords_json) if keywords_json else []
                
                # Calculate simple relevance score
                text_lower = text.lower()
                relevance = sum(1 for word in query_words if word in text_lower) / len(query_words)
                
                matches.append({
                    'filename': filename,
                    'full_path': full_path,
                    'extracted_text': text,
                    'ocr_confidence': confidence,
                    'ocr_model': self.config['OCR_MODEL'],
                    'processing_time': proc_time,
                    'relevance_score': relevance,
                    'keywords': keywords[:5],
                    'text_length': text_len
                })
            
            search_time = time.time() - start_time
            
            search_results = {
                'matches': matches,
                'summary': f"Found {len(matches)} keyword matches",
                'total_matches': len(matches),
                'query': query,
                'search_type': 'keyword',
                'search_time': search_time
            }
            
            print(f"✅ Found {len(matches)} keyword matches in {search_time:.2f}s")
            return search_results
            
        except Exception as e:
            print(f"❌ Keyword search error: {e}")
            return {
                'matches': [],
                'summary': f'Keyword search error: {e}',
                'total_matches': 0,
                'search_type': 'error'
            }

    def get_model_performance_stats(self):
        """Get performance statistics for current OCR model"""
        stats = self.conn.execute('''SELECT 
            COUNT(*) as total_processed,
            AVG(ocr_confidence) as avg_confidence,
            AVG(processing_time) as avg_processing_time,
            COUNT(CASE WHEN extracted_text != '' THEN 1 END) as texts_found,
            AVG(text_length) as avg_text_length
            FROM image_texts WHERE ocr_model = ?''',
            (self.config['OCR_MODEL'],)).fetchone()
            
        return {
            'model_name': TRANSFORMER_OCR_MODELS[self.config['OCR_MODEL']]['name'],
            'total_processed': stats[0] or 0,
            'avg_confidence': stats[1] or 0,
            'avg_processing_time': stats[2] or 0,
            'texts_found': stats[3] or 0,
            'success_rate': (stats[3] or 0) / (stats[0] or 1) * 100,
            'avg_text_length': stats[4] or 0
        }

    def export_pure_results(self, results: Dict[str, Any], query: str) -> str:
        """Export search results with pure transformer info"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        query_safe = query.replace(' ', '_')[:30]
        output_path = Path(self.config['RESULTS_OUTPUT_PATH']) / f"pure_transformer_search_{query_safe}_{timestamp}"
        output_path.mkdir(parents=True, exist_ok=True)

        # Copy matching images
        copied = []
        for i, match in enumerate(results.get('matches', []), 1):
            if Path(match['full_path']).exists():
                dest = output_path / f"{i:03d}_{match['filename']}"
                shutil.copy2(match['full_path'], dest)
                copied.append(str(dest))

        # Get performance stats
        perf_stats = self.get_model_performance_stats()
        
        # Pure transformer report
        report = {
            'query': query,
            'timestamp': timestamp,
            'pure_transformer_ocr_model': self.config['OCR_MODEL'],
            'model_info': TRANSFORMER_OCR_MODELS[self.config['OCR_MODEL']],
            'device_used': self.device,
            'search_type': 'pure_transformer_no_llm',
            'performance_stats': perf_stats,
            'search_results': results,
            'copied_files': copied
        }

        with open(output_path / "pure_transformer_report.json", 'w') as f:
            json.dump(report, f, indent=2)

        # Summary report
        with open(output_path / "summary.txt", 'w') as f:
            f.write(f"PURE TRANSFORMER OCR SEARCH REPORT\n")
            f.write(f"=" * 50 + "\n")
            f.write(f"Query: {query}\n")
            f.write(f"Date: {timestamp}\n")
            f.write(f"OCR Model: {perf_stats['model_name']}\n")
            f.write(f"Device: {self.device}\n")
            f.write(f"Search Type: {results.get('search_type', 'unknown')}\n")
            f.write(f"Processing: Pure Transformer (No LLM/Ollama)\n")
            f.write(f"Matches: {len(results.get('matches', []))}\n")
            f.write(f"Search Time: {results.get('search_time', 0):.2f}s\n\n")
            
            f.write(f"MODEL PERFORMANCE:\n")
            f.write(f"  Total Processed: {perf_stats['total_processed']}\n")
            f.write(f"  Success Rate: {perf_stats['success_rate']:.1f}%\n")
            f.write(f"  Avg Confidence: {perf_stats['avg_confidence']:.3f}\n")
            f.write(f"  Avg Processing Time: {perf_stats['avg_processing_time']:.2f}s\n")
            f.write(f"  Avg Text Length: {perf_stats['avg_text_length']:.0f} chars\n\n")

            f.write(f"SEARCH RESULTS:\n")
            f.write(f"-" * 30 + "\n")
            for i, match in enumerate(results.get('matches', []), 1):
                f.write(f"{i}. {match['filename']}\n")
                f.write(f"   OCR Confidence: {match['ocr_confidence']:.3f}\n")
                f.write(f"   Similarity Score: {match.get('similarity_score', match.get('relevance_score', 0)):.3f}\n")
                f.write(f"   Processing Time: {match.get('processing_time', 0):.2f}s\n")
                f.write(f"   Text Length: {match.get('text_length', 0)} chars\n")
                f.write(f"   Text Preview: {match['extracted_text'][:150]}...\n")
                
                if match.get('keywords'):
                    f.write(f"   Keywords: {', '.join(match['keywords'][:5])}\n")
                f.write("\n")

        print(f"📋 Pure Transformer results exported to: {output_path}")
        return str(output_path)

    def close(self):
        """Cleanup resources"""
        if hasattr(self, 'conn') and self.conn:
            self.conn.close()
        if hasattr(self, 'ocr_model') and self.ocr_model and self.device == 'cuda':
            torch.cuda.empty_cache()

# Pure Transformer OCR GUI (No Ollama, No BLIP)
import ipywidgets as widgets
from IPython.display import display, clear_output
import matplotlib.pyplot as plt

def create_pure_transformer_gui():
    """Create GUI for Pure Transformer OCR system"""

    # Model selection (only TrOCR and OLMo)
    model_options = []
    for key, info in TRANSFORMER_OCR_MODELS.items():
        if key != 'salesforce_blip':  # Exclude BLIP
            desc = f"{info['name']} - {info['strengths']}"
            model_options.append((desc, key))

    ocr_selector = widgets.Dropdown(
        options=model_options,
        value='microsoft_trocr',
        description='Transformer OCR:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='600px')
    )

    # Device selection
    device_options = [('Auto (GPU if available)', 'auto')]
    if torch.cuda.is_available():
        device_options.append(('Force GPU', 'cuda'))
    device_options.append(('Force CPU', 'cpu'))

    device_selector = widgets.Dropdown(
        options=device_options,
        value='auto',
        description='Processing Device:',
        style={'description_width': 'initial'}
    )

    # Paths and settings
    gallery_path = widgets.Text(
        value='../../datasets/images/text_ocr',
        description='Image Directory:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='500px')
    )

    batch_size = widgets.IntSlider(
        value=4,
        min=1, max=16, step=1,
        description='Batch Size:',
        style={'description_width': 'initial'},
        tooltip='Higher = faster but more memory usage'
    )

    confidence_threshold = widgets.FloatSlider(
        value=0.6,
        min=0.1, max=0.95, step=0.05,
        description='Confidence Threshold:',
        style={'description_width': 'initial'},
        readout_format='.2f'
    )

    # Search interface
    search_input = widgets.Textarea(
        value='Enter search query...',
        description='Search Query:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='500px', height='80px')
    )

    search_type = widgets.RadioButtons(
        options=[('Semantic Search', 'semantic'), ('Keyword Search', 'keyword')],
        value='semantic',
        description='Search Type:',
        style={'description_width': 'initial'}
    )

    # Search examples for pure OCR
    search_examples = [
        "invoice numbers", "handwritten notes", "printed receipts",
        "form fields", "technical documents", "certificates",
        "phone numbers", "email addresses", "dates"
    ]
    
    quick_search_btns = [widgets.Button(
        description=example,
        tooltip=f"Search for {example}",
        layout=widgets.Layout(width='140px', margin='2px')
    ) for example in search_examples[:6]]

    # Action buttons
    model_info_btn = widgets.Button(description='📋 Model Info', button_style='info', layout=widgets.Layout(width='130px'))
    test_setup_btn = widgets.Button(description='🔧 Test Setup', button_style='info', layout=widgets.Layout(width='130px'))
    process_btn = widgets.Button(description='🚀 Process Images', button_style='primary', layout=widgets.Layout(width='150px'))
    semantic_btn = widgets.Button(description='🔍 Semantic Search', button_style='success', layout=widgets.Layout(width='150px'))
    keyword_btn = widgets.Button(description='🔤 Keyword Search', button_style='warning', layout=widgets.Layout(width='150px'))
    stats_btn = widgets.Button(description='📊 View Stats', button_style='info', layout=widgets.Layout(width='130px'))
    export_btn = widgets.Button(description='📋 Export Results', button_style='success', layout=widgets.Layout(width='150px'), disabled=True)
    clear_btn = widgets.Button(description='🗑️ Clear', layout=widgets.Layout(width='120px'))

    # Status and progress
    status_label = widgets.HTML(value="<b>Status:</b> Pure Transformer OCR Ready")
    progress_bar = widgets.IntProgress(value=0, min=0, max=100, layout=widgets.Layout(width='500px'))
    memory_info = widgets.HTML(value="<b>Memory:</b> Not monitored")
    output_area = widgets.Output()

    # State management
    current_system = None
    current_results = None

    def update_status(msg, progress=None, memory_usage=None):
        status_label.value = f"<b>Status:</b> {msg}"
        if progress is not None:
            progress_bar.value = progress
        if memory_usage is not None:
            memory_info.value = f"<b>Memory:</b> {memory_usage}"

    def get_memory_usage():
        """Get current memory usage"""
        if torch.cuda.is_available():
            allocated = torch.cuda.memory_allocated() / 1024**3
            cached = torch.cuda.memory_reserved() / 1024**3
            return f"GPU: {allocated:.1f}GB allocated, {cached:.1f}GB cached"
        return "CPU mode"

    def init_system():
        nonlocal current_system
        try:
            config = CONFIG.copy()
            
            # Handle device selection
            if device_selector.value == 'auto':
                config['DEVICE'] = 'cuda' if torch.cuda.is_available() else 'cpu'
            else:
                config['DEVICE'] = device_selector.value
                
            config.update({
                'OCR_MODEL': ocr_selector.value,
                'IMAGE_GALLERY_PATH': gallery_path.value,
                'BATCH_SIZE': batch_size.value,
                'OCR_CONFIDENCE_THRESHOLD': confidence_threshold.value
            })
            
            if current_system:
                current_system.close()
                
            current_system = PureTransformerOCRSystem(config)
            return True
        except Exception as e:
            print(f"❌ System initialization error: {e}")
            return False

    # Quick search button handlers
    def on_quick_search(search_text):
        def handler(b):
            search_input.value = search_text
        return handler

    for i, btn in enumerate(quick_search_btns):
        btn.on_click(on_quick_search(search_examples[i]))

    # Button handlers
    def on_model_info(b):
        with output_area:
            clear_output(wait=True)
            print("📋 PURE TRANSFORMER OCR MODEL INFORMATION")
            print("=" * 60)
            
            current_model = ocr_selector.value
            model_info = TRANSFORMER_OCR_MODELS[current_model]
            
            print(f"🤖 Selected Model: {model_info['name']}")
            print(f"🏷️ Model ID: {model_info['model_id']}")
            print(f"📝 Description: {model_info['desc']}")
            print(f"💪 Strengths: {model_info['strengths']}")
            print(f"🔧 Type: {model_info['type']}")
            
            if 'note' in model_info:
                print(f"⚠️ Note: {model_info['note']}")
            
            print(f"\n🖥️ Device: {device_selector.value}")
            if torch.cuda.is_available():
                gpu_name = torch.cuda.get_device_name(0)
                gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
                print(f"🎮 GPU Available: {gpu_name} ({gpu_memory:.1f}GB)")
            else:
                print("💻 CPU Only")
                
            print(f"\n⚙️ Configuration:")
            print(f"  Batch Size: {batch_size.value}")
            print(f"  Confidence Threshold: {confidence_threshold.value}")
            print(f"  Processing: Pure Transformer (No LLM/Ollama)")
            print(f"  Search: Embeddings + Keywords")

    def on_test_setup(b):
        with output_area:
            clear_output(wait=True)
            update_status("Testing Pure Transformer setup...", 10)
            
            print("🔧 PURE TRANSFORMER OCR SETUP TEST")
            print("=" * 50)
            print("✨ No Ollama or BLIP dependencies required!")
            
            # Test model availability
            selected_model = ocr_selector.value
            model_info = TRANSFORMER_OCR_MODELS[selected_model]
            
            print(f"\n🤖 Testing {model_info['name']}...")
            try:
                if model_info['type'] == 'vision_encoder_decoder':
                    from transformers import TrOCRProcessor
                    processor = TrOCRProcessor.from_pretrained(model_info['model_id'])
                    print(f"✅ Model {model_info['model_id']} accessible")
                else:
                    print(f"⚠️ {model_info['name']} requires custom implementation")
            except Exception as e:
                print(f"❌ Model access error: {e}")
                
            update_status("Testing dependencies...", 40)
            
            # Test core dependencies (no Ollama needed)
            deps = ['torch', 'torchvision', 'transformers', 'sentence_transformers', 'chromadb']
            for dep in deps:
                try:
                    __import__(dep.replace('-', '_'))
                    print(f"✅ {dep}")
                except ImportError:
                    print(f"❌ {dep}")
                    
            update_status("Testing GPU/CPU...", 70)
            
            # Test device
            if device_selector.value == 'cuda' and not torch.cuda.is_available():
                print("⚠️ GPU requested but not available")
            elif torch.cuda.is_available():
                print(f"✅ GPU: {torch.cuda.get_device_name(0)}")
                print(f"   Memory: {get_memory_usage()}")
            else:
                print("✅ CPU mode")
                
            # Test image directory
            img_path = Path(gallery_path.value)
            if img_path.exists():
                img_count = len([f for f in img_path.glob('**/*') 
                               if f.suffix.lower() in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']])
                print(f"✅ Image directory: {img_count} images found")
            else:
                print(f"⚠️ Image directory not found: {gallery_path.value}")
                
            update_status("Initializing pure system...", 90)
            
            if init_system():
                print("✅ Pure Transformer OCR system ready!")
                print("🚀 No external dependencies (Ollama/BLIP) required")
                update_status("Pure setup complete!", 100, get_memory_usage())
            else:
                print("❌ System initialization failed")
                update_status("Setup failed", 0)

    def on_process(b):
        with output_area:
            clear_output(wait=True)
            update_status("Initializing processing...", 10)
            
            if not init_system():
                update_status("Initialization failed", 0)
                return
                
            print("🚀 PURE TRANSFORMER OCR PROCESSING")
            print("=" * 50)
            print(f"🤖 Model: {TRANSFORMER_OCR_MODELS[ocr_selector.value]['name']}")
            print(f"🖥️ Device: {current_system.device}")
            print(f"📦 Batch Size: {batch_size.value}")
            print(f"📁 Directory: {gallery_path.value}")
            print("✨ Pure processing - no LLM/Ollama required")
            
            try:
                update_status("Processing with Pure Transformer OCR...", 50, get_memory_usage())
                current_system.process_images()
                
                # Show performance summary
                stats = current_system.get_model_performance_stats()
                print(f"\n📊 PROCESSING SUMMARY:")
                print(f"  🎯 Success Rate: {stats['success_rate']:.1f}%")
                print(f"  ⚡ Avg Processing Time: {stats['avg_processing_time']:.2f}s per image")
                print(f"  📏 Avg Text Length: {stats['avg_text_length']:.0f} characters")
                print(f"  🎚️ Avg Confidence: {stats['avg_confidence']:.3f}")
                
                update_status("Pure processing complete!", 100, get_memory_usage())
                
            except Exception as e:
                update_status("Processing error", 0)
                print(f"❌ Processing error: {e}")

    def on_semantic_search(b):
        nonlocal current_results
        with output_area:
            clear_output(wait=True)
            
            if not current_system and not init_system():
                print("❌ System not ready")
                return
                
            query = search_input.value.strip()
            if not query or query == "Enter search query...":
                print("⚠️ Please enter a search query")
                return
                
            update_status("Performing semantic search...", 30, get_memory_usage())
            
            try:
                print(f"🔍 PURE SEMANTIC SEARCH")
                print(f"Query: '{query}'")
                print(f"Model: {TRANSFORMER_OCR_MODELS[current_system.config['OCR_MODEL']]['name']}")
                print("🚀 Using pure embeddings (no LLM required)")
                
                current_results = current_system.semantic_search(query, k=15)
                matches = current_results.get('matches', [])
                
                if matches:
                    update_status(f"Found {len(matches)} semantic matches", 100, get_memory_usage())
                    export_btn.disabled = False
                    
                    print(f"\n🎯 SEARCH RESULTS ({current_results.get('search_time', 0):.2f}s):")
                    print("-" * 60)
                    
                    for i, match in enumerate(matches[:8], 1):  # Show top 8
                        print(f"{i}. {match['filename']}")
                        print(f"   🎯 Similarity: {match.get('similarity_score', 0):.3f}")
                        print(f"   🎚️ OCR Confidence: {match['ocr_confidence']:.3f}")
                        print(f"   📏 Text Length: {match.get('text_length', 0)} chars")
                        print(f"   📝 Text: {match['extracted_text'][:120]}...")
                        
                        if match.get('keywords'):
                            print(f"   🔑 Keywords: {', '.join(match['keywords'][:4])}")
                        print()
                        
                    # Display results visually
                    if matches:
                        display_pure_results([m['filename'] for m in matches[:6]], current_system)
                        
                else:
                    update_status("No semantic matches found", 100)
                    export_btn.disabled = True
                    print("❌ No semantic matches found")
                    print("💡 Try different search terms or check if images are processed")
                    
            except Exception as e:
                update_status("Search error", 0)
                print(f"❌ Search error: {e}")

    def on_keyword_search(b):
        nonlocal current_results
        with output_area:
            clear_output(wait=True)
            
            if not current_system and not init_system():
                print("❌ System not ready")
                return
                
            query = search_input.value.strip()
            if not query or query == "Enter search query...":
                print("⚠️ Please enter a search query")
                return
                
            update_status("Performing keyword search...", 30)
            
            try:
                print(f"🔤 KEYWORD SEARCH")
                print(f"Query: '{query}'")
                print("🔍 Searching in extracted text")
                
                current_results = current_system.keyword_search(query)
                matches = current_results.get('matches', [])
                
                if matches:
                    update_status(f"Found {len(matches)} keyword matches", 100)
                    export_btn.disabled = False
                    
                    print(f"\n📝 KEYWORD RESULTS ({current_results.get('search_time', 0):.2f}s):")
                    print("-" * 60)
                    
                    for i, match in enumerate(matches[:10], 1):  # Show top 10
                        print(f"{i}. {match['filename']}")
                        print(f"   🎯 Relevance: {match.get('relevance_score', 0):.3f}")
                        print(f"   🎚️ OCR Confidence: {match['ocr_confidence']:.3f}")
                        print(f"   📏 Text Length: {match.get('text_length', 0)} chars")
                        print(f"   📝 Text: {match['extracted_text'][:120]}...")
                        print()
                        
                    # Display results visually
                    if matches:
                        display_pure_results([m['filename'] for m in matches[:6]], current_system)
                        
                else:
                    update_status("No keyword matches found", 100)
                    export_btn.disabled = True
                    print("❌ No keyword matches found")
                    
            except Exception as e:
                update_status("Keyword search error", 0)
                print(f"❌ Keyword search error: {e}")

    def on_stats(b):
        with output_area:
            clear_output(wait=True)
            
            if not current_system and not init_system():
                print("❌ System not ready")
                return
                
            try:
                print("📊 PURE TRANSFORMER OCR STATISTICS")
                print("=" * 50)
                
                # Overall stats
                stats = current_system.get_model_performance_stats()
                print(f"📈 MODEL PERFORMANCE:")
                print(f"  Model: {stats['model_name']}")
                print(f"  Total Processed: {stats['total_processed']} images")
                print(f"  Success Rate: {stats['success_rate']:.1f}%")
                print(f"  Avg Confidence: {stats['avg_confidence']:.3f}")
                print(f"  Avg Processing Time: {stats['avg_processing_time']:.2f}s per image")
                print(f"  Avg Text Length: {stats['avg_text_length']:.0f} characters")
                
                # Recent activity
                recent = current_system.conn.execute('''SELECT COUNT(*) FROM image_texts 
                    WHERE datetime(processed_date) > datetime('now', '-7 days')''').fetchone()[0]
                print(f"\n📅 RECENT ACTIVITY:")
                print(f"  Images processed (last 7 days): {recent}")
                
                # Search history
                searches = current_system.conn.execute('''SELECT COUNT(*) FROM text_searches 
                    WHERE datetime(search_date) > datetime('now', '-7 days')''').fetchone()[0]
                print(f"  Searches performed (last 7 days): {searches}")
                
                # Top keywords
                all_keywords = current_system.conn.execute('''SELECT text_keywords FROM image_texts 
                    WHERE text_keywords IS NOT NULL''').fetchall()
                
                keyword_counts = {}
                for (keywords_json,) in all_keywords:
                    try:
                        keywords = json.loads(keywords_json)
                        for keyword in keywords:
                            keyword_counts[keyword] = keyword_counts.get(keyword, 0) + 1
                    except:
                        continue
                
                if keyword_counts:
                    top_keywords = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)[:10]
                    print(f"\n🔑 TOP KEYWORDS:")
                    for keyword, count in top_keywords:
                        print(f"  {keyword}: {count} occurrences")
                
                update_status("Statistics displayed", 100)
                
            except Exception as e:
                print(f"❌ Statistics error: {e}")

    def on_export(b):
        with output_area:
            if not current_results:
                print("⚠️ No search results to export")
                return
            try:
                update_status("Exporting pure results...", 50)
                path = current_system.export_pure_results(current_results, search_input.value or "search")
                update_status("Export complete!", 100)
                print(f"✅ Pure Transformer results exported to: {path}")
            except Exception as e:
                update_status("Export error", 0)
                print(f"❌ Export error: {e}")

    def on_clear(b):
        nonlocal current_results
        current_results = None
        export_btn.disabled = True
        with output_area:
            clear_output()
            update_status("Output cleared", 0)
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                memory_info.value = f"<b>Memory:</b> {get_memory_usage()}"

    # Connect events
    model_info_btn.on_click(on_model_info)
    test_setup_btn.on_click(on_test_setup)
    process_btn.on_click(on_process)
    semantic_btn.on_click(on_semantic_search)
    keyword_btn.on_click(on_keyword_search)
    stats_btn.on_click(on_stats)
    export_btn.on_click(on_export)
    clear_btn.on_click(on_clear)

    # Layout
    header = widgets.HTML("""
        <h2>🤖 Pure Transformer OCR Image Search System</h2>
        <p><i>Microsoft TrOCR and AllenAI OLMo OCR - No Ollama or BLIP dependencies</i></p>
        <p><b>Features:</b> Pure transformer processing, semantic search, GPU acceleration</p>
    """)

    config_section = widgets.VBox([
        widgets.HTML("<h3>🔧 Pure Transformer Configuration</h3>"),
        ocr_selector,
        widgets.HBox([device_selector, batch_size]),
        gallery_path,
        confidence_threshold
    ])

    search_section = widgets.VBox([
        widgets.HTML("<h3>🔍 Search Interface</h3>"),
        search_input,
        search_type,
        widgets.HTML("<b>Quick Search Examples:</b>"),
        widgets.GridBox(quick_search_btns, layout=widgets.Layout(grid_template_columns='repeat(3, 1fr)', grid_gap='5px'))
    ])

    controls_section = widgets.VBox([
        widgets.HTML("<h3>🎛️ Actions</h3>"),
        widgets.HBox([model_info_btn, test_setup_btn]),
        widgets.HBox([process_btn, semantic_btn]),
        widgets.HBox([keyword_btn, stats_btn]),
        widgets.HBox([export_btn, clear_btn])
    ])

    status_section = widgets.VBox([
        widgets.HTML("<hr>"),
        status_label,
        progress_bar,
        memory_info
    ])

    main_interface = widgets.VBox([
        header,
        config_section,
        search_section,
        controls_section,
        status_section,
        output_area
    ])

    return main_interface

def display_pure_results(filenames: List[str], system: PureTransformerOCRSystem, max_images=6):
    """Display results from pure transformer OCR"""
    if not filenames:
        return

    num = min(len(filenames), max_images)
    cols = 3 if num > 4 else 2 if num > 1 else 1
    rows = (num + cols - 1) // cols

    fig, axes = plt.subplots(rows, cols, figsize=(16, 6*rows))
    if num == 1:
        axes = [axes]
    elif rows == 1:
        axes = axes if num > 1 else [axes]
    else:
        axes = axes.flatten()

    for i, filename in enumerate(filenames[:num]):
        ax = axes[i] if num > 1 else axes[0]
        
        try:
            # Get info from database
            result = system.conn.execute('''SELECT full_path, extracted_text, ocr_confidence, 
                text_keywords, text_length, ocr_model, processing_time FROM image_texts WHERE filename = ?''', 
                (filename,)).fetchone()

            if not result:
                ax.text(0.5, 0.5, f"Not found\n{filename}", ha='center', va='center', transform=ax.transAxes)
                ax.axis('off')
                continue

            full_path, text, confidence, keywords_json, text_len, ocr_model, proc_time = result

            try:
                img = plt.imread(full_path)
                ax.imshow(img)
            except:
                ax.text(0.5, 0.5, f"Image load error\n{filename}", ha='center', va='center', transform=ax.transAxes)
                ax.axis('off')
                continue

            # Title with model info
            model_name = TRANSFORMER_OCR_MODELS.get(ocr_model, {}).get('name', 'Unknown')[:20]
            conf_color = "green" if confidence > 0.9 else "orange" if confidence > 0.7 else "red"
            
            title = f"{filename}\n{model_name}\nConf: {confidence:.3f} | {proc_time:.2f}s"
            ax.set_title(title, fontsize=9, color=conf_color, fontweight='bold')

            # Text preview
            text_preview = text[:80] if text else "No text detected"
            ax.text(0.02, 0.98, f"📝 {text_preview}...", transform=ax.transAxes, fontsize=8,
                   verticalalignment='top', 
                   bbox=dict(boxstyle="round,pad=0.3", facecolor="lightblue", alpha=0.9))

            # Text statistics
            ax.text(0.02, 0.12, f"📊 Length: {text_len} chars", transform=ax.transAxes, fontsize=7,
                   verticalalignment='top',
                   bbox=dict(boxstyle="round,pad=0.2", facecolor="lightyellow", alpha=0.8))
                           
            # Keywords if available
            if keywords_json:
                try:
                    keywords = json.loads(keywords_json)[:3]
                    if keywords:
                        ax.text(0.02, 0.02, f"🔑 {', '.join(keywords)}", transform=ax.transAxes, fontsize=7,
                               verticalalignment='bottom',
                               bbox=dict(boxstyle="round,pad=0.2", facecolor="lightgreen", alpha=0.7))
                except:
                    pass

            ax.axis('off')

        except Exception as e:
            ax.text(0.5, 0.5, f"Display Error\n{filename}\n{str(e)[:20]}", 
                   ha='center', va='center', transform=ax.transAxes, fontsize=8)
            ax.axis('off')

    # Hide unused subplots
    for j in range(num, len(axes)):
        axes[j].axis('off')

    plt.tight_layout()
    plt.suptitle('🤖 Pure Transformer OCR Results', fontsize=16, y=1.02)
    plt.show()

# Utility functions for Pure Transformer OCR
def quick_pure_setup():
    """Quick setup check for Pure Transformer OCR"""
    print("🤖 PURE TRANSFORMER OCR SETUP CHECK")
    print("=" * 50)
    print("✨ No Ollama or BLIP dependencies required!")
    
    # Check PyTorch and GPU
    print(f"✅ PyTorch: {torch.__version__}")
    if torch.cuda.is_available():
        print(f"✅ CUDA: {torch.version.cuda}")
        print(f"🎮 GPU: {torch.cuda.get_device_name(0)}")
    else:
        print("💻 CPU only (slower processing)")
    
    # Check Transformers
    try:
        import transformers
        print(f"✅ Transformers: {transformers.__version__}")
    except:
        print("❌ Transformers not found: pip install transformers")
    
    # Check pure dependencies
    deps = ['sentence_transformers', 'chromadb']
    print(f"\n📦 Pure dependencies:")
    for dep in deps:
        try:
            __import__(dep.replace('-', '_'))
            print(f"✅ {dep}")
        except:
            print(f"❌ {dep}")

def quick_pure_search(query: str, 
                     image_folder: str = '../../datasets/images/text_ocr',
                     model: str = 'microsoft_trocr'):
    """Quick search using Pure Transformer OCR"""
    print(f"🤖 PURE TRANSFORMER OCR SEARCH")
    print(f"Query: '{query}'")
    print(f"Model: {TRANSFORMER_OCR_MODELS[model]['name']}")
    print(f"Folder: {image_folder}")
    print("🚀 Pure processing - no Ollama/BLIP required")
    
    try:
        config = CONFIG.copy()
        config.update({
            'IMAGE_GALLERY_PATH': image_folder,
            'OCR_MODEL': model,
            'DEVICE': 'cuda' if torch.cuda.is_available() else 'cpu'
        })
        
        system = PureTransformerOCRSystem(config)
        
        # Check if images are processed with this model
        count = system.conn.execute(
            "SELECT COUNT(*) FROM image_texts WHERE vector_stored = 1 AND ocr_model = ?",
            (model,)
        ).fetchone()[0]
        
        if count == 0:
            print("📁 Processing images with Pure Transformer OCR...")
            system.process_images()
        
        # Perform semantic search
        results = system.semantic_search(query, k=10)
        matches = results.get('matches', [])
        
        if matches:
            print(f"\n✅ Found {len(matches)} semantic matches:")
            print("-" * 50)
            
            for i, match in enumerate(matches[:5], 1):
                print(f"{i}. {match['filename']}")
                print(f"   🎯 Similarity: {match.get('similarity_score', 0):.3f}")
                print(f"   🎚️ OCR Confidence: {match['ocr_confidence']:.3f}")
                print(f"   📏 Text Length: {match.get('text_length', 0)} chars")
                print(f"   📝 Text: {match['extracted_text'][:100]}...")
                
                if match.get('keywords'):
                    print(f"   🔑 Keywords: {', '.join(match['keywords'][:4])}")
                print()
            
            # Export results
            path = system.export_pure_results(results, query)
            print(f"📋 Results exported to: {path}")
            
            # Show model performance
            stats = system.get_model_performance_stats()
            print(f"\n📊 MODEL PERFORMANCE:")
            print(f"   Success Rate: {stats['success_rate']:.1f}%")
            print(f"   Avg Processing Time: {stats['avg_processing_time']:.2f}s")
            print(f"   Avg Confidence: {stats['avg_confidence']:.3f}")
            
        else:
            print("❌ No semantic matches found")
            print("💡 Try different search terms or check image content")
        
        system.close()
        return results
        
    except Exception as e:
        print(f"❌ Error: {e}")
        return None

def compare_pure_models(image_folder: str = '../../datasets/images/text_ocr',
                       models: List[str] = ['microsoft_trocr', 'microsoft_trocr_handwritten']):
    """Compare Pure Transformer OCR models (no Ollama/BLIP)"""
    print("🏆 PURE TRANSFORMER OCR MODEL COMPARISON")
    print("=" * 60)
    print("✨ Comparing pure models - no external dependencies")
    
    comparison_results = {}
    
    for model in models:
        if model not in TRANSFORMER_OCR_MODELS or model == 'salesforce_blip':
            print(f"⚠️ Skipping model: {model}")
            continue
            
        print(f"\n🤖 Testing: {TRANSFORMER_OCR_MODELS[model]['name']}")
        print("-" * 40)
        
        try:
            config = CONFIG.copy()
            config.update({
                'IMAGE_GALLERY_PATH': image_folder,
                'OCR_MODEL': model,
                'DEVICE': 'cuda' if torch.cuda.is_available() else 'cpu'
            })
            
            system = PureTransformerOCRSystem(config)
            
            # Process a sample of images
            gallery_path = Path(image_folder)
            if not gallery_path.exists():
                print(f"❌ Folder not found: {image_folder}")
                continue
                
            # Get first 10 images for quick comparison
            exts = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
            sample_files = []
            for ext in exts:
                sample_files.extend(list(gallery_path.glob(f"*{ext}"))[:10])
                if len(sample_files) >= 10:
                    break
            
            if not sample_files:
                print("❌ No images found for comparison")
                continue
                
            # Process sample images
            start_time = time.time()
            successful = 0
            total_confidence = 0
            total_text_length = 0
            
            for img_path in sample_files[:10]:
                result = system.extract_text(img_path)
                if result['extracted_text'].strip() and not result.get('error'):
                    successful += 1
                    total_confidence += result['ocr_confidence']
                    total_text_length += len(result['extracted_text'])
            
            total_time = time.time() - start_time
            
            # Calculate metrics
            success_rate = successful / len(sample_files) * 100
            avg_confidence = total_confidence / successful if successful > 0 else 0
            avg_time = total_time / len(sample_files)
            avg_text_length = total_text_length / successful if successful > 0 else 0
            
            comparison_results[model] = {
                'name': TRANSFORMER_OCR_MODELS[model]['name'],
                'success_rate': success_rate,
                'avg_confidence': avg_confidence,
                'avg_time': avg_time,
                'avg_text_length': avg_text_length,
                'total_processed': len(sample_files),
                'successful_extractions': successful
            }
            
            print(f"✅ Success Rate: {success_rate:.1f}%")
            print(f"⚡ Avg Time: {avg_time:.2f}s per image")
            print(f"🎚️ Avg Confidence: {avg_confidence:.3f}")
            print(f"📏 Avg Text Length: {avg_text_length:.0f} chars")
            
            system.close()
            
        except Exception as e:
            print(f"❌ Error testing {model}: {e}")
            
    # Summary comparison
    if len(comparison_results) > 1:
        print(f"\n🏆 PURE MODEL COMPARISON SUMMARY")
        print("=" * 60)
        
        # Sort by success rate
        sorted_results = sorted(comparison_results.items(), 
                              key=lambda x: x[1]['success_rate'], 
                              reverse=True)
        
        for i, (model, data) in enumerate(sorted_results, 1):
            print(f"{i}. {data['name']}")
            print(f"   🎯 Success: {data['success_rate']:.1f}%")
            print(f"   ⚡ Speed: {data['avg_time']:.2f}s")
            print(f"   🎚️ Quality: {data['avg_confidence']:.3f}")
            print()
            
        best_model = sorted_results[0]
        print(f"🏆 WINNER: {best_model[1]['name']}")
        print(f"   Best pure transformer performance for your dataset")
        
    return comparison_results

# Create and display the Pure Transformer OCR interface
print("🤖 Creating Pure Transformer OCR Interface...")
print("🎯 Features: TrOCR, OLMo OCR, GPU acceleration, pure processing")
print("✨ No Ollama or BLIP dependencies required!")

pure_transformer_interface = create_pure_transformer_gui()
display(pure_transformer_interface)

print(f"\n🤖 PURE TRANSFORMER OCR SYSTEM READY!")
print("=" * 70)
print("🔒 PRIVACY: 100% local processing with pure Transformer models")
print("🤖 MODELS: Microsoft TrOCR variants, AllenAI OLMo OCR (placeholder)")
print("⚡ ACCELERATION: GPU support for faster processing")
print("🔍 SEARCH: Pure semantic search with embeddings")
print("📊 ANALYSIS: Simple text analysis without LLM dependency")
print("🚀 SETUP: Automatic model download, GPU detection")
print("✨ PURE: No Ollama, no BLIP, no external LLM required")
print("💡 QUICK: quick_pure_setup() | quick_pure_search('text', './images', 'microsoft_trocr')")
print("🏆 COMPARE: compare_pure_models('./images', ['microsoft_trocr', 'microsoft_trocr_handwritten'])")
print("=" * 70)
print("\n🤖 PURE TRANSFORMER OCR USAGE GUIDE:")
print("1. Select your preferred Transformer OCR model (TrOCR variants)")
print("2. Choose GPU/CPU processing (GPU recommended for speed)")
print("3. Process images to extract text with pure AI models")
print("4. Perform semantic search using embeddings (no LLM needed)")
print("5. Use keyword search for exact text matching")
print("6. View statistics and performance metrics")
print("7. Export results with detailed analysis")
print("=" * 70)
print("\n📋 PURE MODEL RECOMMENDATIONS:")
print("• microsoft_trocr: Best for clean printed documents and forms")
print("• microsoft_trocr_handwritten: Specialized for handwritten text")
print("• microsoft_trocr_large: High accuracy for complex documents")
print("• allenai_olmo_ocr: Advanced reasoning (requires implementation)")
print("=" * 70)
print("\n🚀 KEY BENEFITS OF PURE APPROACH:")
print("• No external dependencies (Ollama/BLIP)")
print("• Faster setup and processing")
print("• Pure Transformer architecture")
print("• GPU acceleration for speed")
print("• Semantic search with embeddings")
print("• Simple text analysis without LLM complexity")
print("• 100% local and private processing")
print("=" * 70)