Worked with Python_3.9

In [1]:
# Cell 1: Configuration and Setup
import os
import shutil
import sqlite3
import json
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Any
import time
import numpy as np
from PIL import Image
import cv2

# Configuration Settings
CONFIG = {
    # OCR Model Selection
    'OCR_MODEL': 'paddleocr',  # 'paddleocr', 'easyocr', 'tesseract'
    'OCR_LANGUAGES': ['en'],   # Languages for OCR

    # Search and Embeddings
    'EMBEDDING_MODEL': 'sentence-transformers/all-MiniLM-L6-v2',
    'DEVICE': 'auto',  # 'auto', 'cpu', 'cuda'

    # Paths
    'SUSPECTS_GALLERY_PATH': '../../datasets/images/text_ocr',     # Input folder with suspect images
    'DATABASE_PATH': 'forensic_analysis_db',       # Database folder
    'RESULTS_OUTPUT_PATH': '../../datasets/images/forensic_results',     # Output folder for matched images

    # OCR Processing parameters
    'OCR_CONFIDENCE_THRESHOLD': 0.6,  # Minimum OCR confidence
    'TEXT_MIN_LENGTH': 3,              # Minimum text length to consider
    'BATCH_SIZE': 4,                   # Batch size for processing

    # Search parameters
    'SIMILARITY_THRESHOLD': 0.3,       # Semantic similarity threshold
    'MAX_RESULTS_DISPLAY': 10,         # Maximum results to display
    'TOP_K_RESULTS': 20,               # Top K results for queries

    # Processing settings
    'SAVE_OCR_METADATA': True,         # Save detailed OCR metadata
    'FIGURE_SIZE': (15, 10),           # Size of result visualization
}

# Available OCR models
AVAILABLE_OCR_MODELS = {
    'paddleocr': {
        'name': 'PaddleOCR',
        'description': 'High accuracy, supports 80+ languages',
        'performance': 'Best overall performance',
        'install_cmd': 'pip install paddlepaddle paddleocr'
    },
    'easyocr': {
        'name': 'EasyOCR',
        'description': 'Simple to use, good accuracy',
        'performance': 'Good performance, easy setup',
        'install_cmd': 'pip install easyocr'
    },
    'tesseract': {
        'name': 'Tesseract',
        'description': 'Classic OCR, widely supported',
        'performance': 'Baseline performance',
        'install_cmd': 'pip install pytesseract'
    }
}

print("✅ Configuration loaded successfully")
print(f"📁 Suspects gallery: {CONFIG['SUSPECTS_GALLERY_PATH']}")
print(f"📁 Database path: {CONFIG['DATABASE_PATH']}")
print(f"📁 Results output: {CONFIG['RESULTS_OUTPUT_PATH']}")
print(f"🔍 OCR model: {CONFIG['OCR_MODEL']}")
print("📝 Forensic OCR + Search System Ready")

# Cell 2: Install and Import Dependencies
def install_dependencies():
    """Install required packages based on selected OCR model"""
    import subprocess
    import sys

    try:
        # Check core dependencies
        import numpy
        import PIL
        import cv2
        print("✅ Core dependencies available")
    except ImportError as e:
        print(f"⚠️ Installing core dependencies...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "numpy", "Pillow", "opencv-python"])

    # Install OCR model
    ocr_model = CONFIG['OCR_MODEL']
    try:
        if ocr_model == 'paddleocr':
            import paddleocr
            print("✅ PaddleOCR already available")
        elif ocr_model == 'easyocr':
            import easyocr
            print("✅ EasyOCR already available")
        elif ocr_model == 'tesseract':
            import pytesseract
            print("✅ Tesseract already available")
    except ImportError:
        print(f"⚠️ Installing {ocr_model}...")
        install_cmd = AVAILABLE_OCR_MODELS[ocr_model]['install_cmd']
        subprocess.check_call([sys.executable, "-m", "pip", "install"] + install_cmd.split()[2:])

    # Install search dependencies
    try:
        import chromadb
        import sentence_transformers
        print("✅ Search dependencies available")
    except ImportError:
        print("⚠️ Installing search dependencies...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "chromadb", "sentence-transformers"])

# Run installation
install_dependencies()

# Import required libraries
try:
    import chromadb
    from sentence_transformers import SentenceTransformer

    # Import OCR model
    if CONFIG['OCR_MODEL'] == 'paddleocr':
        from paddleocr import PaddleOCR
    elif CONFIG['OCR_MODEL'] == 'easyocr':
        import easyocr
    elif CONFIG['OCR_MODEL'] == 'tesseract':
        import pytesseract

    print("✅ All dependencies imported successfully")

    # Check device compatibility
    import torch
    if torch.cuda.is_available():
        print(f"🚀 CUDA available: {torch.cuda.get_device_name(0)}")
        default_device = "cuda"
    else:
        print("🖥️ Using CPU mode")
        default_device = "cpu"

    if CONFIG['DEVICE'] == 'auto':
        CONFIG['DEVICE'] = default_device
        print(f"📍 Auto-detected device: {CONFIG['DEVICE']}")

except ImportError as e:
    print(f"❌ Import error: {e}")
    print("🔧 Please run the installation cell and restart kernel")

# Cell 3: Initialize OCR and Search Models
class ForensicOCRSearchSystem:
    """Comprehensive forensic OCR and search system"""

    def __init__(self, config=None):
        """Initialize the forensic analysis system"""
        self.config = config or CONFIG
        self.setup_directories()
        self.init_ocr_model()
        self.init_search_components()
        self.init_database()

    def setup_directories(self):
        """Create necessary directories"""
        for path_key in ['SUSPECTS_GALLERY_PATH', 'DATABASE_PATH', 'RESULTS_OUTPUT_PATH']:
            path = self.config[path_key]
            os.makedirs(path, exist_ok=True)
            print(f"📁 Directory ready: {path}")

    def init_ocr_model(self):
        """Initialize OCR model based on configuration"""
        ocr_model = self.config['OCR_MODEL']
        languages = self.config['OCR_LANGUAGES']

        print(f"📥 Loading OCR model: {ocr_model}")

        try:
            if ocr_model == 'paddleocr':
                self.ocr = PaddleOCR(
                    use_textline_orientation=True,
                    lang=languages[0] if languages else 'en',
                    # use_gpu=True if self.config['DEVICE'] == 'cuda' else False
                )
                print("✅ PaddleOCR initialized")

            elif ocr_model == 'easyocr':
                self.ocr = easyocr.Reader(
                    languages,
                    gpu=True if self.config['DEVICE'] == 'cuda' else False
                )
                print("✅ EasyOCR initialized")

            elif ocr_model == 'tesseract':
                # Tesseract doesn't need initialization, just check availability
                try:
                    import pytesseract
                    self.ocr = pytesseract
                    print("✅ Tesseract initialized")
                except Exception as e:
                    print(f"❌ Tesseract error: {e}")

        except Exception as e:
            print(f"❌ OCR initialization error: {e}")
            self.ocr = None

    def init_search_components(self):
        """Initialize search and embedding components"""
        try:
            print(f"📥 Loading embedding model: {self.config['EMBEDDING_MODEL']}")
            self.embedding_model = SentenceTransformer(self.config['EMBEDDING_MODEL'])

            print("📥 Initializing vector database...")
            self.chroma_client = chromadb.PersistentClient(path=self.config['DATABASE_PATH'])
            self.collection = self.chroma_client.get_or_create_collection(
                name="forensic_text_search",
                metadata={"hnsw:space": "cosine"}
            )
            print("✅ Search components initialized")

        except Exception as e:
            print(f"❌ Search initialization error: {e}")
            self.embedding_model = None
            self.chroma_client = None
            self.collection = None

    def init_database(self):
        """Initialize SQLite database for metadata"""
        try:
            db_path = os.path.join(self.config['DATABASE_PATH'], 'forensic_metadata.db')
            self.conn = sqlite3.connect(db_path)

            # Create tables
            self.conn.execute('''
                CREATE TABLE IF NOT EXISTS forensic_images (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    filename TEXT UNIQUE NOT NULL,
                    full_path TEXT NOT NULL,
                    extracted_text TEXT,
                    ocr_confidence REAL,
                    text_blocks_count INTEGER,
                    processed_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    file_size INTEGER,
                    image_dimensions TEXT,
                    ocr_metadata TEXT
                )
            ''')

            self.conn.execute('''
                CREATE TABLE IF NOT EXISTS search_queries (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    query_text TEXT NOT NULL,
                    query_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    results_count INTEGER,
                    processing_time REAL
                )
            ''')

            self.conn.commit()
            print("✅ Database initialized")

        except Exception as e:
            print(f"❌ Database initialization error: {e}")
            self.conn = None

    def extract_text_from_image(self, image_path: str) -> Dict[str, Any]:
        """Extract text from image using configured OCR model"""
        try:
            image_path = str(image_path)
            filename = os.path.basename(image_path)

            # Load image
            image = Image.open(image_path).convert("RGB")
            image_array = np.array(image)

            result = {
                'filename': filename,
                'full_path': image_path,
                'extracted_text': '',
                'ocr_confidence': 0.0,
                'text_blocks': [],
                'text_blocks_count': 0,
                'error': None
            }

            # Process with selected OCR model
            if self.config['OCR_MODEL'] == 'paddleocr' and self.ocr:
                ocr_result = self.ocr.ocr(image_array, cls=True)

                if ocr_result and ocr_result[0]:
                    text_blocks = []
                    all_text = []
                    confidences = []

                    for line in ocr_result:
                        for detection in line:
                            bbox, (text, confidence) = detection

                            if confidence >= self.config['OCR_CONFIDENCE_THRESHOLD']:
                                text_blocks.append({
                                    'text': text,
                                    'confidence': confidence,
                                    'bbox': bbox
                                })
                                all_text.append(text)
                                confidences.append(confidence)

                    result.update({
                        'extracted_text': ' '.join(all_text),
                        'ocr_confidence': np.mean(confidences) if confidences else 0.0,
                        'text_blocks': text_blocks,
                        'text_blocks_count': len(text_blocks)
                    })

            elif self.config['OCR_MODEL'] == 'easyocr' and self.ocr:
                ocr_result = self.ocr.readtext(image_array)

                text_blocks = []
                all_text = []
                confidences = []

                for detection in ocr_result:
                    bbox, text, confidence = detection

                    if confidence >= self.config['OCR_CONFIDENCE_THRESHOLD']:
                        text_blocks.append({
                            'text': text,
                            'confidence': confidence,
                            'bbox': bbox
                        })
                        all_text.append(text)
                        confidences.append(confidence)

                result.update({
                    'extracted_text': ' '.join(all_text),
                    'ocr_confidence': np.mean(confidences) if confidences else 0.0,
                    'text_blocks': text_blocks,
                    'text_blocks_count': len(text_blocks)
                })

            elif self.config['OCR_MODEL'] == 'tesseract' and self.ocr:
                # Simple Tesseract extraction
                text = self.ocr.image_to_string(image, config='--psm 6')
                confidence = 0.8  # Tesseract doesn't provide confidence easily

                if text.strip():
                    result.update({
                        'extracted_text': text.strip(),
                        'ocr_confidence': confidence,
                        'text_blocks': [{'text': text.strip(), 'confidence': confidence}],
                        'text_blocks_count': 1
                    })

            return result

        except Exception as e:
            return {
                'filename': os.path.basename(image_path),
                'full_path': image_path,
                'extracted_text': '',
                'ocr_confidence': 0.0,
                'text_blocks': [],
                'text_blocks_count': 0,
                'error': str(e)
            }

    def process_image_gallery(self, image_extensions=['.jpg', '.jpeg', '.png', '.bmp', '.tiff']):
        """Process all images in the suspect gallery"""
        start_time = time.time()
        gallery_path = Path(self.config['SUSPECTS_GALLERY_PATH'])

        print(f"🔍 Processing images from: {gallery_path}")
        print(f"🔍 OCR Model: {self.config['OCR_MODEL']}")
        print(f"⏰ Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

        if not gallery_path.exists():
            print(f"❌ Gallery path {gallery_path} does not exist!")
            return

        # Get all image files
        image_files = []
        for ext in image_extensions:
            image_files.extend(gallery_path.glob(f"**/*{ext}"))
            image_files.extend(gallery_path.glob(f"**/*{ext.upper()}"))

        total_files = len(image_files)
        print(f"📸 Found {total_files} images to process")

        if total_files == 0:
            print("⚠️ No images found!")
            return

        processed_count = 0
        text_found_count = 0

        print("\n" + "="*60)
        print("🔍 PROCESSING IMAGES")
        print("="*60)

        for i, image_path in enumerate(image_files, 1):
            filename = image_path.name

            # Progress indicator
            progress = (i / total_files) * 100
            print(f"[{i:3d}/{total_files}] ({progress:5.1f}%) Processing: {filename[:40]}...", end='')

            # Check if already processed
            cursor = self.conn.execute("SELECT filename FROM forensic_images WHERE filename = ?", (filename,))
            if cursor.fetchone():
                print(" ⏭️ SKIP (already processed)")
                continue

            # Extract text
            ocr_result = self.extract_text_from_image(image_path)

            if ocr_result.get('error'):
                print(f" ❌ ERROR: {ocr_result['error'][:30]}")
                continue

            # Store in database
            file_size = image_path.stat().st_size if image_path.exists() else 0
            image_dims = f"{ocr_result.get('width', 0)}x{ocr_result.get('height', 0)}"

            self.conn.execute('''
                INSERT OR REPLACE INTO forensic_images
                (filename, full_path, extracted_text, ocr_confidence, text_blocks_count,
                 file_size, image_dimensions, ocr_metadata)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?)
            ''', (
                filename,
                str(image_path),
                ocr_result['extracted_text'],
                ocr_result['ocr_confidence'],
                ocr_result['text_blocks_count'],
                file_size,
                image_dims,
                json.dumps(ocr_result['text_blocks']) if self.config['SAVE_OCR_METADATA'] else None
            ))

            # Add to vector database if text found
            if ocr_result['extracted_text'].strip():
                try:
                    doc_id = f"img_{processed_count}_{filename}"
                    self.collection.add(
                        documents=[ocr_result['extracted_text']],
                        metadatas=[{
                            'filename': filename,
                            'full_path': str(image_path),
                            'ocr_confidence': ocr_result['ocr_confidence'],
                            'text_blocks_count': ocr_result['text_blocks_count']
                        }],
                        ids=[doc_id]
                    )
                    text_found_count += 1
                    print(f" ✅ TEXT ({ocr_result['text_blocks_count']} blocks, conf: {ocr_result['ocr_confidence']:.2f})")
                except Exception as e:
                    print(f" ⚠️ DB ERROR: {str(e)[:20]}")
            else:
                print(" ⚪ NO TEXT")

            processed_count += 1

            # Commit every 10 images
            if i % 10 == 0:
                self.conn.commit()

        # Final commit
        self.conn.commit()

        # Calculate timing
        end_time = time.time()
        duration = end_time - start_time

        print("\n" + "="*60)
        print("📊 PROCESSING SUMMARY")
        print("="*60)
        print(f"📸 Total images: {total_files}")
        print(f"✅ Successfully processed: {processed_count}")
        print(f"📝 Images with text: {text_found_count}")
        print(f"⚪ Images without text: {processed_count - text_found_count}")
        print(f"⏱️ Total time: {duration:.1f}s ({duration/total_files:.2f}s per image)")
        print(f"📊 Text detection rate: {(text_found_count/processed_count*100):.1f}%" if processed_count > 0 else "")
        print("="*60)

    def search_by_text_query(self, query: str, max_results: int = None) -> List[Dict[str, Any]]:
        """Search images by semantic text similarity"""
        if max_results is None:
            max_results = self.config['TOP_K_RESULTS']

        if not self.collection or not self.embedding_model:
            print("❌ Search components not available")
            return []

        start_time = time.time()

        try:
            # Perform semantic search
            results = self.collection.query(
                query_texts=[query],
                n_results=max_results
            )

            # Format results
            matches = []
            if results['documents'] and results['documents'][0]:
                for i, doc in enumerate(results['documents'][0]):
                    metadata = results['metadatas'][0][i]
                    distance = results['distances'][0][i] if 'distances' in results else 0
                    similarity = 1 - distance

                    # Filter by similarity threshold
                    if similarity >= self.config['SIMILARITY_THRESHOLD']:
                        matches.append({
                            'filename': metadata['filename'],
                            'full_path': metadata['full_path'],
                            'extracted_text': doc,
                            'similarity_score': similarity,
                            'ocr_confidence': metadata.get('ocr_confidence', 0),
                            'text_blocks_count': metadata.get('text_blocks_count', 0),
                            'text_preview': doc[:200] + "..." if len(doc) > 200 else doc
                        })

            # Log query
            processing_time = time.time() - start_time
            self.conn.execute('''
                INSERT INTO search_queries (query_text, results_count, processing_time)
                VALUES (?, ?, ?)
            ''', (query, len(matches), processing_time))
            self.conn.commit()

            return matches

        except Exception as e:
            print(f"❌ Search error: {e}")
            return []

    def search_by_keywords(self, keywords: List[str], exact_match: bool = False) -> List[Dict[str, Any]]:
        """Search images by specific keywords using SQL"""
        matches = []

        try:
            for keyword in keywords:
                if exact_match:
                    sql_query = """
                        SELECT filename, full_path, extracted_text, ocr_confidence, text_blocks_count
                        FROM forensic_images
                        WHERE extracted_text LIKE ? AND extracted_text != ''
                    """
                    params = (f"%{keyword}%",)
                else:
                    sql_query = """
                        SELECT filename, full_path, extracted_text, ocr_confidence, text_blocks_count
                        FROM forensic_images
                        WHERE LOWER(extracted_text) LIKE LOWER(?) AND extracted_text != ''
                    """
                    params = (f"%{keyword}%",)

                cursor = self.conn.execute(sql_query, params)
                results = cursor.fetchall()

                for row in results:
                    match = {
                        'filename': row[0],
                        'full_path': row[1],
                        'extracted_text': row[2],
                        'ocr_confidence': row[3],
                        'text_blocks_count': row[4],
                        'matched_keyword': keyword,
                        'search_type': 'keyword',
                        'text_preview': row[2][:200] + "..." if len(row[2]) > 200 else row[2]
                    }

                    # Avoid duplicates
                    if not any(m['filename'] == match['filename'] for m in matches):
                        matches.append(match)

            return matches

        except Exception as e:
            print(f"❌ Keyword search error: {e}")
            return []

    def forensic_query_analysis(self, query: str, include_keywords: bool = True) -> Dict[str, Any]:
        """Comprehensive forensic analysis for a query"""
        print(f"🔍 Forensic Analysis: '{query}'")
        print("-" * 50)

        start_time = time.time()

        # Semantic search
        print("📝 Performing semantic search...")
        semantic_results = self.search_by_text_query(query)

        # Keyword search
        keyword_results = []
        if include_keywords:
            print("🔍 Performing keyword search...")
            query_words = query.lower().split()
            keyword_results = self.search_by_keywords(query_words)

        # Combine and deduplicate results
        all_results = semantic_results.copy()
        for kw_result in keyword_results:
            if not any(r['filename'] == kw_result['filename'] for r in all_results):
                kw_result['search_type'] = 'keyword'
                all_results.append(kw_result)

        # Sort by relevance (similarity score or confidence)
        all_results.sort(
            key=lambda x: x.get('similarity_score', x.get('ocr_confidence', 0)),
            reverse=True
        )

        processing_time = time.time() - start_time

        analysis = {
            'query': query,
            'semantic_matches': len(semantic_results),
            'keyword_matches': len(keyword_results),
            'total_unique_matches': len(all_results),
            'processing_time': processing_time,
            'results': all_results[:self.config['MAX_RESULTS_DISPLAY']],
            'timestamp': datetime.now().isoformat()
        }

        print(f"✅ Analysis complete:")
        print(f"   📊 Semantic matches: {len(semantic_results)}")
        print(f"   🔍 Keyword matches: {len(keyword_results)}")
        print(f"   📝 Total unique: {len(all_results)}")
        print(f"   ⏱️ Time: {processing_time:.2f}s")

        return analysis

    def get_statistics(self) -> Dict[str, Any]:
        """Get comprehensive database statistics"""
        try:
            stats = {}

            # Image statistics
            cursor = self.conn.execute("SELECT COUNT(*) FROM forensic_images")
            stats['total_images'] = cursor.fetchone()[0]

            cursor = self.conn.execute("SELECT COUNT(*) FROM forensic_images WHERE extracted_text != ''")
            stats['images_with_text'] = cursor.fetchone()[0]

            cursor = self.conn.execute("SELECT AVG(ocr_confidence) FROM forensic_images WHERE extracted_text != ''")
            result = cursor.fetchone()[0]
            stats['avg_ocr_confidence'] = round(result, 3) if result else 0

            cursor = self.conn.execute("SELECT SUM(text_blocks_count) FROM forensic_images")
            result = cursor.fetchone()[0]
            stats['total_text_blocks'] = result if result else 0

            # Query statistics
            cursor = self.conn.execute("SELECT COUNT(*) FROM search_queries")
            stats['total_queries'] = cursor.fetchone()[0]

            cursor = self.conn.execute("SELECT AVG(processing_time) FROM search_queries")
            result = cursor.fetchone()[0]
            stats['avg_query_time'] = round(result, 3) if result else 0

            # File size statistics
            cursor = self.conn.execute("SELECT SUM(file_size), AVG(file_size) FROM forensic_images")
            total_size, avg_size = cursor.fetchone()
            stats['total_file_size_mb'] = round(total_size / (1024*1024), 2) if total_size else 0
            stats['avg_file_size_kb'] = round(avg_size / 1024, 2) if avg_size else 0

            return stats

        except Exception as e:
            print(f"❌ Statistics error: {e}")
            return {}

    def export_results(self, results: List[Dict], output_folder: str, copy_images: bool = True) -> str:
        """Export search results to folder with metadata"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_path = Path(output_folder) / f"forensic_search_{timestamp}"
        output_path.mkdir(parents=True, exist_ok=True)

        copied_files = []
        metadata = {
            'export_timestamp': timestamp,
            'total_results': len(results),
            'export_path': str(output_path),
            'results': []
        }

        for i, result in enumerate(results, 1):
            try:
                result_meta = {
                    'index': i,
                    'filename': result['filename'],
                    'original_path': result['full_path'],
                    'extracted_text': result['extracted_text'],
                    'text_preview': result.get('text_preview', ''),
                    'ocr_confidence': result.get('ocr_confidence', 0),
                    'similarity_score': result.get('similarity_score', 0),
                    'search_type': result.get('search_type', 'semantic')
                }

                if copy_images:
                    # Copy image file
                    source_path = Path(result['full_path'])
                    if source_path.exists():
                        # Create descriptive filename
                        conf_or_sim = result.get('similarity_score', result.get('ocr_confidence', 0))
                        new_filename = f"{i:03d}_{source_path.stem}_score{conf_or_sim:.2f}{source_path.suffix}"
                        dest_path = output_path / new_filename

                        shutil.copy2(source_path, dest_path)
                        copied_files.append(str(dest_path))
                        result_meta['exported_filename'] = new_filename
                        result_meta['exported_path'] = str(dest_path)

                metadata['results'].append(result_meta)

            except Exception as e:
                print(f"❌ Error exporting {result['filename']}: {e}")

        # Save metadata JSON
        metadata_file = output_path / "search_metadata.json"
        with open(metadata_file, 'w', encoding='utf-8') as f:
            json.dump(metadata, f, indent=2, ensure_ascii=False)

        # Save text summary
        summary_file = output_path / "search_summary.txt"
        with open(summary_file, 'w', encoding='utf-8') as f:
            f.write(f"Forensic Search Results\n")
            f.write(f"="*50 + "\n")
            f.write(f"Export Date: {timestamp}\n")
            f.write(f"Total Results: {len(results)}\n")
            f.write(f"Files Copied: {len(copied_files)}\n\n")

            for i, result in enumerate(results, 1):
                f.write(f"{i}. {result['filename']}\n")
                f.write(f"   Text: {result.get('text_preview', '')}\n")
                f.write(f"   Score: {result.get('similarity_score', result.get('ocr_confidence', 0)):.3f}\n\n")

        print(f"📁 Exported {len(results)} results to: {output_path}")
        print(f"📋 Images copied: {len(copied_files)}")
        print(f"📄 Metadata saved: {metadata_file}")

        return str(output_path)

    def close(self):
        """Close database connections and cleanup"""
        if hasattr(self, 'conn') and self.conn:
            self.conn.close()
            print("📊 Database connections closed")

# Cell 4: Batch Processing and Forensic Analysis Functions
def run_comprehensive_forensic_analysis(system, custom_queries=None):
    """Run comprehensive forensic analysis with predefined and custom queries"""

    # Default forensic text queries
    default_queries = [
        "bank account number",
        "credit card",
        "social security number",
        "phone number",
        "email address",
        "password",
        "address location",
        "threat violence",
        "weapon gun knife",
        "drugs illegal substances",
        "money cash payment",
        "meeting appointment",
        "suspicious activity",
        "personal identification",
        "financial transaction"
    ]

    # Combine with custom queries
    queries = default_queries + (custom_queries or [])

    print(f"🚀 Starting comprehensive forensic analysis")
    print(f"📋 Total queries: {len(queries)} ({len(default_queries)} default + {len(custom_queries or [])} custom)")
    print("="*60)

    all_results = {}
    high_priority_findings = []

    start_time = time.time()

    for i, query in enumerate(queries, 1):
        print(f"\n[{i:2d}/{len(queries)}] Analyzing: '{query}'")

        # Perform analysis
        analysis = system.forensic_query_analysis(query, include_keywords=True)
        all_results[query] = analysis

        # Identify high-priority findings
        for result in analysis['results'][:3]:  # Top 3 per query
            score = result.get('similarity_score', result.get('ocr_confidence', 0))
            if score > 0.5:  # High confidence threshold
                high_priority_findings.append({
                    'query': query,
                    'filename': result['filename'],
                    'score': score,
                    'text_preview': result['text_preview'],
                    'search_type': result.get('search_type', 'semantic')
                })

        # Brief status
        matches = analysis['total_unique_matches']
        status = "🔴" if matches > 5 else "🟡" if matches > 0 else "⚪"
        print(f"   {status} {matches} matches found")

    total_time = time.time() - start_time

    # Generate summary report
    print(f"\n" + "="*60)
    print("📊 COMPREHENSIVE ANALYSIS SUMMARY")
    print("="*60)

    total_matches = sum(r['total_unique_matches'] for r in all_results.values())
    queries_with_results = sum(1 for r in all_results.values() if r['total_unique_matches'] > 0)

    print(f"🎯 Queries processed: {len(queries)}")
    print(f"✅ Queries with results: {queries_with_results}")
    print(f"📊 Total matches found: {total_matches}")
    print(f"🔴 High priority findings: {len(high_priority_findings)}")
    print(f"⏱️ Total analysis time: {total_time:.1f}s")
    print(f"📈 Average per query: {total_time/len(queries):.1f}s")

    # Show top findings by query
    print(f"\n📋 TOP FINDINGS BY QUERY:")
    for query, analysis in all_results.items():
        matches = analysis['total_unique_matches']
        if matches > 0:
            status = "🔴" if matches > 5 else "🟡"
            print(f"{status} '{query}': {matches} matches")

    # Show high priority findings
    if high_priority_findings:
        print(f"\n🚨 HIGH PRIORITY EVIDENCE:")
        high_priority_findings.sort(key=lambda x: x['score'], reverse=True)
        for finding in high_priority_findings[:10]:  # Top 10
            print(f"🔴 {finding['filename']} (Query: '{finding['query']}')")
            print(f"   Score: {finding['score']:.3f} | Type: {finding['search_type']}")
            print(f"   Text: {finding['text_preview'][:80]}...")
            print()

    return {
        'all_results': all_results,
        'high_priority_findings': high_priority_findings,
        'summary': {
            'total_queries': len(queries),
            'queries_with_results': queries_with_results,
            'total_matches': total_matches,
            'high_priority_count': len(high_priority_findings),
            'processing_time': total_time
        }
    }

def batch_export_all_results(system, analysis_results, output_base_path):
    """Export all analysis results in organized folders"""

    print("📁 Exporting all forensic analysis results...")

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    base_export_path = Path(output_base_path) / f"forensic_analysis_{timestamp}"
    base_export_path.mkdir(parents=True, exist_ok=True)

    # Export high priority findings
    high_priority = analysis_results['high_priority_findings']
    if high_priority:
        print(f"🔴 Exporting {len(high_priority)} high priority findings...")
        hp_results = []
        for finding in high_priority:
            # Convert finding to result format
            result = {
                'filename': finding['filename'],
                'full_path': '',  # Will be filled from database
                'extracted_text': '',
                'text_preview': finding['text_preview'],
                'similarity_score': finding['score'],
                'search_type': finding['search_type']
            }

            # Get full details from database
            cursor = system.conn.execute(
                "SELECT full_path, extracted_text FROM forensic_images WHERE filename = ?",
                (finding['filename'],)
            )
            row = cursor.fetchone()
            if row:
                result['full_path'] = row[0]
                result['extracted_text'] = row[1]
                hp_results.append(result)

        if hp_results:
            hp_folder = system.export_results(hp_results, base_export_path / "high_priority", copy_images=True)

    # Export results by query category
    categories = {
        'financial': ['bank account', 'credit card', 'money', 'payment', 'transaction'],
        'personal_info': ['phone number', 'email', 'address', 'social security', 'identification'],
        'security': ['password', 'threat', 'weapon', 'suspicious'],
        'illegal': ['drugs', 'illegal']
    }

    for category, keywords in categories.items():
        category_results = []
        for query, analysis in analysis_results['all_results'].items():
            if any(keyword in query.lower() for keyword in keywords):
                category_results.extend(analysis['results'])

        if category_results:
            # Remove duplicates
            unique_results = []
            seen_files = set()
            for result in category_results:
                if result['filename'] not in seen_files:
                    unique_results.append(result)
                    seen_files.add(result['filename'])

            if unique_results:
                print(f"📂 Exporting {len(unique_results)} {category} findings...")
                system.export_results(unique_results, base_export_path / category, copy_images=True)

    # Create master summary
    summary_file = base_export_path / "analysis_summary.json"
    with open(summary_file, 'w', encoding='utf-8') as f:
        json.dump({
            'export_timestamp': timestamp,
            'summary': analysis_results['summary'],
            'high_priority_count': len(high_priority),
            'categories_exported': list(categories.keys()),
            'export_path': str(base_export_path)
        }, f, indent=2)

    print(f"✅ All results exported to: {base_export_path}")
    return str(base_export_path)

# Cell 5: Interactive Interface Functions
def create_forensic_interface(system):
    """Create interactive interface for forensic analysts"""

    def display_statistics():
        """Display system statistics"""
        stats = system.get_statistics()
        print("\n📊 FORENSIC SYSTEM STATISTICS")
        print("="*40)
        print(f"📸 Total images processed: {stats.get('total_images', 0)}")
        print(f"📝 Images with text: {stats.get('images_with_text', 0)}")
        print(f"📊 Text detection rate: {(stats.get('images_with_text', 0) / max(stats.get('total_images', 1), 1) * 100):.1f}%")
        print(f"🎯 Average OCR confidence: {stats.get('avg_ocr_confidence', 0):.3f}")
        print(f"📋 Total text blocks found: {stats.get('total_text_blocks', 0)}")
        print(f"🔍 Total queries executed: {stats.get('total_queries', 0)}")
        print(f"⏱️ Average query time: {stats.get('avg_query_time', 0):.3f}s")
        print(f"💾 Total data size: {stats.get('total_file_size_mb', 0)} MB")
        print(f"📏 Average file size: {stats.get('avg_file_size_kb', 0)} KB")
        print("="*40)

    def interactive_search():
        """Interactive search interface"""
        print("\n🔍 INTERACTIVE FORENSIC SEARCH")
        print("="*40)
        print("Enter 'help' for commands, 'quit' to exit")

        while True:
            try:
                user_input = input("\n🔍 Query> ").strip()

                if user_input.lower() in ['quit', 'exit', 'q']:
                    break
                elif user_input.lower() == 'help':
                    print("\n📋 Available commands:")
                    print("  search <query>     - Semantic text search")
                    print("  keyword <words>    - Keyword search")
                    print("  stats             - Show system statistics")
                    print("  recent            - Show recent queries")
                    print("  export <query>    - Export last search results")
                    print("  help              - Show this help")
                    print("  quit              - Exit interface")
                    continue
                elif user_input.lower() == 'stats':
                    display_statistics()
                    continue
                elif user_input.lower() == 'recent':
                    cursor = system.conn.execute(
                        "SELECT query_text, results_count, query_date FROM search_queries ORDER BY query_date DESC LIMIT 10"
                    )
                    recent = cursor.fetchall()
                    print("\n🕐 Recent queries:")
                    for query, count, date in recent:
                        print(f"  {date}: '{query}' ({count} results)")
                    continue
                elif not user_input:
                    continue

                # Parse command
                parts = user_input.split(' ', 1)
                command = parts[0].lower()
                query = parts[1] if len(parts) > 1 else ""

                if command == 'search' and query:
                    print(f"\n🔍 Searching: '{query}'")
                    results = system.search_by_text_query(query)

                    if results:
                        print(f"✅ Found {len(results)} matches:")
                        for i, result in enumerate(results[:5], 1):  # Show top 5
                            score = result.get('similarity_score', 0)
                            print(f"{i}. {result['filename']} (score: {score:.3f})")
                            print(f"   {result['text_preview']}")
                            print()
                    else:
                        print("⚪ No matches found")

                elif command == 'keyword' and query:
                    keywords = query.split()
                    print(f"\n🔍 Keyword search: {keywords}")
                    results = system.search_by_keywords(keywords)

                    if results:
                        print(f"✅ Found {len(results)} matches:")
                        for i, result in enumerate(results[:5], 1):
                            keyword = result.get('matched_keyword', '')
                            print(f"{i}. {result['filename']} (keyword: '{keyword}')")
                            print(f"   {result['text_preview']}")
                            print()
                    else:
                        print("⚪ No matches found")

                elif command == 'export' and query:
                    # Search and export
                    results = system.search_by_text_query(query)
                    if results:
                        output_path = system.export_results(results, system.config['RESULTS_OUTPUT_PATH'])
                        print(f"📁 Results exported to: {output_path}")
                    else:
                        print("⚪ No results to export")

                else:
                    print("❌ Unknown command. Type 'help' for available commands.")

            except KeyboardInterrupt:
                print("\n👋 Goodbye!")
                break
            except Exception as e:
                print(f"❌ Error: {e}")

    return interactive_search, display_statistics

# Cell 6: Main Execution and Examples
def main_forensic_analysis():
    """Main function to run the complete forensic analysis system"""

    print("🚀 INITIALIZING FORENSIC OCR + SEARCH SYSTEM")
    print("="*60)

    # Initialize system
    system = ForensicOCRSearchSystem(CONFIG)

    # Check if system is ready
    if not system.ocr:
        print("❌ OCR model not loaded. Please check installation.")
        return None

    if not system.collection:
        print("❌ Search components not loaded. Please check installation.")
        return None

    print("✅ System initialized successfully!")

    # Process images if needed
    stats = system.get_statistics()
    if stats.get('total_images', 0) == 0:
        print("\n📸 No processed images found. Processing gallery...")
        system.process_image_gallery()
    else:
        print(f"\n📊 Found {stats['total_images']} processed images, {stats['images_with_text']} with text")

    return system

# Example usage functions
def run_quick_demo(system):
    """Run a quick demonstration of the system"""
    if not system:
        print("❌ System not available")
        return

    print("\n🎯 QUICK DEMONSTRATION")
    print("="*30)

    # Example queries
    demo_queries = [
        "phone number",
        "bank account",
        "password",
        "address"
    ]

    for query in demo_queries:
        print(f"\n🔍 Demo query: '{query}'")
        results = system.search_by_text_query(query, max_results=3)

        if results:
            print(f"✅ Found {len(results)} matches")
            for result in results[:2]:  # Show top 2
                print(f"  📄 {result['filename']} (score: {result['similarity_score']:.3f})")
        else:
            print("⚪ No matches")

def list_available_ocr_models():
    """Display available OCR models"""
    print("\n🔍 AVAILABLE OCR MODELS")
    print("="*30)
    for key, info in AVAILABLE_OCR_MODELS.items():
        status = "✅" if key == CONFIG['OCR_MODEL'] else "⚪"
        print(f"{status} {info['name']} ({key})")
        print(f"   📊 {info['performance']}")
        print(f"   📝 {info['description']}")
        print(f"   💻 Install: {info['install_cmd']}")
        print()

# Display available models
list_available_ocr_models()

# Initialize system
print("\n🚀 STARTING FORENSIC SYSTEM...")
forensic_system = main_forensic_analysis()

if forensic_system:
    # Create interactive interface
    interactive_search, display_stats = create_forensic_interface(forensic_system)

    print(f"\n✅ FORENSIC OCR + SEARCH SYSTEM READY!")
    print("="*50)
    print("📋 AVAILABLE FUNCTIONS:")
    print("  • forensic_system.process_image_gallery() - Process new images")
    print("  • forensic_system.search_by_text_query(query) - Semantic search")
    print("  • forensic_system.search_by_keywords([keywords]) - Keyword search")
    print("  • forensic_system.forensic_query_analysis(query) - Full analysis")
    print("  • run_comprehensive_forensic_analysis(forensic_system) - Batch analysis")
    print("  • interactive_search() - Interactive search interface")
    print("  • display_stats() - Show system statistics")
    print("  • run_quick_demo(forensic_system) - Quick demonstration")
    print("="*50)

    # Show initial statistics
    display_stats()

    # Run quick demo
    run_quick_demo(forensic_system)

    print(f"\n💡 TIP: Use interactive_search() for hands-on searching!")
    print(f"💡 TIP: Run run_comprehensive_forensic_analysis(forensic_system) for full analysis!")

else:
    print("❌ Failed to initialize forensic system")

# Cell 7: Usage Instructions and Documentation
print("""
🎯 FORENSIC OCR + SEARCH SYSTEM - COMPLETE GUIDE
===============================================

🆕 SYSTEM FEATURES:
• 🔍 Advanced OCR with PaddleOCR/EasyOCR/Tesseract
• 📝 Semantic text search using embeddings
• 🔍 Keyword-based search with SQL
• 📊 Comprehensive forensic analysis
• 📁 Automated result export with metadata
• 💾 SQLite database for persistence
• 🎯 Interactive search interface

📋 SETUP CHECKLIST:
1. ✅ Place suspect images in './suspect_images' folder
2. ✅ Run all cells in order (1-7)
3. ✅ System automatically processes images with OCR
4. ✅ Vector database created for semantic search
5. ✅ Ready for forensic analysis

🔧 OCR MODEL OPTIONS:
• PaddleOCR: Best accuracy, 80+ languages, GPU support
• EasyOCR: Good balance, easy setup, 80+ languages
• Tesseract: Classic, reliable, widely supported

🔍 SEARCH CAPABILITIES:
• Semantic Search: "bank account information" finds related concepts
• Keyword Search: Exact/partial word matching in extracted text
• Combined Analysis: Both semantic + keyword for comprehensive results
• Batch Processing: Multiple queries for systematic investigation

🎛️ KEY FUNCTIONS:
forensic_system.search_by_text_query("bank account")
forensic_system.search_by_keywords(["phone", "number"])
forensic_system.forensic_query_analysis("suspicious activity")
run_comprehensive_forensic_analysis(forensic_system)

📊 ANALYSIS FEATURES:
• High-priority evidence detection (score > 0.5)
• Automatic categorization (financial, personal, security, illegal)
• Export results with images and metadata
• Query performance tracking
• Statistical reporting

🔍 FORENSIC QUERY EXAMPLES:
• "bank account number" - Financial information
• "phone number" - Contact information
• "threat violence" - Security concerns
• "password" - Security credentials
• "suspicious activity" - General investigation
• "weapon gun knife" - Dangerous items
• "drugs illegal substances" - Illegal content

📁 OUTPUT STRUCTURE:
forensic_results/
├── forensic_analysis_20240614_143022/
│   ├── high_priority/          # Critical findings
│   ├── financial/              # Financial evidence
│   ├── personal_info/          # Personal information
│   ├── security/               # Security-related
│   └── illegal/                # Illegal content
└── search_metadata.json       # Analysis metadata

⚙️ CONFIGURATION OPTIONS:
• OCR_CONFIDENCE_THRESHOLD: Minimum OCR confidence (0.6)
• SIMILARITY_THRESHOLD: Semantic search threshold (0.3)
• TEXT_MIN_LENGTH: Minimum text length to process (3)
• MAX_RESULTS_DISPLAY: Results shown per query (10)
• BATCH_SIZE: Processing batch size (4)

🚨 FORENSIC BEST PRACTICES:
• Start with comprehensive batch analysis
• Review high-priority findings first
• Cross-reference semantic + keyword results
• Export evidence with metadata for reports
• Document analysis parameters for court
• Use interactive search for targeted investigation

📈 PERFORMANCE OPTIMIZATION:
• PaddleOCR + GPU: Best performance for large datasets
• Adjust confidence thresholds based on image quality
• Use batch processing for efficiency
• Monitor similarity scores for relevance tuning
• Regular database maintenance for speed

🔍 TROUBLESHOOTING:
• "No text found" → Check OCR confidence threshold
• "No search results" → Lower similarity threshold
• "Slow processing" → Reduce batch size or use CPU
• "Memory issues" → Process smaller batches
• "Poor OCR accuracy" → Try different OCR model

📊 EXPECTED PERFORMANCE:
• OCR Processing: 2-5 seconds per image (CPU)
• Semantic Search: 0.1-0.5 seconds per query
• Text Detection Rate: 60-90% depending on image quality
• Search Accuracy: High for clear, well-extracted text

🎯 INVESTIGATION WORKFLOW:
1. Process all suspect images with OCR
2. Run comprehensive forensic analysis
3. Review high-priority findings
4. Conduct targeted searches based on case needs
5. Export evidence with metadata
6. Generate reports for legal proceedings

💡 ADVANCED FEATURES:
• Custom query development for specific cases
• Integration with existing forensic workflows
• Batch export for evidence management
• Statistical analysis for case patterns
• Interactive search for real-time investigation
""")

✅ Configuration loaded successfully
📁 Suspects gallery: ../../datasets/images/text_ocr
📁 Database path: forensic_analysis_db
📁 Results output: ../../datasets/images/forensic_results
🔍 OCR model: paddleocr
📝 Forensic OCR + Search System Ready
✅ Core dependencies available
✅ PaddleOCR already available
✅ Search dependencies available
✅ All dependencies imported successfully
🖥️ Using CPU mode
📍 Auto-detected device: cpu

🔍 AVAILABLE OCR MODELS
✅ PaddleOCR (paddleocr)
   📊 Best overall performance
   📝 High accuracy, supports 80+ languages
   💻 Install: pip install paddlepaddle paddleocr

⚪ EasyOCR (easyocr)
   📊 Good performance, easy setup
   📝 Simple to use, good accuracy
   💻 Install: pip install easyocr

⚪ Tesseract (tesseract)
   📊 Baseline performance
   📝 Classic OCR, widely supported
   💻 Install: pip install pytesseract


🚀 STARTING FORENSIC SYSTEM...
🚀 INITIALIZING FORENSIC OCR + SEARCH SYSTEM
📁 Directory ready: ../../datasets/images/text_ocr
📁 Directory ready: forensic_analysis

[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_doc_ori), the model files will be automatically downloaded and saved in C:\Users\scott\.paddlex\official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

[32mCreating model: ('UVDoc', None)[0m
[33mThe model(UVDoc) is not supported to run in MKLDNN mode! Using `paddle` instead![0m
[32mUsing official model (UVDoc), the model files will be automatically downloaded and saved in C:\Users\scott\.paddlex\official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_textline_ori), the model files will be automatically downloaded and saved in C:\Users\scott\.paddlex\official_models.[0m


Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mUsing official model (PP-OCRv5_server_det), the model files will be automatically downloaded and saved in C:\Users\scott\.paddlex\official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

[32mCreating model: ('PP-OCRv5_server_rec', None)[0m
[32mUsing official model (PP-OCRv5_server_rec), the model files will be automatically downloaded and saved in C:\Users\scott\.paddlex\official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

✅ PaddleOCR initialized
📥 Loading embedding model: sentence-transformers/all-MiniLM-L6-v2
📥 Initializing vector database...
✅ Search components initialized
✅ Database initialized
✅ System initialized successfully!

📸 No processed images found. Processing gallery...
🔍 Processing images from: ..\..\datasets\images\text_ocr
🔍 OCR Model: paddleocr
⏰ Start time: 2025-07-09 20:05:41
📸 Found 200 images to process

🔍 PROCESSING IMAGES
[  1/200] (  0.5%) Processing: Text_ocr_000.jpg... ❌ ERROR: predict() got an unexpected ke
[  2/200] (  1.0%) Processing: Text_ocr_001.jpg... ❌ ERROR: predict() got an unexpected ke
[  3/200] (  1.5%) Processing: Text_ocr_002.jpg... ❌ ERROR: predict() got an unexpected ke
[  4/200] (  2.0%) Processing: Text_ocr_003.jpg... ❌ ERROR: predict() got an unexpected ke
[  5/200] (  2.5%) Processing: Text_ocr_004.jpg... ❌ ERROR: predict() got an unexpected ke
[  6/200] (  3.0%) Processing: Text_ocr_005.jpg... ❌ ERROR: predict() got an unexpected ke
[  7/200] (  3.5%) Proc

  ocr_result = self.ocr.ocr(image_array, cls=True)


 ❌ ERROR: predict() got an unexpected ke
[ 15/200] (  7.5%) Processing: Text_ocr_014.jpg... ❌ ERROR: predict() got an unexpected ke
[ 16/200] (  8.0%) Processing: Text_ocr_015.jpg... ❌ ERROR: predict() got an unexpected ke
[ 17/200] (  8.5%) Processing: Text_ocr_016.jpg... ❌ ERROR: predict() got an unexpected ke
[ 18/200] (  9.0%) Processing: Text_ocr_017.jpg... ❌ ERROR: predict() got an unexpected ke
[ 19/200] (  9.5%) Processing: Text_ocr_018.jpg... ❌ ERROR: predict() got an unexpected ke
[ 20/200] ( 10.0%) Processing: Text_ocr_019.jpg... ❌ ERROR: predict() got an unexpected ke
[ 21/200] ( 10.5%) Processing: Text_ocr_020.jpg... ❌ ERROR: predict() got an unexpected ke
[ 22/200] ( 11.0%) Processing: Text_ocr_021.jpg... ❌ ERROR: predict() got an unexpected ke
[ 23/200] ( 11.5%) Processing: Text_ocr_022.jpg... ❌ ERROR: predict() got an unexpected ke
[ 24/200] ( 12.0%) Processing: Text_ocr_023.jpg... ❌ ERROR: predict() got an unexpected ke
[ 25/200] ( 12.5%) Processing: Text_ocr_024.jpg..

In [9]:
# Quick Fix: Add OCR Selector to Existing Interface
import ipywidgets as widgets
from IPython.display import display

print("🔧 QUICK FIX: Adding OCR selector to existing interface")
print("="*50)

# Create the missing OCR selector
ocr_selector_fix = widgets.Dropdown(
    options=[
        ('PaddleOCR - Best overall, 80+ languages', 'paddleocr'),
        ('EasyOCR - Good for European languages', 'easyocr'),
        ('Tesseract - Classic OCR for printed text', 'tesseract'),
        ('TrOCR - AI-based OCR for handwriting', 'trocr')
    ],
    value='paddleocr',
    description='OCR Engine:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='500px')
)

# Create enhanced interface section for OCR
ocr_section = widgets.VBox([
    widgets.HTML("<div style='background-color: #e8f4fd; padding: 15px; border-radius: 5px; margin: 10px 0;'>"),
    widgets.HTML("<h4 style='margin: 0; color: #0066cc;'>🔍 OCR Engine Configuration</h4>"),
    widgets.HTML("<p style='margin: 5px 0; font-size: 14px;'>Select the OCR engine for text extraction:</p>"),
    ocr_selector_fix,
    widgets.HTML("<p style='margin: 5px 0; font-size: 12px; color: #666;'>💡 Different engines work better for different types of evidence</p>"),
    widgets.HTML("</div>")
])

print("📱 Displaying OCR configuration section:")
display(ocr_section)

# Add functionality to show OCR info
def show_ocr_info(change):
    info_map = {
        'paddleocr': {
            'name': 'PaddleOCR',
            'best_for': 'Asian languages, handwriting, complex layouts',
            'use_cases': 'Phone screenshots, multilingual documents, handwritten notes'
        },
        'easyocr': {
            'name': 'EasyOCR',
            'best_for': 'European languages, clean text',
            'use_cases': 'Printed documents, simple layouts, European text'
        },
        'tesseract': {
            'name': 'Tesseract',
            'best_for': 'Clean printed text, stability',
            'use_cases': 'Bank statements, invoices, typed documents'
        },
        'trocr': {
            'name': 'TrOCR (Transformers)',
            'best_for': 'Handwritten text, degraded images',
            'use_cases': 'Handwritten notes, old documents, poor quality scans'
        }
    }

    if change['new'] in info_map:
        info = info_map[change['new']]
        print(f"\n🔄 Selected: {info['name']}")
        print(f"🎯 Best for: {info['best_for']}")
        print(f"📋 Use cases: {info['use_cases']}")

ocr_selector_fix.observe(show_ocr_info, names='value')

# Instructions for integrating with existing system
print("\n🔗 INTEGRATION INSTRUCTIONS:")
print("="*30)
print("To use this OCR selector with your existing forensic system:")
print()
print("1. The OCR selector is now available as 'ocr_selector_fix'")
print("2. Get current selection: ocr_selector_fix.value")
print("3. Use in your system config:")
print("   config['OCR_MODEL'] = ocr_selector_fix.value")
print()
print("4. Update your forensic system initialization:")
print("   system = CompactForensicSystem(config)")

# Make the selector available globally
globals()['main_ocr_selector'] = ocr_selector_fix

print(f"\n✅ OCR selector ready! Current selection: {ocr_selector_fix.value}")
print("🔧 This should work alongside your existing interface")

# Test the selector
print("\n🧪 Testing OCR selector functionality...")
print(f"📋 Available options: {[opt[1] for opt in ocr_selector_fix.options]}")
print(f"📋 Current value: {ocr_selector_fix.value}")
print(f"📋 Widget type: {type(ocr_selector_fix)}")

print("\n💡 Try selecting different OCR engines above to see their descriptions!")

🔧 QUICK FIX: Adding OCR selector to existing interface
📱 Displaying OCR configuration section:


VBox(children=(HTML(value="<div style='background-color: #e8f4fd; padding: 15px; border-radius: 5px; margin: 1…


🔗 INTEGRATION INSTRUCTIONS:
To use this OCR selector with your existing forensic system:

1. The OCR selector is now available as 'ocr_selector_fix'
2. Get current selection: ocr_selector_fix.value
3. Use in your system config:
   config['OCR_MODEL'] = ocr_selector_fix.value

4. Update your forensic system initialization:
   system = CompactForensicSystem(config)

✅ OCR selector ready! Current selection: paddleocr
🔧 This should work alongside your existing interface

🧪 Testing OCR selector functionality...
📋 Available options: ['paddleocr', 'easyocr', 'tesseract', 'trocr']
📋 Current value: paddleocr
📋 Widget type: <class 'ipywidgets.widgets.widget_selection.Dropdown'>

💡 Try selecting different OCR engines above to see their descriptions!


In [12]:
# Compact Local Forensic OCR System with GUI
import os, json, sqlite3, shutil, warnings, time
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Any, Optional
import numpy as np
from PIL import Image
warnings.filterwarnings("ignore")

# Compact config with OCR options
CONFIG = {
    'OCR_MODEL': 'paddleocr', 'OCR_CONFIDENCE_THRESHOLD': 0.6,
    'LLM_PROVIDER': 'ollama', 'LLM_MODEL': 'llama3.2:3b', 'LLM_BASE_URL': 'http://localhost:11434',
    'LLM_TEMPERATURE': 0.1, 'LLM_MAX_TOKENS': 1000,
    'EMBEDDING_PROVIDER': 'huggingface', 'EMBEDDING_MODEL': 'sentence-transformers/all-MiniLM-L6-v2',
    'VECTOR_STORE_PATH': './forensic_vectorstore', 'CHUNK_SIZE': 500, 'CHUNK_OVERLAP': 50,
    'SUSPECTS_GALLERY_PATH': './suspects_gallery', 'DATABASE_PATH': './forensic_analysis_db',
    'RESULTS_OUTPUT_PATH': './forensic_results', 'BATCH_SIZE': 4, 'MAX_RESULTS_DISPLAY': 10,
}

# OCR model options for different evidence types
OCR_MODELS = {
    'paddleocr': {'name': 'PaddleOCR', 'desc': 'Best overall, 80+ languages, handwriting capable', 'strengths': 'Asian text, complex layouts'},
    'easyocr': {'name': 'EasyOCR', 'desc': 'Good for European languages, simple setup', 'strengths': 'European text, clean images'},
    'tesseract': {'name': 'Tesseract', 'desc': 'Classic OCR, best for clean printed text', 'strengths': 'Printed documents, stable'},
    'trocr': {'name': 'TrOCR (Transformers)', 'desc': 'AI-based, excellent for handwriting', 'strengths': 'Handwritten notes, degraded images'}
}

print(f"🏠 LOCAL FORENSIC SYSTEM | LLM: {CONFIG['LLM_PROVIDER']}-{CONFIG['LLM_MODEL']} | 🔒 100% Private")

# Quick dependency check
def check_deps():
    deps = ['paddleocr', 'langchain', 'chromadb', 'sentence_transformers']
    for dep in deps:
        try: __import__(dep.replace('-', '_')); print(f"✅ {dep}")
        except ImportError: print(f"❌ {dep} - Install: pip install {dep}")

def check_ollama():
    try:
        import requests
        r = requests.get(f"{CONFIG['LLM_BASE_URL']}/api/tags", timeout=5)
        if r.status_code == 200:
            models = [m['name'] for m in r.json().get('models', [])]
            print(f"✅ Ollama: {models}")
            return CONFIG['LLM_MODEL'] in models
        return False
    except: print("❌ Ollama not running"); return False

check_deps()
ollama_ready = check_ollama() if CONFIG['LLM_PROVIDER'] == 'ollama' else True

# Compact forensic system
class CompactForensicSystem:
    def __init__(self, config=None):
        self.config = config or CONFIG.copy()
        self._setup_dirs()
        self._init_db()
        self._init_ocr()
        self._init_llm()

    def _setup_dirs(self):
        for k in ['SUSPECTS_GALLERY_PATH', 'DATABASE_PATH', 'RESULTS_OUTPUT_PATH', 'VECTOR_STORE_PATH']:
            os.makedirs(self.config[k], exist_ok=True)

    def _init_db(self):
        db_path = os.path.join(self.config['DATABASE_PATH'], 'forensic.db')
        self.conn = sqlite3.connect(db_path)
        self.conn.execute('''CREATE TABLE IF NOT EXISTS forensic_images (
            id INTEGER PRIMARY KEY, filename TEXT UNIQUE, full_path TEXT, extracted_text TEXT,
            ocr_confidence REAL, processed_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            local_analysis TEXT, risk_score REAL, contains_incriminating BOOLEAN DEFAULT 0,
            vector_stored BOOLEAN DEFAULT 0)''')
        self.conn.execute('''CREATE TABLE IF NOT EXISTS local_investigations (
            id INTEGER PRIMARY KEY, query TEXT, investigation_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            results_json TEXT, flagged_images TEXT, model_used TEXT, processing_time REAL)''')
        self.conn.commit()

    def _init_ocr(self):
        try:
            ocr_model = self.config['OCR_MODEL']
            if ocr_model == 'paddleocr':
                from paddleocr import PaddleOCR
                self.ocr = PaddleOCR(use_textline_orientation=True, lang='en', show_log=False)
            elif ocr_model == 'easyocr':
                import easyocr
                self.ocr = easyocr.Reader(['en'], gpu=False)
            elif ocr_model == 'tesseract':
                import pytesseract
                self.ocr = pytesseract
            elif ocr_model == 'trocr':
                from transformers import TrOCRProcessor, VisionEncoderDecoderModel
                self.ocr_processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-printed')
                self.ocr_model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-printed')
                self.ocr = 'trocr'
            print(f"✅ {OCR_MODELS[ocr_model]['name']} ready")
        except Exception as e: print(f"❌ OCR error: {e}"); self.ocr = None

    def _init_llm(self):
        try:
            if self.config['LLM_PROVIDER'] == 'ollama':
                from langchain_community.llms import Ollama
                self.llm = Ollama(model=self.config['LLM_MODEL'], base_url=self.config['LLM_BASE_URL'],
                                temperature=self.config['LLM_TEMPERATURE'])
            elif self.config['LLM_PROVIDER'] == 'transformers':
                from langchain_community.llms import HuggingFacePipeline
                from transformers import pipeline
                pipe = pipeline("text-generation", model=self.config['LLM_MODEL'],
                              max_new_tokens=self.config['LLM_MAX_TOKENS'])
                self.llm = HuggingFacePipeline(pipeline=pipe)

            from langchain_community.embeddings import HuggingFaceEmbeddings
            from langchain_community.vectorstores import Chroma
            from langchain.text_splitter import RecursiveCharacterTextSplitter

            self.embeddings = HuggingFaceEmbeddings(model_name=self.config['EMBEDDING_MODEL'],
                                                  model_kwargs={'device': 'cpu'})
            self.vectorstore = Chroma(persist_directory=self.config['VECTOR_STORE_PATH'],
                                    embedding_function=self.embeddings)
            self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.config['CHUNK_SIZE'],
                                                              chunk_overlap=self.config['CHUNK_OVERLAP'])
            print("✅ LangChain ready")
        except Exception as e: print(f"❌ LLM error: {e}")

    def extract_text(self, img_path: str) -> Dict[str, Any]:
        try:
            img = Image.open(img_path).convert("RGB")
            result = {'filename': os.path.basename(img_path), 'full_path': img_path,
                     'extracted_text': '', 'ocr_confidence': 0.0, 'error': None}

            if self.ocr:
                ocr_model = self.config['OCR_MODEL']

                if ocr_model == 'paddleocr':
                    ocr_result = self.ocr.ocr(np.array(img), cls=True)
                    if ocr_result and ocr_result[0]:
                        texts, confs = [], []
                        for line in ocr_result:
                            for detection in line:
                                bbox, (text, conf) = detection
                                if conf >= self.config['OCR_CONFIDENCE_THRESHOLD']:
                                    texts.append(text); confs.append(conf)
                        result.update({'extracted_text': ' '.join(texts),
                                     'ocr_confidence': np.mean(confs) if confs else 0.0})

                elif ocr_model == 'easyocr':
                    ocr_result = self.ocr.readtext(np.array(img))
                    texts, confs = [], []
                    for detection in ocr_result:
                        bbox, text, conf = detection
                        if conf >= self.config['OCR_CONFIDENCE_THRESHOLD']:
                            texts.append(text); confs.append(conf)
                    result.update({'extracted_text': ' '.join(texts),
                                 'ocr_confidence': np.mean(confs) if confs else 0.0})

                elif ocr_model == 'tesseract':
                    import pytesseract
                    text = pytesseract.image_to_string(img, config='--psm 6')
                    if text.strip():
                        result.update({'extracted_text': text.strip(), 'ocr_confidence': 0.8})

                elif ocr_model == 'trocr':
                    pixel_values = self.ocr_processor(images=img, return_tensors="pt").pixel_values
                    generated_ids = self.ocr_model.generate(pixel_values)
                    text = self.ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
                    if text.strip():
                        result.update({'extracted_text': text.strip(), 'ocr_confidence': 0.9})

            return result
        except Exception as e:
            return {'filename': os.path.basename(img_path), 'full_path': img_path,
                   'extracted_text': '', 'ocr_confidence': 0.0, 'error': str(e)}

    def analyze_text(self, text: str, filename: str) -> Dict[str, Any]:
        prompt = f'''Analyze text from image '{filename}' for criminal evidence:
Text: {text}
Look for: weapons, drugs, threats, violence, financial crimes, personal info theft.
Respond with JSON only:
{{"contains_incriminating": true/false, "risk_score": 0.0-1.0,
"categories": ["weapons","drugs","threats","financial_crimes","violence","personal_info"],
"analysis": "explanation", "keywords": ["words"], "evidence_summary": "summary"}}'''

        try:
            response = str(self.llm.invoke(prompt)).strip()
            response = response.replace('```json', '').replace('```', '').strip()
            start, end = response.find('{'), response.rfind('}') + 1
            if start >= 0 and end > start:
                parsed = json.loads(response[start:end])
                return {k: parsed.get(k, v) for k, v in [
                    ('contains_incriminating', False), ('risk_score', 0.0), ('categories', []),
                    ('analysis', ''), ('keywords', []), ('evidence_summary', '')]}
            else: return self._fallback_analysis(response, filename)
        except: return self._fallback_analysis(response if 'response' in locals() else '', filename)

    def _fallback_analysis(self, response: str, filename: str) -> Dict[str, Any]:
        keywords = {'weapons': ['weapon','gun','knife','bomb'], 'drugs': ['drug','cocaine','meth'],
                   'threats': ['threat','kill','attack'], 'financial_crimes': ['fraud','steal']}
        found_cats, found_words, risk = [], [], 0.0
        resp_lower = response.lower()
        for cat, words in keywords.items():
            for word in words:
                if word in resp_lower: found_cats.append(cat); found_words.append(word); risk += 0.15
        return {'contains_incriminating': risk > 0.3, 'risk_score': min(risk, 1.0),
               'categories': list(set(found_cats)), 'analysis': f'Fallback analysis: {response[:100]}',
               'keywords': found_words, 'evidence_summary': f'Found {len(found_words)} terms'}

    def add_to_vectorstore(self, text: str, metadata: Dict[str, Any]):
        try:
            from langchain_core.documents import Document
            chunks = self.text_splitter.split_text(text)
            docs = [Document(page_content=chunk, metadata={**metadata, 'chunk_id': i})
                   for i, chunk in enumerate(chunks)]
            self.vectorstore.add_documents(docs); self.vectorstore.persist(); return True
        except: return False

    def process_gallery(self):
        start_time = time.time()
        gallery_path = Path(self.config['SUSPECTS_GALLERY_PATH'])
        print(f"🏠 PROCESSING: {gallery_path} | Model: {self.config['LLM_MODEL']}")

        if not gallery_path.exists(): print(f"❌ Path not found: {gallery_path}"); return

        exts = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
        files = [f for ext in exts for f in gallery_path.glob(f"**/*{ext}") + gallery_path.glob(f"**/*{ext.upper()}")]
        total, processed, vector_stored, incriminating = len(files), 0, 0, 0

        print(f"📸 Found {total} images")
        if total == 0: print("⚠️ No images found!"); return

        for i, img_path in enumerate(files, 1):
            filename = img_path.name
            print(f"[{i:3d}/{total}] ({i/total*100:5.1f}%) {filename[:40]}...", end='')

            # Skip if processed
            if self.conn.execute("SELECT id FROM forensic_images WHERE filename = ?", (filename,)).fetchone():
                print(" ⏭️ SKIP"); continue

            # Extract and analyze
            ocr_result = self.extract_text(img_path)
            if ocr_result.get('error'): print(" ❌ ERROR"); continue

            text = ocr_result['extracted_text']
            analysis = None
            if text.strip():
                analysis = self.analyze_text(text, filename)
                metadata = {'filename': filename, 'full_path': str(img_path),
                          'ocr_confidence': ocr_result['ocr_confidence']}
                if self.add_to_vectorstore(text, metadata): vector_stored += 1

            # Store in DB
            risk = analysis['risk_score'] if analysis else 0.0
            incrim = analysis['contains_incriminating'] if analysis else False
            self.conn.execute('''INSERT OR REPLACE INTO forensic_images
                (filename, full_path, extracted_text, ocr_confidence, local_analysis,
                 risk_score, contains_incriminating, vector_stored) VALUES (?,?,?,?,?,?,?,?)''',
                (filename, str(img_path), text, ocr_result['ocr_confidence'],
                 json.dumps(analysis) if analysis else None, risk, incrim, vector_stored > 0))

            if text.strip():
                if incrim: incriminating += 1; print(f" 🚨 INCRIMINATING ({risk:.2f})")
                else: print(" ✅ TEXT")
            else: print(" ⚪ NO TEXT")

            processed += 1
            if i % 10 == 0: self.conn.commit()

        self.conn.commit()
        print(f"\n🏠 COMPLETE: {processed} processed, {vector_stored} vectorized, {incriminating} flagged, {time.time()-start_time:.1f}s")

    def investigate(self, query: str) -> Dict[str, Any]:
        print(f"🏠 INVESTIGATING: '{query}'")
        try:
            docs = self.vectorstore.similarity_search(query, k=5)
            if not docs: return {'findings': [], 'summary': 'No evidence found', 'total_flagged': 0}

            evidence, files = "", set()
            for i, doc in enumerate(docs, 1):
                filename = doc.metadata.get('filename', f'unknown_{i}')
                files.add(filename)
                evidence += f"Evidence {i} - {filename}: {doc.page_content}\n---\n"

            prompt = f'''Forensic investigation: "{query}"
Evidence: {evidence}
Respond with JSON:
{{"findings": [{{"image_number": 1, "filename": "file.jpg", "matches_query": true,
"risk_level": "high", "evidence_found": "desc", "relevant_text": "text", "confidence": 0.8}}],
"summary": "summary", "total_flagged": 2, "highest_risk_image": "file.jpg"}}'''

            response = str(self.llm.invoke(prompt)).strip()
            response = response.replace('```json', '').replace('```', '').strip()
            start, end = response.find('{'), response.rfind('}') + 1

            if start >= 0 and end > start:
                results = json.loads(response[start:end])
            else:
                results = self._fallback_investigation(query, files)

            # Store investigation
            flagged = [f.get('filename', '') for f in results.get('findings', []) if f.get('matches_query')]
            self.conn.execute('''INSERT INTO local_investigations
                (query, results_json, flagged_images, model_used) VALUES (?,?,?,?)''',
                (query, json.dumps(results), json.dumps(flagged), f"{self.config['LLM_PROVIDER']}-{self.config['LLM_MODEL']}"))
            self.conn.commit()

            print(f"✅ Found {results.get('total_flagged', 0)} matches")
            return results
        except Exception as e:
            print(f"❌ Error: {e}")
            return {'findings': [], 'summary': f'Error: {e}', 'total_flagged': 0}

    def _fallback_investigation(self, query: str, files: set) -> Dict[str, Any]:
        findings = [{'image_number': i, 'filename': f, 'matches_query': True, 'risk_level': 'medium',
                    'evidence_found': f'Related to: {query}', 'relevant_text': f'Matches: {query}', 'confidence': 0.7}
                   for i, f in enumerate(list(files)[:5], 1)]
        return {'findings': findings, 'summary': f'Fallback analysis for: {query}',
               'total_flagged': len(findings), 'highest_risk_image': list(files)[0] if files else None}

    def get_flagged(self, min_risk=0.3):
        return self.conn.execute('''SELECT filename, full_path, extracted_text, risk_score,
            local_analysis, ocr_confidence FROM forensic_images
            WHERE contains_incriminating = 1 AND risk_score >= ? ORDER BY risk_score DESC''', (min_risk,)).fetchall()

    def export_results(self, results: Dict[str, Any], query: str) -> str:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        query_safe = query.replace(' ', '_')[:30]
        output_path = Path(self.config['RESULTS_OUTPUT_PATH']) / f"investigation_{query_safe}_{timestamp}"
        output_path.mkdir(parents=True, exist_ok=True)

        flagged = [f.get('filename', '') for f in results.get('findings', []) if f.get('matches_query')]
        copied = []
        for i, filename in enumerate(flagged, 1):
            result = self.conn.execute("SELECT full_path FROM forensic_images WHERE filename = ?", (filename,)).fetchone()
            if result and Path(result[0]).exists():
                dest = output_path / f"{i:03d}_{filename}"
                shutil.copy2(result[0], dest); copied.append(str(dest))

        # Save reports
        report = {'query': query, 'timestamp': timestamp, 'model': f"{self.config['LLM_PROVIDER']}-{self.config['LLM_MODEL']}",
                 'results': results, 'flagged_images': flagged, 'copied_files': copied}

        with open(output_path / "report.json", 'w') as f: json.dump(report, f, indent=2)
        with open(output_path / "summary.txt", 'w') as f:
            f.write(f"LOCAL FORENSIC REPORT\nQuery: {query}\nDate: {timestamp}\n")
            f.write(f"Model: {self.config['LLM_PROVIDER']}-{self.config['LLM_MODEL']}\n")
            f.write(f"Flagged: {len(flagged)}\nSummary: {results.get('summary', 'N/A')}\n")

        print(f"📋 Exported to: {output_path}")
        return str(output_path)

    def close(self):
        if hasattr(self, 'conn') and self.conn: self.conn.close()

# Compact GUI
import ipywidgets as widgets
from IPython.display import display, clear_output
import matplotlib.pyplot as plt

def create_compact_gui():
    # Print debug info
    print("🔧 Creating GUI with OCR selector...")
    print(f"Available OCR models: {list(OCR_MODELS.keys())}")

    # OCR Selector Widget - Make sure it's created first
    ocr_selector = widgets.Dropdown(
        options=[(f"{info['name']} - {info['desc']}", key) for key, info in OCR_MODELS.items()],
        value=CONFIG['OCR_MODEL'],
        description='OCR Engine:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='500px')
    )

    # Debug: Print OCR selector options
    print(f"OCR selector options: {ocr_selector.options}")
    print(f"OCR selector value: {ocr_selector.value}")

    # LLM Widgets
    llm_provider = widgets.Dropdown(
        options=[('Ollama', 'ollama'), ('Transformers', 'transformers')],
        value=CONFIG['LLM_PROVIDER'],
        description='LLM Provider:',
        style={'description_width': 'initial'}
    )

    llm_model = widgets.Text(
        value=CONFIG['LLM_MODEL'],
        description='LLM Model:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='300px')
    )

    # Other widgets
    gallery_path = widgets.Text(
        value=CONFIG['SUSPECTS_GALLERY_PATH'],
        description='Gallery Path:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='400px')
    )

    query_input = widgets.Textarea(
        value='Find weapons or violence mentions',
        description='Investigation Query:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='500px', height='60px')
    )

    confidence_slider = widgets.FloatSlider(
        value=CONFIG['OCR_CONFIDENCE_THRESHOLD'],
        min=0.1, max=0.9, step=0.05,
        description='OCR Confidence:',
        style={'description_width': 'initial'},
        readout_format='.2f'
    )

    risk_slider = widgets.FloatSlider(
        value=0.3,
        min=0.1, max=1.0, step=0.1,
        description='Risk Threshold:',
        style={'description_width': 'initial'},
        readout_format='.1f'
    )

    # Quick queries
    quick_queries = ["Find weapons", "Look for drugs", "Financial crimes", "Personal info theft", "Threats", "Illegal coordination"]
    quick_btns = [widgets.Button(description=q[:20], tooltip=q, layout=widgets.Layout(width='180px', margin='2px')) for q in quick_queries]

    # Action buttons
    test_btn = widgets.Button(description='🔍 Test Setup', button_style='info', layout=widgets.Layout(width='120px'))
    process_btn = widgets.Button(description='📁 Process', button_style='primary', layout=widgets.Layout(width='120px'))
    investigate_btn = widgets.Button(description='🤖 Investigate', button_style='success', layout=widgets.Layout(width='120px'))
    flagged_btn = widgets.Button(description='🚨 Flagged', button_style='warning', layout=widgets.Layout(width='120px'))
    export_btn = widgets.Button(description='📋 Export', button_style='success', layout=widgets.Layout(width='120px'), disabled=True)
    clear_btn = widgets.Button(description='🗑️ Clear', layout=widgets.Layout(width='120px'))

    status_label = widgets.HTML(value="<b>Status:</b> Ready")
    progress_bar = widgets.IntProgress(value=0, min=0, max=100, layout=widgets.Layout(width='400px'))
    output_area = widgets.Output()

    # State
    current_system = None
    current_results = None

    def update_status(msg, progress=None):
        status_label.value = f"<b>Status:</b> {msg}"
        if progress is not None: progress_bar.value = progress

    def init_system():
        nonlocal current_system
        try:
            config = CONFIG.copy()
            config.update({'OCR_MODEL': ocr_selector.value, 'LLM_PROVIDER': llm_provider.value, 'LLM_MODEL': llm_model.value,
                          'SUSPECTS_GALLERY_PATH': gallery_path.value, 'OCR_CONFIDENCE_THRESHOLD': confidence_slider.value})
            current_system = CompactForensicSystem(config)
            return True
        except Exception as e: print(f"❌ Init error: {e}"); return False

    # Button handlers
    def on_quick_query(query):
        def handler(b): query_input.value = query
        return handler

    for i, btn in enumerate(quick_btns): btn.on_click(on_quick_query(quick_queries[i]))

    def on_test(b):
        with output_area:
            clear_output(wait=True)
            update_status("Testing setup...", 20)
            print("🔍 TESTING LOCAL SETUP")
            print("="*40)

            # Test OCR first
            print(f"Testing OCR: {ocr_selector.value}")
            ocr_info = OCR_MODELS.get(ocr_selector.value, {})
            print(f"Purpose: {ocr_info.get('strengths', 'General OCR')}")

            try:
                if ocr_selector.value == 'paddleocr':
                    import paddleocr; print("✅ PaddleOCR available")
                elif ocr_selector.value == 'easyocr':
                    import easyocr; print("✅ EasyOCR available")
                elif ocr_selector.value == 'tesseract':
                    import pytesseract; print("✅ Tesseract available")
                elif ocr_selector.value == 'trocr':
                    import transformers; print("✅ TrOCR (Transformers) available")
            except ImportError as e: print(f"❌ {ocr_selector.value} not found: {e}")

            # Test other dependencies
            deps = [('paddleocr', 'PaddleOCR'), ('langchain', 'LangChain'), ('chromadb', 'ChromaDB'), ('sentence_transformers', 'Transformers')]
            for dep, name in deps:
                try: __import__(dep.replace('-', '_')); print(f"✅ {name}")
                except ImportError: print(f"❌ {name} not found")

            # Test Ollama
            if llm_provider.value == 'ollama':
                update_status("Testing Ollama...", 60)
                try:
                    import requests
                    r = requests.get("http://localhost:11434/api/tags", timeout=5)
                    if r.status_code == 200:
                        models = [m['name'] for m in r.json().get('models', [])]
                        print(f"✅ Ollama: {models}")
                        if llm_model.value in models: print(f"✅ Model '{llm_model.value}' ready")
                        else: print(f"⚠️ Model '{llm_model.value}' not found. Install: ollama pull {llm_model.value}")
                    else: print("❌ Ollama not responding")
                except: print("❌ Ollama connection failed. Start: ollama serve")

            update_status("Testing system init...", 80)
            if init_system(): print("✅ System ready"); update_status("Setup complete!", 100)
            else: print("❌ System init failed"); update_status("Setup failed", 0)

    def on_process(b):
        with output_area:
            clear_output(wait=True)
            update_status("Processing gallery...", 20)
            if not init_system(): update_status("Init failed", 0); return

            print(f"🏠 PROCESSING | OCR: {ocr_selector.value} | LLM: {llm_provider.value}-{llm_model.value}")
            try:
                current_system.process_gallery()
                update_status("Processing complete!", 100)
                flagged = current_system.get_flagged(risk_slider.value)
                if flagged: print(f"\n🚨 FLAGGED: {len(flagged)} images above risk {risk_slider.value}")
                else: print("\n✅ No high-risk content detected")
            except Exception as e: update_status("Processing error", 0); print(f"❌ Error: {e}")

    def on_investigate(b):
        nonlocal current_results
        with output_area:
            clear_output(wait=True)
            if not current_system and not init_system(): print("❌ System not ready"); return

            query = query_input.value.strip()
            if not query: print("⚠️ Enter investigation query"); return

            update_status("Investigating...", 50)
            print(f"🏠 INVESTIGATING: '{query}' | Model: {llm_model.value}")
            try:
                current_results = current_system.investigate(query)
                if current_results.get('total_flagged', 0) > 0:
                    update_status(f"Found {current_results['total_flagged']} matches", 100)
                    export_btn.disabled = False
                    print(f"📊 Summary: {current_results.get('summary', 'N/A')}")
                    print("🚨 FLAGGED:")
                    for f in current_results.get('findings', []):
                        if f.get('matches_query'):
                            print(f"  {f['risk_level'].upper()}: {f['filename']} - {f['evidence_found']}")

                    # Display images
                    flagged_files = [f['filename'] for f in current_results.get('findings', []) if f.get('matches_query')]
                    if flagged_files: display_flagged_images(flagged_files[:6], current_system)
                else:
                    update_status("No matches", 100); export_btn.disabled = True
                    print("❌ No matches. Try different terms or lower risk threshold.")
            except Exception as e: update_status("Investigation error", 0); print(f"❌ Error: {e}")

    def on_flagged(b):
        with output_area:
            clear_output(wait=True)
            if not current_system and not init_system(): print("❌ System not ready"); return

            update_status("Loading flagged images...", 50)
            try:
                flagged = current_system.get_flagged(risk_slider.value)
                if flagged:
                    print(f"🚨 FLAGGED IMAGES (Risk ≥ {risk_slider.value})")
                    for i, (filename, _, text, risk, analysis, conf) in enumerate(flagged, 1):
                        level = "CRITICAL" if risk > 0.8 else "HIGH" if risk > 0.6 else "MEDIUM" if risk > 0.3 else "LOW"
                        print(f"{i}. {filename} | {level} ({risk:.3f}) | OCR: {conf:.3f}")
                        print(f"   Text: {text[:100]}...")
                        if analysis:
                            try:
                                data = json.loads(analysis)
                                cats = data.get('categories', [])
                                if cats: print(f"   Categories: {', '.join(cats)}")
                            except: pass
                        print()

                    files = [r[0] for r in flagged[:6]]
                    display_flagged_images(files, current_system)
                    update_status(f"Showing {len(flagged)} flagged images", 100)
                else:
                    print(f"✅ No images flagged above risk {risk_slider.value}")
                    update_status("No flagged images", 100)
            except Exception as e: update_status("Error loading flagged", 0); print(f"❌ Error: {e}")

    def on_export(b):
        with output_area:
            if not current_results: print("⚠️ No results to export"); return
            try:
                update_status("Exporting...", 50)
                path = current_system.export_results(current_results, query_input.value or "investigation")
                update_status("Export complete!", 100)
                print(f"✅ Results exported to: {path}")
            except Exception as e: update_status("Export error", 0); print(f"❌ Error: {e}")

    def on_clear(b):
        nonlocal current_results
        current_results = None; export_btn.disabled = True
        with output_area: clear_output(); update_status("Cleared", 0); print("🗑️ Cleared")

    # Connect events
    test_btn.on_click(on_test); process_btn.on_click(on_process); investigate_btn.on_click(on_investigate)
    flagged_btn.on_click(on_flagged); export_btn.on_click(on_export); clear_btn.on_click(on_clear)

    # Layout
    quick_grid = widgets.GridBox(quick_btns, layout=widgets.Layout(grid_template_columns='repeat(3, 1fr)', grid_gap='5px'))
    controls = widgets.VBox([
        widgets.HTML("<h3>🏠 Compact Local Forensic System</h3>"),
        widgets.HBox([llm_provider, llm_model]), gallery_path, query_input,
        widgets.HTML("<b>Quick Queries:</b>"), quick_grid,
        widgets.HBox([confidence_slider, risk_slider]),
        widgets.HBox([test_btn, process_btn, investigate_btn, flagged_btn, export_btn, clear_btn]),
        status_label, progress_bar, widgets.HTML("<hr>")
    ])

    return widgets.VBox([controls, output_area])

    for i, filename in enumerate(filenames[:num]):
        ax = axes[i] if num > 1 else axes
        try:
            result = system.conn.execute('''SELECT full_path, extracted_text, risk_score, local_analysis
                FROM forensic_images WHERE filename = ?''', (filename,)).fetchone()
            if not result: ax.text(0.5, 0.5, f"Not found\n{filename}", ha='center', va='center', transform=ax.transAxes); ax.axis('off'); continue

            full_path, text, risk, analysis = result
            try: img = plt.imread(full_path); ax.imshow(img)
            except: ax.text(0.5, 0.5, f"Load error\n{filename}", ha='center', va='center', transform=ax.transAxes); ax.axis('off'); continue

            # Risk level and color
            level, color = ("CRITICAL", "red") if risk > 0.8 else ("HIGH", "orange") if risk > 0.6 else ("MEDIUM", "yellow") if risk > 0.3 else ("LOW", "green")
            ax.set_title(f"{filename}\n{level}: {risk:.2f}", fontsize=10, color=color, fontweight='bold')

            # Text overlay
            text_preview = text[:80] if text else "No text"
            ax.text(0.02, 0.98, f"Text: {text_preview}...", transform=ax.transAxes, fontsize=8,
                   verticalalignment='top', bbox=dict(boxstyle="round,pad=0.3", facecolor=color, alpha=0.7))

            # Categories
            if analysis:
                try:
                    data = json.loads(analysis)
                    cats = data.get('categories', [])
                    if cats: ax.text(0.02, 0.02, f"Cat: {', '.join(cats[:2])}", transform=ax.transAxes, fontsize=7,
                                   verticalalignment='bottom', bbox=dict(boxstyle="round,pad=0.2", facecolor="lightblue", alpha=0.8))
                except: pass
            ax.axis('off')
        except Exception as e: ax.text(0.5, 0.5, f"Error\n{filename}\n{str(e)[:30]}", ha='center', va='center', transform=ax.transAxes); ax.axis('off')

    for j in range(num, len(axes)): axes[j].axis('off')
    plt.tight_layout(); plt.suptitle('🏠 Local Forensic Results', fontsize=14, y=1.02); plt.show()

# Quick functions
def quick_setup():
    print("🏠 QUICK SETUP CHECK")
    deps = ['paddleocr', 'langchain', 'chromadb', 'sentence_transformers']
    for dep in deps:
        try: __import__(dep.replace('-', '_')); print(f"✅ {dep}")
        except ImportError: print(f"❌ {dep} - pip install {dep}")

    try:
        import requests
        r = requests.get("http://localhost:11434/api/tags", timeout=5)
        if r.status_code == 200:
            models = [m['name'] for m in r.json().get('models', [])]
            print(f"✅ Ollama: {models}")
            if 'llama3.2:3b' in models: print("✅ Recommended model ready")
            else: print("⚠️ Install: ollama pull llama3.2:3b")
        else: print("❌ Ollama not responding")
    except: print("❌ Ollama not available - Start: ollama serve")

def quick_investigate(query: str, gallery_path: str = './suspects_gallery'):
    print(f"🏠 Quick Investigation: '{query}' | 🔒 100% Local")
    try:
        config = CONFIG.copy(); config['SUSPECTS_GALLERY_PATH'] = gallery_path
        system = CompactForensicSystem(config)

        # Check if processed
        count = system.conn.execute("SELECT COUNT(*) FROM forensic_images WHERE vector_stored = 1").fetchone()[0]
        if count == 0: print("📁 Processing gallery first..."); system.process_gallery()

        results = system.investigate(query)
        if results.get('total_flagged', 0) > 0:
            path = system.export_results(results, query)
            print(f"📋 Exported to: {path}")
        system.close(); return results
    except Exception as e: print(f"❌ Error: {e}"); return None

# Display interface with debugging
print("🏠 Creating Compact Local Forensic Interface...")
print("🔧 Debug: OCR models available:", list(OCR_MODELS.keys()))

compact_interface = create_compact_gui()

# Verify the interface was created properly
print("✅ GUI created successfully")
print("🔍 OCR selector should be visible at the top")

display(compact_interface)

print(f"\n🏠 COMPACT LOCAL FORENSIC SYSTEM READY!")
print("="*50)
print("🔒 PRIVACY: 100% local, no API calls, offline capable")
print("🤖 COMPONENTS: Multi-OCR + Ollama/Transformers + ChromaDB + SQLite")
print("📝 OCR OPTIONS: PaddleOCR, EasyOCR, Tesseract, TrOCR")
print("🚀 SETUP: ollama serve → ollama pull llama3.2:3b")
print("💡 QUICK: quick_setup() | quick_investigate('find weapons')")
print("="*50)
print("\n🔍 OCR SELECTION GUIDE:")
print("• PaddleOCR: Best overall, Asian languages, handwriting")
print("• EasyOCR: European languages, clean text")
print("• Tesseract: Printed documents, very stable")
print("• TrOCR: Handwritten notes, degraded images")
print("="*50)

🏠 LOCAL FORENSIC SYSTEM | LLM: ollama-llama3.2:3b | 🔒 100% Private
✅ paddleocr
✅ langchain
✅ chromadb
✅ sentence_transformers
✅ Ollama: ['deepseek-r1:1.5b', 'llama3.2:latest']
🏠 Creating Compact Local Forensic Interface...
🔧 Debug: OCR models available: ['paddleocr', 'easyocr', 'tesseract', 'trocr']
🔧 Creating GUI with OCR selector...
Available OCR models: ['paddleocr', 'easyocr', 'tesseract', 'trocr']
OCR selector options: (('PaddleOCR - Best overall, 80+ languages, handwriting capable', 'paddleocr'), ('EasyOCR - Good for European languages, simple setup', 'easyocr'), ('Tesseract - Classic OCR, best for clean printed text', 'tesseract'), ('TrOCR (Transformers) - AI-based, excellent for handwriting', 'trocr'))
OCR selector value: paddleocr
✅ GUI created successfully
🔍 OCR selector should be visible at the top


VBox(children=(VBox(children=(HTML(value='<h3>🏠 Compact Local Forensic System</h3>'), HBox(children=(Dropdown(…


🏠 COMPACT LOCAL FORENSIC SYSTEM READY!
🔒 PRIVACY: 100% local, no API calls, offline capable
🤖 COMPONENTS: Multi-OCR + Ollama/Transformers + ChromaDB + SQLite
📝 OCR OPTIONS: PaddleOCR, EasyOCR, Tesseract, TrOCR
🚀 SETUP: ollama serve → ollama pull llama3.2:3b
💡 QUICK: quick_setup() | quick_investigate('find weapons')

🔍 OCR SELECTION GUIDE:
• PaddleOCR: Best overall, Asian languages, handwriting
• EasyOCR: European languages, clean text
• Tesseract: Printed documents, very stable
• TrOCR: Handwritten notes, degraded images


In [16]:
# Compact Local Forensic OCR System with GUI
import os, json, sqlite3, shutil, warnings, time
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Any, Optional
import numpy as np
from PIL import Image
warnings.filterwarnings("ignore")

# Compact config with OCR options
CONFIG = {
    'OCR_MODEL': 'paddleocr', 'OCR_CONFIDENCE_THRESHOLD': 0.6,
    'LLM_PROVIDER': 'ollama', 'LLM_MODEL': 'llama3.2:3b', 'LLM_BASE_URL': 'http://localhost:11434',
    'LLM_TEMPERATURE': 0.1, 'LLM_MAX_TOKENS': 1000,
    'EMBEDDING_PROVIDER': 'huggingface', 'EMBEDDING_MODEL': 'sentence-transformers/all-MiniLM-L6-v2',
    'VECTOR_STORE_PATH': './forensic_vectorstore', 'CHUNK_SIZE': 500, 'CHUNK_OVERLAP': 50,
    'SUSPECTS_GALLERY_PATH': '../../', 'DATABASE_PATH': './forensic_analysis_db',
    'RESULTS_OUTPUT_PATH': './forensic_results', 'BATCH_SIZE': 4, 'MAX_RESULTS_DISPLAY': 10,
}

# OCR model options for different evidence types
OCR_MODELS = {
    'paddleocr': {'name': 'PaddleOCR', 'desc': 'Best overall, 80+ languages, handwriting capable', 'strengths': 'Asian text, complex layouts'},
    'easyocr': {'name': 'EasyOCR', 'desc': 'Good for European languages, simple setup', 'strengths': 'European text, clean images'},
    'tesseract': {'name': 'Tesseract', 'desc': 'Classic OCR, best for clean printed text', 'strengths': 'Printed documents, stable'},
    'trocr': {'name': 'TrOCR (Transformers)', 'desc': 'AI-based, excellent for handwriting', 'strengths': 'Handwritten notes, degraded images'}
}

print(f"🏠 LOCAL FORENSIC SYSTEM | LLM: {CONFIG['LLM_PROVIDER']}-{CONFIG['LLM_MODEL']} | 🔒 100% Private")

# Quick dependency check
def check_deps():
    deps = ['paddleocr', 'langchain', 'chromadb', 'sentence_transformers']
    for dep in deps:
        try: __import__(dep.replace('-', '_')); print(f"✅ {dep}")
        except ImportError: print(f"❌ {dep} - Install: pip install {dep}")

def check_ollama():
    try:
        import requests
        r = requests.get(f"{CONFIG['LLM_BASE_URL']}/api/tags", timeout=5)
        if r.status_code == 200:
            models = [m['name'] for m in r.json().get('models', [])]
            print(f"✅ Ollama: {models}")
            return CONFIG['LLM_MODEL'] in models
        return False
    except: print("❌ Ollama not running"); return False

check_deps()
ollama_ready = check_ollama() if CONFIG['LLM_PROVIDER'] == 'ollama' else True

# Compact forensic system
class CompactForensicSystem:
    def __init__(self, config=None):
        self.config = config or CONFIG.copy()
        self._setup_dirs()
        self._init_db()
        self._init_ocr()
        self._init_llm()

    def _setup_dirs(self):
        for k in ['SUSPECTS_GALLERY_PATH', 'DATABASE_PATH', 'RESULTS_OUTPUT_PATH', 'VECTOR_STORE_PATH']:
            os.makedirs(self.config[k], exist_ok=True)

    def _init_db(self):
        db_path = os.path.join(self.config['DATABASE_PATH'], 'forensic.db')
        self.conn = sqlite3.connect(db_path)
        self.conn.execute('''CREATE TABLE IF NOT EXISTS forensic_images (
            id INTEGER PRIMARY KEY, filename TEXT UNIQUE, full_path TEXT, extracted_text TEXT,
            ocr_confidence REAL, processed_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            local_analysis TEXT, risk_score REAL, contains_incriminating BOOLEAN DEFAULT 0,
            vector_stored BOOLEAN DEFAULT 0)''')
        self.conn.execute('''CREATE TABLE IF NOT EXISTS local_investigations (
            id INTEGER PRIMARY KEY, query TEXT, investigation_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            results_json TEXT, flagged_images TEXT, model_used TEXT, processing_time REAL)''')
        self.conn.commit()

    def _init_ocr(self):
        try:
            ocr_model = self.config['OCR_MODEL']
            if ocr_model == 'paddleocr':
                from paddleocr import PaddleOCR
                self.ocr = PaddleOCR(use_textline_orientation=True, lang='en', show_log=False)
            elif ocr_model == 'easyocr':
                import easyocr
                self.ocr = easyocr.Reader(['en'], gpu=False)
            elif ocr_model == 'tesseract':
                import pytesseract
                self.ocr = pytesseract
            elif ocr_model == 'trocr':
                from transformers import TrOCRProcessor, VisionEncoderDecoderModel
                self.ocr_processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-printed')
                self.ocr_model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-printed')
                self.ocr = 'trocr'
            print(f"✅ {OCR_MODELS[ocr_model]['name']} ready")
        except Exception as e: print(f"❌ OCR error: {e}"); self.ocr = None

    def _init_llm(self):
        try:
            if self.config['LLM_PROVIDER'] == 'ollama':
                from langchain_community.llms import Ollama
                self.llm = Ollama(model=self.config['LLM_MODEL'], base_url=self.config['LLM_BASE_URL'],
                                temperature=self.config['LLM_TEMPERATURE'])
            elif self.config['LLM_PROVIDER'] == 'transformers':
                from langchain_community.llms import HuggingFacePipeline
                from transformers import pipeline
                pipe = pipeline("text-generation", model=self.config['LLM_MODEL'],
                              max_new_tokens=self.config['LLM_MAX_TOKENS'])
                self.llm = HuggingFacePipeline(pipeline=pipe)

            from langchain_community.embeddings import HuggingFaceEmbeddings
            from langchain_community.vectorstores import Chroma
            from langchain.text_splitter import RecursiveCharacterTextSplitter

            self.embeddings = HuggingFaceEmbeddings(model_name=self.config['EMBEDDING_MODEL'],
                                                  model_kwargs={'device': 'cpu'})
            self.vectorstore = Chroma(persist_directory=self.config['VECTOR_STORE_PATH'],
                                    embedding_function=self.embeddings)
            self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.config['CHUNK_SIZE'],
                                                              chunk_overlap=self.config['CHUNK_OVERLAP'])
            print("✅ LangChain ready")
        except Exception as e: print(f"❌ LLM error: {e}")

    def extract_text(self, img_path: str) -> Dict[str, Any]:
        try:
            img = Image.open(img_path).convert("RGB")
            result = {'filename': os.path.basename(img_path), 'full_path': img_path,
                     'extracted_text': '', 'ocr_confidence': 0.0, 'error': None}

            if self.ocr:
                ocr_model = self.config['OCR_MODEL']

                if ocr_model == 'paddleocr':
                    ocr_result = self.ocr.ocr(np.array(img), cls=True)
                    if ocr_result and ocr_result[0]:
                        texts, confs = [], []
                        for line in ocr_result:
                            for detection in line:
                                bbox, (text, conf) = detection
                                if conf >= self.config['OCR_CONFIDENCE_THRESHOLD']:
                                    texts.append(text); confs.append(conf)
                        result.update({'extracted_text': ' '.join(texts),
                                     'ocr_confidence': np.mean(confs) if confs else 0.0})

                elif ocr_model == 'easyocr':
                    ocr_result = self.ocr.readtext(np.array(img))
                    texts, confs = [], []
                    for detection in ocr_result:
                        bbox, text, conf = detection
                        if conf >= self.config['OCR_CONFIDENCE_THRESHOLD']:
                            texts.append(text); confs.append(conf)
                    result.update({'extracted_text': ' '.join(texts),
                                 'ocr_confidence': np.mean(confs) if confs else 0.0})

                elif ocr_model == 'tesseract':
                    import pytesseract
                    text = pytesseract.image_to_string(img, config='--psm 6')
                    if text.strip():
                        result.update({'extracted_text': text.strip(), 'ocr_confidence': 0.8})

                elif ocr_model == 'trocr':
                    pixel_values = self.ocr_processor(images=img, return_tensors="pt").pixel_values
                    generated_ids = self.ocr_model.generate(pixel_values)
                    text = self.ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
                    if text.strip():
                        result.update({'extracted_text': text.strip(), 'ocr_confidence': 0.9})

            return result
        except Exception as e:
            return {'filename': os.path.basename(img_path), 'full_path': img_path,
                   'extracted_text': '', 'ocr_confidence': 0.0, 'error': str(e)}

    def analyze_text(self, text: str, filename: str) -> Dict[str, Any]:
        prompt = f'''Analyze text from image '{filename}' for criminal evidence:
Text: {text}
Look for: weapons, drugs, threats, violence, financial crimes, personal info theft.
Respond with JSON only:
{{"contains_incriminating": true/false, "risk_score": 0.0-1.0,
"categories": ["weapons","drugs","threats","financial_crimes","violence","personal_info"],
"analysis": "explanation", "keywords": ["words"], "evidence_summary": "summary"}}'''

        try:
            response = str(self.llm.invoke(prompt)).strip()
            response = response.replace('```json', '').replace('```', '').strip()
            start, end = response.find('{'), response.rfind('}') + 1
            if start >= 0 and end > start:
                parsed = json.loads(response[start:end])
                return {k: parsed.get(k, v) for k, v in [
                    ('contains_incriminating', False), ('risk_score', 0.0), ('categories', []),
                    ('analysis', ''), ('keywords', []), ('evidence_summary', '')]}
            else: return self._fallback_analysis(response, filename)
        except: return self._fallback_analysis(response if 'response' in locals() else '', filename)

    def _fallback_analysis(self, response: str, filename: str) -> Dict[str, Any]:
        keywords = {'weapons': ['weapon','gun','knife','bomb'], 'drugs': ['drug','cocaine','meth'],
                   'threats': ['threat','kill','attack'], 'financial_crimes': ['fraud','steal']}
        found_cats, found_words, risk = [], [], 0.0
        resp_lower = response.lower()
        for cat, words in keywords.items():
            for word in words:
                if word in resp_lower: found_cats.append(cat); found_words.append(word); risk += 0.15
        return {'contains_incriminating': risk > 0.3, 'risk_score': min(risk, 1.0),
               'categories': list(set(found_cats)), 'analysis': f'Fallback analysis: {response[:100]}',
               'keywords': found_words, 'evidence_summary': f'Found {len(found_words)} terms'}

    def add_to_vectorstore(self, text: str, metadata: Dict[str, Any]):
        try:
            from langchain_core.documents import Document
            chunks = self.text_splitter.split_text(text)
            docs = [Document(page_content=chunk, metadata={**metadata, 'chunk_id': i})
                   for i, chunk in enumerate(chunks)]
            self.vectorstore.add_documents(docs); self.vectorstore.persist(); return True
        except: return False

    def process_gallery(self):
        start_time = time.time()
        gallery_path = Path(self.config['SUSPECTS_GALLERY_PATH'])
        print(f"🏠 PROCESSING: {gallery_path} | Model: {self.config['LLM_MODEL']}")

        if not gallery_path.exists(): print(f"❌ Path not found: {gallery_path}"); return

        exts = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
        files = [f for ext in exts for f in gallery_path.glob(f"**/*{ext}") + gallery_path.glob(f"**/*{ext.upper()}")]
        total, processed, vector_stored, incriminating = len(files), 0, 0, 0

        print(f"📸 Found {total} images")
        if total == 0: print("⚠️ No images found!"); return

        for i, img_path in enumerate(files, 1):
            filename = img_path.name
            print(f"[{i:3d}/{total}] ({i/total*100:5.1f}%) {filename[:40]}...", end='')

            # Skip if processed
            if self.conn.execute("SELECT id FROM forensic_images WHERE filename = ?", (filename,)).fetchone():
                print(" ⏭️ SKIP"); continue

            # Extract and analyze
            ocr_result = self.extract_text(img_path)
            if ocr_result.get('error'): print(" ❌ ERROR"); continue

            text = ocr_result['extracted_text']
            analysis = None
            if text.strip():
                analysis = self.analyze_text(text, filename)
                metadata = {'filename': filename, 'full_path': str(img_path),
                          'ocr_confidence': ocr_result['ocr_confidence']}
                if self.add_to_vectorstore(text, metadata): vector_stored += 1

            # Store in DB
            risk = analysis['risk_score'] if analysis else 0.0
            incrim = analysis['contains_incriminating'] if analysis else False
            self.conn.execute('''INSERT OR REPLACE INTO forensic_images
                (filename, full_path, extracted_text, ocr_confidence, local_analysis,
                 risk_score, contains_incriminating, vector_stored) VALUES (?,?,?,?,?,?,?,?)''',
                (filename, str(img_path), text, ocr_result['ocr_confidence'],
                 json.dumps(analysis) if analysis else None, risk, incrim, vector_stored > 0))

            if text.strip():
                if incrim: incriminating += 1; print(f" 🚨 INCRIMINATING ({risk:.2f})")
                else: print(" ✅ TEXT")
            else: print(" ⚪ NO TEXT")

            processed += 1
            if i % 10 == 0: self.conn.commit()

        self.conn.commit()
        print(f"\n🏠 COMPLETE: {processed} processed, {vector_stored} vectorized, {incriminating} flagged, {time.time()-start_time:.1f}s")

    def investigate(self, query: str) -> Dict[str, Any]:
        print(f"🏠 INVESTIGATING: '{query}'")
        try:
            docs = self.vectorstore.similarity_search(query, k=5)
            if not docs: return {'findings': [], 'summary': 'No evidence found', 'total_flagged': 0}

            evidence, files = "", set()
            for i, doc in enumerate(docs, 1):
                filename = doc.metadata.get('filename', f'unknown_{i}')
                files.add(filename)
                evidence += f"Evidence {i} - {filename}: {doc.page_content}\n---\n"

            prompt = f'''Forensic investigation: "{query}"
Evidence: {evidence}
Respond with JSON:
{{"findings": [{{"image_number": 1, "filename": "file.jpg", "matches_query": true,
"risk_level": "high", "evidence_found": "desc", "relevant_text": "text", "confidence": 0.8}}],
"summary": "summary", "total_flagged": 2, "highest_risk_image": "file.jpg"}}'''

            response = str(self.llm.invoke(prompt)).strip()
            response = response.replace('```json', '').replace('```', '').strip()
            start, end = response.find('{'), response.rfind('}') + 1

            if start >= 0 and end > start:
                results = json.loads(response[start:end])
            else:
                results = self._fallback_investigation(query, files)

            # Store investigation
            flagged = [f.get('filename', '') for f in results.get('findings', []) if f.get('matches_query')]
            self.conn.execute('''INSERT INTO local_investigations
                (query, results_json, flagged_images, model_used) VALUES (?,?,?,?)''',
                (query, json.dumps(results), json.dumps(flagged), f"{self.config['LLM_PROVIDER']}-{self.config['LLM_MODEL']}"))
            self.conn.commit()

            print(f"✅ Found {results.get('total_flagged', 0)} matches")
            return results
        except Exception as e:
            print(f"❌ Error: {e}")
            return {'findings': [], 'summary': f'Error: {e}', 'total_flagged': 0}

    def _fallback_investigation(self, query: str, files: set) -> Dict[str, Any]:
        findings = [{'image_number': i, 'filename': f, 'matches_query': True, 'risk_level': 'medium',
                    'evidence_found': f'Related to: {query}', 'relevant_text': f'Matches: {query}', 'confidence': 0.7}
                   for i, f in enumerate(list(files)[:5], 1)]
        return {'findings': findings, 'summary': f'Fallback analysis for: {query}',
               'total_flagged': len(findings), 'highest_risk_image': list(files)[0] if files else None}

    def get_flagged(self, min_risk=0.3):
        return self.conn.execute('''SELECT filename, full_path, extracted_text, risk_score,
            local_analysis, ocr_confidence FROM forensic_images
            WHERE contains_incriminating = 1 AND risk_score >= ? ORDER BY risk_score DESC''', (min_risk,)).fetchall()

    def export_results(self, results: Dict[str, Any], query: str) -> str:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        query_safe = query.replace(' ', '_')[:30]
        output_path = Path(self.config['RESULTS_OUTPUT_PATH']) / f"investigation_{query_safe}_{timestamp}"
        output_path.mkdir(parents=True, exist_ok=True)

        flagged = [f.get('filename', '') for f in results.get('findings', []) if f.get('matches_query')]
        copied = []
        for i, filename in enumerate(flagged, 1):
            result = self.conn.execute("SELECT full_path FROM forensic_images WHERE filename = ?", (filename,)).fetchone()
            if result and Path(result[0]).exists():
                dest = output_path / f"{i:03d}_{filename}"
                shutil.copy2(result[0], dest); copied.append(str(dest))

        # Save reports
        report = {'query': query, 'timestamp': timestamp, 'model': f"{self.config['LLM_PROVIDER']}-{self.config['LLM_MODEL']}",
                 'results': results, 'flagged_images': flagged, 'copied_files': copied}

        with open(output_path / "report.json", 'w') as f: json.dump(report, f, indent=2)
        with open(output_path / "summary.txt", 'w') as f:
            f.write(f"LOCAL FORENSIC REPORT\nQuery: {query}\nDate: {timestamp}\n")
            f.write(f"Model: {self.config['LLM_PROVIDER']}-{self.config['LLM_MODEL']}\n")
            f.write(f"Flagged: {len(flagged)}\nSummary: {results.get('summary', 'N/A')}\n")

        print(f"📋 Exported to: {output_path}")
        return str(output_path)

    def close(self):
        if hasattr(self, 'conn') and self.conn: self.conn.close()

# Compact GUI
import ipywidgets as widgets
from IPython.display import display, clear_output
import matplotlib.pyplot as plt

def create_compact_gui():
    # OCR Configuration - ADD THIS TO EXISTING INTERFACE
    ocr_selector = widgets.Dropdown(
        options=[
            ('PaddleOCR - Best overall, 80+ languages', 'paddleocr'),
            ('EasyOCR - Good for European languages', 'easyocr'),
            ('Tesseract - Classic OCR for printed text', 'tesseract'),
            ('TrOCR - AI-based OCR for handwriting', 'trocr')
        ],
        value='paddleocr',
        description='OCR Engine:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='500px')
    )

    # Existing widgets from your current interface
    llm_provider = widgets.Dropdown(
        options=[('Ollama', 'ollama'), ('Transformers', 'transformers')],
        value='ollama',
        description='LLM Provider:',
        style={'description_width': 'initial'}
    )

    llm_model = widgets.Text(
        value='llama3.2:3b',
        description='LLM Model:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='300px')
    )

    gallery_path = widgets.Text(
        value='./suspects_gallery',
        description='Gallery Path:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='400px')
    )

    query_input = widgets.Textarea(
        value='Find weapons or violence mentions',
        description='Investigation Query:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='500px', height='60px')
    )

    confidence_slider = widgets.FloatSlider(
        value=0.60,
        min=0.1, max=0.9, step=0.05,
        description='OCR Confidence:',
        style={'description_width': 'initial'},
        readout_format='.2f'
    )

    risk_slider = widgets.FloatSlider(
        value=0.3,
        min=0.1, max=1.0, step=0.1,
        description='Risk Threshold:',
        style={'description_width': 'initial'},
        readout_format='.1f'
    )

    # Quick queries (same as your existing interface)
    quick_queries = ["Find weapons", "Look for drugs", "Financial crimes", "Personal info theft", "Threats", "Illegal coordination"]
    quick_btns = [widgets.Button(description=q, tooltip=q, layout=widgets.Layout(width='180px', margin='2px')) for q in quick_queries]

    # Action buttons (same as your existing interface)
    test_btn = widgets.Button(description='🔍 Test Setup', button_style='info', layout=widgets.Layout(width='120px'))
    process_btn = widgets.Button(description='📁 Process', button_style='primary', layout=widgets.Layout(width='120px'))
    investigate_btn = widgets.Button(description='🤖 Investigate', button_style='success', layout=widgets.Layout(width='120px'))
    flagged_btn = widgets.Button(description='🚨 Flagged', button_style='warning', layout=widgets.Layout(width='120px'))
    export_btn = widgets.Button(description='📋 Export', button_style='success', layout=widgets.Layout(width='120px'), disabled=True)
    clear_btn = widgets.Button(description='🗑️ Clear', layout=widgets.Layout(width='120px'))

    status_label = widgets.HTML(value="<b>Status:</b> Ready")
    progress_bar = widgets.IntProgress(value=0, min=0, max=100, layout=widgets.Layout(width='400px'))
    output_area = widgets.Output()

    # State
    current_system = None
    current_results = None

    def update_status(msg, progress=None):
        status_label.value = f"<b>Status:</b> {msg}"
        if progress is not None: progress_bar.value = progress

    def init_system():
        nonlocal current_system
        try:
            config = CONFIG.copy()
            config.update({
                'OCR_MODEL': ocr_selector.value,  # NOW USES SELECTED OCR
                'LLM_PROVIDER': llm_provider.value,
                'LLM_MODEL': llm_model.value,
                'SUSPECTS_GALLERY_PATH': gallery_path.value,
                'OCR_CONFIDENCE_THRESHOLD': confidence_slider.value
            })
            current_system = CompactForensicSystem(config)
            return True
        except Exception as e:
            print(f"❌ Init error: {e}")
            return False

    # Button handlers (same functionality, now with OCR support)
    def on_quick_query(query):
        def handler(b): query_input.value = query
        return handler

    for i, btn in enumerate(quick_btns):
        btn.on_click(on_quick_query(quick_queries[i]))

    def on_test(b):
        with output_area:
            clear_output(wait=True)
            update_status("Testing setup...", 20)
            print("🔍 TESTING LOCAL SETUP")
            print("="*40)

            # Test OCR first
            print(f"🔍 Selected OCR: {ocr_selector.value}")
            ocr_info = {
                'paddleocr': 'Best overall, 80+ languages, handwriting',
                'easyocr': 'Good for European languages, clean text',
                'tesseract': 'Excellent for printed documents, stable',
                'trocr': 'Perfect for handwritten notes, degraded images'
            }
            if ocr_selector.value in ocr_info:
                print(f"   Purpose: {ocr_info[ocr_selector.value]}")

            # Test OCR availability
            try:
                if ocr_selector.value == 'paddleocr':
                    import paddleocr; print("✅ PaddleOCR available")
                elif ocr_selector.value == 'easyocr':
                    import easyocr; print("✅ EasyOCR available")
                elif ocr_selector.value == 'tesseract':
                    import pytesseract; print("✅ Tesseract available")
                elif ocr_selector.value == 'trocr':
                    import transformers; print("✅ TrOCR available")
            except ImportError:
                print(f"❌ {ocr_selector.value} not installed")

            # Test other dependencies
            deps = [('langchain', 'LangChain'), ('chromadb', 'ChromaDB'), ('sentence_transformers', 'Transformers')]
            for dep, name in deps:
                try: __import__(dep.replace('-', '_')); print(f"✅ {name}")
                except ImportError: print(f"❌ {name} not found")

            # Test Ollama
            if llm_provider.value == 'ollama':
                update_status("Testing Ollama...", 60)
                try:
                    import requests
                    r = requests.get("http://localhost:11434/api/tags", timeout=5)
                    if r.status_code == 200:
                        models = [m['name'] for m in r.json().get('models', [])]
                        print(f"✅ Ollama: {models}")
                        if llm_model.value in models:
                            print(f"✅ Model '{llm_model.value}' ready")
                        else:
                            print(f"⚠️ Model '{llm_model.value}' not found. Install: ollama pull {llm_model.value}")
                    else: print("❌ Ollama not responding")
                except: print("❌ Ollama connection failed. Start: ollama serve")

            update_status("Testing system init...", 80)
            if init_system():
                print("✅ System ready with selected OCR");
                update_status("Setup complete!", 100)
            else:
                print("❌ System init failed");
                update_status("Setup failed", 0)

    def on_process(b):
        with output_area:
            clear_output(wait=True)
            update_status("Processing gallery...", 20)
            if not init_system():
                update_status("Init failed", 0); return

            print(f"🏠 PROCESSING | OCR: {ocr_selector.value} | LLM: {llm_provider.value}-{llm_model.value}")
            try:
                current_system.process_gallery()
                update_status("Processing complete!", 100)
                flagged = current_system.get_flagged(risk_slider.value)
                if flagged: print(f"\n🚨 FLAGGED: {len(flagged)} images above risk {risk_slider.value}")
                else: print("\n✅ No high-risk content detected")
            except Exception as e: update_status("Processing error", 0); print(f"❌ Error: {e}")

    def on_investigate(b):
        nonlocal current_results
        with output_area:
            clear_output(wait=True)
            if not current_system and not init_system(): print("❌ System not ready"); return

            query = query_input.value.strip()
            if not query: print("⚠️ Enter investigation query"); return

            update_status("Investigating...", 50)
            print(f"🏠 INVESTIGATING: '{query}' | OCR: {ocr_selector.value} | LLM: {llm_model.value}")
            try:
                current_results = current_system.investigate(query)
                if current_results.get('total_flagged', 0) > 0:
                    update_status(f"Found {current_results['total_flagged']} matches", 100)
                    export_btn.disabled = False
                    print(f"📊 Summary: {current_results.get('summary', 'N/A')}")
                    print("🚨 FLAGGED:")
                    for f in current_results.get('findings', []):
                        if f.get('matches_query'):
                            print(f"  {f['risk_level'].upper()}: {f['filename']} - {f['evidence_found']}")

                    # Display images
                    flagged_files = [f['filename'] for f in current_results.get('findings', []) if f.get('matches_query')]
                    if flagged_files:
                        print(f"🖼️ Displaying {len(flagged_files)} flagged images...")
                        display_flagged_images(flagged_files[:6], current_system)
                else:
                    update_status("No matches", 100); export_btn.disabled = True
                    print("❌ No matches. Try different terms or lower risk threshold.")
            except Exception as e: update_status("Investigation error", 0); print(f"❌ Error: {e}")

    def on_flagged(b):
        with output_area:
            clear_output(wait=True)
            if not current_system and not init_system(): print("❌ System not ready"); return

            update_status("Loading flagged images...", 50)
            try:
                flagged = current_system.get_flagged(risk_slider.value)
                if flagged:
                    print(f"🚨 FLAGGED IMAGES (Risk ≥ {risk_slider.value})")
                    for i, (filename, _, text, risk, analysis, conf) in enumerate(flagged, 1):
                        level = "CRITICAL" if risk > 0.8 else "HIGH" if risk > 0.6 else "MEDIUM" if risk > 0.3 else "LOW"
                        print(f"{i}. {filename} | {level} ({risk:.3f}) | OCR: {conf:.3f}")
                        print(f"   Text: {text[:100]}...")
                        if analysis:
                            try:
                                data = json.loads(analysis)
                                cats = data.get('categories', [])
                                if cats: print(f"   Categories: {', '.join(cats)}")
                            except: pass
                        print()

                    files = [r[0] for r in flagged[:6]]
                    print(f"🖼️ Displaying {len(files)} flagged images...")
                    display_flagged_images(files, current_system)
                    update_status(f"Showing {len(flagged)} flagged images", 100)
                else:
                    print(f"✅ No images flagged above risk {risk_slider.value}")
                    update_status("No flagged images", 100)
            except Exception as e: update_status("Error loading flagged", 0); print(f"❌ Error: {e}")

    def on_export(b):
        with output_area:
            if not current_results: print("⚠️ No results to export"); return
            try:
                update_status("Exporting...", 50)
                path = current_system.export_results(current_results, query_input.value or "investigation")
                update_status("Export complete!", 100)
                print(f"✅ Results exported to: {path}")
            except Exception as e: update_status("Export error", 0); print(f"❌ Error: {e}")

    def on_clear(b):
        nonlocal current_results
        current_results = None; export_btn.disabled = True
        with output_area: clear_output(); update_status("Cleared", 0); print("🗑️ Cleared")

    # Connect events
    test_btn.on_click(on_test); process_btn.on_click(on_process); investigate_btn.on_click(on_investigate)
    flagged_btn.on_click(on_flagged); export_btn.on_click(on_export); clear_btn.on_click(on_clear)

    # Quick query grid
    quick_grid = widgets.GridBox(quick_btns, layout=widgets.Layout(grid_template_columns='repeat(3, 1fr)', grid_gap='5px'))

    # UPDATED LAYOUT: Add OCR selector at the top
    controls = widgets.VBox([
        widgets.HTML("<h3>🏠 Compact Local Forensic System</h3>"),

        # ADD OCR SELECTOR AT THE TOP
        widgets.HTML("<b>🔍 OCR Engine Selection:</b>"),
        ocr_selector,
        widgets.HTML("<br>"),

        # Existing LLM configuration
        widgets.HBox([llm_provider, llm_model]),
        gallery_path,
        query_input,
        widgets.HTML("<b>Quick Queries:</b>"),
        quick_grid,
        widgets.HBox([confidence_slider, risk_slider]),
        widgets.HBox([test_btn, process_btn, investigate_btn, flagged_btn, export_btn, clear_btn]),
        status_label,
        progress_bar,
        widgets.HTML("<hr>")
    ])

    return widgets.VBox([controls, output_area])

    confidence_slider = widgets.FloatSlider(
        value=CONFIG['OCR_CONFIDENCE_THRESHOLD'],
        min=0.1, max=0.9, step=0.05,
        description='OCR Confidence:',
        style={'description_width': 'initial'},
        readout_format='.2f'
    )

    risk_slider = widgets.FloatSlider(
        value=0.3,
        min=0.1, max=1.0, step=0.1,
        description='Risk Threshold:',
        style={'description_width': 'initial'},
        readout_format='.1f'
    )

    # Quick queries
    quick_queries = ["Find weapons", "Look for drugs", "Financial crimes", "Personal info theft", "Threats", "Illegal coordination"]
    quick_btns = [widgets.Button(description=q[:20], tooltip=q, layout=widgets.Layout(width='180px', margin='2px')) for q in quick_queries]

    # Action buttons
    test_btn = widgets.Button(description='🔍 Test Setup', button_style='info', layout=widgets.Layout(width='120px'))
    process_btn = widgets.Button(description='📁 Process', button_style='primary', layout=widgets.Layout(width='120px'))
    investigate_btn = widgets.Button(description='🤖 Investigate', button_style='success', layout=widgets.Layout(width='120px'))
    flagged_btn = widgets.Button(description='🚨 Flagged', button_style='warning', layout=widgets.Layout(width='120px'))
    export_btn = widgets.Button(description='📋 Export', button_style='success', layout=widgets.Layout(width='120px'), disabled=True)
    clear_btn = widgets.Button(description='🗑️ Clear', layout=widgets.Layout(width='120px'))

    status_label = widgets.HTML(value="<b>Status:</b> Ready")
    progress_bar = widgets.IntProgress(value=0, min=0, max=100, layout=widgets.Layout(width='400px'))
    output_area = widgets.Output()

    # State
    current_system = None
    current_results = None

    def update_status(msg, progress=None):
        status_label.value = f"<b>Status:</b> {msg}"
        if progress is not None: progress_bar.value = progress

    def init_system():
        nonlocal current_system
        try:
            config = CONFIG.copy()
            config.update({
                'OCR_MODEL': ocr_selector.value,
                'LLM_PROVIDER': llm_provider.value,
                'LLM_MODEL': llm_model.value,
                'SUSPECTS_GALLERY_PATH': gallery_path.value,
                'OCR_CONFIDENCE_THRESHOLD': confidence_slider.value
            })
            current_system = CompactForensicSystem(config)
            return True
        except Exception as e:
            print(f"❌ Init error: {e}")
            return False

    # Button handlers
    def on_quick_query(query):
        def handler(b): query_input.value = query
        return handler

    for i, btn in enumerate(quick_btns): btn.on_click(on_quick_query(quick_queries[i]))

    def on_test(b):
        with output_area:
            clear_output(wait=True)
            update_status("Testing setup...", 20)
            print("🔍 TESTING LOCAL SETUP")
            print("="*40)

            # Test OCR first
            print(f"Testing OCR: {ocr_selector.value}")
            ocr_info = OCR_MODELS.get(ocr_selector.value, {})
            print(f"Purpose: {ocr_info.get('strengths', 'General OCR')}")

            try:
                if ocr_selector.value == 'paddleocr':
                    import paddleocr; print("✅ PaddleOCR available")
                elif ocr_selector.value == 'easyocr':
                    import easyocr; print("✅ EasyOCR available")
                elif ocr_selector.value == 'tesseract':
                    import pytesseract; print("✅ Tesseract available")
                elif ocr_selector.value == 'trocr':
                    import transformers; print("✅ TrOCR (Transformers) available")
            except ImportError as e: print(f"❌ {ocr_selector.value} not found: {e}")

            # Test other dependencies
            deps = [('paddleocr', 'PaddleOCR'), ('langchain', 'LangChain'), ('chromadb', 'ChromaDB'), ('sentence_transformers', 'Transformers')]
            for dep, name in deps:
                try: __import__(dep.replace('-', '_')); print(f"✅ {name}")
                except ImportError: print(f"❌ {name} not found")

            # Test Ollama
            if llm_provider.value == 'ollama':
                update_status("Testing Ollama...", 60)
                try:
                    import requests
                    r = requests.get("http://localhost:11434/api/tags", timeout=5)
                    if r.status_code == 200:
                        models = [m['name'] for m in r.json().get('models', [])]
                        print(f"✅ Ollama: {models}")
                        if llm_model.value in models: print(f"✅ Model '{llm_model.value}' ready")
                        else: print(f"⚠️ Model '{llm_model.value}' not found. Install: ollama pull {llm_model.value}")
                    else: print("❌ Ollama not responding")
                except: print("❌ Ollama connection failed. Start: ollama serve")

            update_status("Testing system init...", 80)
            if init_system(): print("✅ System ready"); update_status("Setup complete!", 100)
            else: print("❌ System init failed"); update_status("Setup failed", 0)

    def on_process(b):
        with output_area:
            clear_output(wait=True)
            update_status("Processing gallery...", 20)
            if not init_system(): update_status("Init failed", 0); return

            print(f"🏠 PROCESSING | OCR: {ocr_selector.value} | LLM: {llm_provider.value}-{llm_model.value}")
            try:
                current_system.process_gallery()
                update_status("Processing complete!", 100)
                flagged = current_system.get_flagged(risk_slider.value)
                if flagged: print(f"\n🚨 FLAGGED: {len(flagged)} images above risk {risk_slider.value}")
                else: print("\n✅ No high-risk content detected")
            except Exception as e: update_status("Processing error", 0); print(f"❌ Error: {e}")

    def on_investigate(b):
        nonlocal current_results
        with output_area:
            clear_output(wait=True)
            if not current_system and not init_system(): print("❌ System not ready"); return

            query = query_input.value.strip()
            if not query: print("⚠️ Enter investigation query"); return

            update_status("Investigating...", 50)
            print(f"🏠 INVESTIGATING: '{query}' | Model: {llm_model.value}")
            try:
                current_results = current_system.investigate(query)
                if current_results.get('total_flagged', 0) > 0:
                    update_status(f"Found {current_results['total_flagged']} matches", 100)
                    export_btn.disabled = False
                    print(f"📊 Summary: {current_results.get('summary', 'N/A')}")
                    print("🚨 FLAGGED:")
                    for f in current_results.get('findings', []):
                        if f.get('matches_query'):
                            print(f"  {f['risk_level'].upper()}: {f['filename']} - {f['evidence_found']}")

                    # Display images
                    flagged_files = [f['filename'] for f in current_results.get('findings', []) if f.get('matches_query')]
                    if flagged_files:
                        print(f"🖼️ Displaying {len(flagged_files)} flagged images...")
                        display_flagged_images(flagged_files[:6], current_system)
                else:
                    update_status("No matches", 100); export_btn.disabled = True
                    print("❌ No matches. Try different terms or lower risk threshold.")
            except Exception as e: update_status("Investigation error", 0); print(f"❌ Error: {e}")

    def on_flagged(b):
        with output_area:
            clear_output(wait=True)
            if not current_system and not init_system(): print("❌ System not ready"); return

            update_status("Loading flagged images...", 50)
            try:
                flagged = current_system.get_flagged(risk_slider.value)
                if flagged:
                    print(f"🚨 FLAGGED IMAGES (Risk ≥ {risk_slider.value})")
                    for i, (filename, _, text, risk, analysis, conf) in enumerate(flagged, 1):
                        level = "CRITICAL" if risk > 0.8 else "HIGH" if risk > 0.6 else "MEDIUM" if risk > 0.3 else "LOW"
                        print(f"{i}. {filename} | {level} ({risk:.3f}) | OCR: {conf:.3f}")
                        print(f"   Text: {text[:100]}...")
                        if analysis:
                            try:
                                data = json.loads(analysis)
                                cats = data.get('categories', [])
                                if cats: print(f"   Categories: {', '.join(cats)}")
                            except: pass
                        print()

                    files = [r[0] for r in flagged[:6]]
                    display_flagged_images(files, current_system)
                    update_status(f"Showing {len(flagged)} flagged images", 100)
                else:
                    print(f"✅ No images flagged above risk {risk_slider.value}")
                    update_status("No flagged images", 100)
            except Exception as e: update_status("Error loading flagged", 0); print(f"❌ Error: {e}")

    def on_export(b):
        with output_area:
            if not current_results: print("⚠️ No results to export"); return
            try:
                update_status("Exporting...", 50)
                path = current_system.export_results(current_results, query_input.value or "investigation")
                update_status("Export complete!", 100)
                print(f"✅ Results exported to: {path}")
            except Exception as e: update_status("Export error", 0); print(f"❌ Error: {e}")

    def on_clear(b):
        nonlocal current_results
        current_results = None; export_btn.disabled = True
        with output_area: clear_output(); update_status("Cleared", 0); print("🗑️ Cleared")

    # Connect events
    test_btn.on_click(on_test); process_btn.on_click(on_process); investigate_btn.on_click(on_investigate)
    flagged_btn.on_click(on_flagged); export_btn.on_click(on_export); clear_btn.on_click(on_clear)

    # Layout
    quick_grid = widgets.GridBox(quick_btns, layout=widgets.Layout(grid_template_columns='repeat(3, 1fr)', grid_gap='5px'))
    controls = widgets.VBox([
        widgets.HTML("<h3>🏠 Compact Local Forensic System</h3>"),
        widgets.HBox([llm_provider, llm_model]), gallery_path, query_input,
        widgets.HTML("<b>Quick Queries:</b>"), quick_grid,
        widgets.HBox([confidence_slider, risk_slider]),
        widgets.HBox([test_btn, process_btn, investigate_btn, flagged_btn, export_btn, clear_btn]),
        status_label, progress_bar, widgets.HTML("<hr>")
    ])

    return widgets.VBox([controls, output_area])

    for i, filename in enumerate(filenames[:num]):
        ax = axes[i] if num > 1 else axes
        try:
            result = system.conn.execute('''SELECT full_path, extracted_text, risk_score, local_analysis
                FROM forensic_images WHERE filename = ?''', (filename,)).fetchone()
            if not result: ax.text(0.5, 0.5, f"Not found\n{filename}", ha='center', va='center', transform=ax.transAxes); ax.axis('off'); continue

            full_path, text, risk, analysis = result
            try: img = plt.imread(full_path); ax.imshow(img)
            except: ax.text(0.5, 0.5, f"Load error\n{filename}", ha='center', va='center', transform=ax.transAxes); ax.axis('off'); continue

            # Risk level and color
            level, color = ("CRITICAL", "red") if risk > 0.8 else ("HIGH", "orange") if risk > 0.6 else ("MEDIUM", "yellow") if risk > 0.3 else ("LOW", "green")
            ax.set_title(f"{filename}\n{level}: {risk:.2f}", fontsize=10, color=color, fontweight='bold')

            # Text overlay
            text_preview = text[:80] if text else "No text"
            ax.text(0.02, 0.98, f"Text: {text_preview}...", transform=ax.transAxes, fontsize=8,
                   verticalalignment='top', bbox=dict(boxstyle="round,pad=0.3", facecolor=color, alpha=0.7))

            # Categories
            if analysis:
                try:
                    data = json.loads(analysis)
                    cats = data.get('categories', [])
                    if cats: ax.text(0.02, 0.02, f"Cat: {', '.join(cats[:2])}", transform=ax.transAxes, fontsize=7,
                                   verticalalignment='bottom', bbox=dict(boxstyle="round,pad=0.2", facecolor="lightblue", alpha=0.8))
                except: pass
            ax.axis('off')
        except Exception as e: ax.text(0.5, 0.5, f"Error\n{filename}\n{str(e)[:30]}", ha='center', va='center', transform=ax.transAxes); ax.axis('off')

    for j in range(num, len(axes)): axes[j].axis('off')
    plt.tight_layout(); plt.suptitle('🏠 Local Forensic Results', fontsize=14, y=1.02); plt.show()

# Quick functions
def quick_setup():
    print("🏠 QUICK SETUP CHECK")
    deps = ['paddleocr', 'langchain', 'chromadb', 'sentence_transformers']
    for dep in deps:
        try: __import__(dep.replace('-', '_')); print(f"✅ {dep}")
        except ImportError: print(f"❌ {dep} - pip install {dep}")

    try:
        import requests
        r = requests.get("http://localhost:11434/api/tags", timeout=5)
        if r.status_code == 200:
            models = [m['name'] for m in r.json().get('models', [])]
            print(f"✅ Ollama: {models}")
            if 'llama3.2:3b' in models: print("✅ Recommended model ready")
            else: print("⚠️ Install: ollama pull llama3.2:3b")
        else: print("❌ Ollama not responding")
    except: print("❌ Ollama not available - Start: ollama serve")

def quick_investigate(query: str, gallery_path: str = './suspects_gallery'):
    print(f"🏠 Quick Investigation: '{query}' | 🔒 100% Local")
    try:
        config = CONFIG.copy(); config['SUSPECTS_GALLERY_PATH'] = gallery_path
        system = CompactForensicSystem(config)

        # Check if processed
        count = system.conn.execute("SELECT COUNT(*) FROM forensic_images WHERE vector_stored = 1").fetchone()[0]
        if count == 0: print("📁 Processing gallery first..."); system.process_gallery()

        results = system.investigate(query)
        if results.get('total_flagged', 0) > 0:
            path = system.export_results(results, query)
            print(f"📋 Exported to: {path}")
        system.close(); return results
    except Exception as e: print(f"❌ Error: {e}"); return None

# Display interface with debugging
print("🏠 Creating Compact Local Forensic Interface...")
print("🔧 Debug: OCR models available:", list(OCR_MODELS.keys()))

compact_interface = create_compact_gui()

# Verify the interface was created properly
print("✅ GUI created successfully")
print("🔍 OCR selector should be visible at the top")

display(compact_interface)

print(f"\n🏠 COMPACT LOCAL FORENSIC SYSTEM READY!")
print("="*50)
print("🔒 PRIVACY: 100% local, no API calls, offline capable")
print("🤖 COMPONENTS: Multi-OCR + Ollama/Transformers + ChromaDB + SQLite")
print("📝 OCR OPTIONS: PaddleOCR, EasyOCR, Tesseract, TrOCR")
print("🚀 SETUP: ollama serve → ollama pull llama3.2:3b")
print("💡 QUICK: quick_setup() | quick_investigate('find weapons')")
print("="*50)
print("\n🔍 OCR SELECTION GUIDE:")
print("• PaddleOCR: Best overall, Asian languages, handwriting")
print("• EasyOCR: European languages, clean text")
print("• Tesseract: Printed documents, very stable")
print("• TrOCR: Handwritten notes, degraded images")
print("="*50)

🏠 LOCAL FORENSIC SYSTEM | LLM: ollama-llama3.2:3b | 🔒 100% Private
✅ paddleocr
✅ langchain
✅ chromadb
✅ sentence_transformers
✅ Ollama: ['deepseek-r1:1.5b', 'llama3.2:latest']
🏠 Creating Compact Local Forensic Interface...
🔧 Debug: OCR models available: ['paddleocr', 'easyocr', 'tesseract', 'trocr']
✅ GUI created successfully
🔍 OCR selector should be visible at the top


VBox(children=(VBox(children=(HTML(value='<h3>🏠 Compact Local Forensic System</h3>'), HTML(value='<b>🔍 OCR Eng…


🏠 COMPACT LOCAL FORENSIC SYSTEM READY!
🔒 PRIVACY: 100% local, no API calls, offline capable
🤖 COMPONENTS: Multi-OCR + Ollama/Transformers + ChromaDB + SQLite
📝 OCR OPTIONS: PaddleOCR, EasyOCR, Tesseract, TrOCR
🚀 SETUP: ollama serve → ollama pull llama3.2:3b
💡 QUICK: quick_setup() | quick_investigate('find weapons')

🔍 OCR SELECTION GUIDE:
• PaddleOCR: Best overall, Asian languages, handwriting
• EasyOCR: European languages, clean text
• Tesseract: Printed documents, very stable
• TrOCR: Handwritten notes, degraded images
