Worked with Python_3.9

In [6]:
!pip install ipywidgets paddlepaddle paddleocr

['Text_ocr_000.jpg',
 'Text_ocr_001.jpg',
 'Text_ocr_002.jpg',
 'Text_ocr_003.jpg',
 'Text_ocr_004.jpg',
 'Text_ocr_005.jpg',
 'Text_ocr_006.jpg',
 'Text_ocr_007.jpg',
 'Text_ocr_008.jpg',
 'Text_ocr_009.jpg',
 'Text_ocr_010.jpg',
 'Text_ocr_011.jpg',
 'Text_ocr_012.jpg',
 'Text_ocr_013.jpg',
 'Text_ocr_014.jpg',
 'Text_ocr_015.jpg',
 'Text_ocr_016.jpg',
 'Text_ocr_017.jpg',
 'Text_ocr_018.jpg',
 'Text_ocr_019.jpg',
 'Text_ocr_020.jpg',
 'Text_ocr_021.jpg',
 'Text_ocr_022.jpg',
 'Text_ocr_023.jpg',
 'Text_ocr_024.jpg',
 'Text_ocr_025.jpg',
 'Text_ocr_026.jpg',
 'Text_ocr_027.jpg',
 'Text_ocr_028.jpg',
 'Text_ocr_029.jpg',
 'Text_ocr_030.jpg',
 'Text_ocr_031.jpg',
 'Text_ocr_032.jpg',
 'Text_ocr_033.jpg',
 'Text_ocr_034.jpg',
 'Text_ocr_035.jpg',
 'Text_ocr_036.jpg',
 'Text_ocr_037.jpg',
 'Text_ocr_038.jpg',
 'Text_ocr_039.jpg',
 'Text_ocr_040.jpg',
 'Text_ocr_041.jpg',
 'Text_ocr_042.jpg',
 'Text_ocr_043.jpg',
 'Text_ocr_044.jpg',
 'Text_ocr_045.jpg',
 'Text_ocr_046.jpg',
 'Text_ocr_04

In [1]:
# Cell 1: Configuration and Setup
import os
import shutil
import sqlite3
import json
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Any
import time
import numpy as np
from PIL import Image
import cv2

# Configuration Settings
CONFIG = {
    # OCR Model Selection
    'OCR_MODEL': 'paddleocr',  # 'paddleocr', 'easyocr', 'tesseract'
    'OCR_LANGUAGES': ['en'],   # Languages for OCR

    # Search and Embeddings
    'EMBEDDING_MODEL': 'sentence-transformers/all-MiniLM-L6-v2',
    'DEVICE': 'auto',  # 'auto', 'cpu', 'cuda'

    # Paths
    'SUSPECTS_GALLERY_PATH': '../../datasets/images/text_ocr',     # Input folder with suspect images
    'DATABASE_PATH': 'forensic_analysis_db',       # Database folder
    'RESULTS_OUTPUT_PATH': '../../datasets/images/forensic_results',     # Output folder for matched images

    # OCR Processing parameters
    'OCR_CONFIDENCE_THRESHOLD': 0.6,  # Minimum OCR confidence
    'TEXT_MIN_LENGTH': 3,              # Minimum text length to consider
    'BATCH_SIZE': 4,                   # Batch size for processing

    # Search parameters
    'SIMILARITY_THRESHOLD': 0.3,       # Semantic similarity threshold
    'MAX_RESULTS_DISPLAY': 10,         # Maximum results to display
    'TOP_K_RESULTS': 20,               # Top K results for queries

    # Processing settings
    'SAVE_OCR_METADATA': True,         # Save detailed OCR metadata
    'FIGURE_SIZE': (15, 10),           # Size of result visualization
}

# Available OCR models
AVAILABLE_OCR_MODELS = {
    'paddleocr': {
        'name': 'PaddleOCR',
        'description': 'High accuracy, supports 80+ languages',
        'performance': 'Best overall performance',
        'install_cmd': 'pip install paddlepaddle paddleocr'
    },
    'easyocr': {
        'name': 'EasyOCR',
        'description': 'Simple to use, good accuracy',
        'performance': 'Good performance, easy setup',
        'install_cmd': 'pip install easyocr'
    },
    'tesseract': {
        'name': 'Tesseract',
        'description': 'Classic OCR, widely supported',
        'performance': 'Baseline performance',
        'install_cmd': 'pip install pytesseract'
    }
}

print("✅ Configuration loaded successfully")
print(f"📁 Suspects gallery: {CONFIG['SUSPECTS_GALLERY_PATH']}")
print(f"📁 Database path: {CONFIG['DATABASE_PATH']}")
print(f"📁 Results output: {CONFIG['RESULTS_OUTPUT_PATH']}")
print(f"🔍 OCR model: {CONFIG['OCR_MODEL']}")
print("📝 Forensic OCR + Search System Ready")

# Cell 2: Install and Import Dependencies
def install_dependencies():
    """Install required packages based on selected OCR model"""
    import subprocess
    import sys

    try:
        # Check core dependencies
        import numpy
        import PIL
        import cv2
        print("✅ Core dependencies available")
    except ImportError as e:
        print(f"⚠️ Installing core dependencies...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "numpy", "Pillow", "opencv-python"])

    # Install OCR model
    ocr_model = CONFIG['OCR_MODEL']
    try:
        if ocr_model == 'paddleocr':
            import paddleocr
            print("✅ PaddleOCR already available")
        elif ocr_model == 'easyocr':
            import easyocr
            print("✅ EasyOCR already available")
        elif ocr_model == 'tesseract':
            import pytesseract
            print("✅ Tesseract already available")
    except ImportError:
        print(f"⚠️ Installing {ocr_model}...")
        install_cmd = AVAILABLE_OCR_MODELS[ocr_model]['install_cmd']
        subprocess.check_call([sys.executable, "-m", "pip", "install"] + install_cmd.split()[2:])

    # Install search dependencies
    try:
        import chromadb
        import sentence_transformers
        print("✅ Search dependencies available")
    except ImportError:
        print("⚠️ Installing search dependencies...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "chromadb", "sentence-transformers"])

# Run installation
install_dependencies()

# Import required libraries
try:
    import chromadb
    from sentence_transformers import SentenceTransformer

    # Import OCR model
    if CONFIG['OCR_MODEL'] == 'paddleocr':
        from paddleocr import PaddleOCR
    elif CONFIG['OCR_MODEL'] == 'easyocr':
        import easyocr
    elif CONFIG['OCR_MODEL'] == 'tesseract':
        import pytesseract

    print("✅ All dependencies imported successfully")

    # Check device compatibility
    import torch
    if torch.cuda.is_available():
        print(f"🚀 CUDA available: {torch.cuda.get_device_name(0)}")
        default_device = "cuda"
    else:
        print("🖥️ Using CPU mode")
        default_device = "cpu"

    if CONFIG['DEVICE'] == 'auto':
        CONFIG['DEVICE'] = default_device
        print(f"📍 Auto-detected device: {CONFIG['DEVICE']}")

except ImportError as e:
    print(f"❌ Import error: {e}")
    print("🔧 Please run the installation cell and restart kernel")

# Cell 3: Initialize OCR and Search Models
class ForensicOCRSearchSystem:
    """Comprehensive forensic OCR and search system"""

    def __init__(self, config=None):
        """Initialize the forensic analysis system"""
        self.config = config or CONFIG
        self.setup_directories()
        self.init_ocr_model()
        self.init_search_components()
        self.init_database()

    def setup_directories(self):
        """Create necessary directories"""
        for path_key in ['SUSPECTS_GALLERY_PATH', 'DATABASE_PATH', 'RESULTS_OUTPUT_PATH']:
            path = self.config[path_key]
            os.makedirs(path, exist_ok=True)
            print(f"📁 Directory ready: {path}")

    def init_ocr_model(self):
        """Initialize OCR model based on configuration"""
        ocr_model = self.config['OCR_MODEL']
        languages = self.config['OCR_LANGUAGES']

        print(f"📥 Loading OCR model: {ocr_model}")

        try:
            if ocr_model == 'paddleocr':
                self.ocr = PaddleOCR(
                    use_textline_orientation=True,
                    lang=languages[0] if languages else 'en',
                    # use_gpu=True if self.config['DEVICE'] == 'cuda' else False
                )
                print("✅ PaddleOCR initialized")

            elif ocr_model == 'easyocr':
                self.ocr = easyocr.Reader(
                    languages,
                    gpu=True if self.config['DEVICE'] == 'cuda' else False
                )
                print("✅ EasyOCR initialized")

            elif ocr_model == 'tesseract':
                # Tesseract doesn't need initialization, just check availability
                try:
                    import pytesseract
                    self.ocr = pytesseract
                    print("✅ Tesseract initialized")
                except Exception as e:
                    print(f"❌ Tesseract error: {e}")

        except Exception as e:
            print(f"❌ OCR initialization error: {e}")
            self.ocr = None

    def init_search_components(self):
        """Initialize search and embedding components"""
        try:
            print(f"📥 Loading embedding model: {self.config['EMBEDDING_MODEL']}")
            self.embedding_model = SentenceTransformer(self.config['EMBEDDING_MODEL'])

            print("📥 Initializing vector database...")
            self.chroma_client = chromadb.PersistentClient(path=self.config['DATABASE_PATH'])
            self.collection = self.chroma_client.get_or_create_collection(
                name="forensic_text_search",
                metadata={"hnsw:space": "cosine"}
            )
            print("✅ Search components initialized")

        except Exception as e:
            print(f"❌ Search initialization error: {e}")
            self.embedding_model = None
            self.chroma_client = None
            self.collection = None

    def init_database(self):
        """Initialize SQLite database for metadata"""
        try:
            db_path = os.path.join(self.config['DATABASE_PATH'], 'forensic_metadata.db')
            self.conn = sqlite3.connect(db_path)

            # Create tables
            self.conn.execute('''
                CREATE TABLE IF NOT EXISTS forensic_images (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    filename TEXT UNIQUE NOT NULL,
                    full_path TEXT NOT NULL,
                    extracted_text TEXT,
                    ocr_confidence REAL,
                    text_blocks_count INTEGER,
                    processed_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    file_size INTEGER,
                    image_dimensions TEXT,
                    ocr_metadata TEXT
                )
            ''')

            self.conn.execute('''
                CREATE TABLE IF NOT EXISTS search_queries (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    query_text TEXT NOT NULL,
                    query_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    results_count INTEGER,
                    processing_time REAL
                )
            ''')

            self.conn.commit()
            print("✅ Database initialized")

        except Exception as e:
            print(f"❌ Database initialization error: {e}")
            self.conn = None

    def extract_text_from_image(self, image_path: str) -> Dict[str, Any]:
        """Extract text from image using configured OCR model"""
        try:
            image_path = str(image_path)
            filename = os.path.basename(image_path)

            # Load image
            image = Image.open(image_path).convert("RGB")
            image_array = np.array(image)

            result = {
                'filename': filename,
                'full_path': image_path,
                'extracted_text': '',
                'ocr_confidence': 0.0,
                'text_blocks': [],
                'text_blocks_count': 0,
                'error': None
            }

            # Process with selected OCR model
            if self.config['OCR_MODEL'] == 'paddleocr' and self.ocr:
                ocr_result = self.ocr.ocr(image_array, cls=True)

                if ocr_result and ocr_result[0]:
                    text_blocks = []
                    all_text = []
                    confidences = []

                    for line in ocr_result:
                        for detection in line:
                            bbox, (text, confidence) = detection

                            if confidence >= self.config['OCR_CONFIDENCE_THRESHOLD']:
                                text_blocks.append({
                                    'text': text,
                                    'confidence': confidence,
                                    'bbox': bbox
                                })
                                all_text.append(text)
                                confidences.append(confidence)

                    result.update({
                        'extracted_text': ' '.join(all_text),
                        'ocr_confidence': np.mean(confidences) if confidences else 0.0,
                        'text_blocks': text_blocks,
                        'text_blocks_count': len(text_blocks)
                    })

            elif self.config['OCR_MODEL'] == 'easyocr' and self.ocr:
                ocr_result = self.ocr.readtext(image_array)

                text_blocks = []
                all_text = []
                confidences = []

                for detection in ocr_result:
                    bbox, text, confidence = detection

                    if confidence >= self.config['OCR_CONFIDENCE_THRESHOLD']:
                        text_blocks.append({
                            'text': text,
                            'confidence': confidence,
                            'bbox': bbox
                        })
                        all_text.append(text)
                        confidences.append(confidence)

                result.update({
                    'extracted_text': ' '.join(all_text),
                    'ocr_confidence': np.mean(confidences) if confidences else 0.0,
                    'text_blocks': text_blocks,
                    'text_blocks_count': len(text_blocks)
                })

            elif self.config['OCR_MODEL'] == 'tesseract' and self.ocr:
                # Simple Tesseract extraction
                text = self.ocr.image_to_string(image, config='--psm 6')
                confidence = 0.8  # Tesseract doesn't provide confidence easily

                if text.strip():
                    result.update({
                        'extracted_text': text.strip(),
                        'ocr_confidence': confidence,
                        'text_blocks': [{'text': text.strip(), 'confidence': confidence}],
                        'text_blocks_count': 1
                    })

            return result

        except Exception as e:
            return {
                'filename': os.path.basename(image_path),
                'full_path': image_path,
                'extracted_text': '',
                'ocr_confidence': 0.0,
                'text_blocks': [],
                'text_blocks_count': 0,
                'error': str(e)
            }

    def process_image_gallery(self, image_extensions=['.jpg', '.jpeg', '.png', '.bmp', '.tiff']):
        """Process all images in the suspect gallery"""
        start_time = time.time()
        gallery_path = Path(self.config['SUSPECTS_GALLERY_PATH'])

        print(f"🔍 Processing images from: {gallery_path}")
        print(f"🔍 OCR Model: {self.config['OCR_MODEL']}")
        print(f"⏰ Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

        if not gallery_path.exists():
            print(f"❌ Gallery path {gallery_path} does not exist!")
            return

        # Get all image files
        image_files = []
        for ext in image_extensions:
            image_files.extend(gallery_path.glob(f"**/*{ext}"))
            image_files.extend(gallery_path.glob(f"**/*{ext.upper()}"))

        total_files = len(image_files)
        print(f"📸 Found {total_files} images to process")

        if total_files == 0:
            print("⚠️ No images found!")
            return

        processed_count = 0
        text_found_count = 0

        print("\n" + "="*60)
        print("🔍 PROCESSING IMAGES")
        print("="*60)

        for i, image_path in enumerate(image_files, 1):
            filename = image_path.name

            # Progress indicator
            progress = (i / total_files) * 100
            print(f"[{i:3d}/{total_files}] ({progress:5.1f}%) Processing: {filename[:40]}...", end='')

            # Check if already processed
            cursor = self.conn.execute("SELECT filename FROM forensic_images WHERE filename = ?", (filename,))
            if cursor.fetchone():
                print(" ⏭️ SKIP (already processed)")
                continue

            # Extract text
            ocr_result = self.extract_text_from_image(image_path)

            if ocr_result.get('error'):
                print(f" ❌ ERROR: {ocr_result['error'][:30]}")
                continue

            # Store in database
            file_size = image_path.stat().st_size if image_path.exists() else 0
            image_dims = f"{ocr_result.get('width', 0)}x{ocr_result.get('height', 0)}"

            self.conn.execute('''
                INSERT OR REPLACE INTO forensic_images
                (filename, full_path, extracted_text, ocr_confidence, text_blocks_count,
                 file_size, image_dimensions, ocr_metadata)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?)
            ''', (
                filename,
                str(image_path),
                ocr_result['extracted_text'],
                ocr_result['ocr_confidence'],
                ocr_result['text_blocks_count'],
                file_size,
                image_dims,
                json.dumps(ocr_result['text_blocks']) if self.config['SAVE_OCR_METADATA'] else None
            ))

            # Add to vector database if text found
            if ocr_result['extracted_text'].strip():
                try:
                    doc_id = f"img_{processed_count}_{filename}"
                    self.collection.add(
                        documents=[ocr_result['extracted_text']],
                        metadatas=[{
                            'filename': filename,
                            'full_path': str(image_path),
                            'ocr_confidence': ocr_result['ocr_confidence'],
                            'text_blocks_count': ocr_result['text_blocks_count']
                        }],
                        ids=[doc_id]
                    )
                    text_found_count += 1
                    print(f" ✅ TEXT ({ocr_result['text_blocks_count']} blocks, conf: {ocr_result['ocr_confidence']:.2f})")
                except Exception as e:
                    print(f" ⚠️ DB ERROR: {str(e)[:20]}")
            else:
                print(" ⚪ NO TEXT")

            processed_count += 1

            # Commit every 10 images
            if i % 10 == 0:
                self.conn.commit()

        # Final commit
        self.conn.commit()

        # Calculate timing
        end_time = time.time()
        duration = end_time - start_time

        print("\n" + "="*60)
        print("📊 PROCESSING SUMMARY")
        print("="*60)
        print(f"📸 Total images: {total_files}")
        print(f"✅ Successfully processed: {processed_count}")
        print(f"📝 Images with text: {text_found_count}")
        print(f"⚪ Images without text: {processed_count - text_found_count}")
        print(f"⏱️ Total time: {duration:.1f}s ({duration/total_files:.2f}s per image)")
        print(f"📊 Text detection rate: {(text_found_count/processed_count*100):.1f}%" if processed_count > 0 else "")
        print("="*60)

    def search_by_text_query(self, query: str, max_results: int = None) -> List[Dict[str, Any]]:
        """Search images by semantic text similarity"""
        if max_results is None:
            max_results = self.config['TOP_K_RESULTS']

        if not self.collection or not self.embedding_model:
            print("❌ Search components not available")
            return []

        start_time = time.time()

        try:
            # Perform semantic search
            results = self.collection.query(
                query_texts=[query],
                n_results=max_results
            )

            # Format results
            matches = []
            if results['documents'] and results['documents'][0]:
                for i, doc in enumerate(results['documents'][0]):
                    metadata = results['metadatas'][0][i]
                    distance = results['distances'][0][i] if 'distances' in results else 0
                    similarity = 1 - distance

                    # Filter by similarity threshold
                    if similarity >= self.config['SIMILARITY_THRESHOLD']:
                        matches.append({
                            'filename': metadata['filename'],
                            'full_path': metadata['full_path'],
                            'extracted_text': doc,
                            'similarity_score': similarity,
                            'ocr_confidence': metadata.get('ocr_confidence', 0),
                            'text_blocks_count': metadata.get('text_blocks_count', 0),
                            'text_preview': doc[:200] + "..." if len(doc) > 200 else doc
                        })

            # Log query
            processing_time = time.time() - start_time
            self.conn.execute('''
                INSERT INTO search_queries (query_text, results_count, processing_time)
                VALUES (?, ?, ?)
            ''', (query, len(matches), processing_time))
            self.conn.commit()

            return matches

        except Exception as e:
            print(f"❌ Search error: {e}")
            return []

    def search_by_keywords(self, keywords: List[str], exact_match: bool = False) -> List[Dict[str, Any]]:
        """Search images by specific keywords using SQL"""
        matches = []

        try:
            for keyword in keywords:
                if exact_match:
                    sql_query = """
                        SELECT filename, full_path, extracted_text, ocr_confidence, text_blocks_count
                        FROM forensic_images
                        WHERE extracted_text LIKE ? AND extracted_text != ''
                    """
                    params = (f"%{keyword}%",)
                else:
                    sql_query = """
                        SELECT filename, full_path, extracted_text, ocr_confidence, text_blocks_count
                        FROM forensic_images
                        WHERE LOWER(extracted_text) LIKE LOWER(?) AND extracted_text != ''
                    """
                    params = (f"%{keyword}%",)

                cursor = self.conn.execute(sql_query, params)
                results = cursor.fetchall()

                for row in results:
                    match = {
                        'filename': row[0],
                        'full_path': row[1],
                        'extracted_text': row[2],
                        'ocr_confidence': row[3],
                        'text_blocks_count': row[4],
                        'matched_keyword': keyword,
                        'search_type': 'keyword',
                        'text_preview': row[2][:200] + "..." if len(row[2]) > 200 else row[2]
                    }

                    # Avoid duplicates
                    if not any(m['filename'] == match['filename'] for m in matches):
                        matches.append(match)

            return matches

        except Exception as e:
            print(f"❌ Keyword search error: {e}")
            return []

    def forensic_query_analysis(self, query: str, include_keywords: bool = True) -> Dict[str, Any]:
        """Comprehensive forensic analysis for a query"""
        print(f"🔍 Forensic Analysis: '{query}'")
        print("-" * 50)

        start_time = time.time()

        # Semantic search
        print("📝 Performing semantic search...")
        semantic_results = self.search_by_text_query(query)

        # Keyword search
        keyword_results = []
        if include_keywords:
            print("🔍 Performing keyword search...")
            query_words = query.lower().split()
            keyword_results = self.search_by_keywords(query_words)

        # Combine and deduplicate results
        all_results = semantic_results.copy()
        for kw_result in keyword_results:
            if not any(r['filename'] == kw_result['filename'] for r in all_results):
                kw_result['search_type'] = 'keyword'
                all_results.append(kw_result)

        # Sort by relevance (similarity score or confidence)
        all_results.sort(
            key=lambda x: x.get('similarity_score', x.get('ocr_confidence', 0)),
            reverse=True
        )

        processing_time = time.time() - start_time

        analysis = {
            'query': query,
            'semantic_matches': len(semantic_results),
            'keyword_matches': len(keyword_results),
            'total_unique_matches': len(all_results),
            'processing_time': processing_time,
            'results': all_results[:self.config['MAX_RESULTS_DISPLAY']],
            'timestamp': datetime.now().isoformat()
        }

        print(f"✅ Analysis complete:")
        print(f"   📊 Semantic matches: {len(semantic_results)}")
        print(f"   🔍 Keyword matches: {len(keyword_results)}")
        print(f"   📝 Total unique: {len(all_results)}")
        print(f"   ⏱️ Time: {processing_time:.2f}s")

        return analysis

    def get_statistics(self) -> Dict[str, Any]:
        """Get comprehensive database statistics"""
        try:
            stats = {}

            # Image statistics
            cursor = self.conn.execute("SELECT COUNT(*) FROM forensic_images")
            stats['total_images'] = cursor.fetchone()[0]

            cursor = self.conn.execute("SELECT COUNT(*) FROM forensic_images WHERE extracted_text != ''")
            stats['images_with_text'] = cursor.fetchone()[0]

            cursor = self.conn.execute("SELECT AVG(ocr_confidence) FROM forensic_images WHERE extracted_text != ''")
            result = cursor.fetchone()[0]
            stats['avg_ocr_confidence'] = round(result, 3) if result else 0

            cursor = self.conn.execute("SELECT SUM(text_blocks_count) FROM forensic_images")
            result = cursor.fetchone()[0]
            stats['total_text_blocks'] = result if result else 0

            # Query statistics
            cursor = self.conn.execute("SELECT COUNT(*) FROM search_queries")
            stats['total_queries'] = cursor.fetchone()[0]

            cursor = self.conn.execute("SELECT AVG(processing_time) FROM search_queries")
            result = cursor.fetchone()[0]
            stats['avg_query_time'] = round(result, 3) if result else 0

            # File size statistics
            cursor = self.conn.execute("SELECT SUM(file_size), AVG(file_size) FROM forensic_images")
            total_size, avg_size = cursor.fetchone()
            stats['total_file_size_mb'] = round(total_size / (1024*1024), 2) if total_size else 0
            stats['avg_file_size_kb'] = round(avg_size / 1024, 2) if avg_size else 0

            return stats

        except Exception as e:
            print(f"❌ Statistics error: {e}")
            return {}

    def export_results(self, results: List[Dict], output_folder: str, copy_images: bool = True) -> str:
        """Export search results to folder with metadata"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_path = Path(output_folder) / f"forensic_search_{timestamp}"
        output_path.mkdir(parents=True, exist_ok=True)

        copied_files = []
        metadata = {
            'export_timestamp': timestamp,
            'total_results': len(results),
            'export_path': str(output_path),
            'results': []
        }

        for i, result in enumerate(results, 1):
            try:
                result_meta = {
                    'index': i,
                    'filename': result['filename'],
                    'original_path': result['full_path'],
                    'extracted_text': result['extracted_text'],
                    'text_preview': result.get('text_preview', ''),
                    'ocr_confidence': result.get('ocr_confidence', 0),
                    'similarity_score': result.get('similarity_score', 0),
                    'search_type': result.get('search_type', 'semantic')
                }

                if copy_images:
                    # Copy image file
                    source_path = Path(result['full_path'])
                    if source_path.exists():
                        # Create descriptive filename
                        conf_or_sim = result.get('similarity_score', result.get('ocr_confidence', 0))
                        new_filename = f"{i:03d}_{source_path.stem}_score{conf_or_sim:.2f}{source_path.suffix}"
                        dest_path = output_path / new_filename

                        shutil.copy2(source_path, dest_path)
                        copied_files.append(str(dest_path))
                        result_meta['exported_filename'] = new_filename
                        result_meta['exported_path'] = str(dest_path)

                metadata['results'].append(result_meta)

            except Exception as e:
                print(f"❌ Error exporting {result['filename']}: {e}")

        # Save metadata JSON
        metadata_file = output_path / "search_metadata.json"
        with open(metadata_file, 'w', encoding='utf-8') as f:
            json.dump(metadata, f, indent=2, ensure_ascii=False)

        # Save text summary
        summary_file = output_path / "search_summary.txt"
        with open(summary_file, 'w', encoding='utf-8') as f:
            f.write(f"Forensic Search Results\n")
            f.write(f"="*50 + "\n")
            f.write(f"Export Date: {timestamp}\n")
            f.write(f"Total Results: {len(results)}\n")
            f.write(f"Files Copied: {len(copied_files)}\n\n")

            for i, result in enumerate(results, 1):
                f.write(f"{i}. {result['filename']}\n")
                f.write(f"   Text: {result.get('text_preview', '')}\n")
                f.write(f"   Score: {result.get('similarity_score', result.get('ocr_confidence', 0)):.3f}\n\n")

        print(f"📁 Exported {len(results)} results to: {output_path}")
        print(f"📋 Images copied: {len(copied_files)}")
        print(f"📄 Metadata saved: {metadata_file}")

        return str(output_path)

    def close(self):
        """Close database connections and cleanup"""
        if hasattr(self, 'conn') and self.conn:
            self.conn.close()
            print("📊 Database connections closed")

# Cell 4: Batch Processing and Forensic Analysis Functions
def run_comprehensive_forensic_analysis(system, custom_queries=None):
    """Run comprehensive forensic analysis with predefined and custom queries"""

    # Default forensic text queries
    default_queries = [
        "bank account number",
        "credit card",
        "social security number",
        "phone number",
        "email address",
        "password",
        "address location",
        "threat violence",
        "weapon gun knife",
        "drugs illegal substances",
        "money cash payment",
        "meeting appointment",
        "suspicious activity",
        "personal identification",
        "financial transaction"
    ]

    # Combine with custom queries
    queries = default_queries + (custom_queries or [])

    print(f"🚀 Starting comprehensive forensic analysis")
    print(f"📋 Total queries: {len(queries)} ({len(default_queries)} default + {len(custom_queries or [])} custom)")
    print("="*60)

    all_results = {}
    high_priority_findings = []

    start_time = time.time()

    for i, query in enumerate(queries, 1):
        print(f"\n[{i:2d}/{len(queries)}] Analyzing: '{query}'")

        # Perform analysis
        analysis = system.forensic_query_analysis(query, include_keywords=True)
        all_results[query] = analysis

        # Identify high-priority findings
        for result in analysis['results'][:3]:  # Top 3 per query
            score = result.get('similarity_score', result.get('ocr_confidence', 0))
            if score > 0.5:  # High confidence threshold
                high_priority_findings.append({
                    'query': query,
                    'filename': result['filename'],
                    'score': score,
                    'text_preview': result['text_preview'],
                    'search_type': result.get('search_type', 'semantic')
                })

        # Brief status
        matches = analysis['total_unique_matches']
        status = "🔴" if matches > 5 else "🟡" if matches > 0 else "⚪"
        print(f"   {status} {matches} matches found")

    total_time = time.time() - start_time

    # Generate summary report
    print(f"\n" + "="*60)
    print("📊 COMPREHENSIVE ANALYSIS SUMMARY")
    print("="*60)

    total_matches = sum(r['total_unique_matches'] for r in all_results.values())
    queries_with_results = sum(1 for r in all_results.values() if r['total_unique_matches'] > 0)

    print(f"🎯 Queries processed: {len(queries)}")
    print(f"✅ Queries with results: {queries_with_results}")
    print(f"📊 Total matches found: {total_matches}")
    print(f"🔴 High priority findings: {len(high_priority_findings)}")
    print(f"⏱️ Total analysis time: {total_time:.1f}s")
    print(f"📈 Average per query: {total_time/len(queries):.1f}s")

    # Show top findings by query
    print(f"\n📋 TOP FINDINGS BY QUERY:")
    for query, analysis in all_results.items():
        matches = analysis['total_unique_matches']
        if matches > 0:
            status = "🔴" if matches > 5 else "🟡"
            print(f"{status} '{query}': {matches} matches")

    # Show high priority findings
    if high_priority_findings:
        print(f"\n🚨 HIGH PRIORITY EVIDENCE:")
        high_priority_findings.sort(key=lambda x: x['score'], reverse=True)
        for finding in high_priority_findings[:10]:  # Top 10
            print(f"🔴 {finding['filename']} (Query: '{finding['query']}')")
            print(f"   Score: {finding['score']:.3f} | Type: {finding['search_type']}")
            print(f"   Text: {finding['text_preview'][:80]}...")
            print()

    return {
        'all_results': all_results,
        'high_priority_findings': high_priority_findings,
        'summary': {
            'total_queries': len(queries),
            'queries_with_results': queries_with_results,
            'total_matches': total_matches,
            'high_priority_count': len(high_priority_findings),
            'processing_time': total_time
        }
    }

def batch_export_all_results(system, analysis_results, output_base_path):
    """Export all analysis results in organized folders"""

    print("📁 Exporting all forensic analysis results...")

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    base_export_path = Path(output_base_path) / f"forensic_analysis_{timestamp}"
    base_export_path.mkdir(parents=True, exist_ok=True)

    # Export high priority findings
    high_priority = analysis_results['high_priority_findings']
    if high_priority:
        print(f"🔴 Exporting {len(high_priority)} high priority findings...")
        hp_results = []
        for finding in high_priority:
            # Convert finding to result format
            result = {
                'filename': finding['filename'],
                'full_path': '',  # Will be filled from database
                'extracted_text': '',
                'text_preview': finding['text_preview'],
                'similarity_score': finding['score'],
                'search_type': finding['search_type']
            }

            # Get full details from database
            cursor = system.conn.execute(
                "SELECT full_path, extracted_text FROM forensic_images WHERE filename = ?",
                (finding['filename'],)
            )
            row = cursor.fetchone()
            if row:
                result['full_path'] = row[0]
                result['extracted_text'] = row[1]
                hp_results.append(result)

        if hp_results:
            hp_folder = system.export_results(hp_results, base_export_path / "high_priority", copy_images=True)

    # Export results by query category
    categories = {
        'financial': ['bank account', 'credit card', 'money', 'payment', 'transaction'],
        'personal_info': ['phone number', 'email', 'address', 'social security', 'identification'],
        'security': ['password', 'threat', 'weapon', 'suspicious'],
        'illegal': ['drugs', 'illegal']
    }

    for category, keywords in categories.items():
        category_results = []
        for query, analysis in analysis_results['all_results'].items():
            if any(keyword in query.lower() for keyword in keywords):
                category_results.extend(analysis['results'])

        if category_results:
            # Remove duplicates
            unique_results = []
            seen_files = set()
            for result in category_results:
                if result['filename'] not in seen_files:
                    unique_results.append(result)
                    seen_files.add(result['filename'])

            if unique_results:
                print(f"📂 Exporting {len(unique_results)} {category} findings...")
                system.export_results(unique_results, base_export_path / category, copy_images=True)

    # Create master summary
    summary_file = base_export_path / "analysis_summary.json"
    with open(summary_file, 'w', encoding='utf-8') as f:
        json.dump({
            'export_timestamp': timestamp,
            'summary': analysis_results['summary'],
            'high_priority_count': len(high_priority),
            'categories_exported': list(categories.keys()),
            'export_path': str(base_export_path)
        }, f, indent=2)

    print(f"✅ All results exported to: {base_export_path}")
    return str(base_export_path)

# Cell 5: Interactive Interface Functions
def create_forensic_interface(system):
    """Create interactive interface for forensic analysts"""

    def display_statistics():
        """Display system statistics"""
        stats = system.get_statistics()
        print("\n📊 FORENSIC SYSTEM STATISTICS")
        print("="*40)
        print(f"📸 Total images processed: {stats.get('total_images', 0)}")
        print(f"📝 Images with text: {stats.get('images_with_text', 0)}")
        print(f"📊 Text detection rate: {(stats.get('images_with_text', 0) / max(stats.get('total_images', 1), 1) * 100):.1f}%")
        print(f"🎯 Average OCR confidence: {stats.get('avg_ocr_confidence', 0):.3f}")
        print(f"📋 Total text blocks found: {stats.get('total_text_blocks', 0)}")
        print(f"🔍 Total queries executed: {stats.get('total_queries', 0)}")
        print(f"⏱️ Average query time: {stats.get('avg_query_time', 0):.3f}s")
        print(f"💾 Total data size: {stats.get('total_file_size_mb', 0)} MB")
        print(f"📏 Average file size: {stats.get('avg_file_size_kb', 0)} KB")
        print("="*40)

    def interactive_search():
        """Interactive search interface"""
        print("\n🔍 INTERACTIVE FORENSIC SEARCH")
        print("="*40)
        print("Enter 'help' for commands, 'quit' to exit")

        while True:
            try:
                user_input = input("\n🔍 Query> ").strip()

                if user_input.lower() in ['quit', 'exit', 'q']:
                    break
                elif user_input.lower() == 'help':
                    print("\n📋 Available commands:")
                    print("  search <query>     - Semantic text search")
                    print("  keyword <words>    - Keyword search")
                    print("  stats             - Show system statistics")
                    print("  recent            - Show recent queries")
                    print("  export <query>    - Export last search results")
                    print("  help              - Show this help")
                    print("  quit              - Exit interface")
                    continue
                elif user_input.lower() == 'stats':
                    display_statistics()
                    continue
                elif user_input.lower() == 'recent':
                    cursor = system.conn.execute(
                        "SELECT query_text, results_count, query_date FROM search_queries ORDER BY query_date DESC LIMIT 10"
                    )
                    recent = cursor.fetchall()
                    print("\n🕐 Recent queries:")
                    for query, count, date in recent:
                        print(f"  {date}: '{query}' ({count} results)")
                    continue
                elif not user_input:
                    continue

                # Parse command
                parts = user_input.split(' ', 1)
                command = parts[0].lower()
                query = parts[1] if len(parts) > 1 else ""

                if command == 'search' and query:
                    print(f"\n🔍 Searching: '{query}'")
                    results = system.search_by_text_query(query)

                    if results:
                        print(f"✅ Found {len(results)} matches:")
                        for i, result in enumerate(results[:5], 1):  # Show top 5
                            score = result.get('similarity_score', 0)
                            print(f"{i}. {result['filename']} (score: {score:.3f})")
                            print(f"   {result['text_preview']}")
                            print()
                    else:
                        print("⚪ No matches found")

                elif command == 'keyword' and query:
                    keywords = query.split()
                    print(f"\n🔍 Keyword search: {keywords}")
                    results = system.search_by_keywords(keywords)

                    if results:
                        print(f"✅ Found {len(results)} matches:")
                        for i, result in enumerate(results[:5], 1):
                            keyword = result.get('matched_keyword', '')
                            print(f"{i}. {result['filename']} (keyword: '{keyword}')")
                            print(f"   {result['text_preview']}")
                            print()
                    else:
                        print("⚪ No matches found")

                elif command == 'export' and query:
                    # Search and export
                    results = system.search_by_text_query(query)
                    if results:
                        output_path = system.export_results(results, system.config['RESULTS_OUTPUT_PATH'])
                        print(f"📁 Results exported to: {output_path}")
                    else:
                        print("⚪ No results to export")

                else:
                    print("❌ Unknown command. Type 'help' for available commands.")

            except KeyboardInterrupt:
                print("\n👋 Goodbye!")
                break
            except Exception as e:
                print(f"❌ Error: {e}")

    return interactive_search, display_statistics

# Cell 6: Main Execution and Examples
def main_forensic_analysis():
    """Main function to run the complete forensic analysis system"""

    print("🚀 INITIALIZING FORENSIC OCR + SEARCH SYSTEM")
    print("="*60)

    # Initialize system
    system = ForensicOCRSearchSystem(CONFIG)

    # Check if system is ready
    if not system.ocr:
        print("❌ OCR model not loaded. Please check installation.")
        return None

    if not system.collection:
        print("❌ Search components not loaded. Please check installation.")
        return None

    print("✅ System initialized successfully!")

    # Process images if needed
    stats = system.get_statistics()
    if stats.get('total_images', 0) == 0:
        print("\n📸 No processed images found. Processing gallery...")
        system.process_image_gallery()
    else:
        print(f"\n📊 Found {stats['total_images']} processed images, {stats['images_with_text']} with text")

    return system

# Example usage functions
def run_quick_demo(system):
    """Run a quick demonstration of the system"""
    if not system:
        print("❌ System not available")
        return

    print("\n🎯 QUICK DEMONSTRATION")
    print("="*30)

    # Example queries
    demo_queries = [
        "phone number",
        "bank account",
        "password",
        "address"
    ]

    for query in demo_queries:
        print(f"\n🔍 Demo query: '{query}'")
        results = system.search_by_text_query(query, max_results=3)

        if results:
            print(f"✅ Found {len(results)} matches")
            for result in results[:2]:  # Show top 2
                print(f"  📄 {result['filename']} (score: {result['similarity_score']:.3f})")
        else:
            print("⚪ No matches")

def list_available_ocr_models():
    """Display available OCR models"""
    print("\n🔍 AVAILABLE OCR MODELS")
    print("="*30)
    for key, info in AVAILABLE_OCR_MODELS.items():
        status = "✅" if key == CONFIG['OCR_MODEL'] else "⚪"
        print(f"{status} {info['name']} ({key})")
        print(f"   📊 {info['performance']}")
        print(f"   📝 {info['description']}")
        print(f"   💻 Install: {info['install_cmd']}")
        print()

# Display available models
list_available_ocr_models()

# Initialize system
print("\n🚀 STARTING FORENSIC SYSTEM...")
forensic_system = main_forensic_analysis()

if forensic_system:
    # Create interactive interface
    interactive_search, display_stats = create_forensic_interface(forensic_system)

    print(f"\n✅ FORENSIC OCR + SEARCH SYSTEM READY!")
    print("="*50)
    print("📋 AVAILABLE FUNCTIONS:")
    print("  • forensic_system.process_image_gallery() - Process new images")
    print("  • forensic_system.search_by_text_query(query) - Semantic search")
    print("  • forensic_system.search_by_keywords([keywords]) - Keyword search")
    print("  • forensic_system.forensic_query_analysis(query) - Full analysis")
    print("  • run_comprehensive_forensic_analysis(forensic_system) - Batch analysis")
    print("  • interactive_search() - Interactive search interface")
    print("  • display_stats() - Show system statistics")
    print("  • run_quick_demo(forensic_system) - Quick demonstration")
    print("="*50)

    # Show initial statistics
    display_stats()

    # Run quick demo
    run_quick_demo(forensic_system)

    print(f"\n💡 TIP: Use interactive_search() for hands-on searching!")
    print(f"💡 TIP: Run run_comprehensive_forensic_analysis(forensic_system) for full analysis!")

else:
    print("❌ Failed to initialize forensic system")

# Cell 7: Usage Instructions and Documentation
print("""
🎯 FORENSIC OCR + SEARCH SYSTEM - COMPLETE GUIDE
===============================================

🆕 SYSTEM FEATURES:
• 🔍 Advanced OCR with PaddleOCR/EasyOCR/Tesseract
• 📝 Semantic text search using embeddings
• 🔍 Keyword-based search with SQL
• 📊 Comprehensive forensic analysis
• 📁 Automated result export with metadata
• 💾 SQLite database for persistence
• 🎯 Interactive search interface

📋 SETUP CHECKLIST:
1. ✅ Place suspect images in './suspect_images' folder
2. ✅ Run all cells in order (1-7)
3. ✅ System automatically processes images with OCR
4. ✅ Vector database created for semantic search
5. ✅ Ready for forensic analysis

🔧 OCR MODEL OPTIONS:
• PaddleOCR: Best accuracy, 80+ languages, GPU support
• EasyOCR: Good balance, easy setup, 80+ languages
• Tesseract: Classic, reliable, widely supported

🔍 SEARCH CAPABILITIES:
• Semantic Search: "bank account information" finds related concepts
• Keyword Search: Exact/partial word matching in extracted text
• Combined Analysis: Both semantic + keyword for comprehensive results
• Batch Processing: Multiple queries for systematic investigation

🎛️ KEY FUNCTIONS:
forensic_system.search_by_text_query("bank account")
forensic_system.search_by_keywords(["phone", "number"])
forensic_system.forensic_query_analysis("suspicious activity")
run_comprehensive_forensic_analysis(forensic_system)

📊 ANALYSIS FEATURES:
• High-priority evidence detection (score > 0.5)
• Automatic categorization (financial, personal, security, illegal)
• Export results with images and metadata
• Query performance tracking
• Statistical reporting

🔍 FORENSIC QUERY EXAMPLES:
• "bank account number" - Financial information
• "phone number" - Contact information
• "threat violence" - Security concerns
• "password" - Security credentials
• "suspicious activity" - General investigation
• "weapon gun knife" - Dangerous items
• "drugs illegal substances" - Illegal content

📁 OUTPUT STRUCTURE:
forensic_results/
├── forensic_analysis_20240614_143022/
│   ├── high_priority/          # Critical findings
│   ├── financial/              # Financial evidence
│   ├── personal_info/          # Personal information
│   ├── security/               # Security-related
│   └── illegal/                # Illegal content
└── search_metadata.json       # Analysis metadata

⚙️ CONFIGURATION OPTIONS:
• OCR_CONFIDENCE_THRESHOLD: Minimum OCR confidence (0.6)
• SIMILARITY_THRESHOLD: Semantic search threshold (0.3)
• TEXT_MIN_LENGTH: Minimum text length to process (3)
• MAX_RESULTS_DISPLAY: Results shown per query (10)
• BATCH_SIZE: Processing batch size (4)

🚨 FORENSIC BEST PRACTICES:
• Start with comprehensive batch analysis
• Review high-priority findings first
• Cross-reference semantic + keyword results
• Export evidence with metadata for reports
• Document analysis parameters for court
• Use interactive search for targeted investigation

📈 PERFORMANCE OPTIMIZATION:
• PaddleOCR + GPU: Best performance for large datasets
• Adjust confidence thresholds based on image quality
• Use batch processing for efficiency
• Monitor similarity scores for relevance tuning
• Regular database maintenance for speed

🔍 TROUBLESHOOTING:
• "No text found" → Check OCR confidence threshold
• "No search results" → Lower similarity threshold
• "Slow processing" → Reduce batch size or use CPU
• "Memory issues" → Process smaller batches
• "Poor OCR accuracy" → Try different OCR model

📊 EXPECTED PERFORMANCE:
• OCR Processing: 2-5 seconds per image (CPU)
• Semantic Search: 0.1-0.5 seconds per query
• Text Detection Rate: 60-90% depending on image quality
• Search Accuracy: High for clear, well-extracted text

🎯 INVESTIGATION WORKFLOW:
1. Process all suspect images with OCR
2. Run comprehensive forensic analysis
3. Review high-priority findings
4. Conduct targeted searches based on case needs
5. Export evidence with metadata
6. Generate reports for legal proceedings

💡 ADVANCED FEATURES:
• Custom query development for specific cases
• Integration with existing forensic workflows
• Batch export for evidence management
• Statistical analysis for case patterns
• Interactive search for real-time investigation
""")

✅ Configuration loaded successfully
📁 Suspects gallery: ../../datasets/images/text_ocr
📁 Database path: forensic_analysis_db
📁 Results output: ../../datasets/images/forensic_results
🔍 OCR model: paddleocr
📝 Forensic OCR + Search System Ready
✅ Core dependencies available
✅ PaddleOCR already available
✅ Search dependencies available
✅ All dependencies imported successfully
🖥️ Using CPU mode
📍 Auto-detected device: cpu

🔍 AVAILABLE OCR MODELS
✅ PaddleOCR (paddleocr)
   📊 Best overall performance
   📝 High accuracy, supports 80+ languages
   💻 Install: pip install paddlepaddle paddleocr

⚪ EasyOCR (easyocr)
   📊 Good performance, easy setup
   📝 Simple to use, good accuracy
   💻 Install: pip install easyocr

⚪ Tesseract (tesseract)
   📊 Baseline performance
   📝 Classic OCR, widely supported
   💻 Install: pip install pytesseract


🚀 STARTING FORENSIC SYSTEM...
🚀 INITIALIZING FORENSIC OCR + SEARCH SYSTEM
📁 Directory ready: ../../datasets/images/text_ocr
📁 Directory ready: forensic_analysis

[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_doc_ori), the model files will be automatically downloaded and saved in C:\Users\scott\.paddlex\official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

[32mCreating model: ('UVDoc', None)[0m
[33mThe model(UVDoc) is not supported to run in MKLDNN mode! Using `paddle` instead![0m
[32mUsing official model (UVDoc), the model files will be automatically downloaded and saved in C:\Users\scott\.paddlex\official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_textline_ori), the model files will be automatically downloaded and saved in C:\Users\scott\.paddlex\official_models.[0m


Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mUsing official model (PP-OCRv5_server_det), the model files will be automatically downloaded and saved in C:\Users\scott\.paddlex\official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

[32mCreating model: ('PP-OCRv5_server_rec', None)[0m
[32mUsing official model (PP-OCRv5_server_rec), the model files will be automatically downloaded and saved in C:\Users\scott\.paddlex\official_models.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

✅ PaddleOCR initialized
📥 Loading embedding model: sentence-transformers/all-MiniLM-L6-v2
📥 Initializing vector database...
✅ Search components initialized
✅ Database initialized
✅ System initialized successfully!

📸 No processed images found. Processing gallery...
🔍 Processing images from: ..\..\datasets\images\text_ocr
🔍 OCR Model: paddleocr
⏰ Start time: 2025-07-09 20:05:41
📸 Found 200 images to process

🔍 PROCESSING IMAGES
[  1/200] (  0.5%) Processing: Text_ocr_000.jpg... ❌ ERROR: predict() got an unexpected ke
[  2/200] (  1.0%) Processing: Text_ocr_001.jpg... ❌ ERROR: predict() got an unexpected ke
[  3/200] (  1.5%) Processing: Text_ocr_002.jpg... ❌ ERROR: predict() got an unexpected ke
[  4/200] (  2.0%) Processing: Text_ocr_003.jpg... ❌ ERROR: predict() got an unexpected ke
[  5/200] (  2.5%) Processing: Text_ocr_004.jpg... ❌ ERROR: predict() got an unexpected ke
[  6/200] (  3.0%) Processing: Text_ocr_005.jpg... ❌ ERROR: predict() got an unexpected ke
[  7/200] (  3.5%) Proc

  ocr_result = self.ocr.ocr(image_array, cls=True)


 ❌ ERROR: predict() got an unexpected ke
[ 15/200] (  7.5%) Processing: Text_ocr_014.jpg... ❌ ERROR: predict() got an unexpected ke
[ 16/200] (  8.0%) Processing: Text_ocr_015.jpg... ❌ ERROR: predict() got an unexpected ke
[ 17/200] (  8.5%) Processing: Text_ocr_016.jpg... ❌ ERROR: predict() got an unexpected ke
[ 18/200] (  9.0%) Processing: Text_ocr_017.jpg... ❌ ERROR: predict() got an unexpected ke
[ 19/200] (  9.5%) Processing: Text_ocr_018.jpg... ❌ ERROR: predict() got an unexpected ke
[ 20/200] ( 10.0%) Processing: Text_ocr_019.jpg... ❌ ERROR: predict() got an unexpected ke
[ 21/200] ( 10.5%) Processing: Text_ocr_020.jpg... ❌ ERROR: predict() got an unexpected ke
[ 22/200] ( 11.0%) Processing: Text_ocr_021.jpg... ❌ ERROR: predict() got an unexpected ke
[ 23/200] ( 11.5%) Processing: Text_ocr_022.jpg... ❌ ERROR: predict() got an unexpected ke
[ 24/200] ( 12.0%) Processing: Text_ocr_023.jpg... ❌ ERROR: predict() got an unexpected ke
[ 25/200] ( 12.5%) Processing: Text_ocr_024.jpg..