In [None]:
'../../../datasets/images/text_ocr'

In [4]:
# Compact Local Forensic OCR System with GUI
import os, json, sqlite3, shutil, warnings, time
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Any, Optional
import numpy as np
from PIL import Image
warnings.filterwarnings("ignore")


# Disable PaddleOCR analytics to prevent connection errors
os.environ['DISABLE_TELEMETRY'] = '1'
os.environ['PADDLEOCR_DISABLE_TELEMETRY'] = '1'
os.environ['HUB_ANALYTICS'] = 'False'

# Compact config with OCR options
CONFIG = {
    'OCR_MODEL': 'paddleocr', 'OCR_CONFIDENCE_THRESHOLD': 0.6,
    'LLM_PROVIDER': 'ollama', 'LLM_MODEL': 'llama3.2:latest', 'LLM_BASE_URL': 'http://localhost:11434',
    'LLM_TEMPERATURE': 0.1, 'LLM_MAX_TOKENS': 1000,
    'EMBEDDING_PROVIDER': 'huggingface', 'EMBEDDING_MODEL': 'sentence-transformers/all-MiniLM-L6-v2',
    'VECTOR_STORE_PATH': './forensic_vectorstore', 'CHUNK_SIZE': 500, 'CHUNK_OVERLAP': 50,
    'SUSPECTS_GALLERY_PATH': '../../../datasets/images/text_ocr', 'DATABASE_PATH': './forensic_analysis_db',
    'RESULTS_OUTPUT_PATH': './forensic_results', 'BATCH_SIZE': 4, 'MAX_RESULTS_DISPLAY': 10,
}

# OCR model options for different evidence types
OCR_MODELS = {
    'paddleocr': {'name': 'PaddleOCR', 'desc': 'Best overall, 80+ languages, handwriting capable', 'strengths': 'Asian text, complex layouts'},
    'easyocr': {'name': 'EasyOCR', 'desc': 'Good for European languages, simple setup', 'strengths': 'European text, clean images'},
    'tesseract': {'name': 'Tesseract', 'desc': 'Classic OCR, best for clean printed text', 'strengths': 'Printed documents, stable'},
    'trocr': {'name': 'TrOCR (Transformers)', 'desc': 'AI-based, excellent for handwriting', 'strengths': 'Handwritten notes, degraded images'}
}

print(f"🏠 LOCAL FORENSIC SYSTEM | LLM: {CONFIG['LLM_PROVIDER']}-{CONFIG['LLM_MODEL']} | 🔒 100% Private")

# Quick dependency check
def check_deps():
    deps = ['paddleocr', 'langchain', 'chromadb', 'sentence_transformers']
    for dep in deps:
        try: __import__(dep.replace('-', '_')); print(f"✅ {dep}")
        except ImportError: print(f"❌ {dep} - Install: pip install {dep}")

def check_ollama():
    try:
        import requests
        r = requests.get(f"{CONFIG['LLM_BASE_URL']}/api/tags", timeout=5)
        if r.status_code == 200:
            models = [m['name'] for m in r.json().get('models', [])]
            print(f"✅ Ollama: {models}")
            return CONFIG['LLM_MODEL'] in models
        return False
    except: print("❌ Ollama not running"); return False

check_deps()
ollama_ready = check_ollama() if CONFIG['LLM_PROVIDER'] == 'ollama' else True


# Add error handling for OCR initialization
def init_ocr_safe():
    try:
        from paddleocr import PaddleOCR
        return PaddleOCR(
            use_textline_orientation=True,
            lang='en',
            show_log=False,  # Disable logging
            use_gpu=False    # Force CPU to avoid GPU errors
        )
    except Exception as e:
        print(f"OCR initialization error: {e}")
        return None

# Compact forensic system
class CompactForensicSystem:
    def __init__(self, config=None):
        self.config = config or CONFIG.copy()
        self._setup_dirs()
        self._init_db()
        self._init_ocr()
        self._init_llm()

    def _setup_dirs(self):
        for k in ['SUSPECTS_GALLERY_PATH', 'DATABASE_PATH', 'RESULTS_OUTPUT_PATH', 'VECTOR_STORE_PATH']:
            os.makedirs(self.config[k], exist_ok=True)

    def _init_db(self):
        db_path = os.path.join(self.config['DATABASE_PATH'], 'forensic.db')
        self.conn = sqlite3.connect(db_path)
        self.conn.execute('''CREATE TABLE IF NOT EXISTS forensic_images (
            id INTEGER PRIMARY KEY, filename TEXT UNIQUE, full_path TEXT, extracted_text TEXT,
            ocr_confidence REAL, processed_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            local_analysis TEXT, risk_score REAL, contains_incriminating BOOLEAN DEFAULT 0,
            vector_stored BOOLEAN DEFAULT 0)''')
        self.conn.execute('''CREATE TABLE IF NOT EXISTS local_investigations (
            id INTEGER PRIMARY KEY, query TEXT, investigation_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            results_json TEXT, flagged_images TEXT, model_used TEXT, processing_time REAL)''')
        self.conn.commit()


    def _init_ocr(self):
        try:
            ocr_model = self.config['OCR_MODEL']
            if ocr_model == 'paddleocr':
                from paddleocr import PaddleOCR
                self.ocr = init_ocr_safe()
            elif ocr_model == 'easyocr':
                import easyocr
                self.ocr = easyocr.Reader(['en'], gpu=False)
            elif ocr_model == 'tesseract':
                import pytesseract
                self.ocr = pytesseract
            elif ocr_model == 'trocr':
                from transformers import TrOCRProcessor, VisionEncoderDecoderModel
                self.ocr_processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-printed')
                self.ocr_model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-printed')
                self.ocr = 'trocr'
            print(f"✅ {OCR_MODELS[ocr_model]['name']} ready")
        except Exception as e: print(f"❌ OCR error: {e}"); self.ocr = None

    def _init_llm(self):
        try:
            if self.config['LLM_PROVIDER'] == 'ollama':
                from langchain_community.llms import Ollama
                self.llm = Ollama(model=self.config['LLM_MODEL'], base_url=self.config['LLM_BASE_URL'],
                                temperature=self.config['LLM_TEMPERATURE'])
            elif self.config['LLM_PROVIDER'] == 'transformers':
                from langchain_community.llms import HuggingFacePipeline
                from transformers import pipeline
                pipe = pipeline("text-generation", model=self.config['LLM_MODEL'],
                              max_new_tokens=self.config['LLM_MAX_TOKENS'])
                self.llm = HuggingFacePipeline(pipeline=pipe)

            from langchain_community.embeddings import HuggingFaceEmbeddings
            from langchain_community.vectorstores import Chroma
            from langchain.text_splitter import RecursiveCharacterTextSplitter

            self.embeddings = HuggingFaceEmbeddings(model_name=self.config['EMBEDDING_MODEL'],
                                                  model_kwargs={'device': 'cpu'})
            self.vectorstore = Chroma(persist_directory=self.config['VECTOR_STORE_PATH'],
                                    embedding_function=self.embeddings)
            self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=self.config['CHUNK_SIZE'],
                                                              chunk_overlap=self.config['CHUNK_OVERLAP'])
            print("✅ LangChain ready")
        except Exception as e: print(f"❌ LLM error: {e}")

    def extract_text(self, img_path: str) -> Dict[str, Any]:
        try:
            img = Image.open(img_path).convert("RGB")
            result = {'filename': os.path.basename(img_path), 'full_path': img_path,
                     'extracted_text': '', 'ocr_confidence': 0.0, 'error': None}

            if self.ocr:
                ocr_model = self.config['OCR_MODEL']

                if ocr_model == 'paddleocr':
                    ocr_result = self.ocr.ocr(np.array(img), cls=True)
                    if ocr_result and ocr_result[0]:
                        texts, confs = [], []
                        for line in ocr_result:
                            for detection in line:
                                bbox, (text, conf) = detection
                                if conf >= self.config['OCR_CONFIDENCE_THRESHOLD']:
                                    texts.append(text); confs.append(conf)
                        result.update({'extracted_text': ' '.join(texts),
                                     'ocr_confidence': np.mean(confs) if confs else 0.0})

                elif ocr_model == 'easyocr':
                    ocr_result = self.ocr.readtext(np.array(img))
                    texts, confs = [], []
                    for detection in ocr_result:
                        bbox, text, conf = detection
                        if conf >= self.config['OCR_CONFIDENCE_THRESHOLD']:
                            texts.append(text); confs.append(conf)
                    result.update({'extracted_text': ' '.join(texts),
                                 'ocr_confidence': np.mean(confs) if confs else 0.0})

                elif ocr_model == 'tesseract':
                    import pytesseract
                    text = pytesseract.image_to_string(img, config='--psm 6')
                    if text.strip():
                        result.update({'extracted_text': text.strip(), 'ocr_confidence': 0.8})

                elif ocr_model == 'trocr':
                    pixel_values = self.ocr_processor(images=img, return_tensors="pt").pixel_values
                    generated_ids = self.ocr_model.generate(pixel_values)
                    text = self.ocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
                    if text.strip():
                        result.update({'extracted_text': text.strip(), 'ocr_confidence': 0.9})

            return result
        except Exception as e:
            return {'filename': os.path.basename(img_path), 'full_path': img_path,
                   'extracted_text': '', 'ocr_confidence': 0.0, 'error': str(e)}

    def analyze_text(self, text: str, filename: str) -> Dict[str, Any]:
        prompt = f'''Analyze text from image '{filename}' for criminal evidence:
Text: {text}
Look for: weapons, drugs, threats, violence, financial crimes, personal info theft.
Respond with JSON only:
{{"contains_incriminating": true/false, "risk_score": 0.0-1.0,
"categories": ["weapons","drugs","threats","financial_crimes","violence","personal_info"],
"analysis": "explanation", "keywords": ["words"], "evidence_summary": "summary"}}'''

        try:
            response = str(self.llm.invoke(prompt)).strip()
            response = response.replace('```json', '').replace('```', '').strip()
            start, end = response.find('{'), response.rfind('}') + 1
            if start >= 0 and end > start:
                parsed = json.loads(response[start:end])
                return {k: parsed.get(k, v) for k, v in [
                    ('contains_incriminating', False), ('risk_score', 0.0), ('categories', []),
                    ('analysis', ''), ('keywords', []), ('evidence_summary', '')]}
            else: return self._fallback_analysis(response, filename)
        except: return self._fallback_analysis(response if 'response' in locals() else '', filename)

    def _fallback_analysis(self, response: str, filename: str) -> Dict[str, Any]:
        keywords = {'weapons': ['weapon','gun','knife','bomb'], 'drugs': ['drug','cocaine','meth'],
                   'threats': ['threat','kill','attack'], 'financial_crimes': ['fraud','steal']}
        found_cats, found_words, risk = [], [], 0.0
        resp_lower = response.lower()
        for cat, words in keywords.items():
            for word in words:
                if word in resp_lower: found_cats.append(cat); found_words.append(word); risk += 0.15
        return {'contains_incriminating': risk > 0.3, 'risk_score': min(risk, 1.0),
               'categories': list(set(found_cats)), 'analysis': f'Fallback analysis: {response[:100]}',
               'keywords': found_words, 'evidence_summary': f'Found {len(found_words)} terms'}

    def add_to_vectorstore(self, text: str, metadata: Dict[str, Any]):
        try:
            from langchain_core.documents import Document
            chunks = self.text_splitter.split_text(text)
            docs = [Document(page_content=chunk, metadata={**metadata, 'chunk_id': i})
                   for i, chunk in enumerate(chunks)]
            self.vectorstore.add_documents(docs); self.vectorstore.persist(); return True
        except: return False

    def process_gallery(self):
        start_time = time.time()
        gallery_path = Path(self.config['SUSPECTS_GALLERY_PATH'])
        print(f"🏠 PROCESSING: {gallery_path} | Model: {self.config['LLM_MODEL']}")

        if not gallery_path.exists(): print(f"❌ Path not found: {gallery_path}"); return

        exts = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
        files = [f for ext in exts for f in list(gallery_path.glob(f"**/*{ext}")) + list(gallery_path.glob(f"**/*{ext.upper()}"))]
        total, processed, vector_stored, incriminating = len(files), 0, 0, 0

        print(f"📸 Found {total} images")
        if total == 0: print("⚠️ No images found!"); return

        for i, img_path in enumerate(files, 1):
            filename = img_path.name
            print(f"[{i:3d}/{total}] ({i/total*100:5.1f}%) {filename[:40]}...", end='')

            # Skip if processed
            if self.conn.execute("SELECT id FROM forensic_images WHERE filename = ?", (filename,)).fetchone():
                print(" ⏭️ SKIP"); continue

            # Extract and analyze
            ocr_result = self.extract_text(img_path)
            if ocr_result.get('error'): print(" ❌ ERROR"); continue

            text = ocr_result['extracted_text']
            analysis = None
            if text.strip():
                analysis = self.analyze_text(text, filename)
                metadata = {'filename': filename, 'full_path': str(img_path),
                          'ocr_confidence': ocr_result['ocr_confidence']}
                if self.add_to_vectorstore(text, metadata): vector_stored += 1

            # Store in DB
            risk = analysis['risk_score'] if analysis else 0.0
            incrim = analysis['contains_incriminating'] if analysis else False
            self.conn.execute('''INSERT OR REPLACE INTO forensic_images
                (filename, full_path, extracted_text, ocr_confidence, local_analysis,
                 risk_score, contains_incriminating, vector_stored) VALUES (?,?,?,?,?,?,?,?)''',
                (filename, str(img_path), text, ocr_result['ocr_confidence'],
                 json.dumps(analysis) if analysis else None, risk, incrim, vector_stored > 0))

            if text.strip():
                if incrim: incriminating += 1; print(f" 🚨 INCRIMINATING ({risk:.2f})")
                else: print(" ✅ TEXT")
            else: print(" ⚪ NO TEXT")

            processed += 1
            if i % 10 == 0: self.conn.commit()

        self.conn.commit()
        print(f"\n🏠 COMPLETE: {processed} processed, {vector_stored} vectorized, {incriminating} flagged, {time.time()-start_time:.1f}s")

    def investigate(self, query: str) -> Dict[str, Any]:
        print(f"🏠 INVESTIGATING: '{query}'")
        try:
            docs = self.vectorstore.similarity_search(query, k=5)
            if not docs: return {'findings': [], 'summary': 'No evidence found', 'total_flagged': 0}

            evidence, files = "", set()
            for i, doc in enumerate(docs, 1):
                filename = doc.metadata.get('filename', f'unknown_{i}')
                files.add(filename)
                evidence += f"Evidence {i} - {filename}: {doc.page_content}\n---\n"

            prompt = f'''Forensic investigation: "{query}"
Evidence: {evidence}
Respond with JSON:
{{"findings": [{{"image_number": 1, "filename": "file.jpg", "matches_query": true,
"risk_level": "high", "evidence_found": "desc", "relevant_text": "text", "confidence": 0.8}}],
"summary": "summary", "total_flagged": 2, "highest_risk_image": "file.jpg"}}'''

            response = str(self.llm.invoke(prompt)).strip()
            response = response.replace('```json', '').replace('```', '').strip()
            start, end = response.find('{'), response.rfind('}') + 1

            if start >= 0 and end > start:
                results = json.loads(response[start:end])
            else:
                results = self._fallback_investigation(query, files)

            # Store investigation
            flagged = [f.get('filename', '') for f in results.get('findings', []) if f.get('matches_query')]
            self.conn.execute('''INSERT INTO local_investigations
                (query, results_json, flagged_images, model_used) VALUES (?,?,?,?)''',
                (query, json.dumps(results), json.dumps(flagged), f"{self.config['LLM_PROVIDER']}-{self.config['LLM_MODEL']}"))
            self.conn.commit()

            print(f"✅ Found {results.get('total_flagged', 0)} matches")
            return results
        except Exception as e:
            print(f"❌ Error: {e}")
            return {'findings': [], 'summary': f'Error: {e}', 'total_flagged': 0}

    def _fallback_investigation(self, query: str, files: set) -> Dict[str, Any]:
        findings = [{'image_number': i, 'filename': f, 'matches_query': True, 'risk_level': 'medium',
                    'evidence_found': f'Related to: {query}', 'relevant_text': f'Matches: {query}', 'confidence': 0.7}
                   for i, f in enumerate(list(files)[:5], 1)]
        return {'findings': findings, 'summary': f'Fallback analysis for: {query}',
               'total_flagged': len(findings), 'highest_risk_image': list(files)[0] if files else None}

    def get_flagged(self, min_risk=0.3):
        return self.conn.execute('''SELECT filename, full_path, extracted_text, risk_score,
            local_analysis, ocr_confidence FROM forensic_images
            WHERE contains_incriminating = 1 AND risk_score >= ? ORDER BY risk_score DESC''', (min_risk,)).fetchall()

    def export_results(self, results: Dict[str, Any], query: str) -> str:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        query_safe = query.replace(' ', '_')[:30]
        output_path = Path(self.config['RESULTS_OUTPUT_PATH']) / f"investigation_{query_safe}_{timestamp}"
        output_path.mkdir(parents=True, exist_ok=True)

        flagged = [f.get('filename', '') for f in results.get('findings', []) if f.get('matches_query')]
        copied = []
        for i, filename in enumerate(flagged, 1):
            result = self.conn.execute("SELECT full_path FROM forensic_images WHERE filename = ?", (filename,)).fetchone()
            if result and Path(result[0]).exists():
                dest = output_path / f"{i:03d}_{filename}"
                shutil.copy2(result[0], dest); copied.append(str(dest))

        # Save reports
        report = {'query': query, 'timestamp': timestamp, 'model': f"{self.config['LLM_PROVIDER']}-{self.config['LLM_MODEL']}",
                 'results': results, 'flagged_images': flagged, 'copied_files': copied}

        with open(output_path / "report.json", 'w') as f: json.dump(report, f, indent=2)
        with open(output_path / "summary.txt", 'w') as f:
            f.write(f"LOCAL FORENSIC REPORT\nQuery: {query}\nDate: {timestamp}\n")
            f.write(f"Model: {self.config['LLM_PROVIDER']}-{self.config['LLM_MODEL']}\n")
            f.write(f"Flagged: {len(flagged)}\nSummary: {results.get('summary', 'N/A')}\n")

        print(f"📋 Exported to: {output_path}")
        return str(output_path)

    def close(self):
        if hasattr(self, 'conn') and self.conn: self.conn.close()

# Compact GUI
import ipywidgets as widgets
from IPython.display import display, clear_output
import matplotlib.pyplot as plt

def create_compact_gui():
    # OCR Configuration - ADD THIS TO EXISTING INTERFACE
    ocr_selector = widgets.Dropdown(
        options=[
            ('PaddleOCR - Best overall, 80+ languages', 'paddleocr'),
            ('EasyOCR - Good for European languages', 'easyocr'),
            ('Tesseract - Classic OCR for printed text', 'tesseract'),
            ('TrOCR - AI-based OCR for handwriting', 'trocr')
        ],
        value='paddleocr',
        description='OCR Engine:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='500px')
    )

    # Existing widgets from your current interface
    llm_provider = widgets.Dropdown(
        options=[('Ollama', 'ollama'), ('Transformers', 'transformers')],
        value='ollama',
        description='LLM Provider:',
        style={'description_width': 'initial'}
    )

    llm_model = widgets.Text(
        value='llama3.2:latest',
        description='LLM Model:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='300px')
    )

    gallery_path = widgets.Text(
        value='../../../datasets/images/text_ocr',
        description='Gallery Path:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='400px')
    )

    query_input = widgets.Textarea(
        value='Find weapons or violence mentions',
        description='Investigation Query:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='500px', height='60px')
    )

    confidence_slider = widgets.FloatSlider(
        value=0.60,
        min=0.1, max=0.9, step=0.05,
        description='OCR Confidence:',
        style={'description_width': 'initial'},
        readout_format='.2f'
    )

    risk_slider = widgets.FloatSlider(
        value=0.3,
        min=0.1, max=1.0, step=0.1,
        description='Risk Threshold:',
        style={'description_width': 'initial'},
        readout_format='.1f'
    )

    # Quick queries (same as your existing interface)
    quick_queries = ["Find weapons", "Look for drugs", "Financial crimes", "Personal info theft", "Threats", "Illegal coordination"]
    quick_btns = [widgets.Button(description=q, tooltip=q, layout=widgets.Layout(width='180px', margin='2px')) for q in quick_queries]

    # Action buttons (same as your existing interface)
    test_btn = widgets.Button(description='🔍 Test Setup', button_style='info', layout=widgets.Layout(width='120px'))
    process_btn = widgets.Button(description='📁 Process', button_style='primary', layout=widgets.Layout(width='120px'))
    investigate_btn = widgets.Button(description='🤖 Investigate', button_style='success', layout=widgets.Layout(width='120px'))
    flagged_btn = widgets.Button(description='🚨 Flagged', button_style='warning', layout=widgets.Layout(width='120px'))
    export_btn = widgets.Button(description='📋 Export', button_style='success', layout=widgets.Layout(width='120px'), disabled=True)
    clear_btn = widgets.Button(description='🗑️ Clear', layout=widgets.Layout(width='120px'))

    status_label = widgets.HTML(value="<b>Status:</b> Ready")
    progress_bar = widgets.IntProgress(value=0, min=0, max=100, layout=widgets.Layout(width='400px'))
    output_area = widgets.Output()

    # State
    current_system = None
    current_results = None

    def update_status(msg, progress=None):
        status_label.value = f"<b>Status:</b> {msg}"
        if progress is not None: progress_bar.value = progress

    def init_system():
        nonlocal current_system
        try:
            config = CONFIG.copy()
            config.update({
                'OCR_MODEL': ocr_selector.value,  # NOW USES SELECTED OCR
                'LLM_PROVIDER': llm_provider.value,
                'LLM_MODEL': llm_model.value,
                'SUSPECTS_GALLERY_PATH': gallery_path.value,
                'OCR_CONFIDENCE_THRESHOLD': confidence_slider.value
            })
            current_system = CompactForensicSystem(config)
            return True
        except Exception as e:
            print(f"❌ Init error: {e}")
            return False

    # Button handlers (same functionality, now with OCR support)
    def on_quick_query(query):
        def handler(b): query_input.value = query
        return handler

    for i, btn in enumerate(quick_btns):
        btn.on_click(on_quick_query(quick_queries[i]))

    def on_test(b):
        with output_area:
            clear_output(wait=True)
            update_status("Testing setup...", 20)
            print("🔍 TESTING LOCAL SETUP")
            print("="*40)

            # Test OCR first
            print(f"🔍 Selected OCR: {ocr_selector.value}")
            ocr_info = {
                'paddleocr': 'Best overall, 80+ languages, handwriting',
                'easyocr': 'Good for European languages, clean text',
                'tesseract': 'Excellent for printed documents, stable',
                'trocr': 'Perfect for handwritten notes, degraded images'
            }
            if ocr_selector.value in ocr_info:
                print(f"   Purpose: {ocr_info[ocr_selector.value]}")

            # Test OCR availability
            try:
                if ocr_selector.value == 'paddleocr':
                    import paddleocr; print("✅ PaddleOCR available")
                elif ocr_selector.value == 'easyocr':
                    import easyocr; print("✅ EasyOCR available")
                elif ocr_selector.value == 'tesseract':
                    import pytesseract; print("✅ Tesseract available")
                elif ocr_selector.value == 'trocr':
                    import transformers; print("✅ TrOCR available")
            except ImportError:
                print(f"❌ {ocr_selector.value} not installed")

            # Test other dependencies
            deps = [('langchain', 'LangChain'), ('chromadb', 'ChromaDB'), ('sentence_transformers', 'Transformers')]
            for dep, name in deps:
                try: __import__(dep.replace('-', '_')); print(f"✅ {name}")
                except ImportError: print(f"❌ {name} not found")

            # Test Ollama
            if llm_provider.value == 'ollama':
                update_status("Testing Ollama...", 60)
                try:
                    import requests
                    r = requests.get("http://localhost:11434/api/tags", timeout=5)
                    if r.status_code == 200:
                        models = [m['name'] for m in r.json().get('models', [])]
                        print(f"✅ Ollama: {models}")
                        if llm_model.value in models:
                            print(f"✅ Model '{llm_model.value}' ready")
                        else:
                            print(f"⚠️ Model '{llm_model.value}' not found. Install: ollama pull {llm_model.value}")
                    else: print("❌ Ollama not responding")
                except: print("❌ Ollama connection failed. Start: ollama serve")

            update_status("Testing system init...", 80)
            if init_system():
                print("✅ System ready with selected OCR");
                update_status("Setup complete!", 100)
            else:
                print("❌ System init failed");
                update_status("Setup failed", 0)

    def on_process(b):
        with output_area:
            clear_output(wait=True)
            update_status("Processing gallery...", 20)
            if not init_system():
                update_status("Init failed", 0); return

            print(f"🏠 PROCESSING | OCR: {ocr_selector.value} | LLM: {llm_provider.value}-{llm_model.value}")
            try:
                current_system.process_gallery()
                update_status("Processing complete!", 100)
                flagged = current_system.get_flagged(risk_slider.value)
                if flagged: print(f"\n🚨 FLAGGED: {len(flagged)} images above risk {risk_slider.value}")
                else: print("\n✅ No high-risk content detected")
            except Exception as e: update_status("Processing error", 0); print(f"❌ Error: {e}")

    def on_investigate(b):
        nonlocal current_results
        with output_area:
            clear_output(wait=True)
            if not current_system and not init_system(): print("❌ System not ready"); return

            query = query_input.value.strip()
            if not query: print("⚠️ Enter investigation query"); return

            update_status("Investigating...", 50)
            print(f"🏠 INVESTIGATING: '{query}' | OCR: {ocr_selector.value} | LLM: {llm_model.value}")
            try:
                current_results = current_system.investigate(query)
                if current_results.get('total_flagged', 0) > 0:
                    update_status(f"Found {current_results['total_flagged']} matches", 100)
                    export_btn.disabled = False
                    print(f"📊 Summary: {current_results.get('summary', 'N/A')}")
                    print("🚨 FLAGGED:")
                    for f in current_results.get('findings', []):
                        if f.get('matches_query'):
                            print(f"  {f['risk_level'].upper()}: {f['filename']} - {f['evidence_found']}")

                    # Display images
                    flagged_files = [f['filename'] for f in current_results.get('findings', []) if f.get('matches_query')]
                    if flagged_files:
                        print(f"🖼️ Displaying {len(flagged_files)} flagged images...")
                        display_flagged_images(flagged_files[:6], current_system)
                else:
                    update_status("No matches", 100); export_btn.disabled = True
                    print("❌ No matches. Try different terms or lower risk threshold.")
            except Exception as e: update_status("Investigation error", 0); print(f"❌ Error: {e}")

    def on_flagged(b):
        with output_area:
            clear_output(wait=True)
            if not current_system and not init_system(): print("❌ System not ready"); return

            update_status("Loading flagged images...", 50)
            try:
                flagged = current_system.get_flagged(risk_slider.value)
                if flagged:
                    print(f"🚨 FLAGGED IMAGES (Risk ≥ {risk_slider.value})")
                    for i, (filename, _, text, risk, analysis, conf) in enumerate(flagged, 1):
                        level = "CRITICAL" if risk > 0.8 else "HIGH" if risk > 0.6 else "MEDIUM" if risk > 0.3 else "LOW"
                        print(f"{i}. {filename} | {level} ({risk:.3f}) | OCR: {conf:.3f}")
                        print(f"   Text: {text[:100]}...")
                        if analysis:
                            try:
                                data = json.loads(analysis)
                                cats = data.get('categories', [])
                                if cats: print(f"   Categories: {', '.join(cats)}")
                            except: pass
                        print()

                    files = [r[0] for r in flagged[:6]]
                    print(f"🖼️ Displaying {len(files)} flagged images...")
                    display_flagged_images(files, current_system)
                    update_status(f"Showing {len(flagged)} flagged images", 100)
                else:
                    print(f"✅ No images flagged above risk {risk_slider.value}")
                    update_status("No flagged images", 100)
            except Exception as e: update_status("Error loading flagged", 0); print(f"❌ Error: {e}")

    def on_export(b):
        with output_area:
            if not current_results: print("⚠️ No results to export"); return
            try:
                update_status("Exporting...", 50)
                path = current_system.export_results(current_results, query_input.value or "investigation")
                update_status("Export complete!", 100)
                print(f"✅ Results exported to: {path}")
            except Exception as e: update_status("Export error", 0); print(f"❌ Error: {e}")

    def on_clear(b):
        nonlocal current_results
        current_results = None; export_btn.disabled = True
        with output_area: clear_output(); update_status("Cleared", 0); print("🗑️ Cleared")

    # Connect events
    test_btn.on_click(on_test); process_btn.on_click(on_process); investigate_btn.on_click(on_investigate)
    flagged_btn.on_click(on_flagged); export_btn.on_click(on_export); clear_btn.on_click(on_clear)

    # Quick query grid
    quick_grid = widgets.GridBox(quick_btns, layout=widgets.Layout(grid_template_columns='repeat(3, 1fr)', grid_gap='5px'))

    # UPDATED LAYOUT: Add OCR selector at the top
    controls = widgets.VBox([
        widgets.HTML("<h3>🏠 Compact Local Forensic System</h3>"),

        # ADD OCR SELECTOR AT THE TOP
        widgets.HTML("<b>🔍 OCR Engine Selection:</b>"),
        ocr_selector,
        widgets.HTML("<br>"),

        # Existing LLM configuration
        widgets.HBox([llm_provider, llm_model]),
        gallery_path,
        query_input,
        widgets.HTML("<b>Quick Queries:</b>"),
        quick_grid,
        widgets.HBox([confidence_slider, risk_slider]),
        widgets.HBox([test_btn, process_btn, investigate_btn, flagged_btn, export_btn, clear_btn]),
        status_label,
        progress_bar,
        widgets.HTML("<hr>")
    ])

    return widgets.VBox([controls, output_area])

    confidence_slider = widgets.FloatSlider(
        value=CONFIG['OCR_CONFIDENCE_THRESHOLD'],
        min=0.1, max=0.9, step=0.05,
        description='OCR Confidence:',
        style={'description_width': 'initial'},
        readout_format='.2f'
    )

    risk_slider = widgets.FloatSlider(
        value=0.3,
        min=0.1, max=1.0, step=0.1,
        description='Risk Threshold:',
        style={'description_width': 'initial'},
        readout_format='.1f'
    )

    # Quick queries
    quick_queries = ["Find weapons", "Look for drugs", "Financial crimes", "Personal info theft", "Threats", "Illegal coordination"]
    quick_btns = [widgets.Button(description=q[:20], tooltip=q, layout=widgets.Layout(width='180px', margin='2px')) for q in quick_queries]

    # Action buttons
    test_btn = widgets.Button(description='🔍 Test Setup', button_style='info', layout=widgets.Layout(width='120px'))
    process_btn = widgets.Button(description='📁 Process', button_style='primary', layout=widgets.Layout(width='120px'))
    investigate_btn = widgets.Button(description='🤖 Investigate', button_style='success', layout=widgets.Layout(width='120px'))
    flagged_btn = widgets.Button(description='🚨 Flagged', button_style='warning', layout=widgets.Layout(width='120px'))
    export_btn = widgets.Button(description='📋 Export', button_style='success', layout=widgets.Layout(width='120px'), disabled=True)
    clear_btn = widgets.Button(description='🗑️ Clear', layout=widgets.Layout(width='120px'))

    status_label = widgets.HTML(value="<b>Status:</b> Ready")
    progress_bar = widgets.IntProgress(value=0, min=0, max=100, layout=widgets.Layout(width='400px'))
    output_area = widgets.Output()

    # State
    current_system = None
    current_results = None

    def update_status(msg, progress=None):
        status_label.value = f"<b>Status:</b> {msg}"
        if progress is not None: progress_bar.value = progress

    def init_system():
        nonlocal current_system
        try:
            config = CONFIG.copy()
            config.update({
                'OCR_MODEL': ocr_selector.value,
                'LLM_PROVIDER': llm_provider.value,
                'LLM_MODEL': llm_model.value,
                'SUSPECTS_GALLERY_PATH': gallery_path.value,
                'OCR_CONFIDENCE_THRESHOLD': confidence_slider.value
            })
            current_system = CompactForensicSystem(config)
            return True
        except Exception as e:
            print(f"❌ Init error: {e}")
            return False

    # Button handlers
    def on_quick_query(query):
        def handler(b): query_input.value = query
        return handler

    for i, btn in enumerate(quick_btns): btn.on_click(on_quick_query(quick_queries[i]))

    def on_test(b):
        with output_area:
            clear_output(wait=True)
            update_status("Testing setup...", 20)
            print("🔍 TESTING LOCAL SETUP")
            print("="*40)

            # Test OCR first
            print(f"Testing OCR: {ocr_selector.value}")
            ocr_info = OCR_MODELS.get(ocr_selector.value, {})
            print(f"Purpose: {ocr_info.get('strengths', 'General OCR')}")

            try:
                if ocr_selector.value == 'paddleocr':
                    import paddleocr; print("✅ PaddleOCR available")
                elif ocr_selector.value == 'easyocr':
                    import easyocr; print("✅ EasyOCR available")
                elif ocr_selector.value == 'tesseract':
                    import pytesseract; print("✅ Tesseract available")
                elif ocr_selector.value == 'trocr':
                    import transformers; print("✅ TrOCR (Transformers) available")
            except ImportError as e: print(f"❌ {ocr_selector.value} not found: {e}")

            # Test other dependencies
            deps = [('paddleocr', 'PaddleOCR'), ('langchain', 'LangChain'), ('chromadb', 'ChromaDB'), ('sentence_transformers', 'Transformers')]
            for dep, name in deps:
                try: __import__(dep.replace('-', '_')); print(f"✅ {name}")
                except ImportError: print(f"❌ {name} not found")

            # Test Ollama
            if llm_provider.value == 'ollama':
                update_status("Testing Ollama...", 60)
                try:
                    import requests
                    r = requests.get("http://localhost:11434/api/tags", timeout=5)
                    if r.status_code == 200:
                        models = [m['name'] for m in r.json().get('models', [])]
                        print(f"✅ Ollama: {models}")
                        if llm_model.value in models: print(f"✅ Model '{llm_model.value}' ready")
                        else: print(f"⚠️ Model '{llm_model.value}' not found. Install: ollama pull {llm_model.value}")
                    else: print("❌ Ollama not responding")
                except: print("❌ Ollama connection failed. Start: ollama serve")

            update_status("Testing system init...", 80)
            if init_system(): print("✅ System ready"); update_status("Setup complete!", 100)
            else: print("❌ System init failed"); update_status("Setup failed", 0)

    def on_process(b):
        with output_area:
            clear_output(wait=True)
            update_status("Processing gallery...", 20)
            if not init_system(): update_status("Init failed", 0); return

            print(f"🏠 PROCESSING | OCR: {ocr_selector.value} | LLM: {llm_provider.value}-{llm_model.value}")
            try:
                current_system.process_gallery()
                update_status("Processing complete!", 100)
                flagged = current_system.get_flagged(risk_slider.value)
                if flagged: print(f"\n🚨 FLAGGED: {len(flagged)} images above risk {risk_slider.value}")
                else: print("\n✅ No high-risk content detected")
            except Exception as e: update_status("Processing error", 0); print(f"❌ Error: {e}")

    def on_investigate(b):
        nonlocal current_results
        with output_area:
            clear_output(wait=True)
            if not current_system and not init_system(): print("❌ System not ready"); return

            query = query_input.value.strip()
            if not query: print("⚠️ Enter investigation query"); return

            update_status("Investigating...", 50)
            print(f"🏠 INVESTIGATING: '{query}' | Model: {llm_model.value}")
            try:
                current_results = current_system.investigate(query)
                if current_results.get('total_flagged', 0) > 0:
                    update_status(f"Found {current_results['total_flagged']} matches", 100)
                    export_btn.disabled = False
                    print(f"📊 Summary: {current_results.get('summary', 'N/A')}")
                    print("🚨 FLAGGED:")
                    for f in current_results.get('findings', []):
                        if f.get('matches_query'):
                            print(f"  {f['risk_level'].upper()}: {f['filename']} - {f['evidence_found']}")

                    # Display images
                    flagged_files = [f['filename'] for f in current_results.get('findings', []) if f.get('matches_query')]
                    if flagged_files:
                        print(f"🖼️ Displaying {len(flagged_files)} flagged images...")
                        display_flagged_images(flagged_files[:6], current_system)
                else:
                    update_status("No matches", 100); export_btn.disabled = True
                    print("❌ No matches. Try different terms or lower risk threshold.")
            except Exception as e: update_status("Investigation error", 0); print(f"❌ Error: {e}")

    def on_flagged(b):
        with output_area:
            clear_output(wait=True)
            if not current_system and not init_system(): print("❌ System not ready"); return

            update_status("Loading flagged images...", 50)
            try:
                flagged = current_system.get_flagged(risk_slider.value)
                if flagged:
                    print(f"🚨 FLAGGED IMAGES (Risk ≥ {risk_slider.value})")
                    for i, (filename, _, text, risk, analysis, conf) in enumerate(flagged, 1):
                        level = "CRITICAL" if risk > 0.8 else "HIGH" if risk > 0.6 else "MEDIUM" if risk > 0.3 else "LOW"
                        print(f"{i}. {filename} | {level} ({risk:.3f}) | OCR: {conf:.3f}")
                        print(f"   Text: {text[:100]}...")
                        if analysis:
                            try:
                                data = json.loads(analysis)
                                cats = data.get('categories', [])
                                if cats: print(f"   Categories: {', '.join(cats)}")
                            except: pass
                        print()

                    files = [r[0] for r in flagged[:6]]
                    display_flagged_images(files, current_system)
                    update_status(f"Showing {len(flagged)} flagged images", 100)
                else:
                    print(f"✅ No images flagged above risk {risk_slider.value}")
                    update_status("No flagged images", 100)
            except Exception as e: update_status("Error loading flagged", 0); print(f"❌ Error: {e}")

    def on_export(b):
        with output_area:
            if not current_results: print("⚠️ No results to export"); return
            try:
                update_status("Exporting...", 50)
                path = current_system.export_results(current_results, query_input.value or "investigation")
                update_status("Export complete!", 100)
                print(f"✅ Results exported to: {path}")
            except Exception as e: update_status("Export error", 0); print(f"❌ Error: {e}")

    def on_clear(b):
        nonlocal current_results
        current_results = None; export_btn.disabled = True
        with output_area: clear_output(); update_status("Cleared", 0); print("🗑️ Cleared")

    # Connect events
    test_btn.on_click(on_test); process_btn.on_click(on_process); investigate_btn.on_click(on_investigate)
    flagged_btn.on_click(on_flagged); export_btn.on_click(on_export); clear_btn.on_click(on_clear)

    # Layout
    quick_grid = widgets.GridBox(quick_btns, layout=widgets.Layout(grid_template_columns='repeat(3, 1fr)', grid_gap='5px'))
    controls = widgets.VBox([
        widgets.HTML("<h3>🏠 Compact Local Forensic System</h3>"),
        widgets.HBox([llm_provider, llm_model]), gallery_path, query_input,
        widgets.HTML("<b>Quick Queries:</b>"), quick_grid,
        widgets.HBox([confidence_slider, risk_slider]),
        widgets.HBox([test_btn, process_btn, investigate_btn, flagged_btn, export_btn, clear_btn]),
        status_label, progress_bar, widgets.HTML("<hr>")
    ])

    return widgets.VBox([controls, output_area])

    for i, filename in enumerate(filenames[:num]):
        ax = axes[i] if num > 1 else axes
        try:
            result = system.conn.execute('''SELECT full_path, extracted_text, risk_score, local_analysis
                FROM forensic_images WHERE filename = ?''', (filename,)).fetchone()
            if not result: ax.text(0.5, 0.5, f"Not found\n{filename}", ha='center', va='center', transform=ax.transAxes); ax.axis('off'); continue

            full_path, text, risk, analysis = result
            try: img = plt.imread(full_path); ax.imshow(img)
            except: ax.text(0.5, 0.5, f"Load error\n{filename}", ha='center', va='center', transform=ax.transAxes); ax.axis('off'); continue

            # Risk level and color
            level, color = ("CRITICAL", "red") if risk > 0.8 else ("HIGH", "orange") if risk > 0.6 else ("MEDIUM", "yellow") if risk > 0.3 else ("LOW", "green")
            ax.set_title(f"{filename}\n{level}: {risk:.2f}", fontsize=10, color=color, fontweight='bold')

            # Text overlay
            text_preview = text[:80] if text else "No text"
            ax.text(0.02, 0.98, f"Text: {text_preview}...", transform=ax.transAxes, fontsize=8,
                   verticalalignment='top', bbox=dict(boxstyle="round,pad=0.3", facecolor=color, alpha=0.7))

            # Categories
            if analysis:
                try:
                    data = json.loads(analysis)
                    cats = data.get('categories', [])
                    if cats: ax.text(0.02, 0.02, f"Cat: {', '.join(cats[:2])}", transform=ax.transAxes, fontsize=7,
                                   verticalalignment='bottom', bbox=dict(boxstyle="round,pad=0.2", facecolor="lightblue", alpha=0.8))
                except: pass
            ax.axis('off')
        except Exception as e: ax.text(0.5, 0.5, f"Error\n{filename}\n{str(e)[:30]}", ha='center', va='center', transform=ax.transAxes); ax.axis('off')

    for j in range(num, len(axes)): axes[j].axis('off')
    plt.tight_layout(); plt.suptitle('🏠 Local Forensic Results', fontsize=14, y=1.02); plt.show()

# Quick functions
def quick_setup():
    print("🏠 QUICK SETUP CHECK")
    deps = ['paddleocr', 'langchain', 'chromadb', 'sentence_transformers']
    for dep in deps:
        try: __import__(dep.replace('-', '_')); print(f"✅ {dep}")
        except ImportError: print(f"❌ {dep} - pip install {dep}")

    try:
        import requests
        r = requests.get("http://localhost:11434/api/tags", timeout=5)
        if r.status_code == 200:
            models = [m['name'] for m in r.json().get('models', [])]
            print(f"✅ Ollama: {models}")
            if 'llama3.2:latest' in models: print("✅ Recommended model ready")
            else: print("⚠️ Install: ollama pull llama3.2:latest")
        else: print("❌ Ollama not responding")
    except: print("❌ Ollama not available - Start: ollama serve")

def quick_investigate(query: str, gallery_path: str = '../../../datasets/images/text_ocr'):
    print(f"🏠 Quick Investigation: '{query}' | 🔒 100% Local")
    try:
        config = CONFIG.copy(); config['SUSPECTS_GALLERY_PATH'] = gallery_path
        system = CompactForensicSystem(config)

        # Check if processed
        count = system.conn.execute("SELECT COUNT(*) FROM forensic_images WHERE vector_stored = 1").fetchone()[0]
        if count == 0: print("📁 Processing gallery first..."); system.process_gallery()

        results = system.investigate(query)
        if results.get('total_flagged', 0) > 0:
            path = system.export_results(results, query)
            print(f"📋 Exported to: {path}")
        system.close(); return results
    except Exception as e: print(f"❌ Error: {e}"); return None

# Display interface with debugging
print("🏠 Creating Compact Local Forensic Interface...")
print("🔧 Debug: OCR models available:", list(OCR_MODELS.keys()))

compact_interface = create_compact_gui()

# Verify the interface was created properly
print("✅ GUI created successfully")
print("🔍 OCR selector should be visible at the top")

display(compact_interface)

print(f"\n🏠 COMPACT LOCAL FORENSIC SYSTEM READY!")
print("="*50)
print("🔒 PRIVACY: 100% local, no API calls, offline capable")
print("🤖 COMPONENTS: Multi-OCR + Ollama/Transformers + ChromaDB + SQLite")
print("📝 OCR OPTIONS: PaddleOCR, EasyOCR, Tesseract, TrOCR")
print("🚀 SETUP: ollama serve → ollama pull llama3.2:latest")
print("💡 QUICK: quick_setup() | quick_investigate('find weapons')")
print("="*50)
print("\n🔍 OCR SELECTION GUIDE:")
print("• PaddleOCR: Best overall, Asian languages, handwriting")
print("• EasyOCR: European languages, clean text")
print("• Tesseract: Printed documents, very stable")
print("• TrOCR: Handwritten notes, degraded images")
print("="*50)

🏠 LOCAL FORENSIC SYSTEM | LLM: ollama-llama3.2:latest | 🔒 100% Private
✅ paddleocr
✅ langchain
✅ chromadb
✅ sentence_transformers
✅ Ollama: ['deepseek-r1:1.5b', 'llama3.2:latest']
🏠 Creating Compact Local Forensic Interface...
🔧 Debug: OCR models available: ['paddleocr', 'easyocr', 'tesseract', 'trocr']
✅ GUI created successfully
🔍 OCR selector should be visible at the top


VBox(children=(VBox(children=(HTML(value='<h3>🏠 Compact Local Forensic System</h3>'), HTML(value='<b>🔍 OCR Eng…


🏠 COMPACT LOCAL FORENSIC SYSTEM READY!
🔒 PRIVACY: 100% local, no API calls, offline capable
🤖 COMPONENTS: Multi-OCR + Ollama/Transformers + ChromaDB + SQLite
📝 OCR OPTIONS: PaddleOCR, EasyOCR, Tesseract, TrOCR
🚀 SETUP: ollama serve → ollama pull llama3.2:latest
💡 QUICK: quick_setup() | quick_investigate('find weapons')

🔍 OCR SELECTION GUIDE:
• PaddleOCR: Best overall, Asian languages, handwriting
• EasyOCR: European languages, clean text
• Tesseract: Printed documents, very stable
• TrOCR: Handwritten notes, degraded images
