In [1]:
# Cell 1: Configuration and Setup
import os
import shutil
import torch
from PIL import Image
import cv2
import numpy as np
from pathlib import Path
import ipywidgets as widgets
from IPython.display import display, clear_output
import matplotlib.pyplot as plt
import matplotlib.patches as patches

# Configuration Settings
CONFIG = {
    # Model Selection (CLIP-based approach for GLIP-like functionality)
    'MODEL_NAME': 'openai/clip-vit-base-patch32',  # CLIP models available in transformers
    'DEVICE': 'auto',  # 'auto', 'cpu', 'cuda'

    # Paths
    'SUSPECTS_GALLERY_PATH': '../../datasets/images/objects/raw',  # Input folder with suspect images
    'RESULTS_OUTPUT_PATH': '../../datasets/images/objects/detections',      # Output folder for matched images

    # Detection parameters (adapted for CLIP)
    'CONFIDENCE_THRESHOLD': 0.25,  # Similarity threshold for CLIP
    'SIMILARITY_THRESHOLD': 0.2,   # Text-image similarity threshold
    'TOP_K_PATCHES': 20,           # Number of top patches to consider

    # Processing settings
    'BATCH_SIZE': 4,               # Default batch size for processing
    'MAX_RESULTS_DISPLAY': 10,     # Maximum results to display at once
    'FIGURE_SIZE': (12, 8),        # Size of result visualization
    'PATCH_SIZE': 64,              # Size of image patches for analysis
    'STRIDE': 32,                  # Stride for sliding window
}

# Available CLIP model options (GLIP alternative)
AVAILABLE_MODELS = {
    'clip-vit-base-patch32': {
        'name': 'CLIP ViT-Base Patch32',
        'model_id': 'openai/clip-vit-base-patch32',
        'description': 'Standard CLIP model - good balance of speed and accuracy',
        'performance': 'Base performance, good speed'
    },
    'clip-vit-base-patch16': {
        'name': 'CLIP ViT-Base Patch16',
        'model_id': 'openai/clip-vit-base-patch16',
        'description': 'Higher resolution CLIP - better accuracy, slower',
        'performance': 'Better accuracy, moderate speed'
    },
    'clip-vit-large-patch14': {
        'name': 'CLIP ViT-Large Patch14',
        'model_id': 'openai/clip-vit-large-patch14',
        'description': 'Largest CLIP model - best accuracy, slowest',
        'performance': 'Best accuracy, slowest speed'
    }
}

print("✅ Configuration loaded successfully")
print(f"📁 Suspects gallery: {CONFIG['SUSPECTS_GALLERY_PATH']}")
print(f"📁 Results output: {CONFIG['RESULTS_OUTPUT_PATH']}")
print(f"🔍 Selected model: {CONFIG['MODEL_NAME']}")
print("📝 Note: Using CLIP for GLIP-like functionality (text-image grounding)")

# Cell 2: Install and Import Dependencies
# Run this cell first to install required packages
try:
    import transformers
    print("✅ Transformers already installed")
except ImportError:
    print("⚠️ Installing required packages...")
    !pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
    !pip install transformers
    !pip install ipywidgets
    !pip install Pillow
    !pip install matplotlib
    !pip install opencv-python
    print("📦 Installation complete")

# Import required libraries
try:
    import torch
    import torch.nn.functional as F
    from transformers import CLIPProcessor, CLIPModel
    import requests
    from PIL import Image
    print("✅ All dependencies imported successfully")

    # Check PyTorch device compatibility
    print(f"🔧 PyTorch version: {torch.__version__}")
    if torch.cuda.is_available():
        print(f"🚀 CUDA available: {torch.cuda.get_device_name(0)}")
        default_device = "cuda"
    else:
        print("🖥️ Using CPU mode (CUDA not available)")
        default_device = "cpu"

    # Update config with detected device
    if CONFIG['DEVICE'] == 'auto':
        CONFIG['DEVICE'] = default_device
        print(f"📍 Auto-detected device: {CONFIG['DEVICE']}")

except ImportError as e:
    print(f"❌ Import error: {e}")
    print("🔧 Troubleshooting steps:")
    print("1. Restart kernel and run this cell again")
    print("2. Check if all packages installed correctly")

# Cell 3: Initialize Model and Directories
def setup_directories():
    """Create necessary directories if they don't exist"""
    os.makedirs(CONFIG['SUSPECTS_GALLERY_PATH'], exist_ok=True)
    os.makedirs(CONFIG['RESULTS_OUTPUT_PATH'], exist_ok=True)
    print(f"📁 Created directories: {CONFIG['SUSPECTS_GALLERY_PATH']}, {CONFIG['RESULTS_OUTPUT_PATH']}")

def load_clip_model(model_name=None):
    """Load CLIP model from Hugging Face for GLIP-like functionality"""
    try:
        # Use provided model name or default from config
        if model_name is None:
            model_name = CONFIG['MODEL_NAME']

        device = CONFIG['DEVICE']
        print(f"📥 Loading CLIP model: {model_name}")
        print(f"🖥️ Target device: {device}")
        print("📝 Using CLIP for grounded language-image understanding")

        # Load processor and model
        print("⏳ Loading processor...")
        processor = CLIPProcessor.from_pretrained(model_name)

        print("⏳ Loading model weights...")
        model = CLIPModel.from_pretrained(model_name)

        # Move to device
        print(f"📍 Moving model to {device}...")
        model = model.to(device)
        model.eval()

        print(f"✅ Model loaded successfully!")
        print(f"   🔍 Model: {model_name}")
        print(f"   🖥️ Device: {device}")

        return model, processor, device, model_name

    except Exception as e:
        print(f"❌ Error loading model: {e}")
        print("\n🔧 Troubleshooting steps:")
        print("1. Check internet connection (models download from Hugging Face)")
        print("2. Verify model name is correct")
        print("3. Try switching to 'openai/clip-vit-base-patch32' for faster loading")
        print("4. Restart kernel if memory issues occur")
        return None, None, None, None

def switch_model(model_key):
    """Switch to a different CLIP model variant"""
    if model_key in AVAILABLE_MODELS:
        CONFIG['MODEL_NAME'] = AVAILABLE_MODELS[model_key]['model_id']
        print(f"🔄 Switched to: {AVAILABLE_MODELS[model_key]['name']}")
        return load_clip_model()
    else:
        print(f"❌ Unknown model: {model_key}")
        print(f"Available models: {list(AVAILABLE_MODELS.keys())}")
        return None, None, None, None

# Initialize
setup_directories()
model, processor, device, model_name = load_clip_model()

# Cell 4: Core Search Functions (GLIP-like with CLIP)
def create_sliding_windows(image, patch_size=64, stride=32):
    """Create sliding windows across the image for localization"""
    width, height = image.size
    windows = []
    positions = []

    for y in range(0, height - patch_size + 1, stride):
        for x in range(0, width - patch_size + 1, stride):
            # Extract patch
            patch = image.crop((x, y, x + patch_size, y + patch_size))
            windows.append(patch)
            positions.append((x, y, x + patch_size, y + patch_size))

    return windows, positions

def compute_text_image_similarity(model, processor, text, images, device):
    """Compute similarity between text and multiple image patches"""
    # Process text and images
    inputs = processor(text=[text], images=images, return_tensors="pt", padding=True)
    inputs = inputs.to(device)

    with torch.no_grad():
        outputs = model(**inputs)

        # Get similarity scores
        logits_per_image = outputs.logits_per_image  # (num_images, num_texts)
        similarities = F.softmax(logits_per_image, dim=1)[:, 0]  # Get scores for our text

    return similarities.cpu().numpy()

def process_image_batch(image_paths, model, processor, query, device, batch_size=4):
    """Process a batch of images efficiently using CLIP for grounding"""
    batch_results = []

    # Process images one by one to avoid memory issues
    for i, img_path in enumerate(image_paths):
        try:
            # Progress indicator
            if i % 5 == 0:
                print(f"Processing {i+1}/{len(image_paths)}: {img_path.name[:30]}...", end='\r')

            # Load image
            image = Image.open(img_path).convert("RGB")

            # Create sliding windows for localization
            patch_size = CONFIG['PATCH_SIZE']
            stride = CONFIG['STRIDE']

            # For very small images, use the whole image
            if min(image.size) < patch_size:
                patches = [image]
                positions = [(0, 0, image.size[0], image.size[1])]
            else:
                patches, positions = create_sliding_windows(image, patch_size, stride)

            # Limit number of patches to avoid memory issues
            if len(patches) > CONFIG['TOP_K_PATCHES']:
                # Sample patches evenly across the image
                step = len(patches) // CONFIG['TOP_K_PATCHES']
                indices = list(range(0, len(patches), step))[:CONFIG['TOP_K_PATCHES']]
                patches = [patches[idx] for idx in indices]
                positions = [positions[idx] for idx in indices]

            # Compute similarities for all patches
            if patches:
                similarities = compute_text_image_similarity(model, processor, query, patches, device)

                # Filter by similarity threshold
                high_sim_indices = np.where(similarities >= CONFIG['SIMILARITY_THRESHOLD'])[0]

                if len(high_sim_indices) > 0:
                    # Get high-similarity patches and their positions
                    filtered_similarities = similarities[high_sim_indices]
                    filtered_positions = [positions[idx] for idx in high_sim_indices]

                    # Convert positions to torch tensors for consistency
                    boxes_tensor = torch.tensor(filtered_positions, dtype=torch.float32)
                    scores_tensor = torch.tensor(filtered_similarities, dtype=torch.float32)

                    batch_results.append({
                        'image_path': img_path,
                        'image': image,
                        'boxes': boxes_tensor,
                        'confidence_scores': scores_tensor,
                        'labels': [query] * len(filtered_similarities),
                        'query': query
                    })

            # Clear memory after each image
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        except Exception as e:
            # Print error but continue processing
            print(f"\n⚠️ Error processing {img_path.name}: {str(e)[:50]}...")
            continue

        # Small break every 10 images to prevent system overload
        if i % 10 == 0 and i > 0:
            import time
            time.sleep(0.1)

    return batch_results

def search_images_with_query(query, model, processor, device, model_name, gallery_path, batch_size=4):
    """
    Search for objects in images using natural language query with CLIP (GLIP-like)
    """
    import time
    from datetime import datetime

    # Start timing
    start_time = time.time()
    start_datetime = datetime.now()

    results = []
    gallery_path = Path(gallery_path)

    if not gallery_path.exists():
        print(f"❌ Gallery path {gallery_path} does not exist")
        return results

    if not model or not processor:
        print("❌ Model or processor not loaded. Please check model initialization.")
        return results

    # Supported image extensions
    image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
    image_files = [f for f in gallery_path.iterdir()
                  if f.suffix.lower() in image_extensions]

    if not image_files:
        print(f"⚠️ No images found in {gallery_path}")
        return results

    total_files = len(image_files)

    print(f"🔍 Processing {total_files} images for: '{query}'")
    print(f"🖥️ Device: {device} | Using CLIP for grounding")
    print(f"🔍 Model: {model_name}")
    print(f"📐 Patch size: {CONFIG['PATCH_SIZE']}px, Stride: {CONFIG['STRIDE']}px")
    print(f"⏰ Start time: {start_datetime.strftime('%Y-%m-%d %H:%M:%S')}")

    # Process images one by one with progress tracking
    try:
        print("⏳ Starting image processing...")
        results = process_image_batch(image_files, model, processor, query, device, batch_size)

        # Calculate timing
        end_time = time.time()
        end_datetime = datetime.now()
        total_duration = end_time - start_time

        # Format duration
        hours = int(total_duration // 3600)
        minutes = int((total_duration % 3600) // 60)
        seconds = total_duration % 60

        if hours > 0:
            duration_str = f"{hours}h {minutes}m {seconds:.1f}s"
        elif minutes > 0:
            duration_str = f"{minutes}m {seconds:.1f}s"
        else:
            duration_str = f"{seconds:.1f}s"

        # Show final progress with timing
        matches_found = len(results)
        avg_time_per_image = total_duration / total_files if total_files > 0 else 0

        print(f"\n" + "="*60)
        print(f"📊 PROCESSING SUMMARY")
        print(f"="*60)
        print(f"📸 Images processed: {total_files}")
        print(f"✅ Matches found: {matches_found}")
        print(f"⏰ Start time: {start_datetime.strftime('%H:%M:%S')}")
        print(f"🏁 End time: {end_datetime.strftime('%H:%M:%S')}")
        print(f"⏱️ Total time: {duration_str}")
        print(f"📈 Avg per image: {avg_time_per_image:.2f}s")
        print(f"="*60)

    except Exception as e:
        end_time = time.time()
        end_datetime = datetime.now()
        total_duration = end_time - start_time

        print(f"❌ Processing error: {e}")
        print(f"⏰ Failed after: {total_duration:.1f}s")
        print("💡 Try reducing patch size or similarity threshold")

    return results

def copy_results_to_folder(results, output_folder):
    """Copy matched images to results folder"""
    output_path = Path(output_folder)
    output_path.mkdir(exist_ok=True)

    # Create subfolder with timestamp
    from datetime import datetime
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    search_folder = output_path / f"search_{timestamp}"
    search_folder.mkdir(exist_ok=True)

    copied_files = []

    for i, result in enumerate(results):
        try:
            source_path = result['image_path']
            # Create descriptive filename
            max_conf = float(result['confidence_scores'].max()) if len(result['confidence_scores']) > 0 else 0.0
            filename = f"{i+1:03d}_{source_path.stem}_sim{max_conf:.2f}{source_path.suffix}"
            dest_path = search_folder / filename

            shutil.copy2(source_path, dest_path)
            copied_files.append(dest_path)

        except Exception as e:
            print(f"❌ Error copying {source_path}: {e}")

    print(f"📋 Copied {len(copied_files)} files to {search_folder}")
    return search_folder, copied_files

# Cell 5: Interactive Query Interface
def create_search_interface():
    """Create interactive search interface for forensic analysts"""

    # Model selection dropdown
    model_options = [(f"{info['name']} - {info['description']}", key)
                    for key, info in AVAILABLE_MODELS.items()]

    model_selector = widgets.Dropdown(
        options=model_options,
        value='clip-vit-base-patch32',
        description='Model:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='500px')
    )

    # Similarity threshold slider
    similarity_threshold_slider = widgets.FloatSlider(
        value=CONFIG['SIMILARITY_THRESHOLD'],
        min=0.1,
        max=0.8,
        step=0.05,
        description='Similarity Threshold:',
        style={'description_width': 'initial'}
    )

    # Patch size slider
    patch_size_slider = widgets.IntSlider(
        value=CONFIG['PATCH_SIZE'],
        min=32,
        max=128,
        step=16,
        description='Patch Size:',
        style={'description_width': 'initial'}
    )

    # Input widgets
    query_input = widgets.Text(
        value='person with weapon',
        placeholder='Enter search query (e.g., "person with weapon", "suspicious activity")',
        description='Query:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='500px')
    )

    confidence_slider = widgets.FloatSlider(
        value=CONFIG['CONFIDENCE_THRESHOLD'],
        min=0.1,
        max=0.9,
        step=0.05,
        description='Min Confidence:',
        style={'description_width': 'initial'}
    )

    search_button = widgets.Button(
        description='🔍 Search Gallery',
        button_style='primary',
        layout=widgets.Layout(width='150px')
    )

    copy_button = widgets.Button(
        description='📋 Copy Results',
        button_style='success',
        layout=widgets.Layout(width='150px'),
        disabled=True
    )

    clear_button = widgets.Button(
        description='🗑️ Clear Results',
        button_style='warning',
        layout=widgets.Layout(width='150px')
    )

    switch_model_button = widgets.Button(
        description='🔄 Switch Model',
        button_style='info',
        layout=widgets.Layout(width='150px')
    )

    # Output area
    output_area = widgets.Output()

    # Store results and current model
    search_results = []
    current_model = model
    current_processor = processor
    current_device = device
    current_model_name = model_name

    def on_model_switch_clicked(b):
        nonlocal current_model, current_processor, current_device, current_model_name
        with output_area:
            selected_model = model_selector.value
            print(f"🔄 Switching to: {AVAILABLE_MODELS[selected_model]['name']}")
            new_model, new_processor, new_device, new_model_name = switch_model(selected_model)
            if new_model and new_processor:
                current_model = new_model
                current_processor = new_processor
                current_device = new_device
                current_model_name = new_model_name
                print("✅ Model switched successfully!")
            else:
                print("❌ Failed to switch model")

    def on_search_clicked(b):
        nonlocal search_results
        with output_area:
            clear_output(wait=True)

            if not current_model or not current_processor:
                print("❌ Model not loaded. Please switch to a valid model first.")
                return

            query = query_input.value.strip()
            if not query:
                print("⚠️ Please enter a search query")
                return

            # Update configuration
            CONFIG['CONFIDENCE_THRESHOLD'] = confidence_slider.value
            CONFIG['SIMILARITY_THRESHOLD'] = similarity_threshold_slider.value
            CONFIG['PATCH_SIZE'] = patch_size_slider.value

            print(f"🚀 Starting CLIP-based search: '{query}'")
            print(f"📊 Confidence: {CONFIG['CONFIDENCE_THRESHOLD']:.2f} | Similarity: {CONFIG['SIMILARITY_THRESHOLD']:.2f}")
            print(f"📐 Patch size: {CONFIG['PATCH_SIZE']}px")
            print("-" * 60)

            # Perform search
            search_results = search_images_with_query(
                query, current_model, current_processor, current_device, current_model_name,
                CONFIG['SUSPECTS_GALLERY_PATH'], CONFIG['BATCH_SIZE']
            )

            if search_results:
                copy_button.disabled = False
                display_results(search_results[:CONFIG['MAX_RESULTS_DISPLAY']])

                if len(search_results) > CONFIG['MAX_RESULTS_DISPLAY']:
                    print(f"\n📝 Showing first {CONFIG['MAX_RESULTS_DISPLAY']} results out of {len(search_results)} total matches")
            else:
                print("🔍 No matches found for your query")
                print("💡 Try lowering the similarity threshold or using different search terms")
                copy_button.disabled = True

    def on_copy_clicked(b):
        with output_area:
            if search_results:
                print("\n📋 Copying results to output folder...")
                folder, files = copy_results_to_folder(search_results, CONFIG['RESULTS_OUTPUT_PATH'])
                print(f"✅ Results saved to: {folder}")
            else:
                print("⚠️ No results to copy")

    def on_clear_clicked(b):
        nonlocal search_results
        search_results = []
        copy_button.disabled = True
        with output_area:
            clear_output()
            print("🗑️ Results cleared")

    # Connect button events
    switch_model_button.on_click(on_model_switch_clicked)
    search_button.on_click(on_search_clicked)
    copy_button.on_click(on_copy_clicked)
    clear_button.on_click(on_clear_clicked)

    # Layout
    controls = widgets.VBox([
        widgets.HTML("<h3>🔍 Forensic Image Search Interface (CLIP-based Grounding)</h3>"),
        model_selector,
        query_input,
        widgets.HBox([confidence_slider, similarity_threshold_slider]),
        widgets.HBox([patch_size_slider]),
        widgets.HBox([search_button, copy_button, clear_button, switch_model_button]),
        widgets.HTML("<hr>")
    ])

    return widgets.VBox([controls, output_area])

def display_results(results):
    """Display search results with bounding boxes"""
    if not results:
        return

    cols = 2
    rows = (len(results) + cols - 1) // cols

    fig, axes = plt.subplots(rows, cols, figsize=CONFIG['FIGURE_SIZE'])
    if rows == 1:
        axes = [axes] if cols == 1 else axes
    else:
        axes = axes.flatten()

    for i, result in enumerate(results):
        ax = axes[i] if len(results) > 1 else axes

        # Display image
        image = result['image']
        ax.imshow(image)

        # Draw bounding boxes
        w, h = image.size
        boxes = result['boxes']
        confidences = result['confidence_scores']

        for box, conf in zip(boxes, confidences):
            # Convert from [x1, y1, x2, y2] to matplotlib rectangle
            x1, y1, x2, y2 = box.tolist()

            # Create rectangle
            rect = patches.Rectangle(
                (x1, y1), x2 - x1, y2 - y1,
                linewidth=2, edgecolor='red', facecolor='none'
            )
            ax.add_patch(rect)

            # Add similarity score text
            ax.text(x1, y1 - 5, f'{conf:.2f}',
                   color='red', fontweight='bold', fontsize=10)

        ax.set_title(f"{result['image_path'].name}\nMatches: {len(boxes)}", fontsize=10)
        ax.axis('off')

    # Hide empty subplots
    for j in range(len(results), len(axes)):
        axes[j].axis('off')

    plt.tight_layout()
    plt.show()

# Display the interface
interface = create_search_interface()
display(interface)

# Cell 6: Batch Processing Functions (Enhanced)
def batch_search_multiple_queries(queries_list, model, processor, device, model_name, gallery_path, output_base_path, batch_size=4):
    """
    Process multiple queries in batch for comprehensive analysis using CLIP
    """
    import time
    from datetime import datetime

    # Start timing for entire batch
    batch_start_time = time.time()
    batch_start_datetime = datetime.now()

    all_results = {}

    print(f"🚀 Starting batch analysis with {len(queries_list)} queries")
    print(f"📁 Gallery: {gallery_path}")
    print(f"🔍 Model: {model_name}")
    print(f"⏰ Batch start: {batch_start_datetime.strftime('%Y-%m-%d %H:%M:%S')}")
    print("=" * 60)

    for i, query in enumerate(queries_list, 1):
        print(f"\n[{i}/{len(queries_list)}] Query: '{query}'")

        # Time individual query
        query_start_time = time.time()
        results = search_images_with_query(query, model, processor, device, model_name, gallery_path, batch_size)
        query_end_time = time.time()
        query_duration = query_end_time - query_start_time

        if results:
            # Create query-specific output folder
            query_folder = Path(output_base_path) / f"query_{query.replace(' ', '_').replace('/', '_')}"
            folder, files = copy_results_to_folder(results, query_folder)
            all_results[query] = {
                'results': results,
                'output_folder': folder,
                'file_count': len(files),
                'match_count': len(results),
                'processing_time': query_duration
            }
            print(f"📁 Saved {len(files)} files to: {folder.name}")
            print(f"⏱️ Query completed in: {query_duration:.1f}s")
        else:
            all_results[query] = {
                'results': [],
                'output_folder': None,
                'file_count': 0,
                'match_count': 0,
                'processing_time': query_duration
            }
            print("⚪ No matches found")
            print(f"⏱️ Query completed in: {query_duration:.1f}s")

    # Calculate batch timing
    batch_end_time = time.time()
    batch_end_datetime = datetime.now()
    total_batch_duration = batch_end_time - batch_start_time

    # Format duration
    hours = int(total_batch_duration // 3600)
    minutes = int((total_batch_duration % 3600) // 60)
    seconds = total_batch_duration % 60

    if hours > 0:
        duration_str = f"{hours}h {minutes}m {seconds:.1f}s"
    elif minutes > 0:
        duration_str = f"{minutes}m {seconds:.1f}s"
    else:
        duration_str = f"{seconds:.1f}s"

    # Summary report with timing
    print("\n" + "="*60)
    print("📊 BATCH SEARCH SUMMARY")
    print("="*60)

    total_matches = 0
    total_files = 0
    total_query_time = 0

    for query, data in all_results.items():
        matches = data['match_count']
        files = data['file_count']
        query_time = data['processing_time']
        total_matches += matches
        total_files += files
        total_query_time += query_time

        status = "✅" if matches > 0 else "⚪"
        print(f"{status} '{query}': {matches} images, {files} files saved ({query_time:.1f}s)")

    avg_time_per_query = total_batch_duration / len(queries_list) if queries_list else 0

    print(f"\n🎯 TOTALS:")
    print(f"📊 Queries processed: {len(queries_list)}")
    print(f"🎯 Total matches: {total_matches} images")
    print(f"📋 Total files copied: {total_files}")
    print(f"⏰ Start time: {batch_start_datetime.strftime('%H:%M:%S')}")
    print(f"🏁 End time: {batch_end_datetime.strftime('%H:%M:%S')}")
    print(f"⏱️ Total batch time: {duration_str}")
    print(f"📈 Avg per query: {avg_time_per_query:.1f}s")
    print("="*60)

    return all_results

# Enhanced batch processing with common forensic queries optimized for CLIP
def run_forensic_batch_analysis(custom_queries=None, batch_size=4, model_to_use=None):
    """Run comprehensive forensic analysis with predefined and custom queries using CLIP"""

    # Use provided model or current global model
    if model_to_use:
        current_model, current_processor, current_device, current_model_name = model_to_use
    else:
        current_model, current_processor, current_device, current_model_name = model, processor, device, model_name

    # Default forensic queries optimized for CLIP
    default_queries = [
        "person with weapon",
        "weapon",
        "gun",
        "knife",
        "suspicious person",
        "person running",
        "vehicle",
        "mask",
        "dark clothing",
        "backpack",
        "group of people",
        "mobile phone",
        "suspicious activity"
    ]

    # Combine with custom queries if provided
    if custom_queries:
        queries = default_queries + custom_queries
        print(f"📋 Using {len(default_queries)} default + {len(custom_queries)} custom queries")
    else:
        queries = default_queries
        print(f"📋 Using {len(default_queries)} default forensic queries")

    if current_model and current_processor:
        print("🔍 Starting comprehensive CLIP-based forensic analysis...")
        batch_results = batch_search_multiple_queries(
            queries,
            current_model,
            current_processor,
            current_device,
            current_model_name,
            CONFIG['SUSPECTS_GALLERY_PATH'],
            CONFIG['RESULTS_OUTPUT_PATH'],
            batch_size
        )
        return batch_results
    else:
        print("❌ Model or processor not loaded. Cannot run batch analysis.")
        return None

# Quick test with reduced output
def quick_forensic_search(query="person with weapon", batch_size=4, model_to_use=None):
    """Quick single query search for testing with CLIP"""
    if model_to_use:
        current_model, current_processor, current_device, current_model_name = model_to_use
    else:
        current_model, current_processor, current_device, current_model_name = model, processor, device, model_name

    if not current_model or not current_processor:
        print("❌ Model or processor not loaded")
        return None

    print(f"🔍 Quick CLIP search: '{query}'")
    print(f"🔍 Using model: {current_model_name}")

    results = search_images_with_query(query, current_model, current_processor, current_device, current_model_name,
                                     CONFIG['SUSPECTS_GALLERY_PATH'], batch_size)

    if results:
        print(f"📋 Found {len(results)} matches - ready for detailed analysis")
        return results
    else:
        print("⚪ No matches found")
        return []

# Model comparison utilities
def list_available_models():
    """Display available CLIP models with their descriptions"""
    print("🔍 Available CLIP Models (GLIP Alternative):")
    print("-" * 50)
    for key, info in AVAILABLE_MODELS.items():
        print(f"🔹 {info['name']} ({key})")
        print(f"   📊 {info['performance']}")
        print(f"   📝 {info['description']}")
        print()

# Display available models
list_available_models()

# Uncomment to run batch analysis
# batch_results = run_forensic_batch_analysis(batch_size=4)

# Uncomment for quick test
# quick_results = quick_forensic_search("weapon", batch_size=4)

# Cell 7: Usage Instructions and Tips
print("""
🎯 FORENSIC IMAGE SEARCH SYSTEM - CLIP-BASED GROUNDING
====================================================

📝 IMPORTANT NOTE:
This implementation uses CLIP (available in transformers) to provide
GLIP-like functionality through text-image similarity and sliding window detection.

🆕 CLIP-BASED FEATURES:
• 🔍 Text-image similarity for grounded understanding
• 📐 Sliding window approach for object localization
• 🎯 Patch-based analysis for detailed detection
• 🔗 Direct similarity scoring between text and image regions

📋 SETUP CHECKLIST:
1. ✅ Place suspect images in the './suspects_gallery' folder
2. ✅ Run all cells in order (1-6)
3. ✅ CLIP models download automatically on first use
4. ✅ Works with standard transformers library (no special dependencies)

🔍 AVAILABLE CLIP MODELS:
• ViT-Base Patch32: Fastest, good balance (recommended for testing)
• ViT-Base Patch16: Higher resolution, better accuracy
• ViT-Large Patch14: Best accuracy, slowest (recommended for critical analysis)

🔧 DEVICE COMPATIBILITY:
• System automatically detects GPU/CPU availability
• Models work on both CPU and GPU
• CPU mode: Slower but works on all systems
• GPU mode: Significantly faster with CUDA support

🔍 SEARCH METHODOLOGY:
• Creates sliding windows across each image
• Computes text-image similarity for each patch
• Returns high-similarity regions as detections
• Adjustable patch size and stride for different granularity

🔍 SEARCH TIPS FOR CLIP-BASED GROUNDING:
• Use clear, descriptive terms: "weapon", "person", "vehicle"
• CLIP works well with object names and simple descriptions
• Try both specific and general terms
• Lower similarity thresholds (0.15-0.3) often work better
• Effective forensic queries:
  - "weapon" / "gun" / "knife"
  - "person" / "suspicious person"
  - "vehicle" / "car"
  - "mask" / "dark clothing"
  - "backpack" / "bag"

⚙️ INTERFACE FEATURES:
• Model selector for switching between CLIP variants
• Similarity threshold: Controls detection sensitivity
• Patch size: Adjusts detection granularity (32-128px)
• Confidence threshold: Final filtering of results
• Real-time search with progress tracking

🎛️ PARAMETER TUNING:
• Similarity Threshold (0.1-0.8): Text-image matching sensitivity
  - Lower = more detections, higher = more precise
• Patch Size (32-128px): Detection window size
  - Smaller = more detailed, larger = faster processing
• Confidence Threshold (0.1-0.9): Final result filtering
• Start with: Similarity 0.2, Patch 64px, Confidence 0.25

📁 OUTPUT STRUCTURE:
search_results/
├── search_20240611_143022/
│   ├── 001_suspect1_sim0.32.jpg
│   ├── 002_suspect5_sim0.45.jpg
│   └── ...

🚨 TROUBLESHOOTING:
• "No matches found" → Lower similarity threshold (try 0.15-0.2)
• "Too many false positives" → Raise similarity threshold
• "Missing small objects" → Reduce patch size to 32-48px
• "Slow processing" → Increase patch size or use faster model
• "Memory issues" → Restart kernel, reduce patch count

🔄 MODEL COMPARISON:
• ViT-Base Patch32:
  - Pros: Fast processing, good general performance
  - Cons: Lower resolution features
  - Best for: Quick analysis, large image sets
• ViT-Base Patch16:
  - Pros: Better detail recognition, good balance
  - Cons: Slower than Patch32
  - Best for: Standard forensic analysis
• ViT-Large Patch14:
  - Pros: Best accuracy, finest details
  - Cons: Slowest processing, high memory usage
  - Best for: Critical evidence analysis

⭐ CLIP ADVANTAGES:
• Available in standard transformers library
• No special installation requirements
• Good zero-shot performance on diverse objects
• Robust text-image understanding
• Multiple model sizes for different needs

🔍 VS GLIP COMPARISON:
• CLIP: Widely available, good performance, simpler setup
• True GLIP: Better language grounding, more complex setup
• This implementation: GLIP-like functionality with CLIP convenience

💡 FORENSIC BEST PRACTICES:
• Start with simple object names: "weapon", "person", "vehicle"
• Use multiple patch sizes for comprehensive analysis
• Cross-reference results between different models
• Adjust thresholds based on image quality and lighting
• Document parameter settings for evidence reports

🎯 PERFORMANCE EXPECTATIONS:
• CLIP excels at recognizing common objects and people
• Good performance on weapons, vehicles, and clothing
• Works well with clear, well-lit images
• May require parameter tuning for optimal results
• Best results with objects that fill a significant portion of patches

📊 OPTIMIZATION TIPS:
• For speed: Use Patch32 model, larger patch sizes (96-128px)
• For accuracy: Use Large model, smaller patches (32-48px)
• For balance: Use Patch16 model, medium patches (64px)
• Memory saving: Reduce TOP_K_PATCHES in config
• Quality images: Lower similarity thresholds work better

🔍 QUERY OPTIMIZATION:
• Simple terms often work best: "weapon" vs "person holding weapon"
• Try synonyms: "car" vs "vehicle", "gun" vs "weapon"
• Use single concepts per query for clearer results
• Combine results from multiple related queries

🎓 ADVANCED USAGE:
• Batch processing for systematic investigation
• Multiple model comparison for validation
• Parameter sweeps to find optimal settings
• Integration with other detection models
• Custom patch extraction for specific use cases

📈 EXPECTED RESULTS:
• Similarity scores typically range 0.15-0.6
• Higher scores indicate stronger text-image match
• Results depend heavily on image quality and object visibility
• Fine-tune thresholds based on your specific dataset

For batch processing of multiple queries, use the functions in Cell 6.
For systematic forensic investigations, consider parameter optimization.
This CLIP-based approach provides practical GLIP-like functionality without complex setup requirements.
""")

✅ Configuration loaded successfully
📁 Suspects gallery: ../../datasets/images/objects/raw
📁 Results output: ../../datasets/images/objects/detections
🔍 Selected model: openai/clip-vit-base-patch32
📝 Note: Using CLIP for GLIP-like functionality (text-image grounding)
✅ Transformers already installed
✅ All dependencies imported successfully
🔧 PyTorch version: 2.7.1+cpu
🖥️ Using CPU mode (CUDA not available)
📍 Auto-detected device: cpu
📁 Created directories: ../../datasets/images/objects/raw, ../../datasets/images/objects/detections
📥 Loading CLIP model: openai/clip-vit-base-patch32
🖥️ Target device: cpu
📝 Using CLIP for grounded language-image understanding
⏳ Loading processor...


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


⏳ Loading model weights...
📍 Moving model to cpu...
✅ Model loaded successfully!
   🔍 Model: openai/clip-vit-base-patch32
   🖥️ Device: cpu


VBox(children=(VBox(children=(HTML(value='<h3>🔍 Forensic Image Search Interface (CLIP-based Grounding)</h3>'),…

🔍 Available CLIP Models (GLIP Alternative):
--------------------------------------------------
🔹 CLIP ViT-Base Patch32 (clip-vit-base-patch32)
   📊 Base performance, good speed
   📝 Standard CLIP model - good balance of speed and accuracy

🔹 CLIP ViT-Base Patch16 (clip-vit-base-patch16)
   📊 Better accuracy, moderate speed
   📝 Higher resolution CLIP - better accuracy, slower

🔹 CLIP ViT-Large Patch14 (clip-vit-large-patch14)
   📊 Best accuracy, slowest speed
   📝 Largest CLIP model - best accuracy, slowest


🎯 FORENSIC IMAGE SEARCH SYSTEM - CLIP-BASED GROUNDING

📝 IMPORTANT NOTE:
This implementation uses CLIP (available in transformers) to provide
GLIP-like functionality through text-image similarity and sliding window detection.

🆕 CLIP-BASED FEATURES:
• 🔍 Text-image similarity for grounded understanding
• 📐 Sliding window approach for object localization
• 🎯 Patch-based analysis for detailed detection
• 🔗 Direct similarity scoring between text and image regions

📋 SETUP CHECKLIST:
1