# AIC-25 Colab: Textual KIS (Clean Flow)

Choose one path and follow the numbered steps.

- Path A — Quickstart (use provided features): simplest, fastest
- Path B — Recompute (SigLIP2): higher quality, recomputes features and model

Steps overview
1) Clone repo
2) Install deps
3) Download dataset
4A) Quickstart setup (Path A) — OR — 4B) Recompute SigLIP2 (Path B)
5) Start backend
6) Run a KIS query and export CSV
7) Zip for Codabench

In [None]:
# Environment detection and directory setup
import os
import sys

# CRITICAL: Set memory management BEFORE any PyTorch imports
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Detect environment and set base directory
if 'google.colab' in sys.modules:
    # Google Colab environment
    BASE_DIR = '/content/aic-25'
    print('Environment: Google Colab')
else:
    # Local Jupyter/Linux environment - assume we're in the project root
    # or create a local aic-25 directory structure
    current_dir = os.path.abspath('.')
    
    # Check if we're already in the project root (has tools, aic-24-BE, etc.)
    if all(os.path.exists(os.path.join(current_dir, d)) for d in ['tools', 'aic-24-BE', 'aic-24-FE']):
        BASE_DIR = current_dir
        print(f'Environment: Local Jupyter (already in project root)')
    else:
        # Create a local workspace directory
        BASE_DIR = os.path.join(current_dir, 'aic-25-local')
        os.makedirs(BASE_DIR, exist_ok=True)
        print(f'Environment: Local Jupyter (created workspace)')

print(f'Base directory: {BASE_DIR}')

# Choose your path: 'quickstart' or 'recompute'
PATH_CHOICE = 'recompute'  # <-- set to 'quickstart' for the simplest path
print('Path:', PATH_CHOICE)

# Check GPU memory early to detect if kernel restart is needed
try:
    import torch
    if torch.cuda.is_available():
        free_gb = torch.cuda.mem_get_info()[0] / 1024**3
        total_gb = torch.cuda.mem_get_info()[1] / 1024**3
        used_gb = total_gb - free_gb
        print(f'\n🔧 GPU Memory Status:')
        print(f'  Total: {total_gb:.2f}GB')
        print(f'  Used:  {used_gb:.2f}GB ({used_gb/total_gb*100:.1f}%)')
        print(f'  Free:  {free_gb:.2f}GB ({free_gb/total_gb*100:.1f}%)')
        
        if used_gb > total_gb * 0.8:  # More than 80% used
            print(f'⚠️  WARNING: GPU memory is {used_gb/total_gb*100:.1f}% full!')
            print(f'💡 Consider restarting your Python kernel to free GPU memory')
            print(f'   (Kernel → Restart Kernel... in Jupyter)')
        elif free_gb < 4:
            print(f'⚠️  WARNING: Only {free_gb:.2f}GB free - may need smaller batch sizes')
    else:
        print('🔧 CUDA not available, will use CPU')
except ImportError:
    print('🔧 PyTorch not yet imported, will check memory later')

In [None]:
# Step 1) Setup repository structure
import os, shutil, subprocess

if 'google.colab' in sys.modules:
    # Google Colab: Clone repository
    REPO_URL = 'https://github.com/dnlqvu/hcm-AI-challenge-2024-main.git'
    if os.path.exists(BASE_DIR):
        shutil.rmtree(BASE_DIR)
    print('Cloning', REPO_URL, '->', BASE_DIR)
    subprocess.run(['git', 'clone', REPO_URL, BASE_DIR], check=True)
    print('Repository contents:', os.listdir(BASE_DIR))
else:
    # Local environment: Handle different scenarios
    current_dir = os.path.abspath('.')
    
    if BASE_DIR == current_dir:
        # We're already in the project root
        print(f'✅ Using existing project structure in {BASE_DIR}')
        print('Contents:', [f for f in os.listdir(BASE_DIR) if not f.startswith('.')])
    else:
        # We're in a different directory, need to copy or clone
        if os.path.exists('tools') and os.path.exists('aic-24-BE'):
            # We have the source, copy to workspace
            print(f'📁 Copying project files to workspace: {BASE_DIR}')
            for item in ['tools', 'aic-24-BE', 'aic-24-FE']:
                src = os.path.join(current_dir, item)
                dst = os.path.join(BASE_DIR, item)
                if os.path.exists(src):
                    if os.path.exists(dst):
                        shutil.rmtree(dst)
                    shutil.copytree(src, dst)
            print('✅ Project structure copied to workspace')
        else:
            # Clone from repo
            print(f'🔄 Cloning repository to {BASE_DIR}')
            if os.path.exists(BASE_DIR):
                shutil.rmtree(BASE_DIR)
            REPO_URL = 'https://github.com/dnlqvu/hcm-AI-challenge-2024-main.git'
            subprocess.run(['git', 'clone', REPO_URL, BASE_DIR], check=True)
            print('✅ Repository cloned')
        
        print('Workspace contents:', os.listdir(BASE_DIR))

# Verify the structure is correct
required_dirs = ['tools', 'aic-24-BE']
missing_dirs = [d for d in required_dirs if not os.path.exists(os.path.join(BASE_DIR, d))]
if missing_dirs:
    raise FileNotFoundError(f'❌ Missing required directories in {BASE_DIR}: {missing_dirs}')
print('✅ Repository structure verified')

In [None]:
# Change to project directory and show current location
os.chdir(BASE_DIR)
print('Current working directory:', os.getcwd())
print('Contents:', [f for f in os.listdir('.') if not f.startswith('.')])

# Step 2) Install backend + extractor dependencies
backend_dir = os.path.join(BASE_DIR, 'aic-24-BE')
os.chdir(backend_dir)
print(f'Installing backend dependencies from {backend_dir}')

# Check for newer GPU and install appropriate PyTorch
import subprocess
import sys

# Check GPU and CUDA compatibility
try:
    import torch
    if torch.cuda.is_available():
        gpu_name = torch.cuda.get_device_name(0)
        print(f"Detected GPU: {gpu_name}")
        
        # Check if we have RTX 50 series or newer GPU
        if "5070" in gpu_name or "5080" in gpu_name or "5090" in gpu_name or "4090" in gpu_name:
            print("⚠️ Detected newer GPU, upgrading PyTorch for compatibility...")
            
            # Uninstall old PyTorch
            if 'google.colab' in sys.modules:
                get_ipython().system('pip uninstall -y torch torchvision torchaudio')
                # Install PyTorch 2.2+ with CUDA 12.1 support for newer GPUs
                get_ipython().system('pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121')
            else:
                subprocess.run([sys.executable, '-m', 'pip', 'uninstall', '-y', 'torch', 'torchvision', 'torchaudio'], 
                              capture_output=True)
                # Install PyTorch 2.2+ with CUDA 12.1 support
                subprocess.run([sys.executable, '-m', 'pip', 'install', 'torch', 'torchvision', 'torchaudio', 
                               '--index-url', 'https://download.pytorch.org/whl/cu121'], check=True)
            print("✅ PyTorch upgraded for newer GPU support")
except ImportError:
    print("PyTorch not yet installed, will install with requirements")

# Use appropriate pip installation method
if 'google.colab' in sys.modules:
    get_ipython().system('python -m pip install --quiet --upgrade pip')
    get_ipython().system('pip install --quiet -r requirements.txt')
else:
    # Local environment - use subprocess for better error handling
    try:
        subprocess.run([sys.executable, '-m', 'pip', 'install', '--upgrade', 'pip'], check=True)
        subprocess.run([sys.executable, '-m', 'pip', 'install', '-r', 'requirements.txt'], check=True)
        print('✅ Backend requirements installed successfully')
    except subprocess.CalledProcessError as e:
        print(f'❌ Failed to install requirements: {e}')
        print('You may need to install requirements manually: pip install -r requirements.txt')

# Check if critical packages installed successfully
import sys
try:
    import uvicorn
    import fastapi
    print("✓ Backend dependencies installed")
except ImportError as e:
    print(f"✗ Backend dependency missing: {e}")
    sys.exit(1)

# Return to base directory
os.chdir(BASE_DIR)

# Install extras with compatible versions
print("\nInstalling additional dependencies with compatible versions...")
extra_packages = [
    'opencv-python==4.8.1.78',  # Compatible with numpy 1.26.4
    'tqdm', 
    'pillow', 
    'open_clip_torch'
]

for pkg in extra_packages:
    try:
        print(f"Installing {pkg}...")
        if 'google.colab' in sys.modules:
            get_ipython().system(f'pip install --quiet {pkg}')
        else:
            result = subprocess.run([sys.executable, '-m', 'pip', 'install', pkg], 
                                  capture_output=True, text=True)
            if result.returncode != 0 and 'already satisfied' not in result.stdout:
                print(f'⚠️ Warning installing {pkg}: {result.stderr}')
    except subprocess.CalledProcessError:
        print(f'⚠️ Failed to install {pkg} - may need manual installation')

# Verify extraction dependencies and CUDA compatibility
try:
    import cv2
    import open_clip
    import torch
    import numpy as np
    
    print("\n=== System Status ===")
    print(f"NumPy version: {np.__version__}")
    print(f"OpenCV version: {cv2.__version__}")
    print(f"PyTorch version: {torch.__version__}")
    
    if torch.cuda.is_available():
        print(f"CUDA available: Yes")
        print(f"CUDA version: {torch.version.cuda}")
        print(f"GPU: {torch.cuda.get_device_name(0)}")
        print(f"GPU compute capability: {torch.cuda.get_device_capability(0)}")
        
        # Test CUDA functionality
        try:
            test_tensor = torch.randn(2, 2).cuda()
            result = test_tensor @ test_tensor.T
            print("✓ CUDA operations working correctly")
            device_info = 'cuda'
        except RuntimeError as e:
            print(f"✗ CUDA operations failed: {e}")
            print("⚠️ Falling back to CPU")
            device_info = 'cpu'
    else:
        print("CUDA available: No")
        device_info = 'cpu'
    
    print(f"✓ All dependencies installed (torch device: {device_info})")
    
except ImportError as e:
    print(f"✗ Extraction dependency missing: {e}")
    sys.exit(1)

In [None]:
# Step 2) Install backend + extractor dependencies
%cd /content/aic-25/aic-24-BE
!python -m pip install --quiet --upgrade pip
!pip install --quiet -r requirements.txt
# Check if critical packages installed successfully
import sys
try:
    import uvicorn
    import fastapi
    print("✓ Backend dependencies installed")
except ImportError as e:
    print(f"✗ Backend dependency missing: {e}")
    sys.exit(1)

%cd /content/aic-25
# Extras for local extraction & utilities
!pip install --quiet opencv-python tqdm pillow open_clip_torch
# Verify extraction dependencies
try:
    import cv2
    import open_clip
    import torch
    print(f"✓ Extraction dependencies installed (torch device: {'cuda' if torch.cuda.is_available() else 'cpu'})")
except ImportError as e:
    print(f"✗ Extraction dependency missing: {e}")
    sys.exit(1)

In [None]:
# Step 3) Upload AIC_2025_dataset_download_link.csv
import os

if 'google.colab' in sys.modules:
    # Google Colab: Use file upload widget
    from google.colab import files
    print('Please upload your AIC_2025_dataset_download_link.csv file:')
    uploaded = files.upload()  # choose AIC_2025_dataset_download_link.csv
    CSV_PATH = next(iter(uploaded))
    print('Using uploaded CSV:', CSV_PATH)
else:
    # Local environment: Look for CSV file or prompt for path
    possible_paths = [
        'AIC_2025_dataset_download_link.csv',
        'dataset/AIC_2025_dataset_download_link.csv',
        '../AIC_2025_dataset_download_link.csv',
        os.path.expanduser('~/Downloads/AIC_2025_dataset_download_link.csv')
    ]
    
    CSV_PATH = None
    for path in possible_paths:
        if os.path.exists(path):
            CSV_PATH = path
            print(f'✅ Found dataset CSV at: {CSV_PATH}')
            break
    
    if not CSV_PATH:
        print('❌ AIC_2025_dataset_download_link.csv not found in common locations.')
        print('Checked locations:', possible_paths)
        CSV_PATH = input('Please enter the full path to AIC_2025_dataset_download_link.csv: ').strip()
        if not os.path.exists(CSV_PATH):
            raise FileNotFoundError(f'CSV file not found at: {CSV_PATH}')
        print(f'✅ Using CSV file: {CSV_PATH}')

print(f'Final CSV path: {os.path.abspath(CSV_PATH)}')

In [None]:
# Step 3) Download dataset assets to example_dataset/ and extract
os.chdir(BASE_DIR)
print(f'Working in: {os.getcwd()}')

# Create dataset directory
dataset_dir = os.path.join(BASE_DIR, 'example_dataset')
os.makedirs(dataset_dir, exist_ok=True)

if 'google.colab' in sys.modules:
    get_ipython().system(f'python tools/aic_cli.py download-dataset --csv {CSV_PATH} --outdir example_dataset --extract')
    get_ipython().system('ls -la example_dataset | head -n 50')
else:
    # Local environment - use subprocess
    try:
        cmd = [sys.executable, 'tools/aic_cli.py', 'download-dataset', 
               '--csv', CSV_PATH, '--outdir', 'example_dataset', '--extract']
        print(f'Running: {" ".join(cmd)}')
        result = subprocess.run(cmd, check=True, capture_output=True, text=True, cwd=BASE_DIR)
        print('✅ Dataset download completed')
        if result.stdout:
            print('Output:', result.stdout[-500:])  # Last 500 chars
    except subprocess.CalledProcessError as e:
        print(f'❌ Dataset download failed: {e}')
        if e.stderr:
            print('Error:', e.stderr[-500:])
        raise

# Sanity checks for required directories
required_dataset_dirs = [
    'example_dataset/map-keyframes',
    'example_dataset/clip-features-32', 
    'example_dataset/media-info',
    'example_dataset/keyframes'
]

for check_dir in required_dataset_dirs:
    full_path = os.path.join(BASE_DIR, check_dir)
    if not os.path.exists(full_path):
        print(f'⚠️ MISSING: {check_dir}')
    else:
        print(f'✅ Found: {check_dir}')

print(f'Dataset contents: {os.listdir(os.path.join(BASE_DIR, "example_dataset"))}')

In [None]:
os.chdir(BASE_DIR)
if PATH_CHOICE != 'recompute':
    print('Skipping Recompute (PATH_CHOICE!=recompute)')
    raise SystemExit(0)

VIDEOS_DIR = os.path.join(BASE_DIR, 'example_dataset', 'Videos_L21_a')
MODEL = 'ViT-L-16-SigLIP-256'  # Changed from 384 to avoid OOM
PRETRAINED = 'webli'  # Correct pretrained tag for SigLIP models
CLIP_LEN = 1.5
DECODE_FPS = 1.5  # Reduced from 2.0 to save memory
TARGET_FPS = 0.8  # Reduced from 1.0 to save memory

print(f'Videos directory: {VIDEOS_DIR}')
print(f'Videos directory exists: {os.path.exists(VIDEOS_DIR)}')
if os.path.exists(VIDEOS_DIR):
    print(f'Videos found: {os.listdir(VIDEOS_DIR)[:5]}')  # Show first 5
print(f'Model: {MODEL} with {PRETRAINED}')

# 4B.1 Smart sampling → extract exact frames (original indices) - with memory-optimized adaptive sampling
frames_dir = os.path.join(BASE_DIR, 'aic-24-BE', 'data', 'video_frames')
os.makedirs(frames_dir, exist_ok=True)
print(f'Frames directory: {frames_dir}')

# Clear any existing GPU memory before starting
try:
    import torch
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        import gc
        gc.collect()
        free_gb = torch.cuda.mem_get_info()[0] / 1024**3
        total_gb = torch.cuda.mem_get_info()[1] / 1024**3
        print(f'🔧 GPU Memory cleared: {free_gb:.2f}GB free / {total_gb:.2f}GB total')
        
        if free_gb < 3:
            print(f'🚨 CRITICAL: Only {free_gb:.2f}GB free - strongly recommend restarting kernel!')
except ImportError:
    pass

cmd = [
    sys.executable, 'tools/aic_cli.py', 'sample-smart', 
    '--strategy', 'clip-delta',
    '--videos-dir', VIDEOS_DIR,
    '--frames-dir', frames_dir,
    '--decode-fps', str(DECODE_FPS),
    '--target-fps', str(TARGET_FPS),
    '--model', MODEL,
    '--pretrained', PRETRAINED,
    '--adaptive',
    '--batch-size', '2'  # Very conservative batch size to avoid OOM
]

print(f'Running smart sampling: {" ".join(cmd)}')
if 'google.colab' in sys.modules:
    get_ipython().system(f'python tools/aic_cli.py sample-smart --strategy clip-delta --videos-dir "{VIDEOS_DIR}" --frames-dir "{frames_dir}" --decode-fps {DECODE_FPS} --target-fps {TARGET_FPS} --model {MODEL} --pretrained {PRETRAINED} --adaptive --batch-size 2')
else:
    try:
        result = subprocess.run(cmd, cwd=BASE_DIR, check=True, capture_output=True, text=True)
        print('✅ Smart sampling completed')
        if result.stdout:
            print('Output:', result.stdout[-1000:])
    except subprocess.CalledProcessError as e:
        print(f'❌ Smart sampling failed with exit code {e.returncode}')
        print(f'Command: {" ".join(e.cmd)}')
        if e.stdout:
            print('STDOUT:')
            print(e.stdout)
        if e.stderr:
            print('STDERR:')
            print(e.stderr)
        
        # Don't raise the error, continue with manual approach
        print('\n⚠️ Falling back to direct smart_sampling.py execution...')

# If we get here and smart sampling failed, try direct approach with even smaller batch size
if not os.path.exists(frames_dir) or not os.listdir(frames_dir):
    print('🔄 Attempting direct smart_sampling.py execution with minimal batch size:')
    direct_cmd = [
        sys.executable, 'tools/smart_sampling.py',
        '--videos-dir', VIDEOS_DIR,
        '--strategy', 'clip-delta', 
        '--decode-fps', str(DECODE_FPS),
        '--target-fps', str(TARGET_FPS),
        '--model', MODEL,
        '--pretrained', PRETRAINED,
        '--adaptive',
        '--batch-size', '1',  # Absolute minimum batch size
        '--out-csv', 'selected_frames.csv'
    ]
    
    try:
        direct_result = subprocess.run(direct_cmd, cwd=BASE_DIR, check=True, capture_output=True, text=True)
        print('✅ Direct smart sampling completed')
        
        # Now extract frames using crop_frame.py
        if os.path.exists('selected_frames.csv'):
            crop_cmd = [
                sys.executable, 'aic-24-BE/data_processing/crop_frame.py',
                '--input-dir', VIDEOS_DIR,
                '--output-dir', frames_dir,
                '--frame-list', 'selected_frames.csv'
            ]
            crop_result = subprocess.run(crop_cmd, cwd=BASE_DIR, check=True, capture_output=True, text=True)
            print('✅ Frame extraction completed')
    except subprocess.CalledProcessError as direct_e:
        print(f'❌ Direct smart sampling also failed: {direct_e}')
        if direct_e.stderr:
            print('STDERR:', direct_e.stderr)
        raise

# 4B.2 Encode sampled frames with SigLIP-256 and write shards
backend_dir = os.path.join(BASE_DIR, 'aic-24-BE')
os.chdir(backend_dir)
print(f'Working in backend directory: {os.getcwd()}')

import os, numpy as np, torch, pickle, gc
from pathlib import Path
from PIL import Image
import open_clip
from tqdm import tqdm

# Clear GPU memory before loading model
torch.cuda.empty_cache()
gc.collect()

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Loading {MODEL} with {PRETRAINED} on {device}...")

# Memory monitoring
if torch.cuda.is_available():
    free_gb = torch.cuda.mem_get_info()[0] / 1024**3
    total_gb = torch.cuda.mem_get_info()[1] / 1024**3
    print(f"GPU Memory before loading: {free_gb:.2f}GB free / {total_gb:.2f}GB total")
    
    if free_gb < 2:
        print("🚨 CRITICAL: Less than 2GB free - model loading will likely fail!")
        print("💡 Please restart your Python kernel to free GPU memory")

# Clear any existing models from memory
if 'model' in locals():
    del model
torch.cuda.empty_cache()
gc.collect()

model, _, preprocess = open_clip.create_model_and_transforms(MODEL, pretrained=PRETRAINED, device=device)
model.eval()

# Memory monitoring after loading
if torch.cuda.is_available():
    free_gb = torch.cuda.mem_get_info()[0] / 1024**3
    print(f"GPU Memory after loading: {free_gb:.2f}GB free / {total_gb:.2f}GB total")

frames_root = 'data/video_frames'
out_dir = 'data/clip_features'
os.makedirs(out_dir, exist_ok=True)

# Check if frames directory exists before proceeding
if not os.path.exists(frames_root):
    raise FileNotFoundError(f"Frames directory not found: {frames_root}. Smart sampling may have failed.")

def encode_batch(img_paths, target_size=256, batch_size=4):
    """Encode batch with ultra-conservative memory management"""
    ims = []
    for p in img_paths:
        im = Image.open(p).convert('RGB')
        # Resize to target resolution (256x256 for balance of context vs memory)
        im = im.resize((target_size, target_size), Image.LANCZOS)
        ims.append(preprocess(im))
    
    # Process in ultra-small sub-batches
    all_feats = []
    for i in range(0, len(ims), batch_size):
        sub_batch = ims[i:i+batch_size]
        try:
            with torch.no_grad():
                batch_tensor = torch.stack(sub_batch).to(device)
                feats = model.encode_image(batch_tensor)
                feats = feats / feats.norm(dim=-1, keepdim=True)
                all_feats.append(feats.cpu().float().numpy())
                
                # Clear GPU cache after every sub-batch
                if device == 'cuda':
                    torch.cuda.empty_cache()
        except torch.cuda.OutOfMemoryError:
            print(f"🚨 OOM even with batch size {len(sub_batch)}! Processing one by one...")
            # Process individually
            for single_tensor in sub_batch:
                with torch.no_grad():
                    single_batch = single_tensor.unsqueeze(0).to(device)
                    feats = model.encode_image(single_batch)
                    feats = feats / feats.norm(dim=-1, keepdim=True)
                    all_feats.append(feats.cpu().float().numpy())
                    torch.cuda.empty_cache()
    
    return np.concatenate(all_feats, axis=0)

# Process videos with ultra-conservative memory management
checkpoint_dir = 'data/checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)

for vid in sorted(os.listdir(frames_root)):
    vid_dir = os.path.join(frames_root, vid)
    if not os.path.isdir(vid_dir):
        continue
    
    # Check for existing checkpoint
    checkpoint_file = os.path.join(checkpoint_dir, f'{vid}.checkpoint')
    final_file = os.path.join(out_dir, f'{vid}.pkl')
    
    if os.path.exists(final_file):
        print(f"Skipping {vid}: already processed")
        continue
    
    imgs = [f for f in os.listdir(vid_dir) if f.lower().endswith('.jpg')]
    if not imgs:
        continue
    imgs = sorted(imgs, key=lambda x: int(os.path.splitext(x)[0]))
    file_paths = [f'./data/video_frames/{vid}/{name}' for name in imgs]
    
    feats_list = []
    bs = 8  # Ultra-conservative batch size
    
    start_idx = 0
    if os.path.exists(checkpoint_file):
        # Load checkpoint
        with open(checkpoint_file, 'rb') as f:
            checkpoint = pickle.load(f)
            feats_list = checkpoint['feats_list']
            start_idx = checkpoint['last_idx']
        print(f"Resuming {vid} from frame {start_idx}")
    
    for i in tqdm(range(start_idx, len(imgs), bs), desc=f'Encoding {vid}'):
        batch_paths = [os.path.join(vid_dir, name) for name in imgs[i:i+bs]]
        batch_feats = encode_batch(batch_paths, batch_size=2)  # Ultra-small sub-batch
        feats_list.append(batch_feats)
        
        # Save checkpoint every 20 frames (very frequent for safety)
        if (i - start_idx) % 20 == 0 and i > start_idx:
            with open(checkpoint_file, 'wb') as f:
                pickle.dump({
                    'feats_list': feats_list,
                    'last_idx': i + bs
                }, f)
        
        # Force garbage collection and memory cleanup after every batch
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            if i % 20 == 0:
                free_gb = torch.cuda.mem_get_info()[0] / 1024**3
                print(f"  GPU Memory: {free_gb:.2f}GB free")
    
    # Finalize features
    feats_np = np.concatenate(feats_list, axis=0)
    with open(final_file, 'wb') as f:
        pickle.dump((file_paths, feats_np), f)
    
    # Clean up checkpoint
    if os.path.exists(checkpoint_file):
        os.remove(checkpoint_file)
    
    print(f"✓ Encoded {vid}: {feats_np.shape[0]} features at 256px resolution")
    
# 4B.3 Build model and patch .env
print("Building NitzcheCLIP model...")
from nitzche_clip import NitzcheCLIP
m = NitzcheCLIP(out_dir)
os.makedirs('models', exist_ok=True)
m.save('models/clip_siglip.pkl')
envp = Path('.env')
content = envp.read_text(encoding='utf-8') if envp.exists() else ''
lines = []
saw_path = saw_16 = False
for line in content.splitlines():
    if line.strip().startswith('MODEL_PATH='): lines.append('MODEL_PATH="./models/"'); saw_path=True
    elif line.strip().startswith('MODEL_16='): lines.append('MODEL_16="clip_siglip.pkl"'); saw_16=True
    else: lines.append(line)
if not saw_path: lines.append('MODEL_PATH="./models/"')
if not saw_16: lines.append('MODEL_16="clip_siglip.pkl"')
# Also set matching text encoder
set_name = False; set_pre = False
out=[]
for line in lines:
    if line.strip().startswith('CLIP_MODEL_NAME='): out.append(f'CLIP_MODEL_NAME="{MODEL}"'); set_name=True
    elif line.strip().startswith('CLIP_PRETRAINED='): out.append(f'CLIP_PRETRAINED="{PRETRAINED}"'); set_pre=True
    else: out.append(line)
if not set_name: out.append(f'CLIP_MODEL_NAME="{MODEL}"')
if not set_pre: out.append(f'CLIP_PRETRAINED="{PRETRAINED}"')
envp.write_text('\n'.join(out)+'\n', encoding='utf-8')
print('✅ Recompute complete. Ultra-memory-optimized smart-sampled frames + SigLIP-256 features with intelligent adaptive sampling. .env updated.')

In [None]:
# Step 5) Start backend API (daemon)
os.chdir(BASE_DIR)
print(f'Starting backend from: {os.getcwd()}')

# Start backend using aic_cli
if 'google.colab' in sys.modules:
    get_ipython().system('python tools/aic_cli.py serve --port 8000 --run --daemon --no-reload')
    get_ipython().system('python tools/aic_cli.py serve-status')
else:
    # Local environment
    try:
        subprocess.Popen([sys.executable, 'tools/aic_cli.py', 'serve', '--port', '8000', '--run', '--daemon', '--no-reload'], 
                        cwd=BASE_DIR)
        print('✅ Backend started in daemon mode')
        
        # Check status
        result = subprocess.run([sys.executable, 'tools/aic_cli.py', 'serve-status'], 
                              cwd=BASE_DIR, capture_output=True, text=True)
        if result.stdout:
            print('Status:', result.stdout.strip())
    except Exception as e:
        print(f'❌ Failed to start backend: {e}')

# Test backend connectivity
import time, requests
print('Testing backend connectivity...')
for attempt in range(30):
    try:
        r = requests.get('http://localhost:8000/docs', timeout=2)
        print(f'✅ Backend reachable: HTTP {r.status_code}')
        break
    except Exception:
        if attempt < 5:
            print(f'Attempt {attempt + 1}: Waiting for backend...')
        time.sleep(1)
else:
    print('❌ Backend not reachable after 30 seconds')

In [None]:
# Step 6) Prepare a KIS query
%cd /content/aic-25
query_text = 'Cảnh quay bằng flycam một cây cầu ở TP Hồ Chí Minh, tiếp theo đến cảnh quay tòa nhà Bitexco. Một vài cảnh sau đó chuyển qua quay hình ảnh hồ gươm tại Hà Nội.'  # edit your KIS query here
print('Query:', (query_text[:120] + ('...' if len(query_text) > 120 else '')))


In [None]:
# Step 6) Export KIS CSV to submission/
os.chdir(BASE_DIR)
query_text = 'Cảnh quay bằng flycam một cây cầu ở TP Hồ Chí Minh, tiếp theo đến cảnh quay tòa nhà Bitexco. Một vài cảnh sau đó chuyển qua quay hình ảnh hồ gươm tại Hà Nội.'
print('Query:', (query_text[:120] + ('...' if len(query_text) > 120 else '')))

submission_dir = os.path.join(BASE_DIR, 'submission')
os.makedirs(submission_dir, exist_ok=True)

export_cmd = [
    sys.executable, 'tools/aic_cli.py', 'export',
    '--text', query_text,
    '--task', 'kis',
    '--name', 'query-1',
    '--api', 'http://localhost:8000',
    '--outdir', 'submission',
    '--wait-api', '30'
]

print('Exporting KIS query results...')
if 'google.colab' in sys.modules:
    get_ipython().system(f'python tools/aic_cli.py export --text "{query_text}" --task kis --name query-1 --api http://localhost:8000 --outdir submission --wait-api 30')
    get_ipython().system('echo "Generated files:" && ls -la submission')
    get_ipython().system('echo "Preview:" && head -n 5 submission/query-1-kis.csv')
else:
    try:
        result = subprocess.run(export_cmd, cwd=BASE_DIR, check=True, capture_output=True, text=True)
        print('✅ Export completed successfully')
        
        # Show generated files
        files = os.listdir(submission_dir)
        print(f'Generated files: {files}')
        
        # Preview the CSV
        csv_file = os.path.join(submission_dir, 'query-1-kis.csv')
        if os.path.exists(csv_file):
            with open(csv_file, 'r') as f:
                print('Preview:')
                for i, line in enumerate(f):
                    if i >= 5:
                        break
                    print(line.strip())
        
    except subprocess.CalledProcessError as e:
        print(f'❌ Export failed: {e}')
        if e.stderr:
            print('Error:', e.stderr)

In [None]:
# Step 7) Zip for Codabench
os.chdir(BASE_DIR)
print(f'Creating submission zip from: {os.getcwd()}')

zip_cmd = [
    sys.executable, 'tools/aic_cli.py', 'zip-submission',
    '--outdir', 'submission',
    '--name', 'aic25_submission.zip'
]

if 'google.colab' in sys.modules:
    get_ipython().system('python tools/aic_cli.py zip-submission --outdir submission --name aic25_submission.zip')
    from google.colab import files as colab_files
    colab_files.download('aic25_submission.zip')
else:
    try:
        result = subprocess.run(zip_cmd, cwd=BASE_DIR, check=True, capture_output=True, text=True)
        print('✅ Submission zip created successfully')
        
        zip_path = os.path.join(BASE_DIR, 'aic25_submission.zip')
        if os.path.exists(zip_path):
            file_size = os.path.getsize(zip_path) / (1024 * 1024)  # MB
            print(f'📦 Submission file: {zip_path} ({file_size:.2f} MB)')
            print('Ready for upload to Codabench!')
        else:
            print('❌ Zip file not found after creation')
            
    except subprocess.CalledProcessError as e:
        print(f'❌ Zip creation failed: {e}')
        if e.stderr:
            print('Error:', e.stderr)