In [6]:
from deepface import DeepFace

import os
import numpy as np

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import umap
import hdbscan
from sklearn.metrics.pairwise import cosine_distances
from PIL import Image
from collections import defaultdict, Counter
import pickle
from tqdm import tqdm
import time
import hashlib
import shutil

In [7]:
def show_cache_status(cache_dir="coba_cache"):
    """
    Display detailed information about the current cache status
    """
    print("🔍 CACHE STATUS REPORT")
    print("=" * 40)
    
    faces_path = os.path.join(cache_dir, "faces.npy")
    metadata_path = os.path.join(cache_dir, "metadata.pkl")
    cache_info_path = os.path.join(cache_dir, "cache_info.pkl")
    
    if not os.path.exists(cache_dir):
        print("❌ Cache directory does not exist")
        return
    
    # Check core cache files
    has_faces = os.path.exists(faces_path)
    has_metadata = os.path.exists(metadata_path)
    has_cache_info = os.path.exists(cache_info_path)
    
    print(f"📁 Cache Directory: {cache_dir}")
    print(f"📄 Faces file: {'✅ Exists' if has_faces else '❌ Missing'}")
    print(f"📄 Metadata file: {'✅ Exists' if has_metadata else '❌ Missing'}")
    print(f"📄 Cache info: {'✅ Exists' if has_cache_info else '❌ Missing'}")
    
    if has_faces and has_metadata:
        try:
            # Load and show cache contents
            faces = np.load(faces_path)
            with open(metadata_path, "rb") as f:
                metadata = pickle.load(f)
            
            print(f"\n📊 CACHED DATA:")
            print(f"   • Face embeddings: {len(faces)}")
            
            # Only show dimensions if faces exist
            if len(faces) > 0:
                print(f"   • Face dimensions: {faces.shape[1]}D")
            else:
                print(f"   • Face dimensions: N/A (empty cache)")
                
            print(f"   • Metadata entries: {len(metadata)}")
            print(f"   • File sizes: faces={os.path.getsize(faces_path)/1024/1024:.1f}MB, "
                  f"metadata={os.path.getsize(metadata_path)/1024/1024:.1f}MB")
            
            # Show cache info if available
            if has_cache_info:
                try:
                    with open(cache_info_path, "rb") as f:
                        cache_info = pickle.load(f)
                    
                    print(f"\n📅 CACHE DETAILS:")
                    print(f"   • Created: {cache_info.get('created_time', 'Unknown')}")
                    print(f"   • Source files: {cache_info.get('folder_stats', {}).get('total_files', 'Unknown')}")
                    print(f"   • Extraction params: {cache_info.get('extraction_params', {})}")
                    
                except:
                    print("\n⚠️  Cache info file corrupted")
            
            if len(faces) > 0:
                print(f"\n✅ Cache is ready to use!")
            else:
                print(f"\n⚠️  Cache is empty - no faces found")
            
        except Exception as e:
            print(f"\n❌ Error reading cache: {e}")
    else:
        print(f"\n❌ Cache is incomplete - missing required files")
    
    print("\n💡 To force re-extraction, set FORCE_REEXTRACT=True in the script")


In [8]:
def get_folder_stats(folder_path):
    """
    Get statistics about folder structure for cache validation
    """
    stats = {
        'total_files': 0,
        'total_size': 0,
        'folder_structure': {},
        'last_modified': 0
    }
    
    try:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                if file.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp')):
                    file_path = os.path.join(root, file)
                    try:
                        file_stat = os.stat(file_path)
                        stats['total_files'] += 1
                        stats['total_size'] += file_stat.st_size
                        stats['last_modified'] = max(stats['last_modified'], file_stat.st_mtime)
                        
                        # Track folder structure
                        rel_path = os.path.relpath(root, folder_path)
                        if rel_path not in stats['folder_structure']:
                            stats['folder_structure'][rel_path] = 0
                        stats['folder_structure'][rel_path] += 1
                    except:
                        continue
    except:
        pass
    
    return stats

In [9]:
def extract_faces_from_event_folder(event_folder_path, cache_dir="coba_cache", cropped_dir="cropted_faces", min_face_size=27, force_reextract=False):
    """
    Extract faces from event folder with robust caching system
    
    Args:
        event_folder_path: Path to folder containing images (single dir or nested albums)
        cache_dir: Directory to store cached face embeddings  
        cropped_dir: Directory to store cropped face images
        min_face_size: Minimum face size to include
        force_reextract: If True, ignore cache and re-extract all faces
    """
    os.makedirs(cache_dir, exist_ok=True)
    os.makedirs(cropped_dir, exist_ok=True)

    faces_path = os.path.join(cache_dir, "faces.npy")
    metadata_path = os.path.join(cache_dir, "metadata.pkl")
    cache_info_path = os.path.join(cache_dir, "cache_info.pkl")

    # Check if we should use cached data
    use_cache = not force_reextract and os.path.exists(faces_path) and os.path.exists(metadata_path)
    
    if use_cache:
        try:
            # Validate cache integrity
            print("🔍 Checking cached face data...")
            
            # Load cache info for validation
            cache_info = {}
            if os.path.exists(cache_info_path):
                try:
                    with open(cache_info_path, "rb") as f:
                        cache_info = pickle.load(f)
                except:
                    print("⚠️  Cache info corrupted, will re-extract")
                    use_cache = False
            
            if use_cache:
                # Check if source folder structure changed
                current_stats = get_folder_stats(event_folder_path)
                cached_stats = cache_info.get('folder_stats', {})
                
                if current_stats != cached_stats:
                    print("⚠️  Source images changed, cache invalid - will re-extract")
                    use_cache = False
                else:
                    # Load cached data
                    print("✅ Loading cached face data...")
                    faces = np.load(faces_path)
                    with open(metadata_path, "rb") as f:
                        metadata = pickle.load(f)
                    
                    print(f"📂 Loaded {len(faces)} cached face embeddings")
                    print(f"📂 Loaded {len(metadata)} cached metadata entries")
                    print(f"📂 Cache created: {cache_info.get('created_time', 'Unknown')}")
                    
                    return faces, metadata
                    
        except Exception as e:
            print(f"❌ Error loading cache: {e}")
            print("🔄 Will re-extract faces...")
            use_cache = False

    faces = []
    metadata = []

    event_id = os.path.basename(event_folder_path)

    print(f"Processing event folder: {event_folder_path}")
    
    # Check if this is a single directory or nested structure
    if not os.path.exists(event_folder_path):
        print(f"❌ Error: Source folder '{event_folder_path}' does not exist!")
        return np.array([]), []
    
    items_in_folder = os.listdir(event_folder_path)
    print(f"📂 Found {len(items_in_folder)} items in folder")
    
    # Debug: show what's in the folder
    image_files = [f for f in items_in_folder if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp'))]
    subdirs = [f for f in items_in_folder if os.path.isdir(os.path.join(event_folder_path, f))]
    
    print(f"   📸 Image files: {len(image_files)}")
    print(f"   📁 Subdirectories: {len(subdirs)}")
    
    if len(image_files) > 0:
        print(f"   📝 Sample images: {image_files[:3]}{'...' if len(image_files) > 3 else ''}")
    if len(subdirs) > 0:
        print(f"   📝 Sample subdirs: {subdirs[:3]}{'...' if len(subdirs) > 3 else ''}")
    
    has_subdirs = len(subdirs) > 0
    has_images = len(image_files) > 0
    
    # Use single directory approach if we have images directly in the folder
    if has_images:
        # Single directory with images directly inside
        print("📁 Using single directory processing mode")
        total_photos = len(image_files)
        
        with tqdm(total=total_photos, desc=f"Processing Images") as pbar:
            for img_name in image_files:
                img_path = os.path.join(event_folder_path, img_name)
                try:
                    result = DeepFace.represent(
                        img_path=img_path,
                        model_name="Facenet512",
                        detector_backend="retinaface",
                        align=True,
                        enforce_detection=False,
                    )
                    for face_idx, face_data in enumerate(result):
                        face_vector = face_data['embedding']
                        facial_area = face_data['facial_area']
                        x, y, w, h = facial_area['x'], facial_area['y'], facial_area['w'], facial_area['h']
                        face_convidence = face_data['face_confidence']
                        # Get original image dimensions
                        img = Image.open(img_path)
                        img_width, img_height = img.size
                        
                        # Check if face size matches full image size
                        if w >= 800 or h >= 800:
                            print(f"⚠️ Face size matches full image in {img_path} face {face_idx}, skipping")
                            continue
                        
                        # Also keep min size check
                        if w < min_face_size or h < min_face_size:
                            print(f"⚠️ Face too small in {img_path} face {face_idx}, skipping") 
                            continue
                        
                        padding_factor = 0.3
                        padding_x = int(w * padding_factor)
                        padding_y = int(h * padding_factor)
                        
                        # Calculate new coordinates with padding
                        new_x = max(0, x - padding_x)
                        new_y = max(0, y - padding_y)
                        new_right = min(img_width, x + w + padding_x)
                        new_bottom = min(img_height, y + h + padding_y)
                        new_w = new_right - new_x
                        new_h = new_bottom - new_y
                        # Crop face
                        img = Image.open(img_path).convert("RGB")
                        cropped_img = img.crop((new_x, new_y, new_x + new_w, new_y + new_h))

                        # Save cropped face with unique name
                        cropped_name = f"{os.path.splitext(img_name)[0]}_f-{face_idx}.jpg"
                        cropped_path = os.path.join(cropped_dir, cropped_name)
                        cropped_img.save(cropped_path)

                        faces.append(face_vector)
                        metadata.append({
                            "foto_id": cropped_path,  # Link ke cropped face
                            "album": {
                                "id": "main",
                                "name": "main",
                                "event": {
                                    "id": event_id,
                                    "name": event_id
                                }
                            },
                            "embedding": face_vector,
                            "cluster_id": None,
                            "path": img_path,          # Link ke original image
                            "facial_area": facial_area,
                            "face_confidence": face_convidence
                        })
                except Exception as e:
                    print(f"Error processing {img_path}: {e}")
                pbar.update(1)
    elif has_subdirs:
        # Nested directory structure (original logic)
        print("📁 Using nested album processing mode")
        total_photos = sum(len(os.listdir(os.path.join(event_folder_path, album))) 
                           for album in os.listdir(event_folder_path) 
                           if os.path.isdir(os.path.join(event_folder_path, album)))

        with tqdm(total=total_photos, desc=f"Processing Event {event_id}") as pbar:
            for album_name in os.listdir(event_folder_path):
                album_path = os.path.join(event_folder_path, album_name)
                if os.path.isdir(album_path):
                    album_id = album_name
                    for img_name in os.listdir(album_path):
                        img_path = os.path.join(album_path, img_name)
                        if img_name.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp')):
                            try:
                                result = DeepFace.represent(
                                    img_path=img_path,
                                    model_name="Facenet512",
                                    detector_backend="retinaface",
                                    align=True,
                                    enforce_detection=False,
                                )
                                for face_idx, face_data in enumerate(result):
                                    face_vector = face_data['embedding']
                                    facial_area = face_data['facial_area']
                                    x, y, w, h = facial_area['x'], facial_area['y'], facial_area['w'], facial_area['h']
                                    face_convidence = face_data['face_confidence']
                                    # Get original image dimensions
                                    img = Image.open(img_path)
                                    img_width, img_height = img.size
                                    
                                    # Check if face size matches full image size
                                    if w >= 800 or h >= 800:
                                        print(f"⚠️ Face size matches full image in {img_path} face {face_idx}, skipping")
                                        continue
                                    
                                    # Also keep min size check
                                    if w < min_face_size or h < min_face_size:
                                        print(f"⚠️ Face too small in {img_path} face {face_idx}, skipping") 
                                        continue
                                    
                                    padding_factor = 0.3
                                    padding_x = int(w * padding_factor)
                                    padding_y = int(h * padding_factor)
                                    
                                    # Calculate new coordinates with padding
                                    new_x = max(0, x - padding_x)
                                    new_y = max(0, y - padding_y)
                                    new_right = min(img_width, x + w + padding_x)
                                    new_bottom = min(img_height, y + h + padding_y)
                                    new_w = new_right - new_x
                                    new_h = new_bottom - new_y
                                    # Crop face
                                    img = Image.open(img_path).convert("RGB")
                                    cropped_img = img.crop((new_x, new_y, new_x + new_w, new_y + new_h))

                                    # Save cropped face with unique name
                                    cropped_name = f"{os.path.splitext(img_name)[0]}_f-{face_idx}.jpg"
                                    cropped_path = os.path.join(cropped_dir, cropped_name)
                                    cropped_img.save(cropped_path)

                                    faces.append(face_vector)
                                    metadata.append({
                                        "foto_id": cropped_path,  # Link ke cropped face
                                        "album": {
                                            "id": album_id,
                                            "name": album_name,
                                            "event": {
                                                "id": event_id,
                                                "name": event_id
                                            }
                                        },
                                        "embedding": face_vector,
                                        "cluster_id": None,
                                        "path": img_path,          # Link ke original image
                                        "facial_area": facial_area,
                                        "face_confidence": face_convidence
                                    })
                            except Exception as e:
                                print(f"Error processing {img_path}: {e}")
                        pbar.update(1)
    else:
        print("❌ No images or subdirectories found in the source folder!")
        print(f"   Please check if '{event_folder_path}' contains image files")
        print(f"   Supported formats: .jpg, .jpeg, .png, .bmp")
        return np.array([]), []

    # Save extracted faces to cache with validation info
    print(f"\n💾 Saving face data to cache...")
    try:
        faces_array = np.array(faces)
        
        # Save faces and metadata
        np.save(faces_path, faces_array)
        with open(metadata_path, "wb") as f:
            pickle.dump(metadata, f)
        
        # Save cache validation info
        cache_info = {
            'created_time': time.strftime('%Y-%m-%d %H:%M:%S'),
            'num_faces': len(faces),
            'num_metadata': len(metadata),
            'folder_stats': get_folder_stats(event_folder_path),
            'extraction_params': {
                'min_face_size': min_face_size,
                'model_name': "Facenet512",
                'detector_backend': "retinaface"
            }
        }
        
        with open(cache_info_path, "wb") as f:
            pickle.dump(cache_info, f)
        
        print(f"✅ Cached {len(faces)} face embeddings")
        print(f"✅ Cached {len(metadata)} metadata entries") 
        print(f"✅ Cache validation info saved")
        print(f"📁 Cache location: {cache_dir}")
        
    except Exception as e:
        print(f"⚠️  Error saving cache: {e}")
        print("Face extraction completed but cache may be incomplete")

    return faces_array, metadata

In [None]:
FORCE_REEXTRACT = False  # Set to True to ignore cache and re-extract all faces
SOURCE_FOLDER = "Hi_concer"  # Folder containing images to process
CACHE_DIR = "coba_cache"  # Cache directory

print("🚀 FACE CLUSTERING WITH INTELLIGENT CACHING")
print("=" * 50)

# Show current cache status
show_cache_status(CACHE_DIR)

# Check if you want to force re-extraction (useful for testing different parameters)
if FORCE_REEXTRACT:
    print("⚠️  FORCE_REEXTRACT=True: Will ignore cache and re-extract all faces")

print(f"\n🔄 FACE EXTRACTION/LOADING:")
print("-" * 30)

# Extract or load faces using the improved caching system
faces, foto_data = extract_faces_from_event_folder(
    event_folder_path=SOURCE_FOLDER,
    cache_dir=CACHE_DIR,
    force_reextract=FORCE_REEXTRACT
)

print(f"\n📊 FACE DATA SUMMARY:")
print(f"   • Total faces: {len(faces)}")
if len(faces) > 0:
    print(f"   • Face dimensions: {faces.shape[1]}D")
print(f"   • Metadata entries: {len(foto_data)}")

# Check if we have faces to cluster
if len(faces) == 0:
    print("❌ No faces found to cluster! Check your source folder and try re-extracting.")
    print("💡 Set FORCE_REEXTRACT=True to re-process images")
    exit()

print(f"   • Ready for clustering! 🎯")


🚀 FACE CLUSTERING WITH INTELLIGENT CACHING
🔍 CACHE STATUS REPORT
❌ Cache directory does not exist

🔄 FACE EXTRACTION/LOADING:
------------------------------
Processing event folder: Hi_concer
📂 Found 232 items in folder
   📸 Image files: 232
   📁 Subdirectories: 0
   📝 Sample images: ['Hiconcer_1.jpg', 'Hiconcer_10.jpg', 'Hiconcer_100.jpg']...
📁 Using single directory processing mode


Processing Images:   2%|▏         | 4/232 [00:13<12:48,  3.37s/it]

In [None]:
foto_data[2]["face_confidence"]