In [None]:
import os
import gc
import shutil
import signal
from PIL import Image
import numpy as np
from mtcnn.mtcnn import MTCNN
from tqdm import tqdm
import psutil

input_base = '/kaggle/input/raw-data/images_data'
output_base = '/kaggle/working/faces/'
batch_size = 1000
required_size = (160, 160)
log_file_path = '/kaggle/working/processed_files.txt'
zip_after_each_batch = False
skip_persons = 500

def extract_face(filename, detector, required_size=(160, 160)):
    try:
        with Image.open(filename).convert('RGB') as image:
            pixels = np.asarray(image)
        results = detector.detect_faces(pixels)
        if len(results) != 1:
            return None
        x1, y1, width, height = results[0]['box']
        x1, y1 = max(0, x1), max(0, y1)
        x2, y2 = x1 + width, y1 + height
        x2, y2 = min(x2, pixels.shape[1]), min(y2, pixels.shape[0])
        face = pixels[y1:y2, x1:x2]
        image = Image.fromarray(face).resize(required_size)
        return np.asarray(image)
    except Exception:
        return None

def iter_image_files(input_base, skip_first=0):
    subdirs = sorted([os.path.join(input_base, d) for d in os.listdir(input_base)
                      if os.path.isdir(os.path.join(input_base, d))])[skip_first:]
    for subdir in subdirs:
        for root, _, files in os.walk(subdir):
            for file in files:
                if file.lower().endswith(('.jpg', '.jpeg', '.png')):
                    yield os.path.join(root, file)

def get_memory_usage_mb():
    return psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)

def get_folder_size_mb(folder):
    total = 0
    for dirpath, _, filenames in os.walk(folder):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total += os.path.getsize(fp)
    return total / (1024 * 1024)

def load_processed_log(log_file_path):
    if os.path.exists(log_file_path):
        with open(log_file_path, 'r') as f:
            return set(line.strip() for line in f)
    return set()

def append_to_log(filepath):
    with open(log_file_path, 'a') as f:
        f.write(filepath + '\n')

class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException

signal.signal(signal.SIGALRM, timeout_handler)

detector = MTCNN()
all_image_files = iter_image_files(input_base, skip_first=skip_persons)
processed_files = load_processed_log(log_file_path)

batch_index = 0
processed_count = 0

for filepath in tqdm(all_image_files, desc="Processing images"):
    if filepath in processed_files:
        continue

    try:
        signal.alarm(10)
        face_pixels = extract_face(filepath, detector)
        signal.alarm(0)

        if face_pixels is not None:
            relative_path = os.path.relpath(filepath, input_base)
            output_path = os.path.join(output_base, relative_path)

            os.makedirs(os.path.dirname(output_path), exist_ok=True)
            with Image.fromarray(face_pixels) as face_image:
                face_image.save(output_path)

            append_to_log(filepath)
            processed_count += 1

            if processed_count % 50 == 0:
                print(f"Processed: {processed_count} | RAM: {get_memory_usage_mb():.2f} MB | Output size: {get_folder_size_mb(output_base):.2f} MB")

            if zip_after_each_batch and processed_count % batch_size == 0:
                zip_name = f"/kaggle/working/faces_batch_{batch_index}.zip"
                shutil.make_archive(zip_name.replace('.zip', ''), 'zip', output_base)
                print(f"Zipped batch {batch_index} to {zip_name}")
                batch_index += 1

        gc.collect()

    except TimeoutException:
        print(f"Timeout: {filepath}")
        continue
    except Exception as e:
        print(f"Error processing {filepath}: {e}")
        gc.collect()
        continue

zip_filename = '/kaggle/working/faces_final.zip'
shutil.make_archive(zip_filename.replace('.zip', ''), 'zip', output_base)
print(f"Zipped final folder to: {zip_filename}")
print(f"Total images processed: {processed_count}")