In [None]:
import os
import numpy as np
import shutil
import face_recognition
from sklearn.cluster import DBSCAN
import time
from tqdm import tqdm
import zipfile

def clean_face_dataset(input_folder, output_folder, min_cluster_ratio=0.1):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    person_name = os.path.basename(input_folder)
    person_output_folder = os.path.join(output_folder, person_name)
    if not os.path.exists(person_output_folder):
        os.makedirs(person_output_folder)
    
    embeddings = []
    file_names = []
    
    image_files = [f for f in os.listdir(input_folder) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    
    for filename in image_files:
        image_path = os.path.join(input_folder, filename)
        
        try:
            image = face_recognition.load_image_file(image_path)
            
            face_locations = face_recognition.face_locations(image, model="hog")
            
            if face_locations:
                max_area = 0
                max_face_idx = 0
                
                for i, face_loc in enumerate(face_locations):
                    top, right, bottom, left = face_loc
                    area = (right - left) * (bottom - top)
                    if area > max_area:
                        max_area = area
                        max_face_idx = i
                
                face_encodings = face_recognition.face_encodings(
                    image, 
                    [face_locations[max_face_idx]], 
                    num_jitters=5,
                    model="large"
                )
                
                if face_encodings:
                    embeddings.append(face_encodings[0])
                    file_names.append(filename)
        except Exception:
            pass
    
    if len(embeddings) > 0:
        embeddings_array = np.array(embeddings)
        
        best_config = find_best_clustering_config(embeddings_array, person_name)
        labels = best_config['labels']
        
        unique_labels, counts = np.unique(labels, return_counts=True)
        total_images = len(labels)
        
        valid_clusters = []
        for label, count in zip(unique_labels, counts):
            if label == -1:
                continue
                
            cluster_ratio = count / total_images
            
            if cluster_ratio >= min_cluster_ratio:
                valid_clusters.append(label)
        
        valid_count = 0
        
        for i, label in enumerate(labels):
            if label in valid_clusters:
                src_path = os.path.join(input_folder, file_names[i])
                dst_path = os.path.join(person_output_folder, file_names[i])
                shutil.copy(src_path, dst_path)
                valid_count += 1
        
        return valid_count
    else:
        return 0

def find_best_clustering_config(embeddings_array, person_name):
    best_score = float('-inf')
    best_eps = 0
    best_min_samples = 0
    best_labels = None
    best_n_clusters = 0

    eps_values = [0.4, 0.45, 0.5, 0.55, 0.6]
    min_samples_values = [2, 3, 4]

    for eps in eps_values:
        for min_samples in min_samples_values:
            clustering = DBSCAN(eps=eps, min_samples=min_samples, metric="euclidean").fit(embeddings_array)
            labels = clustering.labels_
            
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            n_outliers = np.sum(labels == -1)
            outlier_ratio = n_outliers / len(labels)
            
            score = n_clusters - outlier_ratio
            
            if score > best_score:
                best_score = score
                best_eps = eps
                best_min_samples = min_samples
                best_labels = labels.copy()
                best_n_clusters = n_clusters
    
    return {
        'eps': best_eps,
        'min_samples': best_min_samples,
        'labels': best_labels,
        'n_clusters': best_n_clusters
    }

def zip_folder(folder_path, output_zip):
    with zipfile.ZipFile(output_zip, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, os.path.dirname(folder_path))
                zipf.write(file_path, arcname)

def process_all_folders(base_input_folder, output_folder, min_cluster_ratio=0.1):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        
    person_folders = [d for d in os.listdir(base_input_folder) 
                     if os.path.isdir(os.path.join(base_input_folder, d))]
    
    total_valid_images = 0
    
    for person_folder in tqdm(person_folders, desc="Processing", ncols=100):
        person_path = os.path.join(base_input_folder, person_folder)
        
        valid_count = clean_face_dataset(
            input_folder=person_path,
            output_folder=output_folder,
            min_cluster_ratio=min_cluster_ratio
        )
        
        total_valid_images += valid_count
    
    zip_path = output_folder + ".zip"
    zip_folder(output_folder, zip_path)
    
    return total_valid_images


base_input_folder = "/kaggle/input/train-set/train"

output_folder = "/kaggle/working/faces_image_cleaned"

min_cluster_ratio = 0.1

total_images = process_all_folders(
    base_input_folder=base_input_folder,
    output_folder=output_folder, 
    min_cluster_ratio=min_cluster_ratio
)

print(f"Tổng số ảnh đã lấy được: {total_images}")
print(f"Đã nén thành công: {output_folder}.zip")