# Get some random N samples from dataset

In [None]:
import os
import random
import shutil

# Define paths
images_path = 'dataset/train/images'
annotations_path = 'dataset/train/labels'
sampled_images_path = 'sampled_data/images'
sampled_annotations_path = 'sampled_data/labels'

# Make directories for sampled data
os.makedirs(sampled_images_path, exist_ok=True)
os.makedirs(sampled_annotations_path, exist_ok=True)

# List all images in the dataset
all_images = [img for img in os.listdir(images_path) if img.endswith('.jpg')]

# Randomly select 100 images
sampled_images = random.sample(all_images, 100)

# Copy sampled images and their corresponding annotation files
for image_name in sampled_images:
    # Copy image
    src_image_path = os.path.join(images_path, image_name)
    dst_image_path = os.path.join(sampled_images_path, image_name)
    shutil.copy(src_image_path, dst_image_path)
    
    # Copy corresponding annotation
    annotation_name = os.path.splitext(image_name)[0] + '.txt'
    src_annotation_path = os.path.join(annotations_path, annotation_name)
    dst_annotation_path = os.path.join(sampled_annotations_path, annotation_name)
    if os.path.exists(src_annotation_path):
        shutil.copy(src_annotation_path, dst_annotation_path)

print("Sampled 100 images and annotations.")


# Crop faces from dataset images

In [None]:
import cv2
import os

# Path to the folder containing images
image_folder = 'big_data/images/train'
output_folder = 'data_cropped_faces/p'

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Load the Haar Cascade
face_cascade = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')
i = 0
# Loop through each image in the folder
for filename in os.listdir(image_folder):
    if filename.endswith(('.png', '.jpg', '.jpeg')):  # Add more extensions if needed
        # Load the image
        image_path = os.path.join(image_folder, filename)
        image = cv2.imread(image_path)

        # Convert the image to grayscale
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        # Detect faces
        faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(20, 20))

        # Draw rectangles around detected faces
        for (x, y, w, h) in faces:
            cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 0), 2)
            frame = image[y:y+h, x:x+w]
            # save 
            cv2.imwrite(f'data_cropped_faces/p/face_{i}.jpg', frame)
            i+=1

        # Save the processed image to the output folder
        # output_/path = os.path.join(output_folder, filename)
        # cv2.imwrite(output_path, image)
        
        # Optional: Display the image (Press any key to close each image window)
        # cv2.imshow('Detected Faces', image)
        # cv2.waitKey(0)
        # cv2.destroyAllWindows()

print("Processing complete. Check the output folder for results.")


# Get negative images

In [None]:
import os
import random
import shutil

# Paths to your full dataset and target folder for negatives
full_images_path = 'haarcascade-negatives/images'
negatives_path = 'data/n'

# Create directory for negatives if it doesn't exist
os.makedirs(negatives_path, exist_ok=True)

# List all images in the dataset
all_images = [img for img in os.listdir(full_images_path) if img.endswith('.jpg')]

# Randomly select 200 images
negative_images = random.sample(all_images, 425)

# Copy selected negative images to the negatives folder
for image_name in negative_images:
    src_image_path = os.path.join(full_images_path, image_name)
    dst_image_path = os.path.join(negatives_path, image_name)
    shutil.copy(src_image_path, dst_image_path)

# Create the negatives.txt file
# with open('negatives.txt', 'w') as f:
#     for image_name in negative_images:
#         f.write(os.path.join(negatives_path, image_name) + '\n')

print("Selected 425 negative images and created negatives.txt.")
