# Dataset Pipeline

## Installation

In [1]:
%pip install kagglehub numpy==2.0.0 rembg pymatting opencv-python ipython pillow


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## Dataset Downloads

In [2]:
import kagglehub # we are using an api to import kagglehub's dataset

paths = []
paths.append(kagglehub.dataset_download("jafarhussain786/human-emotionshappy-faces"))
paths.append(kagglehub.dataset_download("jafarhussain786/human-emotionssad-faces"))
paths.append(kagglehub.dataset_download("jafarhussain786/human-emotionsangry-faces"))
paths.append(kagglehub.dataset_download("jafarhussain786/human-emotionsfear-faces"))
paths.append(kagglehub.dataset_download("jafarhussain786/human-emotionssuprise-faces"))

emotions = ["happy", "sad", "angry", "fear", "surprise"]
paths


  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/jafarhussain786/human-emotionssad-faces?dataset_version_number=1...


100%|██████████| 14.2M/14.2M [00:00<00:00, 20.2MB/s]

Extracting files...





Downloading from https://www.kaggle.com/api/v1/datasets/download/jafarhussain786/human-emotionsfear-faces?dataset_version_number=1...


100%|██████████| 86.6M/86.6M [00:04<00:00, 20.7MB/s]

Extracting files...





Downloading from https://www.kaggle.com/api/v1/datasets/download/jafarhussain786/human-emotionssuprise-faces?dataset_version_number=1...


100%|██████████| 8.91M/8.91M [00:00<00:00, 20.7MB/s]

Extracting files...





['/Users/upe/.cache/kagglehub/datasets/jafarhussain786/human-emotionshappy-faces/versions/1',
 '/Users/upe/.cache/kagglehub/datasets/jafarhussain786/human-emotionssad-faces/versions/1',
 '/Users/upe/.cache/kagglehub/datasets/jafarhussain786/human-emotionsangry-faces/versions/1',
 '/Users/upe/.cache/kagglehub/datasets/jafarhussain786/human-emotionsfear-faces/versions/1',
 '/Users/upe/.cache/kagglehub/datasets/jafarhussain786/human-emotionssuprise-faces/versions/1']

## Image Manipulation

Resizing first: Ensures that all subsequent operations work on a consistent image size.
Setting the background to black next: Helps to clean up the image before grayscaling, making it more effective.
Grayscaling last: Keeps it optional, allowing for flexibility based on your needs.

In [3]:
import cv2
import numpy as np
import rembg
from PIL import Image as PILImage

def resize_image(image, target_size=(224, 224)):
    """
    Resize the image to the target size while maintaining aspect ratio.
    Adds padding to ensure the output size is consistent.
    """
    h, w = image.shape[:2]
    scale = min(target_size[0] / h, target_size[1] / w)
    new_w = int(w * scale)
    new_h = int(h * scale)

    resized_image = cv2.resize(image, (new_w, new_h))

    # Add padding to make it the target size
    top = (target_size[0] - new_h) // 2
    left = (target_size[1] - new_w) // 2

    # Create a black canvas and place the resized image on it
    canvas = np.zeros((target_size[0], target_size[1], 3), dtype=np.uint8)
    canvas[top:top+new_h, left:left+new_w] = resized_image

    return canvas

def remove_background(image):
    """
    Remove the background using rembg and return an image with a black background.
    """
    # Convert OpenCV image (BGR) to PIL image (RGB)
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    pil_image = PILImage.fromarray(image_rgb)

    # Use rembg to remove the background
    image_no_bg = rembg.remove(pil_image)

    # Convert back to OpenCV image and replace transparent pixels with black
    image_no_bg_cv = np.array(image_no_bg)
    image_no_bg_cv = cv2.cvtColor(image_no_bg_cv, cv2.COLOR_RGB2BGR)

    # Replace transparent areas with black
    if image_no_bg_cv.shape[2] == 4:  # Check if alpha channel exists
        alpha_channel = image_no_bg_cv[:, :, 3]
        mask = alpha_channel == 0
        image_no_bg_cv = image_no_bg_cv[:, :, :3]  # Remove alpha channel
        image_no_bg_cv[mask] = [0, 0, 0]  # Set background to black

    return image_no_bg_cv

def preprocess_image(image, grayscale=True, target_size=(224, 224)):
    """
    Preprocess the image by resizing, removing the background, and converting to grayscale.
    """
    # Step 1: Resize the image
    image = resize_image(image, target_size)

    # Step 2: Remove the background and replace it with black
    image = remove_background(image)

    # Step 3: Convert to grayscale if specified
    if grayscale:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    return image

# Display and save functions remain the same as before



In [4]:
import os
from IPython.display import display, Image
from PIL import Image as PILImage
import io

image_files = []
for path in paths:
    for f in os.listdir(path):
        if f.endswith(('.png', '.jpg', '.jpeg')):
            image_files.append(os.path.join(path, f))

DISPLAY_COUNT = len(image_files)
image_files = np.array(image_files)

In [5]:
def show_images(path, num_images=1):
    print(path)
    rand_array = np.random.randint(0, len(image_files), num_images)  # random.ra
    for i, image_path in enumerate(image_files[rand_array]):
        print(f"Using image {i+1} for testing: {image_path}")

        image = cv2.imread(image_path)

        original_image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        processed_image = preprocess_image(image)

        processed_image_rgb = cv2.cvtColor(processed_image, cv2.COLOR_BGR2RGB)

        original_img_pil = PILImage.fromarray(original_image_rgb)
        buf_original = io.BytesIO()
        original_img_pil.save(buf_original, format='JPEG')
        buf_original.seek(0)

        processed_img_pil = PILImage.fromarray(processed_image_rgb)
        buf_processed = io.BytesIO()
        processed_img_pil.save(buf_processed, format='JPEG')
        buf_processed.seek(0)

        # print(f"Original Image {i+1}:")
        # display(Image(data=buf_original.getvalue()))

        # print(f"Image {i+1} after ETL:")
        # display(Image(data=buf_processed.getvalue()))

show_images(paths[1], 0)

/Users/upe/.cache/kagglehub/datasets/jafarhussain786/human-emotionssad-faces/versions/1


In [7]:
output_dir = "./output/emotion_dataset"
os.makedirs(output_dir, exist_ok=True)

def save_processed_images(path, num_images=1):
    for i, image_path in enumerate(image_files[:num_images]):
        print(f"Processing image {i+1}: {image_path}")

        image = cv2.imread(image_path)

        processed_image = preprocess_image(image)

        processed_image_rgb = cv2.cvtColor(processed_image, cv2.COLOR_BGR2RGB)

        output_file_path = None
        for emotion in emotions:
            if emotion in image_path:
                image_folder = os.path.join(output_dir, f"{emotion}_{i+1}")
                os.makedirs(image_folder, exist_ok=True)

                output_file_path = os.path.join(image_folder, f"{emotion}_{i+1}_original.jpg")

        PILImage.fromarray(processed_image_rgb).save(output_file_path)

        print(f"Processed image saved to: {output_file_path}")

save_processed_images(paths[0], 10)

Processing image 1: /Users/upe/.cache/kagglehub/datasets/jafarhussain786/human-emotionshappy-faces/versions/1/images26.jpg
Processed image saved to: ./output/emotion_dataset/happy_1/happy_1_original.jpg
Processing image 2: /Users/upe/.cache/kagglehub/datasets/jafarhussain786/human-emotionshappy-faces/versions/1/images32.jpg
Processed image saved to: ./output/emotion_dataset/happy_2/happy_2_original.jpg
Processing image 3: /Users/upe/.cache/kagglehub/datasets/jafarhussain786/human-emotionshappy-faces/versions/1/n-with-happy-face-expression-people-portraits-isolated-in-neutral-bac-W1FJB7.jpg
Processed image saved to: ./output/emotion_dataset/happy_3/happy_3_original.jpg
Processing image 4: /Users/upe/.cache/kagglehub/datasets/jafarhussain786/human-emotionshappy-faces/versions/1/image22.jpeg
Processed image saved to: ./output/emotion_dataset/happy_4/happy_4_original.jpg
Processing image 5: /Users/upe/.cache/kagglehub/datasets/jafarhussain786/human-emotionshappy-faces/versions/1/500_F_2461