**Install dependencies**

In [1]:
# %%capture
# !pip install -U pillow numpy pandas
# !pip install -U cjm_pil_utils cjm_parallel_utils

**Import dependencies**

In [2]:
import random
from pathlib import Path
from functools import partial
import queue

from PIL import Image
import numpy as np

import pandas as pd
pd.set_option('max_colwidth', None)  # Do not truncate the contents of cells in the DataFrame
pd.set_option('display.max_rows', None)  # Display all rows in the DataFrame
pd.set_option('display.max_columns', None)  # Display all columns in the DataFrame

from cjm_pil_utils.core import get_img_files
from cjm_parallel_utils.core import parallel

**Define path to dataset**

In [3]:
# dataset_path = Path(f"/mnt/980_1TB_2/Datasets/pexels-110k-512p-min-jpg/")
dataset_path = Path(f"/mnt/980_1TB_2/Datasets/pexels-110k-768p-min-jpg/")

In [4]:
# Set the path for the images directory
img_dir = Path(dataset_path/"images")

In [5]:
# Get a list of the paths to the images in the images directory
img_paths = get_img_files(img_dir)

# Print the number of images in the images directory
print(len(img_paths))

109971


In [6]:
# Create a dictionary mapping image IDs to image paths
img_path_dict = {int(path.stem.split('-')[-1]) : path for path in img_paths}

# Display the first few entries in the dictionary using a Pandas Dataframe
pd.DataFrame.from_dict(img_path_dict, orient='index').head()

Unnamed: 0,0
1061142,/mnt/980_1TB_2/Datasets/pexels-110k-768p-min-jpg/images/1-art-background-1061142.jpg
2586334,/mnt/980_1TB_2/Datasets/pexels-110k-768p-min-jpg/images/1-person-beauty-face-blurred-background-2586334.jpg
1460032,/mnt/980_1TB_2/Datasets/pexels-110k-768p-min-jpg/images/1-wtc-artist-concert-1460032.jpg
1061133,/mnt/980_1TB_2/Datasets/pexels-110k-768p-min-jpg/images/10-art-artistic-1061133.jpg
1339845,/mnt/980_1TB_2/Datasets/pexels-110k-768p-min-jpg/images/10-art-color-1339845.jpg


In [7]:
img_ids = list(img_path_dict.keys())

**Define function to apply to an array of image IDs**

In [8]:
def array_func(img_id, img_path_dict, img_id_queue, max_size):
    """
    Add the image ID to the queue if the associated image is greater than or equal
    to the given maximum size.
    
    Parameters:
        img_id (int): ID of the image to be processed.
        img_path_dict (dict): A dictionary that maps image IDs to their file paths.
        img_id_queue (queue): A queue to hold the IDs of images that meet the size requirement.
        max_size (int): The maximum size of the image's dimension.
    """
    # Check if the maximum dimension of the image is greater than or equal to max_size
    if max(Image.open(img_path_dict[img_id]).size) >= max_size:
        # If it is, add the image ID to the queue
        img_id_queue.put(img_id)

In [9]:
img_id = random.choice(img_ids)
img_id

2104072

In [10]:
img_id_queue = queue.Queue()

In [11]:
max_size = 1500

**Test array function**

In [12]:
# Use the `array_func` function to 
array_func(img_id, img_path_dict, img_id_queue, max_size)

img_id_queue.queue

deque([])

**Encode images in parallel**

In [13]:
# 
partial_func = partial(array_func, img_path_dict=img_path_dict, img_id_queue=img_id_queue, max_size=max_size)

# Use the `parallel` function
parallel(func=partial_func, arr=img_ids)

  0%|          | 0/109971 [00:00<?, ?it/s]

In [14]:
len(img_id_queue.queue)

0

In [15]:
stop

NameError: name 'stop' is not defined

**Save list of large images**

In [None]:
with open("large_img_ids.txt", "w") as write_file:
    for img_id in list(img_id_queue.queue):
        write_file.writelines(f"{img_id}\n")

**Delete large images**

In [None]:
for img_id in list(img_id_queue.queue):
    img_path_dict[img_id].unlink()

**Set file path for attributes DataFrame**

In [None]:
# Set the file path for the attributes dataframe
attributes_df_json_file = Path(dataset_path/'attributes_df.json')

# Print the file path
print(attributes_df_json_file)

**Load attributes DataFrame**

In [None]:
# Read the attributes dataframe from the JSON file
attributes_df = pd.read_json(attributes_df_json_file)

# Print the number of entries in the dataframe
print(f"Number of entries: {len(attributes_df)}")

# Display the first few rows of the dataframe
attributes_df.head().transpose()

**Remove attributes for large images from DataFrame**

In [None]:
attributes_df = attributes_df.drop(list(img_id_queue.queue))
len(attributes_df)

In [None]:
attributes_df.to_json(attributes_df_json_file)