In [6]:
#| hide

from collections.abc import Callable

import daft


In [7]:
GLOB_PATH = "../tests/test-data/butterflies_baseline/data/originals/*.jpg"
MIN_BYTES = 100
MIN_ASPECT = 4.0

In [14]:
df_img = daft.from_glob_path(GLOB_PATH).with_column_renamed("path", "img_path")

# drop any files too small to be images
df_img = df_img.filter(df_img["size"] > MIN_BYTES)

# create image name column
df_img = df_img.with_column("image_name",
                   df_img["img_path"].str.split("/").list.get(
                       -1).cast(str))


In [15]:

def split_on_condition(df: daft.DataFrame, condition: Callable[[daft.DataFrame], daft.DataFrame]):
    """Splits a DataFrame into accepted and dropped rows based on a filtering condition.

    Args:
        df (daft.DataFrame): The input DataFrame.
        condition (Callable[[daft.DataFrame], daft.DataFrame]): A function that filters the DataFrame.

    Returns:
        Tuple[daft.DataFrame, daft.DataFrame]: (accepted_df, dropped_df)
    """
    filtered_df = condition(df)
    dropped_df = df.join(filtered_df, on="id", how="anti")  # Ensure dropped_df has the same schema
    return filtered_df, dropped_df

In [16]:

# Define filtering functions
def size_nontrivial(df: daft.DataFrame) -> daft.DataFrame:
    """Keeps images that are at least MIN_BYTES in size on disk."""
    return df.filter(df["size"] > MIN_BYTES)


In [17]:
def img_not_oblong(df: daft.DataFrame) -> daft.DataFrame:
    """Keeps images with an aspect ratio between 1:4 and 4:1 using Daft's `image_decode`."""
    decoded = df.with_column("decoded_img", daft.col("img_path").image.decode())
    checked = decoded.filter(
        (decoded["decoded_img"].image.width() / decoded["decoded_img"].image.height() < 4) &
        (decoded["decoded_img"].image.height() / decoded["decoded_img"].image.width() < 4)
    ).drop("decoded_img")  # Drop transient column
    return checked

In [18]:
img_not_oblong(df_img)

DaftTypeError: ImageDecode can only decode BinaryArrays, got img_path#Utf8

In [None]:

def img_name_distinct(df: daft.DataFrame) -> daft.DataFrame:
    """Keeps images with unique filenames."""
    unique_filenames = df.group_by("filename").agg([("id", "count")]).filter(df["id_count"] == 1).select("filename")
    return df.join(unique_filenames, on="filename", how="inner")  # Keep schema intact


In [None]:

# Define pipeline of conditions
pipeline: list[Callable[[daft.DataFrame], daft.DataFrame]] = [
    size_nontrivial,
    img_not_oblong,
    img_name_distinct
]

In [None]:

# Example Daft DataFrame
df = df_img

# Process the pipeline
for condition in pipeline:
    df, dropped = split_on_condition(df, condition)

    if dropped.num_rows() > 0:
        print(f"Dropped {dropped.num_rows()} rows due to {condition.__name__}:")
        print(dropped.head(1))  # Print first dropped row as an example