In [1]:
#| hide

from collections.abc import Callable

import daft
import numpy as np

In [2]:
GLOB_PATH: str = "../../tests/test-data/butterflies_baseline/data/originals/*.jpg"
MIN_BYTES: int = 300
MIN_ASPECT: float = 4.0

In [3]:
df_img = daft.from_glob_path(GLOB_PATH).with_column_renamed("path", "img_path")

# create image name column
df_img = df_img.with_column("img_name",
                   df_img["img_path"].str.split("/").list.get(
                       -1).cast(str))

# get the images since we'll need to use them several times
df_img = df_img.with_column("img", daft.col("img_path"
                                   ).url.download(on_error="null"
                                                  ).image.decode(on_error="null",
                                                                 mode="RGB")).collect()

  from .autonotebook import tqdm as notebook_tqdm


                                                      d

In [4]:
df_img.count_rows()

12

In [5]:

def split_on_condition(df: daft.DataFrame, condition: Callable[[daft.DataFrame], daft.DataFrame]):
    """Splits a DataFrame into accepted and dropped rows based on a filtering condition.

    Args:
        df (daft.DataFrame): The input DataFrame.
        condition (Callable[[daft.DataFrame], daft.DataFrame]): A function that filters the DataFrame.

    Returns:
        Tuple[daft.DataFrame, daft.DataFrame]: (accepted_df, dropped_df)
    """
    filtered_df = condition(df)
    if filtered_df.count_rows() < df.count_rows():
        dropped_df = filtered_df.except_distinct(df)
        return filtered_df, dropped_df
    else:
        return df, None

In [6]:
# Define filtering functions
def size_nontrivial(df: daft.DataFrame) -> daft.DataFrame:
    """Keeps images that are at least MIN_BYTES in size on disk."""
    return df.filter(df["size"] > MIN_BYTES)


In [7]:
@daft.udf(return_dtype=daft.DataType.bool())
def array_not_oblong(arrs: daft.Series, max_oblongness: float=4.0) -> bool:
    """is an array oblong"""
    arrs = arrs.to_pylist()
    shapes = np.array([a.shape[:2] for a in arrs])  # Extract h, w as an array
    max_aspects = np.max(shapes / shapes[:, ::-1], axis=1)  # Compute max(h/w, w/h)
    return max_aspects < max_oblongness

In [8]:
def img_not_oblong(df: daft.DataFrame) -> daft.DataFrame:
    """Keeps images with an aspect ratio between 1:4 and 4:1 using Daft's `image_decode`."""
    # checkable = decoded.with_column("is_not_oblong", df["img"].apply(array_not_oblong, daft.DataType.bool()))
    checkable = df.with_column("is_not_oblong", array_not_oblong(df_img["img"]))
    checked = checkable.filter(checkable["is_not_oblong"]
                             ).exclude("is_not_oblong") # Drop transient column
    return checked

In [9]:
def img_name_distinct(df: daft.DataFrame, name_col: str="img_name") -> daft.DataFrame:
    """Keeps images with unique filenames."""
    aggs = [daft.col(c).any_value() for c in set(df_img.column_names) - {name_col}]
    return df.groupby(name_col).agg(*aggs)

In [10]:

# Define pipeline of conditions
pipeline: list[Callable[[daft.DataFrame], daft.DataFrame]] = [
    size_nontrivial,
    img_not_oblong,
    img_name_distinct
]

In [None]:
# Process the pipeline
for check in pipeline:
    print(f"Checking {check.__qualname__}")
    df_img, dropped = split_on_condition(df_img, check)

    if dropped:
        print(f"{dropped.count_rows()} images failed check {check.__name__} and will be dropped.")
        print(dropped.head(1))  # Print first dropped row as an example

NameError: name 'condition' is not defined