In [0]:
import os
import base64
from datetime import datetime
from PIL import Image
from io import BytesIO
from pyspark.sql import Row

# Define the root volume path
volume_root = "/Volumes/autobricks/agriculture/crop_images"

# Recursively list all JPEG files in all folders
def list_jpeg_files(root_path):
    files = []
    for dirpath, dirnames, filenames in os.walk(root_path):
        for filename in filenames:
            if filename.lower().endswith(".jpg") or filename.lower().endswith(".jpeg"):
                files.append(os.path.join(dirpath, filename))
    return files

jpeg_files = list_jpeg_files(volume_root)
print(f"Found {len(jpeg_files)} JPEG images.")

# Collect metadata and base64-encoded image for each file
def get_image_metadata(file_path):
    try:
        # Get file stats
        stat = os.stat(file_path)
        file_name = os.path.basename(file_path)
        folder = os.path.dirname(file_path).replace(volume_root, "").lstrip("/")
        size_bytes = stat.st_size
        created_at = datetime.fromtimestamp(stat.st_ctime)
        # Read and encode image
        with open(file_path, "rb") as f:
            img_bytes = f.read()
            img_b64 = base64.b64encode(img_bytes).decode()
        return Row(
            file_path=file_path,
            file_name=file_name,
            folder=folder,
            size_bytes=size_bytes,
            created_at=created_at,
            image_base64=img_b64
        )
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

rows = [get_image_metadata(fp) for fp in jpeg_files]
rows = [r for r in rows if r is not None]

# Create Spark DataFrame
images_df = spark.createDataFrame(rows)
display(images_df)

In [0]:
# Save to Unity Catalog table autobricks.agriculture.crop_images_directory
images_df.write.format("delta").mode("overwrite").saveAsTable("autobricks.agriculture.crop_images_directory")
print("✅ Table saved to autobricks.agriculture.crop_images_directory in Unity Catalog.")