<h3 align="center"></h3>

<h1 align="center">Use case : HuggingFace dataset of Omeka S collection's items</h1>

---

<h2 align="center">Example : <a href="https://huggingface.co/datasets/Geraldine/Humazur-collection-Chine">https://huggingface.co/datasets/Geraldine/Humazur-collection-Chine</a></h2>

# Requirements

In [1]:
import sys
import os
sys.path.append('..')

In [None]:
import pandas as pd
from PIL import Image
import requests
import io
import json
import math
import logging
from urllib.parse import urlparse, parse_qs
from huggingface_hub import login, whoami
from omeka_s_client import OmekaSClient,OmekaSClientError
from datasets import Dataset

# Configuration

In [None]:
OMEKA_URL = "https://humazur.univ-cotedazur.fr"
API_KEY_IDENTITY = "..."
API_KEY_CREDENTIAL = "..."

HF_TOKEN = "..."
logging.getLogger().setLevel(logging.INFO)

In [4]:
client = OmekaSClient(OMEKA_URL, API_KEY_IDENTITY, API_KEY_CREDENTIAL, default_per_page=50)
prefixes = OmekaSClient._DEFAULT_PARSE_METADATA

2025-04-22 11:02:34,795 - INFO - OmekaClient initialized for API: https://humazur.univ-cotedazur.fr/api
2025-04-22 11:02:34,795 - INFO - Using API Key Identity.


In [None]:
# Log to HuggingFace with your token (needs to have write permission)
login(token = HF_TOKEN)
whoami()

# Helpers

In [5]:
def image_url_to_pil(url: str, max_size=(512, 512)) -> Image:
    """
    Ex usage : image_blobs = df["image_url"].apply(image_url_to_pil).tolist()
    """
    response = requests.get(url, stream=True, timeout=5)
    response.raise_for_status()
    image = Image.open(io.BytesIO(response.content)).convert("RGB")
    image.thumbnail(max_size, Image.Resampling.LANCZOS)
    return image

def reorder_columns(df,cols_list,position='first'):
    # Make sure they actually exist in the dataframe
    selected_columns = [col for col in cols_list if col in df.columns]
    # Get the remaining columns (excluding the ones above)
    remaining_columns = [col for col in df.columns if col not in selected_columns]
    if position=="first":
        new_column_order = selected_columns + remaining_columns
        # Reorder the DataFrame
    elif position == "last":
        new_column_order = remaining_columns + selected_columns
    else:
        raise ValueError("position must be 'first' or 'last'")
    df = df[new_column_order]
    return df

# Harvest & generate dataset

The dataset will contains all image medias urls for each item.

In [6]:
def generate_dataset(item_set_id=None,prefixes=prefixes,per_page=50):
    print("\n--- Fetching and Parsing Multiple Items by colection---")
    try:
        # Fetch first 5 items
        items_list = client.list_all_items(item_set_id=item_set_id, per_page=per_page)
        print(f"Fetched {len(items_list)} items.")

        parsed_items_list = []
        for item_raw in items_list:
            if 'o:media' in item_raw:
                parsed = OmekaSClient.digest_item_data(item_raw, prefixes=prefixes)
                if parsed: # Only add if parsing was successful
                    # Add media
                    medias_id = [x["o:id"] for x in item_raw["o:media"]]
                    medias_list = []
                    for media_id in medias_id:
                        media = client.get_media(media_id)
                        if "image" in media["o:media_type"]:
                          medias_list.append(media.get('o:original_url'))
                    if medias_list: # Only append if there are image URLs
                      parsed["images_urls"] = medias_list
                      parsed_items_list.append(parsed)
                      print(f"Successfully parsed {len(parsed_items_list)} items.")

                print(f"Successfully parsed {len(parsed_items_list)} items.")
        # Note: List columns (like dcterms:title) might need further handling in Pandas
        print("\nDataFrame from parsed items:")
        return pd.DataFrame(parsed_items_list)

    except OmekaSClientError as e:
        print(f"Error fetching/parsing multiple items: {e}")
    except Exception as e:
        print(f"An unexpected error occurred during multi-item parsing: {e}")

In [None]:
df = generate_dataset(item_set_id=8599)
df.head()

In [None]:
df["id"] = range(1, len(df) + 1)
df = reorder_columns(df,["id","item_id", "Identifier", "images_urls","Iitle"])

# Push to Hub

In [None]:
dataset = Dataset.from_pandas(df)
dataset

In [None]:
dataset.push_to_hub("<YOUR_HF_REPO>", private=False)