In [1]:
import pandas as pd
from omeka_tools.client import OmekaClient
from omeka_tools.utils import extract_file_urls, get_public_url

import os
from dotenv import load_dotenv

# Load .env variables
load_dotenv()

API_KEY = os.getenv("OMEKA_API_KEY")
API_URL = os.getenv("OMEKA_API_URL")

In [2]:
omeka_client = OmekaClient(base_url=API_URL, api_key=API_KEY)

In [3]:
files = omeka_client.get_files_by_item(item_id=1448)
file_urls = extract_file_urls(files, 'original')

### Omeka items - Load

In [4]:
omeka_items = pd.read_csv('../../data/omeka_data_last.csv', index_col=0)

In [5]:
omeka_items = omeka_items[['id', 'files_count', 'item_type_name', 'element_text_element_set_name', 'element_text_element_name', 'element_text_text']]

In [6]:
omeka_items

Unnamed: 0,id,files_count,item_type_name,element_text_element_set_name,element_text_element_name,element_text_text
0,839,1,Contextual Item,Item Type Metadata,FormerDB-ID,C-11
1,839,1,Contextual Item,Dublin Core,Title,Hut 211
2,839,1,Contextual Item,Dublin Core,Creator,Gedenkstätte Bergen-Belsen
3,839,1,Contextual Item,Item Type Metadata,Translated Title (German),Baracke 211
4,839,1,Contextual Item,Item Type Metadata,Translated Title (English),Hut 211
...,...,...,...,...,...,...
8042,2154,1,Still Image,Item Type Metadata,Additional Caption 1 (English),Translated from the Hungarian
8043,2154,1,Still Image,Item Type Metadata,Additional Caption 1 (German),Übersetzung aus dem Ungarischen
8044,2154,1,Still Image,Item Type Metadata,Additional Caption 1 (Dutch),Vertaald uit het Hongaars
8045,2154,1,Still Image,Item Type Metadata,Earliest Start of Reference Period,1944-12-14


We have the following item types in Omeka: 'Contextual Item', 'Landscape Item', 'Still Image', 'Physical Object', 'Text Item', 'Oral History'

In [7]:
# Count number of unique item IDs per metadata field
field_counts = omeka_items.groupby('element_text_element_name')['id'].nunique().reset_index()
field_counts.columns = ['field_name', 'unique_item_count']
field_counts.sort_values(by='unique_item_count', ascending=False)

Unnamed: 0,field_name,unique_item_count
51,Title,309
7,Creator,291
60,Translated Title (English),290
0,Access Rights,249
50,Source Reference,236
...,...,...
22,Interviewee,1
52,Transcription from depicted text,1
47,Rights,1
42,PositionScreen,1


Now we want to collapse items into one row per id, choosing the appropiate metadata we will embbed.
The idea is to compilate text metadata into one field, while keeping geospatial and temporal metadata separate.

To start we will use english metadata

In [8]:

# Define priority mapping for fields
field_priority = {
    "title": ["Translated Title (English)", "Title"],
    "text": ["Translated Full Text Fragment (English)", "Full Text Fragment (Original Language)"],
    "creator": ["Creator"],
    "latitude": [
        "Viewpoint Latitude", "Origin Place Latitude", "Reference Place Latitude", "Label Latitude"
    ],
    "longitude": [
        "Viewpoint Longitude", "Origin Place Longitude", "Reference Place Longitude", "Label Longitude"
    ],
    "geo_metadata": [
        "Viewpoint Latitude", "Viewpoint Longitude", "Viewpoint Elevation", "Viewpoint Altitude",
        "Origin Place Latitude", "Origin Place Longitude", "Origin Place Altitude",
        "Reference Place Latitude", "Reference Place Longitude", "Reference Place Elevation",
        "Label Latitude", "Label Longitude", "Label Elevation Correction", "Horizon Height Estimate"
    ],
    "time_metadata": [
        "Dates of Creation", "Dates of Reference", "Date Available",
        "Earliest Start of Reference Period", "Latest End of Reference Period"
    ]
}

text_fields = [
    "Translated Full Text Fragment (English)",
    "Main Caption (English)",
    "Additional Caption 1 (English)",
    "Additional Caption 2 (English)",
    "Transcription from depicted text",
    "Display Label (English)",
    "Historical Caption",
]

# Helper function to extract first available field by priority
def extract_field(group, priorities):
    for field in priorities:
        match = group[group['element_text_element_name'] == field]
        if not match.empty:
            return match.iloc[0]['element_text_text']
    return None

# New helper to concatenate text fields in order
def concatenate_fields(group, fields):
    texts = []
    for field in fields:
        matches = group[group['element_text_element_name'] == field]
        texts.extend(matches['element_text_text'].tolist())
    return "\n\n".join(texts) if texts else None

# Group by item ID
records = []

for item_id, group in omeka_items.groupby('id'):
    title = extract_field(group, field_priority["title"])
    text = concatenate_fields(group, field_priority["text"])
    creator = extract_field(group, field_priority["creator"])

    # Build list of lat/lon pairs
    lat_fields = group[group['element_text_element_name'].isin(field_priority["latitude"])]
    lon_fields = group[group['element_text_element_name'].isin(field_priority["longitude"])]

    latitudes = lat_fields['element_text_text'].astype(float).tolist()
    longitudes = lon_fields['element_text_text'].astype(float).tolist()
    locations = [{"lat": lat, "lon": lon} for lat, lon in zip(latitudes, longitudes)]

    # Build geo and time metadata JSON from raw fields
    geo_meta = group[group['element_text_element_name'].isin(field_priority["geo_metadata"])]
    geo_dict = dict(zip(geo_meta['element_text_element_name'], geo_meta['element_text_text']))

    time_meta = group[group['element_text_element_name'].isin(field_priority["time_metadata"])]
    time_dict = dict(zip(time_meta['element_text_element_name'], time_meta['element_text_text']))

    records.append({
        "id": item_id,
        "title": title,
        "text": text,
        "creator": creator,
        "locations": locations if locations else None,
        "geo_metadata": geo_dict if geo_dict else None,
        "time_metadata": time_dict if time_dict else None
    })

# Convert to DataFrame
wide_df = pd.DataFrame(records)


In [9]:
wide_df

Unnamed: 0,id,title,text,creator,locations,geo_metadata,time_metadata
0,839,Hut 211,Hut 211 in the former women's camp. Here child...,Gedenkstätte Bergen-Belsen,,,
1,840,Huts 9 and 10,"In 1941, the Wehrmacht was planning to build 2...",Gedenkstätte Bergen-Belsen,,,
2,841,The Men's Camp,"The first transport with 1,000 sick prisoners ...",Gedenkstätte Bergen-Belsen,,,
3,842,The Star Camp,"More than 5,400 prisoners were imprisoned in t...",Gedenkstätte Bergen-Belsen,,,
4,843,The Gate to the Concentration Camp Prisoners'...,A fence with a simple gate on the camp’s main ...,Gedenkstätte Bergen-Belsen,,,
...,...,...,...,...,...,...,...
306,2145,"No food, no water","“Nothing at all: no food, no water. There was ...",Mala Birnbaum,,,{'Dates of Reference': '1945-04-xx - 1945-4-15'}
307,2149,"Arrival at the prisoners’ compound, June 1944","""[...] Through a shaky double gate strung ever...",Pierre Petit,"[{'lat': 52.76301546, 'lon': 9.914206557}, {'l...","{'Origin Place Latitude': '52.76301546', 'Orig...",{'Dates of Creation': 'xx/xx/1965 - xx/xx/1965...
308,2151,Aimé Blanc's prisoner tag,,unknown,,,{'Earliest Start of Reference Period': '1944-0...
309,2152,Status report for Bergen-Belsen concentration ...,,unbekannt/unknown,"[{'lat': 52.765300269101, 'lon': 9.91832613945...",{'Reference Place Latitude': '52.765300269101'...,{'Earliest Start of Reference Period': '1945-0...


Add urls to public url, and actual images

In [10]:
wide_df['public_url'] = wide_df['id'].apply(get_public_url)
wide_df['files_url'] = wide_df['id'].apply(
    lambda item_id: extract_file_urls(omeka_client.get_files_by_item(item_id))
)

In [79]:
wide_df.to_parquet('../../data/omeka_items_wide.parquet', index=False)

In [81]:
wide_df.head(1)

Unnamed: 0,id,title,text,creator,locations,geo_metadata,time_metadata,public_url,files_url
0,839,Hut 211,Hut 211 in the former women's camp. Here child...,Gedenkstätte Bergen-Belsen,,,,https://bb-g.futurememoryfoundation.org/items/...,[https://bb-g.futurememoryfoundation.org/files...


In [23]:
time_col = wide_df[~wide_df['time_metadata'].isna()][['id','time_metadata']]

In [40]:
time_col

Unnamed: 0,id,time_metadata
37,1136,{'Dates of Creation': 'xx/04/1945 - xx/12/1945...
38,1137,"{'Dates of Creation': '1945', 'Dates of Refere..."
39,1138,{'Dates of Creation': 'xx/06/1945 - xx/06/1945...
40,1140,"{'Dates of Creation': '28/04/1945', 'Dates of ..."
41,1141,"{'Dates of Creation': '28/04/1945', 'Dates of ..."
...,...,...
306,2145,{'Dates of Reference': '1945-04-xx - 1945-4-15'}
307,2149,{'Dates of Creation': 'xx/xx/1965 - xx/xx/1965...
308,2151,{'Earliest Start of Reference Period': '1944-0...
309,2152,{'Earliest Start of Reference Period': '1945-0...


In [70]:
times = time_col.groupby('id')['time_metadata'].apply(list).to_dict()

In [71]:
import json
with open('../../data/omeka_times.json', 'w', encoding='utf-8') as f:
    json.dump(times, f, indent=2, ensure_ascii=False)

Date normalization

In [91]:
import re
import json
import dateparser
import pandas as pd

def extract_all_date_strings(text):
    """
    Extracts date-like substrings from a raw string using strict patterns.
    """
    if not isinstance(text, str):
        return []

    patterns = [
        r'\b\d{4}-\d{2}-\d{2}\b',              # Full ISO: 1945-04-10
        r'\b\d{4}-\d{1,2}-\d{1,2}\b',          # Unpadded ISO: 1945-4-1
        r'\b\d{1,2}/\d{1,2}/\d{4}\b',          # Slashes: 10/04/1945
        r'\b\d{1,2}/\d{4}\b',                  # Month/Year: 04/1945
        r'\b\d{4}-\d{2}\b',                    # Year-Month: 1945-04
        r'\b\d{4}-\d{2}-xx\b',                 # Fuzzy ISO day
        r'\b\d{4}-xx-xx\b',                    # Fuzzy ISO month+day
        r'\[?xx/xx/\d{4}\]?',                  # Fuzzy slashes: xx/xx/1945
        r'\[?\d{1,2}/\d{1,2}/\d{4}\]?',        # Bracketed slashes
        r'\[?\d{1,2}\.\d{1,2}\.\d{4}\]?',      # Dot format: 10.04.1945
        r'\b\d{4}\b',                          # Year only
    ]
    combined_pattern = '|'.join(patterns)
    return re.findall(combined_pattern, text)

def normalize_date_str(date_str):
    if not date_str or not isinstance(date_str, str):
        return None

    clean = date_str.strip('[]').strip()

    # Already full ISO date
    if re.match(r"^\d{4}-\d{2}-\d{2}$", clean):
        return clean + "T00:00:00Z"

    # Handle year only
    if re.match(r"^\d{4}$", clean):
        return f"{clean}-01-01T00:00:00Z"

    # Handle year-month only
    if re.match(r"^\d{4}-\d{2}$", clean):
        return f"{clean}-01T00:00:00Z"

    # Fuzzy formats like xx/04/1945
    if 'xx' in clean:
        year_match = re.search(r'(\d{4})', clean)
        month_match = re.search(r'xx/(\d{2})/(\d{4})', clean)
        if year_match and not month_match:
            return f"{year_match.group(1)}-01-01T00:00:00Z"
        elif month_match:
            return f"{month_match.group(2)}-{month_match.group(1)}-01T00:00:00Z"
        clean = clean.replace('xx', '01')

    if '.' in clean:
        clean = clean.replace('.', '/')

    if re.match(r'^\d{1,2}/\d{4}$', clean):
        clean = f"01/{clean}"

    dt = dateparser.parse(clean, settings={
        'PREFER_DAY_OF_MONTH': 'first',
        'DATE_ORDER': 'DMY',
        'RETURN_AS_TIMEZONE_AWARE': False
    })

    return dt.isoformat() + "Z" if dt else None

def normalize_metadata_dict(metadata):
    """
    Normalize all date-like values in a metadata dict.
    Returns a new dict with same keys, each containing list of normalized RFC3339 dates.
    """
    result = {}
    if not isinstance(metadata, dict):
        return result  # Safely return empty if input is None or invalid

    for key, value in metadata.items():
        if not isinstance(value, str):
            continue

        # Only extract known date-like strings
        raw_dates = extract_all_date_strings(value)

        # Normalize extracted date strings
        normalized = [normalize_date_str(d) for d in raw_dates]

        # Remove duplicates and failed parses
        filtered = sorted(set(d for d in normalized if d))

        if filtered:
            result[key] = filtered

    return result


In [63]:
time_col["normalized_dates"] = time_col["time_metadata"].apply(normalize_metadata_dict)

In [None]:
def build_combined_dict(df):
    result = {}

    grouped = df.groupby('id')['time_metadata'].apply(list).to_dict()

    for doc_id, metadata_list in grouped.items():
        enriched = []
        for metadata in metadata_list:
            normalized = normalize_metadata_dict(metadata)
            enriched.append({
                "original": metadata,
                "normalized_dates": normalized
            })
        result[str(doc_id)] = enriched

    return result

# --- Apply on your DataFrame ---
# Assuming time_col is a DataFrame with 'id' and 'time_metadata'
combined_dict = build_combined_dict(time_col)

# (Optional) Save to JSON
import json
with open("omeka_times_fixed.json", "w", encoding="utf-8") as f:
    json.dump(combined_dict, f, indent=2, ensure_ascii=False)

In [95]:
wide_df["time_metadata"] = wide_df["time_metadata"].apply(normalize_metadata_dict)

In [97]:
wide_df.columns

Index(['id', 'title', 'text', 'creator', 'locations', 'geo_metadata',
       'time_metadata', 'public_url', 'files_url'],
      dtype='object')

snake case metadata items

In [158]:
import numpy as np

def to_snake_case(text):
    return re.sub(r'[\s\-]+', '_', text.strip()).lower()

def normalize_dict_keys(d):
    if isinstance(d, dict):
        return {to_snake_case(k): v for k, v in d.items()}
    return d

def empty_to_nan(val):
    if isinstance(val, (list, dict)) and not val:
        return np.nan
    return val

In [159]:
wide_df["geo_metadata"] = wide_df["geo_metadata"].apply(normalize_dict_keys)
wide_df["time_metadata"] = wide_df["time_metadata"].apply(normalize_dict_keys)

wide_df["geo_metadata"] = wide_df["geo_metadata"].apply(empty_to_nan)
wide_df["time_metadata"] = wide_df["time_metadata"].apply(empty_to_nan)
wide_df["files_url"] = wide_df["files_url"].apply(empty_to_nan)

## Embedding and Metadata

To start we should decide whether what fields do we embedd for each item.
Additionally which metadata payload to input to Qdrant, the latter should compromise time, space, images at least.



In [125]:
def safe_first_file_url(files_url):
    if isinstance(files_url, list) and files_url:
        return files_url[0]
    elif isinstance(files_url, str):
        return files_url
    return None

In [148]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct

client = QdrantClient(
    url="https://8b82ea46-4590-4c10-ab6e-5cffadbf67df.europe-west3-0.gcp.cloud.qdrant.io",
    api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.XPp3QvP2ktvjUPa48GZT9Vk6ax-KdXZg10p5HeZEDQc"
)

In [161]:
collection_name = "omeka_items"

client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=len(vectors[0]), distance=Distance.COSINE)
)

  client.recreate_collection(


True

In [162]:
from sentence_transformers import SentenceTransformer
from uuid import uuid4

# 1. Load the embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")  # 384-dim vector

# 2. Prepare vectors and payloads
points = []

for _, row in wide_df.iterrows():
    combined_text = f"{row['title']}\n{row['text']}" if pd.notnull(row['text']) else row['title']
    embedding = model.encode(combined_text)

    # Collect metadata
    payload = {
        "id": str(row['id']),
        "title": row['title'],
        "text": row.get('text'),
        "creator": row.get('creator'),
        "locations": row.get('locations'),
        "geo_metadata": row.get('geo_metadata'),
        "time_metadata": row.get('time_metadata'),
        "public_url": row.get('public_url'),
        "image_url": safe_first_file_url(row.get('files_url')),
        "files_url": row.get('files_url'),
    }

    points.append(
        PointStruct(
            id=str(uuid4()),  # or use row["id"] if it's unique
            vector=embedding,
            payload=payload
        )
    )

### Qdrant Ingest

In [163]:
client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [196]:
from qdrant_client.models import Filter, FieldCondition, MatchValue, PayloadSchemaType

client.create_payload_index(
    collection_name="omeka_items",
    field_name="time_metadata.dates_of_creation",
    field_type="datetime"
)

UpdateResult(operation_id=10, status=<UpdateStatus.COMPLETED: 'completed'>)

## Test Querying

In [186]:
from qdrant_client import models
query = "Hunger"
query_vector = model.encode(query)

response = client.query_points(
    collection_name=collection_name,
    query=query_vector,
    limit=5
)

response.points[1].payload

{'id': '2140',
 'title': 'Food rations',
 'text': '“The food has grown significantly worse since the move. At midday we now receive a soup made of cabbage or spinach, but which has almost no fat. For our bread there is margarine instead of butter, and instead of the good jam there is an acidic compote. We haven\'t received our evening soup for a whole month. We all feel bad, we are all constantly hungry and irritated. We’ve never quarrelled so much before. Acquaintances fight, impatient mothers hit their children, married couples have on-going wars of words. If we receive a few days’ worth of bread rations, we use a knife to mark how much of the bread we can eat each day. Anyone who can’t control themselves eats their entire ration in the first days and has to go without bread in the last days. Then you just lie in bed. It’s easier to bear the hunger lying down."\n\n“The food has grown significantly worse since the move. At midday we now receive a soup made of cabbage or spinach, but w

In [215]:
from datetime import datetime, timedelta

def shift_iso_datetime(iso_str, hours=0, days=0):
    dt = datetime.strptime(iso_str, "%Y-%m-%dT%H:%M:%SZ")
    shifted = dt + timedelta(hours=hours, days=days)
    return shifted.strftime("%Y-%m-%dT%H:%M:%SZ")


In [221]:
from qdrant_client.models import Filter, FieldCondition, DatetimeRange

filter_datetime = "1944-01-31T10:14:31Z"

search_filter = Filter(
    must=[
        FieldCondition(
            key="time_metadata.dates_of_creation",
            range=DatetimeRange(
                gt=filter_datetime,
                lte=shift_iso_datetime(filter_datetime, days=1000)
                )  # 1 hour after
        )
    ]
)

results = client.query_points(
    collection_name="omeka_items",
    query=model.encode("your query text"),
    limit=10,
    query_filter=search_filter,
    with_payload=True
)

In [222]:
print(results.model_dump(exclude_none=True, exclude_unset=True))

{'points': [{'id': 'cce533d6-03e7-4a2e-9423-4d062037e14e', 'version': 0, 'score': 0.13541843, 'payload': {'id': '1167', 'title': 'Incinerator', 'text': None, 'creator': 'Fotograf unbekannt /  Photographer unknown', 'locations': [{'lat': 52.75933271, 'lon': 9.903840874}, {'lat': 52.75949922, 'lon': 9.903894835}, {'lat': 52.759346864, 'lon': 9.9039709568024}], 'geo_metadata': {'origin_place_latitude': '52.75933271', 'origin_place_longitude': '9.903840874', 'reference_place_latitude': '52.75949922', 'reference_place_longitude': '9.903894835', 'viewpoint_latitude': '52.759346864', 'viewpoint_longitude': '9.9039709568024'}, 'time_metadata': {'dates_of_creation': ['1945-04-15T00:00:00Z', '1945-10-01T00:00:00Z'], 'dates_of_reference': ['1945-04-15T00:00:00Z', '1945-10-01T00:00:00Z'], 'earliest_start_of_reference_period': ['1945-04-15T00:00:00Z'], 'latest_end_of_reference_period': ['1945-10-31T00:00:00Z']}, 'public_url': 'https://bb-g.futurememoryfoundation.org/items/show/1167', 'image_url': '

In [223]:
with open("omeka_search_results.json", "w", encoding="utf-8") as f:
    f.write(results.model_dump_json(indent=2, exclude_none=True))

In [None]:
import folium

results, _ = client.scroll(
    collection_name="omeka_items",
    with_payload=True,
    limit=1000  # Adjust to number of points you want
)

# Create a base map centered over Europe
m = folium.Map(location=[51.0, 10.0], zoom_start=5)

for point in results:
    locations = payload.get("locations")

    payload = point.payload
    geo = locations[0] if isinstance(locations, list) and locations else None
    title = payload.get("title", "No Title")

    if geo and isinstance(geo, dict):
        try:
            lat = float(geo["lat"])
            lon = float(geo["lon"])
            folium.Marker(
                location=[lat, lon],
                popup=title,
                tooltip=title
            ).add_to(m)
        except (KeyError, ValueError):
            continue

# Show the map in Jupyter notebook
m