In [1]:
import pandas as pd
import omeka_tools as ot
import omeka_tools.omeka_extractor as oe
from omeka_tools.standarize.dates import extract_and_standardize_dates

In [2]:
omeka_client = ot.client.OmekaClient()

In [3]:
# retrieve items
omeka_items = oe.fetch_all_items()

# clean
items = [oe.filter_json(item) for item in omeka_items]

# metadata df
metadata_df = oe.extract_metadata(items)

Retrieved 740 items.


In [5]:
# metadata items
corpus = metadata_df[["id", "element__name", "text"]].rename({"element__name": "field"}, axis=1)

In [6]:
# Define priority mapping for fields
text_fields = [
    "Translated Full Text Fragment (English)",
    "Main Caption (English)",
    "Additional Caption 1 (English)",
    "Additional Caption 2 (English)",
    "Transcription from depicted text",
    "Display Label (English)",
    "Historical Caption",
]

field_priority = {
    "title": ["Translated Title (English)", "Title"],
    "text": text_fields,
    "creator": ["Creator"],
    "latitude": [
        "Viewpoint Latitude", "Origin Place Latitude", "Reference Place Latitude", "Label Latitude"
    ],
    "longitude": [
        "Viewpoint Longitude", "Origin Place Longitude", "Reference Place Longitude", "Label Longitude"
    ],
    "geo_metadata": [
        "Viewpoint Latitude", "Viewpoint Longitude", "Viewpoint Elevation", "Viewpoint Altitude",
        "Origin Place Latitude", "Origin Place Longitude", "Origin Place Altitude",
        "Reference Place Latitude", "Reference Place Longitude", "Reference Place Elevation",
        "Label Latitude", "Label Longitude", "Label Elevation Correction", "Horizon Height Estimate"
    ],
    "time_metadata": [
        "Dates of Creation", "Dates of Reference", "Date Available",
        "Earliest Start of Reference Period", "Latest End of Reference Period"
    ]
}

# extract first available field by priority
def extract_field(group, priorities):
    for field in priorities:
        match = group[group['field'] == field]
        if not match.empty:
            return match.iloc[0]['text']
    return None

# concatenate text fields in order
def concatenate_fields(group, fields):
    texts = []
    for field in fields:
        matches = group[group['field'] == field]
        texts.extend(matches['text'].tolist())
    return "\n\n".join(texts) if texts else None

# Group by item ID
records = []

for item_id, group in corpus.groupby('id'):
    title = extract_field(group, field_priority["title"])
    text = concatenate_fields(group, field_priority["text"])
    creator = extract_field(group, field_priority["creator"])

    # Build list of lat/lon pairs
    lat_fields = group[group['field'].isin(field_priority["latitude"])]
    lon_fields = group[group['field'].isin(field_priority["longitude"])]

    latitudes = lat_fields['text'].astype(float).tolist()
    longitudes = lon_fields['text'].astype(float).tolist()
    locations = [{"lat": lat, "lon": lon} for lat, lon in zip(latitudes, longitudes)]

    # Build geo and time metadata JSON from raw fields
    geo_meta = group[group['field'].isin(field_priority["geo_metadata"])]
    geo_dict = dict(zip(geo_meta['field'], geo_meta['text']))

    time_meta = group[group['field'].isin(field_priority["time_metadata"])]
    time_dict = dict(zip(time_meta['field'], time_meta['text']))

    records.append({
        "id": item_id,
        "title": title,
        "text": text,
        "creator": creator,
        "locations": locations if locations else None,
        "geo_metadata": geo_dict if geo_dict else None,
        "time_metadata": time_dict if time_dict else None
    })

# Convert to DataFrame
corpus_wide = pd.DataFrame(records)

In [7]:
# date normalization
corpus_wide["time_metadata"] = corpus_wide["time_metadata"].apply(extract_and_standardize_dates)



In [17]:
import numpy as np
import re

def to_snake_case(text):
    return re.sub(r'[\s\-]+', '_', text.strip()).lower()

def normalize_dict_keys(d):
    if isinstance(d, dict):
        return {to_snake_case(k): v for k, v in d.items()}
    return d

def empty_to_nan(val):
    if isinstance(val, (list, dict)) and not val:
        return np.nan
    return val

def strip_str(val):
    if not val:
        return np.nan
    return val.strip()

corpus_wide["geo_metadata"] = corpus_wide["geo_metadata"].apply(normalize_dict_keys)
corpus_wide["time_metadata"] = corpus_wide["time_metadata"].apply(normalize_dict_keys)
corpus_wide["geo_metadata"] = corpus_wide["geo_metadata"].apply(empty_to_nan)
corpus_wide["time_metadata"] = corpus_wide["time_metadata"].apply(empty_to_nan)
corpus_wide["text"] = corpus_wide["text"].apply(strip_str)

AttributeError: 'float' object has no attribute 'strip'

In [9]:
corpus_wide.to_csv("./data/corpus_bb.csv", index=False)

In [10]:
corpus_wide = pd.read_csv("./data/corpus_bb.csv")

In [11]:
display(corpus_wide)

Unnamed: 0,id,title,text,creator,locations,geo_metadata,time_metadata
0,839,Hut 211,Hut 211 in the former women's camp. Here child...,Gedenkstätte Bergen-Belsen,,,
1,840,Huts 9 and 10,"In 1941, the Wehrmacht was planning to build 2...",Gedenkstätte Bergen-Belsen,,,
2,841,The Men's Camp,"The first transport with 1,000 sick prisoners ...",Gedenkstätte Bergen-Belsen,,,
3,842,The Star Camp,"More than 5,400 prisoners were imprisoned in t...",Gedenkstätte Bergen-Belsen,,,
4,843,The Gate to the Concentration Camp Prisoners'...,A fence with a simple gate on the camp’s main ...,Gedenkstätte Bergen-Belsen,,,
...,...,...,...,...,...,...,...
735,2151,Aimé Blanc's prisoner tag,The prisoners on the first transport to Bergen...,unknown,,,{'earliest_start_of_reference_period': ['1944-...
736,2152,Status report for Bergen-Belsen concentration ...,This is one of the few surviving documents fro...,unbekannt/unknown,"[{'lat': 52.765300269101, 'lon': 9.91832613945...",{'reference_place_latitude': '52.765300269101'...,{'earliest_start_of_reference_period': ['1945-...
737,2153,Bible and sermon sketch,The renowned Dutch Rabbi Simon Philip de Vries...,unknown,,,{'latest_end_of_reference_period': ['1944-03-2...
738,2154,Funeral in the Hungarian 'Sonder-Lager'. Peopl...,Watercolour drawing\n\nTranslated from the Hun...,Ervin Abádi,,,{'earliest_start_of_reference_period': ['1944-...


In [12]:
corpus_wide['text_length_chars'] = corpus_wide['text'].str.len()
corpus_wide['text_length_words'] = corpus_wide['text'].str.split().str.len()

In [13]:
# filter by length
dataset = corpus_wide[~corpus_wide['text'].isna()]

In [15]:
dataset = dataset[dataset['text_length_chars'] > 100]

In [47]:
filter_ids = [1554]

In [48]:
dataset = dataset[~dataset['id'].isin(filter_ids)]

In [49]:
dataset.to_csv("./data/bergen-belsen-omeka.csv")

In [50]:
dataset

Unnamed: 0,id,title,text,creator,locations,geo_metadata,time_metadata,text_length_chars,text_length_words,id_link
0,839,Hut 211,Hut 211 in the former women's camp. Here child...,Gedenkstätte Bergen-Belsen,,,,114.0,19.0,"<a href=""https://bb-g.futurememoryfoundation.o..."
1,840,Huts 9 and 10,"In 1941, the Wehrmacht was planning to build 2...",Gedenkstätte Bergen-Belsen,,,,1099.0,185.0,"<a href=""https://bb-g.futurememoryfoundation.o..."
2,841,The Men's Camp,"The first transport with 1,000 sick prisoners ...",Gedenkstätte Bergen-Belsen,,,,459.0,69.0,"<a href=""https://bb-g.futurememoryfoundation.o..."
3,842,The Star Camp,"More than 5,400 prisoners were imprisoned in t...",Gedenkstätte Bergen-Belsen,,,,480.0,80.0,"<a href=""https://bb-g.futurememoryfoundation.o..."
4,843,The Gate to the Concentration Camp Prisoners'...,A fence with a simple gate on the camp’s main ...,Gedenkstätte Bergen-Belsen,,,,390.0,67.0,"<a href=""https://bb-g.futurememoryfoundation.o..."
...,...,...,...,...,...,...,...,...,...,...
734,2150,View from the watchtower,View from the watchtower on the entrance area ...,unknown,,,{'earliest_start_of_reference_period': ['1941-...,233.0,36.0,"<a href=""https://bb-g.futurememoryfoundation.o..."
735,2151,Aimé Blanc's prisoner tag,The prisoners on the first transport to Bergen...,unknown,,,{'earliest_start_of_reference_period': ['1944-...,489.0,82.0,"<a href=""https://bb-g.futurememoryfoundation.o..."
736,2152,Status report for Bergen-Belsen concentration ...,This is one of the few surviving documents fro...,unbekannt/unknown,"[{'lat': 52.765300269101, 'lon': 9.91832613945...",{'reference_place_latitude': '52.765300269101'...,{'earliest_start_of_reference_period': ['1945-...,274.0,43.0,"<a href=""https://bb-g.futurememoryfoundation.o..."
737,2153,Bible and sermon sketch,The renowned Dutch Rabbi Simon Philip de Vries...,unknown,,,{'latest_end_of_reference_period': ['1944-03-2...,396.0,70.0,"<a href=""https://bb-g.futurememoryfoundation.o..."


In [65]:
BASE_URL = "https://bb-g.futurememoryfoundation.org/items/show/"

# 1. Prepare Data and Hyperlink Column
# Note: You need to have the 'dataset' DataFrame defined before this block.
dataset['id_link'] = dataset['id'].apply(lambda x: f'<a href="{BASE_URL}{x}" target="_blank">{x}</a>')

# 2. Select columns and start the Styler object
report_cols = ['id_link', 'title', 'text', 'creator']
styled_df = dataset[report_cols].style

# --- Tooltip Setup (Correct as fixed in previous steps) ---
df_index = styled_df.data.index
tooltip_column_name = 'id_link'
tooltip_text = "Click to view the source item."
tooltip_series = pd.Series([tooltip_text] * len(df_index), index=df_index)
tooltip_df = pd.DataFrame({tooltip_column_name: tooltip_series})
styled_df = styled_df.set_tooltips(tooltip_df)

# 3. Apply Styling, Formatting, and Attributes

styled_df = styled_df.set_caption("Bergen-Belsen Metadata Sharing")

# ⭐️ CORRECTION 1: CONSOLIDATE TABLE ATTRIBUTES
# This line ensures the critical 'id="reportTable"' attribute is present.
styled_df = styled_df.set_table_attributes('id="reportTable" class="styled-report-table"')

# ⭐️ CORRECTION 2: Use set_properties ONCE for CSS
# You can define ALL cell-specific CSS properties here.
styled_df = styled_df.set_properties(**{
    'font-size': '10pt', 
    'border-color': 'lightgray'
})

# Apply table styles (th/table width CSS)
styled_df = styled_df.set_table_styles([
    # We remove the .styled-report-table selector here since width is set by the CSS sheet, 
    # but keeping it for completeness if you have custom inline styles you want to enforce.
    {'selector': '.styled-report-table', 'props': [('width', '90%')]}, 
    {'selector': 'th', 'props': [('text-align', 'left')]}
])

# Apply data formatting (thousand separators)
# styled_df = styled_df.format({
#     'text_length_chars': '{:,.0f}',
#     'text_length_words': '{:,.0f}'
# })

# Apply background gradient
# styled_df = styled_df.background_gradient(
#     subset=['text_length_chars', 'text_length_words'],
#     cmap='Blues'
# )

# 4. Generate the HTML string
html_table_string = styled_df.to_html(index=False)

In [66]:
full_html_template = f"""
<!DOCTYPE html>
<html>
<head>
    <title>Metadata Report</title>
    <style>
        .styled-report-table {{
            border-collapse: collapse; width: 90%; margin: 25px auto;
            font-family: Arial, sans-serif; box-shadow: 0 0 20px rgba(0, 0, 0, 0.1);
            text-align: left;
        }}
        .styled-report-table caption {{ font-size: 1.5em; font-weight: bold; padding-bottom: 10px; }}
        
        .styled-report-table th, .styled-report-table td {{ 
            padding: 12px 15px; 
            border: 1px solid #ddd; 
            vertical-align: top; 
        }}
        
        .styled-report-table th {{ background-color: #f2f2f2; color: #333; cursor: pointer; }}
        .styled-report-table th:hover {{ background-color: #e0e0e0; }}
        
        .styled-report-table th.sorted-asc::after {{ content: ' ▲'; }}
        .styled-report-table th.sorted-desc::after {{ content: ' ▼'; }}
    </style>
</head>
<body>
    {html_table_string}

    <script>
        document.addEventListener('DOMContentLoaded', () => {{
            const table = document.getElementById('reportTable');
            if (!table) return; // Exit if table not found

            const getCellValue = (tr, idx) => {{
                const cell = tr.children[idx];
                
                // Handle Hyperlinks (gets the visible ID number)
                const link = cell.querySelector('a');
                if (link) {{
                    return link.innerText || link.textContent;
                }}
                
                // Fallback for regular cells
                return cell.innerText || cell.textContent;
            }};

            const comparer = (idx, asc) => (a, b) => {{
                let va = getCellValue(asc ? a : b, idx);
                let vb = getCellValue(asc ? b : a, idx);
                
                // Robust Numeric Comparison (handles formatted numbers like "150,000")
                const numeric_va = parseFloat(va.replace(/[^0-9.-]+/g, ""));
                const numeric_vb = parseFloat(vb.replace(/[^0-9.-]+/g, ""));

                const isNumeric = !isNaN(numeric_va) && !isNaN(numeric_vb);
                
                if (isNumeric) {{
                    // Sort numerically
                    return numeric_va - numeric_vb;
                }}

                // Default to String comparison
                return va.toString().localeCompare(vb.toString());
            }};

            table.querySelectorAll('th').forEach(th => th.addEventListener('click', (() => {{
                const tableBody = table.querySelector('tbody');
                const index = Array.from(th.parentNode.children).indexOf(th);
                const asc = th.asc = !th.asc;

                Array.from(tableBody.querySelectorAll('tr'))
                    .sort(comparer(index, asc))
                    .forEach(tr => tableBody.appendChild(tr));

                // Clear and set sort indicator
                table.querySelectorAll('th').forEach(h => h.classList.remove('sorted-asc', 'sorted-desc'));
                th.classList.add(asc ? 'sorted-asc' : 'sorted-desc');
            }})));
        }});
    </script>
</body>
</html>
"""

In [67]:
# 6. Save the HTML file with Sorting (JavaScript)
output_file_path = './data/omeka_bb_dataset.html'

with open(output_file_path, 'w') as f:
    f.write(full_html_template)

print(f"✅ Report saved successfully to: {output_file_path}")
print("The 'ID' column is now clickable, and the table headers (including length columns) are sortable.")

✅ Report saved successfully to: ./data/omeka_bb_dataset.html
The 'ID' column is now clickable, and the table headers (including length columns) are sortable.


In [68]:
import pdfkit
from pathlib import Path
import os

# -------------------------------------------------------------
# PDF CONVERSION LOGIC
# -------------------------------------------------------------

# 1. Define the output path for the PDF
pdf_output_path = Path('./data/omeka_bb_dataset.pdf')

# Ensure the output directory exists
pdf_output_path.parent.mkdir(parents=True, exist_ok=True)

try:
    # 2. Configure wkhtmltopdf options (Optional but Recommended)
    # The default behavior is often fine, but these settings improve table layout and printing.
    options = {
        'page-size': 'A4',
        'orientation': 'Landscape', # Use Landscape if your table has many columns
        'margin-top': '1in',
        'margin-right': '0.5in',
        'margin-bottom': '1in',
        'margin-left': '0.5in',
        'encoding': "UTF-8",
        'no-outline': None, # Don't print outline (table of contents)
        # Disable smart shrinking/scaling, often leads to better layout for data tables
        'disable-smart-shrinking': '', 
        # Crucial for multi-page tables: allows tables to break across pages
        'enable-local-file-access': True
    }

    # 3. Use pdfkit to convert the HTML string to a PDF file
    pdfkit.from_string(
        html_table_string, 
        str(pdf_output_path), 
        options=options
    )

    print(f"✅ Report saved successfully to: {pdf_output_path.resolve()}")

except IOError as e:
    print("❌ ERROR: PDF conversion failed.")
    print("This usually means wkhtmltopdf is not installed or its path is not configured.")
    print("Please install wkhtmltopdf or check the path if running in a non-standard environment.")
    print(f"Details: {e}")

# Note: You still need to run your HTML saving code if you want both files.

✅ Report saved successfully to: /workspace/MEMORISE/omeka-tools/notebooks/data/omeka_bb_dataset.pdf


#### Validation of Date Normalization

In [None]:
time_col = corpus_wide[~corpus_wide['time_metadata'].isna()][['id','time_metadata']]
time_col["normalized_dates"] = time_col["time_metadata"].apply(extract_and_standardize_dates)

In [None]:
# debug time normalization
def build_combined_dict(df):
    result = {}

    grouped = df.groupby('id')['time_metadata'].apply(list).to_dict()

    for doc_id, metadata_list in grouped.items():
        enriched = []
        for metadata in metadata_list:
            normalized = extract_and_standardize_dates(metadata)
            enriched.append({
                "original": metadata,
                "normalized_dates": normalized
            })
        result[str(doc_id)] = enriched

    return result

# --- Apply on your DataFrame ---
combined_dict = build_combined_dict(time_col)

# (Optional) Save to JSON
import json
with open("./data/omeka_times_fixed.json", "w", encoding="utf-8") as f:
    json.dump(combined_dict, f, indent=2, ensure_ascii=False)