In [None]:
# INSTALLS THAT NEED TO BE RUN ON CONDA
# !pip install loguru

In [None]:
import json
import boto3
import base64
import re
import os
from PIL import Image
from pathlib import Path
import glob
import time
from importlib import reload
import pandas as pd
from IPython.display import display
from io import BytesIO
from botocore.config import Config
import logging

logging.basicConfig(level=logging.INFO, force=True)  # Resets handlers

%load_ext autoreload
%autoreload 2

pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# os.chdir('..')
print("CWD:", os.getcwd())
bedrock_runtime = boto3.client("bedrock-runtime")
s3 = boto3.client("s3")

In [None]:
# Import libary specific items.  chdir to the right import directory and then finally change back whether or not there are errors
try:
    os.chdir("../../lib/src")
    import image_captioning_assistant.generate.prompts as p
    import image_captioning_assistant.data.data_classes as dc

    import image_captioning_assistant.generate.generate_structured_metadata as gsm
    import image_captioning_assistant.generate.utils as gen_utils

    # import image_captioning_assistant.data.data_classes as dc
    import image_captioning_assistant.aws.s3 as s3_util
finally:
    os.chdir("../../projects/research")

In [None]:
# Helper functions for notebook
def show_base64_image(encoded_str):
    # Add padding if missing
    missing_padding = len(encoded_str) % 4
    if missing_padding:
        encoded_str += "=" * (4 - missing_padding)

    # Decode and display
    image_data = base64.b64decode(encoded_str)
    image = Image.open(BytesIO(image_data))
    display(image)

    
def download_s3_directory(bucket, s3_prefix, local_dir):
    s3 = boto3.client("s3")
    paginator = s3.get_paginator("list_objects_v2")

    for page in paginator.paginate(Bucket=bucket, Prefix=s3_prefix):
        for obj in page.get("Contents", []):
            # Skip directory markers
            if obj["Key"].endswith("/"):
                continue

            # Build local path
            relative_path = obj["Key"].replace(s3_prefix, "", 1)
            local_path = local_dir / relative_path

            # Create parent directories and download
            local_path.parent.mkdir(parents=True, exist_ok=True)
            s3.download_file(bucket, obj["Key"], str(local_path))


def display_work_id_images(work_id):
    """
    Display images for a work ID from S3, without saving to disk.
    """
    # Get SHA1s for the work_id
    shas = ground_truth_df[ground_truth_df["work_id"] == work_id]["page_sha1"]
    
    # Create S3 URIs
    image_s3_uris = [f"s3://{bucket_name}/ground_truth_images/{sha}" for sha in shas]
    
    # Load all images with no resizing
    image_bytes_list = gen_utils.load_and_resize_images(
        image_s3_uris=image_s3_uris,
        s3_kwargs={},
        resize_kwargs={}  # No resizing as requested
    )
    
    # Display the images
    from io import BytesIO
    
    for img_bytes in image_bytes_list:
        if img_bytes is not None:
            img = Image.open(BytesIO(img_bytes))
            display(img)
        else:
            print("Failed to load image")


def print_output(output):
    if "description" in output:
        s = pd.Series(output)
        display(pd.DataFrame({"Metadata Item": s.index, "Output from AI Model": s.values}))
    else:
        bias_list = output["bias_analysis"]
        # Convert to DataFrame with multi-index
        multi_index_data = []
        for i, bias_dict in enumerate(bias_list):
            for key, value in bias_dict.items():
                multi_index_data.append(((i + 1, key), value))

        # Create DataFrame
        multi_index = pd.MultiIndex.from_tuples([item[0] for item in multi_index_data], names=["Bias ID", "Bias Item"])
        df = pd.DataFrame(
            {"Output from AI Model": [item[1] for item in multi_index_data]},
            index=multi_index,
        )
        display(df)

def gen_metadata_for_wid(work_id, page_title, model_id="us.anthropic.claude-3-5-sonnet-20241022-v2:0"):
    image_path = "ground_truth/images"
    shas = ground_truth_df[ground_truth_df["work_id"] == work_id][["page_sha1", "page_title"]]
    image_s3_uris = []
    if page_title.lower() == "front" and len(shas) > 1:
        front_sha = shas[shas["page_title"].str.lower() == "front"]["page_sha1"].values[0]
        image_s3_uris.append(f"s3://gaiic-emory-dev/ground_truth_images/{front_sha}")
        back_sha = shas[shas["page_title"].str.lower() == "back"]["page_sha1"].values[0]
        image_s3_uris.append(f"s3://gaiic-emory-dev/ground_truth_images/{back_sha}")
    else:
        front_sha = shas[shas["page_title"] == page_title]["page_sha1"].values[0]
        image_s3_uris.append(f"s3://gaiic-emory-dev/ground_truth_images/{front_sha}")
        back_sha = None

    s3_kwargs = {
        "config": Config(
            s3={"addressing_style": "virtual"},
            signature_version="s3v4",
        ),
        "region_name": "us-east-1",
    }

    llm_kwargs = {
        "model_id": model_id,
    }
    results = gsm.generate_work_structured_metadata(
        image_s3_uris=image_s3_uris,
        llm_kwargs=llm_kwargs,
        s3_kwargs=s3_kwargs,
        resize_kwargs={},
    )
    return results

In [None]:
# download ground truth set to local
# Configuration
bucket_name = "gaiic-emory-dev"
local_base = Path("ground_truth")

# Download single CSV file
csv_path = local_base / "ground_truth.csv"
csv_path.parent.mkdir(parents=True, exist_ok=True)
boto3.client("s3").download_file(bucket_name, "ground_truth.csv", str(csv_path))
ground_truth_df = pd.read_csv("ground_truth/ground_truth.csv")
# Download entire images directory - optional, takes a while
# download_s3_directory(
#     bucket=bucket_name,
#     s3_prefix='ground_truth_images/',
#     local_dir=local_base / 'images'
# )

In [None]:
# Visualize a work ID
display_work_id_images('880ht76hj7-cor')

In [None]:
# OPTIONAL for diagnostics, dump out the metadata model schema
# formatted_schema = json.dumps(dc.Metadata.model_json_schema(), indent=4, ensure_ascii=True)
# print(formatted_schema)

In [None]:
reload(p)
results = gen_metadata_for_wid('880ht76hj7-cor', 'Front') # cotton in sunny south 689d51c5f7-cor
# results = gen_metadata_for_wid("989r2280h5-cor", "Front")  # Hatian mother with offspring
# results = gen_metadata_for_wid("689d51c5f7-cor", "Front")  # AA boy pointing at possum in tree
# results = gen_metadata_for_wid('24298sf7s0-cor', "Front")  # Burning of AA man
print_output(results.model_dump())

In [None]:
reload(p)
results = gen_metadata_for_wid('880ht76hj7-cor', 'Front', model_id="amazon.nova-pro-v1:0") # cotton in sunny south 689d51c5f7-cor
# results = gen_metadata_for_wid("989r2280h5-cor", "Front")  # Hatian mother with offspring
# results = gen_metadata_for_wid("689d51c5f7-cor", "Front")  # AA boy pointing at possum in tree
# results = gen_metadata_for_wid('24298sf7s0-cor', "Front")  # Burning of AA man
print_output(results.model_dump())

In [None]:
reload(p)
# results = gen_metadata_for_wid('880ht76hj7-cor', 'Front') # cotton in sunny south 689d51c5f7-cor
results = gen_metadata_for_wid("989r2280h5-cor", "Front")  # Hatian mother with offspring
# results = gen_metadata_for_wid("689d51c5f7-cor", "Front")  # AA boy pointing at possum in tree
# results = gen_metadata_for_wid('24298sf7s0-cor', "Front")  # Burning of AA man
print_output(results.model_dump())

In [None]:
reload(p)
# results = gen_metadata_for_wid('880ht76hj7-cor', 'Front') # cotton in sunny south 689d51c5f7-cor
# results = gen_metadata_for_wid("989r2280h5-cor", "Front")  # Hatian mother with offspring
results = gen_metadata_for_wid("689d51c5f7-cor", "Front")  # AA boy pointing at possum in tree
# results = gen_metadata_for_wid('24298sf7s0-cor', "Front")  # Burning of AA man
print_output(results.model_dump())

# Run Evaluation -- NEEDS FIXING TO DEAL WITH DIFFERENCES BETWEEN GROUND TRUTH AND GENERATIONS

In [None]:
try:
    os.chdir("../../lib/src")
    import image_captioning_assistant.evaluate.evaluate_bias_analysis as eba
    import image_captioning_assistant.evaluate.evaluate_structured_metadata as esm
    import image_captioning_assistant.evaluate.evaluate_freeform_description as efd
    from image_captioning_assistant.data.constants import BiasLevel, BiasType, LibraryFormat
finally:
    os.chdir("../../projects/research")

### Metadata Eval

In [None]:
ground_truth_df.head(1).tail(1).transpose()

In [None]:
metadata_items = """
title
abstract
content_genres
content_type
date_created
subject_geo
subject_names
subject_topics
""".strip().split(
    "\n"
)
ground_truth_df.head(17).tail(1)[["work_id"] + metadata_items].transpose()

In [None]:
def gt_row_to_metadata_obj(gt_row):
    gt_content_type_mapping = {
        "http://id.loc.gov/vocabulary/resourceTypes/img": LibraryFormat.still_image,
        "http://id.loc.gov/vocabulary/resourceTypes/txt": LibraryFormat.text,
    }
    cleaned_abstract = (
        gt_row["abstract"].replace("Verso:", "").replace("Recto:", "")
        if gt_row["abstract"] == gt_row["abstract"]
        else ""
    )
    topics = []
    for col in ["subject_topics"]:
        if gt_row[col]:
            topics.extend(str(gt_row[col]).split("|"))
    return dc.Metadata(
        description=gt_row["title"],
        transcription=dc.Transcription(printed_text=[cleaned_abstract], handwriting=[]),
        date=str(gt_row["date_created"]),
        location=str(gt_row["subject_geo"]),
        publication_info=[],
        contextual_info=[],
        format=gt_content_type_mapping[gt_row["content_type"]],
        genre=[gt_row["content_genres"]],
        topics=topics,
        objects=[],
        actions=[],
        people=[],
    )


gt_row_to_metadata_obj(ground_truth_df.iloc[16, :])

In [None]:
import aiohttp
import asyncio
import nest_asyncio

nest_asyncio.apply()


def get_human_and_llm_metadata(gt_row):
    human_metadata = gt_row_to_metadata_obj(gt_row)
    llm_metadata = gen_metadata_for_wid(gt_row["work_id"], gt_row["page_title"])
    return human_metadata, llm_metadata


def run_gt_metadata_eval(gt_row):
    human_metadata, llm_metadata = get_human_and_llm_metadata(gt_row)
    chat_bedrock_kwargs = {"model": "us.anthropic.claude-3-5-sonnet-20241022-v2:0"}
    # Call eval function
    return esm.evaluate_structured_metadata(llm_metadata, human_metadata, chat_bedrock_kwargs)

In [None]:
# run eval for only items with front and back because others are pages which are bias only
from tqdm import tqdm_notebook

eval_results = {}
gt_dedup = ground_truth_df[ground_truth_df["page_title"].str.lower() == "front"].copy(deep=True).fillna("")
for i, gt_row in tqdm_notebook(gt_dedup.iterrows(), total=len(gt_dedup)):
    if i not in eval_results:
        eval_results[i] = run_gt_metadata_eval(gt_row)

In [None]:
eval_results[i]

In [None]:
eval_results[i]

In [None]:
reload(esm)
esm.combine_structured_metadata_evaluations(list(eval_results.values())).model_dump()

In [None]:
shas = ground_truth_df[ground_truth_df["work_id"] == gt_row["work_id"]][["page_sha1", "page_title"]]
shas

In [None]:
ground_truth_df[ground_truth_df["work_id"].str.strip() == gt_row["work_id"]][["page_sha1", "page_title"]]

In [None]:
results = gen_metadata_for_wid("423612jmc0-cor")
# print(results.cot)
print_output(results.model_dump())

In [None]:
reload(esm)
# LLM-generated example
llm_metadata_different = dc.MetadataCOT(
    description="A digitized manuscript showing agricultural practices from early modern Europe",
    transcription=dc.Transcription(
        printed_text=["Treatise on Farming Methods", "Printed in Venice 1592"],
        handwriting=["Marginal notes regarding crop rotation", "Ownership signature: G. Agricola"],
    ),
    date="1590-1600",
    location="Northern Italy",
    publication_info=["Venetian Printing House"],
    contextual_info=["Demonstrates pre-Enlightenment farming techniques"],
    format=LibraryFormat.text,
    genre=["Manuscript", "Agricultural"],
    objects=["Quill annotations", "Illustrations of plows"],
    actions=["Harvesting", "Irrigating fields"],
    people=["Male figures in peasant attire"],
    cot="Generated through analysis of visual patterns and textual correlations in historical documents",
)
llm_metadata_similar = dc.MetadataCOT(
    description="Colorized lithographic print showing Victorian-era metropolitan peddlers",
    transcription=dc.Transcription(
        printed_text=["London Street Scenes", "Issued by Smith & Sons 1883"],
        handwriting=["Collection note: Uncommon version with azure coloring", "Previous owner: J. Smith"],
    ),
    date="1883",
    location="London, England",
    publication_info=["Smith & Sons Publishers"],
    contextual_info=["Chronicles vanishing professions during the Industrial Revolution"],
    format=LibraryFormat.still_image,
    genre=["Lithograph", "Social Documentation"],
    objects=["Vendor carts", "Coal containers", "Work garments"],
    actions=["Merchant trading", "Price negotiation"],
    people=["Market traders (both genders)", "Young workers"],
    cot="Documented through direct artifact examination and archival source verification",
)


# Human-curated example
human_metadata = dc.Metadata(
    description="Hand-colored lithograph depicting 19th century urban street vendors",
    transcription=dc.Transcription(
        printed_text=["Street Life of London", "Published by Smith & Sons 1883"],
        handwriting=["Curator's note: Rare variant with blue tint", "Ex collection: J. Smith"],
    ),
    date="1883",
    location="London, England",
    publication_info=["Smith & Sons Publishers"],
    contextual_info=["Documents disappearing trades during industrialization"],
    format=LibraryFormat.still_image,
    genre=["Lithograph", "Social History"],
    objects=["Push carts", "Coal buckets", "Aprons"],
    actions=["Selling goods", "Haggling prices"],
    people=["Street vendors (male and female)", "Child apprentices"],
)

chat_bedrock_kwargs = {"model": "us.anthropic.claude-3-5-sonnet-20241022-v2:0"}

# Call the function
result_different = esm.evaluate_structured_metadata(llm_metadata_different, human_metadata, chat_bedrock_kwargs)
result_similar = esm.evaluate_structured_metadata(llm_metadata_similar, human_metadata, chat_bedrock_kwargs)
display(result_different)
display(result_similar)