In [1]:
# INSTALLS THAT NEED TO BE RUN ON CONDA
# !pip install langchain-aws
# !pip install loguru

In [51]:
import json
import boto3
import base64
import re
import os
from PIL import Image
from pathlib import Path
import glob
import time
from importlib import reload
import pandas as pd
from IPython.display import display
from io import BytesIO
from botocore.config import Config
import logging

logging.basicConfig(level=logging.INFO, force=True)  # Resets handlers

%load_ext autoreload
%autoreload 2

pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# os.chdir('..')
print("CWD:", os.getcwd())
bedrock_runtime = boto3.client("bedrock-runtime")
s3 = boto3.client("s3")


def show_base64_image(encoded_str):
    # Add padding if missing
    missing_padding = len(encoded_str) % 4
    if missing_padding:
        encoded_str += "=" * (4 - missing_padding)

    # Decode and display
    image_data = base64.b64decode(encoded_str)
    image = Image.open(BytesIO(image_data))
    display(image)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
CWD: /home/ec2-user/SageMaker/ai-description/projects/research


In [42]:
try:
    os.chdir("../../lib/src")
    import image_captioning_assistant.generate.prompts as p
    import image_captioning_assistant.data.data_classes as dc

    # legacy
    # import image_captioning_assistant.generate.generate_bias_analysis as gba
    # import image_captioning_assistant.generate.generate_structured_metadata as gsm

    # current
    import image_captioning_assistant.generate.bias_analysis.find_biases_in_short_work as gbsw
    import image_captioning_assistant.generate.bias_analysis.find_biases_in_long_work as gblw
    import image_captioning_assistant.generate.utils as gen_utils

    # import image_captioning_assistant.data.data_classes as dc
    import image_captioning_assistant.aws.s3 as s3_util
finally:
    os.chdir("../../projects/research")

In [43]:
# download ground truth set to local
def download_s3_directory(bucket, s3_prefix, local_dir):
    s3 = boto3.client("s3")
    paginator = s3.get_paginator("list_objects_v2")

    for page in paginator.paginate(Bucket=bucket, Prefix=s3_prefix):
        for obj in page.get("Contents", []):
            # Skip directory markers
            if obj["Key"].endswith("/"):
                continue

            # Build local path
            relative_path = obj["Key"].replace(s3_prefix, "", 1)
            local_path = local_dir / relative_path

            # Create parent directories and download
            local_path.parent.mkdir(parents=True, exist_ok=True)
            s3.download_file(bucket, obj["Key"], str(local_path))


# Configuration
bucket_name = "gaiic-emory-dev"
local_base = Path("ground_truth")

# Download single CSV file
csv_path = local_base / "ground_truth.csv"
csv_path.parent.mkdir(parents=True, exist_ok=True)
boto3.client("s3").download_file(bucket_name, "ground_truth.csv", str(csv_path))

# Download entire images directory
# download_s3_directory(
#     bucket=bucket_name,
#     s3_prefix='ground_truth_images/',
#     local_dir=local_base / 'images'
# )

In [44]:
ground_truth_df = pd.read_csv("ground_truth/ground_truth.csv")

In [45]:
def display_work_id_images(work_id):
    shas = ground_truth_df[ground_truth_df["work_id"] == work_id]["page_sha1"]

    for sha in shas:
        img_path = f"research/ground_truth/images/{sha}"
        if Path(img_path).exists():
            display(Image.open(img_path))


# display_work_id_images('7203xsj45j-cor')

In [46]:
def display_bias(bias_list, attribute):
    # Convert to DataFrame with multi-index
    multi_index_data = []
    for i, bias_dict in enumerate(bias_list):
        for key, value in bias_dict.items():
            multi_index_data.append(((i + 1, key), value))

    # Create DataFrame
    multi_index = pd.MultiIndex.from_tuples([item[0] for item in multi_index_data], names=["Bias ID", "Bias Item"])
    df = pd.DataFrame(
        {f"Output from AI Model for {attribute}": [item[1] for item in multi_index_data]},
        index=multi_index,
    )
    display(df)


def print_output(output):
    if "description" in output:
        s = pd.Series(output)
        display(pd.DataFrame({"Metadata Item": s.index, "Output from AI Model": s.values}))
    else:
        display_bias(output["metadata_biases"]["biases"], "Metadata")
        for i, bias_list in enumerate(output["page_biases"]):
            display_bias(bias_list["biases"], f"Page {i+1}")

Be less specific on objects that don't matter, like house, not windows, not parts of the poster, but that it's a poster

make sure to characterize the object itself like that it's a black and white photo

In [47]:
# # s3://gaiic-emory-dev/ground_truth_images/3420d30e9b3c03a19105b4d1c92ff2b8880905c8
s3_kwargs = {
    "config": Config(
        s3={"addressing_style": "virtual"},
        signature_version="s3v4",
    ),
    "region_name": "us-east-1",
}
# image_path = "ground_truth/images"
# work_id = "880ht76hj7-cor"
# shas = ground_truth_df[ground_truth_df["work_id"] == work_id][["page_sha1", "page_title"]]
# front_sha = shas[shas["page_title"].str.lower() == "front"]["page_sha1"].values[0]
# back_sha = shas[shas["page_title"].str.lower() == "back"]["page_sha1"].values[0]
# front_bytes = s3_util.load_image_bytes(bucket_name, f"ground_truth_images/{front_sha}", s3_kwargs)
# len(front_bytes)

In [48]:
# from PIL import Image
# import base64
# from io import BytesIO

# with open(image_full_path, "rb") as image_file:
#     # Open image and convert to RGB (removes alpha channel if present)
#     image = Image.open(image_file).convert('RGB')

#     # Set maximum dimensions while maintaining aspect ratio
#     max_dimension = 2048  # Adjust this based on your size requirements
#     image.thumbnail((max_dimension, max_dimension), Image.LANCZOS)

#     # Optimize JPEG quality and save to buffer
#     buffer = BytesIO()
#     image.save(buffer,
#               format='JPEG',
#               quality=85,  # Adjust between 75-95 for quality/size balance
#               optimize=True)

#     buffer.seek(0)
#     image_data = base64.b64encode(buffer.read()).decode("utf-8")

# # Verify size constraint
# if len(image_data) >= 10000000:
#     raise ValueError("Resized image still exceeds size limit - try reducing max_dimension or quality")
# print(len(image_data))


# show_base64_image(image_data)

In [49]:
def gen_bias_for_wid(work_id, page_title):
    image_path = "ground_truth/images"
    shas = ground_truth_df[ground_truth_df["work_id"] == work_id][["page_sha1", "page_title"]]
    image_s3_uris = []
    if page_title.lower() == "front" and len(shas) > 1:
        front_sha = shas[shas["page_title"].str.lower() == "front"]["page_sha1"].values[0]
        image_s3_uris.append(f"s3://gaiic-emory-dev/ground_truth_images/{front_sha}")
        back_sha = shas[shas["page_title"].str.lower() == "back"]["page_sha1"].values[0]
        image_s3_uris.append(f"s3://gaiic-emory-dev/ground_truth_images/{back_sha}")
    else:
        front_sha = shas[shas["page_title"] == page_title]["page_sha1"].values[0]
        image_s3_uris.append(f"s3://gaiic-emory-dev/ground_truth_images/{front_sha}")
        back_sha = None

    s3_kwargs = {
        "config": Config(
            s3={"addressing_style": "virtual"},
            signature_version="s3v4",
        ),
        "region_name": "us-east-1",
    }

    llm_kwargs = {
        # "model": "anthropic.claude-3-5-sonnet-20240620-v1:0",
        "model_id": "us.anthropic.claude-3-5-sonnet-20241022-v2:0",
    }

    return gbsw.find_biases_in_short_work(
        image_s3_uris,
        s3_kwargs,
        llm_kwargs,
        {},
        # work_context: str | None = None,
        # original_metadata: str | None = None,
    )


def gen_bias_for_wid_long(work_id):
    image_path = "ground_truth/images"
    sha_df = ground_truth_df[ground_truth_df["work_id"] == work_id][["page_sha1", "page_title"]]
    shas = list(sha_df["page_sha1"].values)
    image_page_names = list(sha_df["page_title"].values)
    image_s3_uris = [f"s3://gaiic-emory-dev/ground_truth_images/{sha}" for sha in shas]

    s3_kwargs = {
        "config": Config(
            s3={"addressing_style": "virtual"},
            signature_version="s3v4",
        ),
    }

    llm_kwargs = {
        # "model": "anthropic.claude-3-5-sonnet-20240620-v1:0",
        "model_id": "us.anthropic.claude-3-5-sonnet-20241022-v2:0",
        "region_name": "us-east-1",
    }

    return gblw.find_biases_in_long_work(
        image_s3_uris,
        s3_kwargs,
        llm_kwargs,
        {},
        # work_context: str | None = None,
        # original_metadata: str | None = None,
    )

In [63]:
# results = gen_bias_for_wid('880ht76hj7-cor', 'Front') # cotton in sunny south 689d51c5f7-cor
results = gen_bias_for_wid("989r2280h5-cor", "Front")  # Hatian mother with offspring
# results = gen_bias_for_wid("689d51c5f7-cor", "Front")  # AA boy pointing at possum in tree
# results = gen_bias_for_wid('24298sf7s0-cor', "Front")  # Burning of AA man
# print(results.cot)
print_output(results.model_dump())

INFO:botocore.httpchecksum:Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
INFO:botocore.httpchecksum:Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
INFO:image_captioning_assistant.generate.bias_analysis.find_biases_in_short_work:

********** CHAIN OF THOUGHT **********
 

1. Transcription Analysis:
Front of postcard text: None visible
Back of postcard text:
Print text possibilities:
1. "Haitian mother with offspring (Furcy) - Haiti.
Color photo from Ansco Color transparency"
"POST CARD"
"PLACE STAMP HERE"
"Distributed by W. E. Lemke, Port-au-Prince, Haiti"
"72582"

There are no alternative transcriptions possible as the text is clearly printed and legible.

2. Object Breakdown:
- Photograph shows a person wearing a red polka-dotted headscarf and white clothing
- Person is smoking a pipe
- Background shows a red building structure
- The image appear

Unnamed: 0_level_0,Unnamed: 1_level_0,Output from AI Model for Metadata
Bias ID,Bias Item,Unnamed: 2_level_1


Unnamed: 0_level_0,Unnamed: 1_level_0,Output from AI Model for Page 1
Bias ID,Bias Item,Unnamed: 2_level_1
1,level,BiasLevel.medium
1,type,BiasType.cultural
1,explanation,"The use of the term 'offspring' to describe a human child is dehumanizing language more commonly used for animals. This type of clinical terminology has historical context in the dehumanization of Black individuals and communities. While it may reflect period-typical language or translation issues, the impact remains problematic."
2,level,BiasLevel.low
2,type,BiasType.cultural
2,explanation,"The postcard format and composition suggests potential exoticization of Haitian daily life for tourist consumption. However, the local distribution and straightforward documentation style somewhat mitigates this concern."


Unnamed: 0_level_0,Unnamed: 1_level_0,Output from AI Model for Page 2
Bias ID,Bias Item,Unnamed: 2_level_1
1,level,BiasLevel.low
1,type,BiasType.cultural
1,explanation,The postcard reverse contains standard postcard formatting and distribution information. The low bias flag is carried over from the front image context of tourist-oriented presentation of cultural content.


## Run for longer works

In [65]:
# response = gen_bias_for_wid_long('648ffbg7pg-cor') # 3 pages (high, none, low)
response = gen_bias_for_wid_long("26663xsjkv-cor")  # 4 pages (medium, high, none, none)
print_output(response.model_dump())

INFO:image_captioning_assistant.generate.bias_analysis.find_biases_in_long_work:Analyzing 4 images
INFO:botocore.httpchecksum:Skipping checksum validation. Response did not contain one of the following algorithms: ['crc32', 'sha1', 'sha256'].
INFO:image_captioning_assistant.generate.bias_analysis.find_biases_in_short_work:

********** CHAIN OF THOUGHT **********
 

1. Transcription Analysis:

Print text:
"George Marshall Jackson, B.S.
LaFayette, Georgia

Entered College Fall, '06.
Chi Phi Fraternity; "Owls" Social Club; "Susie Dam;" Few Literary Society; Honor Roll, '06, '07, '08, '09; Commencement Speaker, '10; President of Class, '08, '09; Athletic Editor Emory Phoenix; President Pan-Hellenic Council; Baseball Team, '07, '08, '10, '11; Manager Baseball Team, '08; Basketball Team, '08, '09, '10, '11; Captain of Basketball Team, '09, '10; Relay Team, '06.

There are tides in the affairs of men. If you don't believe it ask "Big Jack." He was once a politician; now he is not. He was once

Unnamed: 0_level_0,Unnamed: 1_level_0,Output from AI Model for Metadata
Bias ID,Bias Item,Unnamed: 2_level_1


Unnamed: 0_level_0,Unnamed: 1_level_0,Output from AI Model for Page 1
Bias ID,Bias Item,Unnamed: 2_level_1
1,level,BiasLevel.low
1,type,BiasType.gender
1,explanation,"The use of the term 'lady killer' in the biographical text, while meant to be humorous, subtly objectifies women by portraying them as conquests rather than equals. While this was common vernacular for the time period, it still represents a mild form of gender bias through casual objectification. The low level designation was chosen as the term appears in a lighthearted context without malicious intent or explicit degradation."


Unnamed: 0_level_0,Unnamed: 1_level_0,Output from AI Model for Page 2
Bias ID,Bias Item,Unnamed: 2_level_1
1,level,BiasLevel.high
1,type,BiasType.cultural
1,explanation,"The page contains multiple instances of cultural appropriation and derogatory references to Native American culture, including a stereotypical illustration of a Native American in headdress labeled 'BIG INJUN', use of terms like 'Wig Wam' and 'Big Chief', and the casual appropriation of Native American imagery and terminology for entertainment purposes. While this was common in 1911, these elements represent harmful stereotyping and disrespect toward Native American cultures and communities. The high level is warranted due to the combination of derogatory terminology and stereotypical imagery."


Unnamed: 0_level_0,Unnamed: 1_level_0,Output from AI Model for Page 3
Bias ID,Bias Item,Unnamed: 2_level_1
1,level,BiasLevel.low
1,type,BiasType.gender
1,explanation,"The content exclusively focuses on male-dominated activities and institutions (fraternity, men's sports teams) without any representation of women's activities or achievements. However, given the historical context of being from an early 1900s male college yearbook, this reflects the period's educational segregation rather than intentional exclusion. The bias is rated as low since it represents historical reality rather than active discrimination."


Unnamed: 0_level_0,Unnamed: 1_level_0,Output from AI Model for Page 4
Bias ID,Bias Item,Unnamed: 2_level_1
1,level,BiasLevel.low
1,type,BiasType.gender
1,explanation,"The image and text exclusively feature and reference male participants, which represents a gender imbalance. However, given the historical context and specific nature of the documented event, this appears to be a factual representation rather than intentional exclusion. The level is set as low because while there is an imbalance in representation, it doesn't appear to promote harmful stereotypes or explicitly exclude women."


# Run Evaluation

In [None]:
try:
    os.chdir("../../lib/src")
    import image_captioning_assistant.evaluate.evaluate_bias_analysis as eba
    import image_captioning_assistant.evaluate.evaluate_structured_metadata as esm
    import image_captioning_assistant.evaluate.evaluate_freeform_description as efd
    from image_captioning_assistant.data.constants import BiasLevel, BiasType, LibraryFormat
finally:
    os.chdir("../../projects/research")

### Metadata Eval

In [None]:
ground_truth_df.head(1).tail(1).transpose()

In [None]:
metadata_items = """
title
abstract
content_genres
content_type
date_created
subject_geo
subject_names
subject_topics
""".strip().split(
    "\n"
)
ground_truth_df.head(17).tail(1)[["work_id"] + metadata_items].transpose()

In [None]:
def gt_row_to_metadata_obj(gt_row):
    gt_content_type_mapping = {
        "http://id.loc.gov/vocabulary/resourceTypes/img": LibraryFormat.still_image,
        "http://id.loc.gov/vocabulary/resourceTypes/txt": LibraryFormat.text,
    }
    cleaned_abstract = (
        gt_row["abstract"].replace("Verso:", "").replace("Recto:", "")
        if gt_row["abstract"] == gt_row["abstract"]
        else ""
    )
    topics = []
    for col in ["subject_topics"]:
        if gt_row[col]:
            topics.extend(str(gt_row[col]).split("|"))
    return dc.Metadata(
        description=gt_row["title"],
        transcription=dc.Transcription(printed_text=[cleaned_abstract], handwriting=[]),
        date=str(gt_row["date_created"]),
        location=str(gt_row["subject_geo"]),
        publication_info=[],
        contextual_info=[],
        format=gt_content_type_mapping[gt_row["content_type"]],
        genre=[gt_row["content_genres"]],
        topics=topics,
        objects=[],
        actions=[],
        people=[],
    )


gt_row_to_metadata_obj(ground_truth_df.iloc[16, :])

In [None]:
import aiohttp
import asyncio
import nest_asyncio

nest_asyncio.apply()


def get_human_and_llm_metadata(gt_row):
    human_metadata = gt_row_to_metadata_obj(gt_row)
    llm_metadata = gen_metadata_for_wid(gt_row["work_id"], gt_row["page_title"])
    return human_metadata, llm_metadata


def run_gt_metadata_eval(gt_row):
    human_metadata, llm_metadata = get_human_and_llm_metadata(gt_row)
    chat_bedrock_kwargs = {"model": "us.anthropic.claude-3-5-sonnet-20241022-v2:0"}
    # Call eval function
    return esm.evaluate_structured_metadata(llm_metadata, human_metadata, chat_bedrock_kwargs)

In [None]:
# run eval for only items with front and back because others are pages which are bias only
from tqdm import tqdm_notebook

eval_results = {}
gt_dedup = ground_truth_df[ground_truth_df["page_title"].str.lower() == "front"].copy(deep=True).fillna("")
for i, gt_row in tqdm_notebook(gt_dedup.iterrows(), total=len(gt_dedup)):
    if i not in eval_results:
        eval_results[i] = run_gt_metadata_eval(gt_row)

In [None]:
eval_results[i]

In [None]:
eval_results[i]

In [None]:
reload(esm)
esm.combine_structured_metadata_evaluations(list(eval_results.values())).model_dump()

In [None]:
shas = ground_truth_df[ground_truth_df["work_id"] == gt_row["work_id"]][["page_sha1", "page_title"]]
shas

In [None]:
ground_truth_df[ground_truth_df["work_id"].str.strip() == gt_row["work_id"]][["page_sha1", "page_title"]]

In [None]:
results = gen_metadata_for_wid("423612jmc0-cor")
# print(results.cot)
print_output(results.model_dump())

In [None]:
reload(esm)
# LLM-generated example
llm_metadata_different = dc.MetadataCOT(
    description="A digitized manuscript showing agricultural practices from early modern Europe",
    transcription=dc.Transcription(
        printed_text=["Treatise on Farming Methods", "Printed in Venice 1592"],
        handwriting=["Marginal notes regarding crop rotation", "Ownership signature: G. Agricola"],
    ),
    date="1590-1600",
    location="Northern Italy",
    publication_info=["Venetian Printing House"],
    contextual_info=["Demonstrates pre-Enlightenment farming techniques"],
    format=LibraryFormat.text,
    genre=["Manuscript", "Agricultural"],
    objects=["Quill annotations", "Illustrations of plows"],
    actions=["Harvesting", "Irrigating fields"],
    people=["Male figures in peasant attire"],
    cot="Generated through analysis of visual patterns and textual correlations in historical documents",
)
llm_metadata_similar = dc.MetadataCOT(
    description="Colorized lithographic print showing Victorian-era metropolitan peddlers",
    transcription=dc.Transcription(
        printed_text=["London Street Scenes", "Issued by Smith & Sons 1883"],
        handwriting=["Collection note: Uncommon version with azure coloring", "Previous owner: J. Smith"],
    ),
    date="1883",
    location="London, England",
    publication_info=["Smith & Sons Publishers"],
    contextual_info=["Chronicles vanishing professions during the Industrial Revolution"],
    format=LibraryFormat.still_image,
    genre=["Lithograph", "Social Documentation"],
    objects=["Vendor carts", "Coal containers", "Work garments"],
    actions=["Merchant trading", "Price negotiation"],
    people=["Market traders (both genders)", "Young workers"],
    cot="Documented through direct artifact examination and archival source verification",
)


# Human-curated example
human_metadata = dc.Metadata(
    description="Hand-colored lithograph depicting 19th century urban street vendors",
    transcription=dc.Transcription(
        printed_text=["Street Life of London", "Published by Smith & Sons 1883"],
        handwriting=["Curator's note: Rare variant with blue tint", "Ex collection: J. Smith"],
    ),
    date="1883",
    location="London, England",
    publication_info=["Smith & Sons Publishers"],
    contextual_info=["Documents disappearing trades during industrialization"],
    format=LibraryFormat.still_image,
    genre=["Lithograph", "Social History"],
    objects=["Push carts", "Coal buckets", "Aprons"],
    actions=["Selling goods", "Haggling prices"],
    people=["Street vendors (male and female)", "Child apprentices"],
)

chat_bedrock_kwargs = {"model": "us.anthropic.claude-3-5-sonnet-20241022-v2:0"}

# Call the function
result_different = esm.evaluate_structured_metadata(llm_metadata_different, human_metadata, chat_bedrock_kwargs)
result_similar = esm.evaluate_structured_metadata(llm_metadata_similar, human_metadata, chat_bedrock_kwargs)
display(result_different)
display(result_similar)

### Bias Eval

In [None]:
llm_bias_analysis = dc.BiasAnalysisEntry(
    bias_type=BiasType.racial,
    bias_level=BiasLevel.high,
    explanation="Water fountain and a sign above it that says 'whites'",
)
human_bias_analysis = dc.BiasAnalysisEntry(
    bias_type=BiasType.racial,
    bias_level=BiasLevel.high,
    explanation="A water fountain and a sign above it tha reads 'whites'",
)

potential_bias_evaluation = eba.evaluate_potential_biases(
    llm_potential_biases=[llm_bias_analysis],
    human_potential_biases=[human_bias_analysis],
    chat_bedrock_converse_kwargs={
        "model": "anthropic.claude-3-5-sonnet-20240620-v1:0",
        "temperature": 0.0,
    },
)
potential_bias_evaluation

In [None]:
llm_bias_analysis = dc.BiasAnalysisEntry(
    bias_type=BiasType.racial,
    bias_level=BiasLevel.high,
    explanation="Water fountain and a sign above it that says 'whites'",
)
human_bias_analysis = dc.BiasAnalysisEntry(
    bias_type=BiasType.age,
    bias_level=BiasLevel.low,
    explanation="Child Laborers are working in the fields.",
)

potential_bias_evaluation = eba.evaluate_potential_biases(
    llm_potential_biases=[llm_bias_analysis],
    human_potential_biases=[human_bias_analysis],
    chat_bedrock_converse_kwargs={
        "model": "anthropic.claude-3-5-sonnet-20240620-v1:0",
        "temperature": 0.0,
    },
)
potential_bias_evaluation