In [2]:
# INSTALLS THAT NEED TO BE RUN ON CONDA
# !pip install langchain-aws
# !pip install loguru

In [1]:
import json
import boto3
import base64
import re
import os
from PIL import Image
from pathlib import Path
import glob
import time
from importlib import reload
import pandas as pd
from IPython.display import display
from io import BytesIO
from botocore.config import Config

pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# os.chdir('..')
print("CWD:", os.getcwd())
bedrock_runtime = boto3.client("bedrock-runtime")
s3 = boto3.client("s3")


def show_base64_image(encoded_str):
    # Add padding if missing
    missing_padding = len(encoded_str) % 4
    if missing_padding:
        encoded_str += "=" * (4 - missing_padding)

    # Decode and display
    image_data = base64.b64decode(encoded_str)
    image = Image.open(BytesIO(image_data))
    display(image)

CWD: /home/ec2-user/SageMaker/ai-description/projects/research


In [2]:
try:
    os.chdir("../../lib/src")
    import image_captioning_assistant.generate.prompts as p
    import image_captioning_assistant.generate.generate_bias_analysis as gba
    import image_captioning_assistant.generate.generate_structured_metadata as gsm
    import image_captioning_assistant.generate.utils as gen_utils

    # import image_captioning_assistant.data.data_classes as dc
    import image_captioning_assistant.aws.s3 as s3_util
finally:
    os.chdir("../../projects/research")

In [5]:
# download ground truth set to local
def download_s3_directory(bucket, s3_prefix, local_dir):
    s3 = boto3.client("s3")
    paginator = s3.get_paginator("list_objects_v2")

    for page in paginator.paginate(Bucket=bucket, Prefix=s3_prefix):
        for obj in page.get("Contents", []):
            # Skip directory markers
            if obj["Key"].endswith("/"):
                continue

            # Build local path
            relative_path = obj["Key"].replace(s3_prefix, "", 1)
            local_path = local_dir / relative_path

            # Create parent directories and download
            local_path.parent.mkdir(parents=True, exist_ok=True)
            s3.download_file(bucket, obj["Key"], str(local_path))


# Configuration
bucket_name = "gaiic-emory-dev"
local_base = Path("ground_truth")

# Download single CSV file
csv_path = local_base / "ground_truth.csv"
csv_path.parent.mkdir(parents=True, exist_ok=True)
boto3.client("s3").download_file(bucket_name, "ground_truth.csv", str(csv_path))

# Download entire images directory
# download_s3_directory(
#     bucket=bucket_name,
#     s3_prefix='ground_truth_images/',
#     local_dir=local_base / 'images'
# )

In [6]:
ground_truth_df = pd.read_csv("ground_truth/ground_truth.csv")

In [7]:
def display_work_id_images(work_id):
    shas = ground_truth_df[ground_truth_df["work_id"] == work_id]["page_sha1"]

    for sha in shas:
        img_path = f"research/ground_truth/images/{sha}"
        if Path(img_path).exists():
            display(Image.open(img_path))


# display_work_id_images('7203xsj45j-cor')

In [9]:
def print_output(output):
    if "metadata" in output:
        s = pd.Series(output["metadata"])
        display(pd.DataFrame({"Metadata Item": s.index, "Output from AI Model": s.values}))
    bias_list = output["bias_analysis"]
    # Convert to DataFrame with multi-index
    multi_index_data = []
    for i, bias_dict in enumerate(bias_list):
        for key, value in bias_dict.items():
            multi_index_data.append(((i + 1, key), value))

    # Create DataFrame
    multi_index = pd.MultiIndex.from_tuples([item[0] for item in multi_index_data], names=["Bias ID", "Bias Item"])
    df = pd.DataFrame(
        {"Output from AI Model": [item[1] for item in multi_index_data]},
        index=multi_index,
    )
    display(df)

Be less specific on objects that don't matter, like house, not windows, not parts of the poster, but that it's a poster

make sure to characterize the object itself like that it's a black and white photo

In [10]:
# # s3://gaiic-emory-dev/ground_truth_images/3420d30e9b3c03a19105b4d1c92ff2b8880905c8
s3_kwargs = {
    "config": Config(
        s3={"addressing_style": "virtual"},
        signature_version="s3v4",
    ),
    "region_name": "us-east-1",
}
image_path = "ground_truth/images"
work_id = "880ht76hj7-cor"
shas = ground_truth_df[ground_truth_df["work_id"] == work_id][["page_sha1", "page_title"]]
front_sha = shas[shas["page_title"].str.lower() == "front"]["page_sha1"].values[0]
back_sha = shas[shas["page_title"].str.lower() == "back"]["page_sha1"].values[0]
front_bytes = s3_util.load_image_bytes(bucket_name, f"ground_truth_images/{front_sha}", s3_kwargs)
len(front_bytes)

27971190

In [11]:
# from PIL import Image
# import base64
# from io import BytesIO

# with open(image_full_path, "rb") as image_file:
#     # Open image and convert to RGB (removes alpha channel if present)
#     image = Image.open(image_file).convert('RGB')

#     # Set maximum dimensions while maintaining aspect ratio
#     max_dimension = 2048  # Adjust this based on your size requirements
#     image.thumbnail((max_dimension, max_dimension), Image.LANCZOS)

#     # Optimize JPEG quality and save to buffer
#     buffer = BytesIO()
#     image.save(buffer,
#               format='JPEG',
#               quality=85,  # Adjust between 75-95 for quality/size balance
#               optimize=True)

#     buffer.seek(0)
#     image_data = base64.b64encode(buffer.read()).decode("utf-8")

# # Verify size constraint
# if len(image_data) >= 10000000:
#     raise ValueError("Resized image still exceeds size limit - try reducing max_dimension or quality")
# print(len(image_data))


# show_base64_image(image_data)

In [13]:
print("With Sonnet 3.5 v2")
reload(p)
reload(gsm)
return_all = False
return_cot = False
image_path = "ground_truth/images"
work_id = "880ht76hj7-cor"
shas = ground_truth_df[ground_truth_df["work_id"] == work_id][["page_sha1", "page_title"]]
front_sha = shas[shas["page_title"].str.lower() == "front"]["page_sha1"].values[0]
back_sha = shas[shas["page_title"].str.lower() == "back"]["page_sha1"].values[0]

# Read and encode the images
image_data = s3_util.load_image_bytes(bucket_name, f"ground_truth_images/{front_sha}", s3_kwargs)
image_data_back = s3_util.load_image_bytes(bucket_name, f"ground_truth_images/{back_sha}", s3_kwargs)
img_bytes_list = [image_data, image_data_back]

llm_kwargs = {
    # "model": "anthropic.claude-3-5-sonnet-20240620-v1:0",
    "model": "us.anthropic.claude-3-5-sonnet-20241022-v2:0",
    "region_name": "us-east-1",
}
results = gsm.generate_structured_metadata(img_bytes_list, llm_kwargs, " ")

With Sonnet 3.5 v2


In [10]:
print(results["cot"])
print_output(results["metadata"].model_dump())


1. Object Breakdown:
- Historical black and white photograph on a postcard showing cotton picking in the American South
- Multiple people working in a cotton field, including adults and children
- Cotton plants with white bolls visible throughout the field
- Wooded area visible in background
- Woven baskets visible for collecting cotton
- People wearing period work clothing including hats and long garments
- The image appears to be from 1909 based on copyright text

2. Text Analysis:
Front of postcard:
- Printed text: "COTTON PICKING IN THE SUNNY SOUTH"
- Copyright text: "COPYRIGHT 1909 BY H.TEES"

Back of postcard:
- Standard postcard template text: "POST CARD", "CORRESPONDENCE", "NAME AND ADDRESS"
- Handwritten text: Several lines of handwritten text that appears to be a message
- Name "Maßward" (or similar)
- Question "How are you?"
- Additional handwritten text that is difficult to decipher

3. Translation/Transcription Analysis:
The handwritten message on the back is partially le

Unnamed: 0,Metadata Item,Output from AI Model
0,description,"Black and white photograph showing multiple people, including adults and children, picking cotton in a large field. The scene depicts agricultural workers harvesting cotton bolls from mature plants, with wooded area visible in the background. Workers are using woven baskets to collect cotton, and are dressed in period work clothing including hats and long garments."
1,transcription,"{'print': ['COTTON PICKING IN THE SUNNY SOUTH', 'COPYRIGHT 1909 BY H.TEES', 'POST CARD', 'CORRESPONDENCE', 'NAME AND ADDRESS', 'PLACE POSTAGE STAMP HERE'], 'handwriting': ['Maßward', 'How are you?', '[remainder of message unclear]']}"
2,date,1909
3,location,Southern United States
4,publication_info,[Copyright 1909 by H.Tees]
5,contextual_info,"[Created during Jim Crow era, Period of widespread agricultural labor exploitation, Child labor was common in agricultural settings, Cotton was a primary crop in Southern economy]"
6,format,Mixed Material
7,genre,"[Postcards, Black-and-white photographs, Documentary photographs]"
8,objects,"[cotton plants, cotton bolls, woven baskets, work clothes, hats]"
9,actions,"[picking cotton, harvesting, working, standing]"


Unnamed: 0_level_0,Unnamed: 1_level_0,Output from AI Model
Bias ID,Bias Item,Unnamed: 2_level_1
1,bias_level,High
1,bias_type,racial
1,explanation,The image presents cotton picking in the post-Civil War South without acknowledging the complex racial and social justice issues of the era. The phrase 'Sunny South' romanticizes and minimizes the harsh realities of agricultural labor conditions.
2,bias_level,High
2,bias_type,ageism
2,explanation,"The photograph shows children engaged in agricultural labor, normalizing child labor practices of the era."
3,bias_level,Medium
3,bias_type,cultural
3,explanation,"The postcard format treats a serious subject of agricultural labor conditions as a casual souvenir, potentially trivializing the experiences of the workers shown."


In [14]:
print(results["cot"])
print_output(results["metadata"].model_dump())


1. Object Breakdown:
- Format: Postcard with black and white photograph
- Front shows multiple people picking cotton in a field
- Back shows standard postcard format with handwritten message
- Copyright text indicates 1909 date
- Scene depicts cotton harvesting in the American South
- Multiple people visible including adults and children
- Cotton field with trees in background
- Woven baskets visible for collecting cotton

2. Text Analysis:
Front:
- Printed text: "COTTON PICKING IN THE SUNNY SOUTH"
- Copyright notice: "COPYRIGHT 1909 BY H.TEES"

Back:
- Standard postcard template text: "POST CARD", "CORRESPONDENCE", "NAME AND ADDRESS"
- Handwritten text appears to be a personal message (partially visible)
- Name appears to be "M. Rivard"
- Question "How are you?"
- Rest of handwritten text is difficult to decipher clearly

3. Translation/Transcription Analysis:
The front text is straightforward with no alternative interpretations needed. The back contains handwriting that is partially

Unnamed: 0,Metadata Item,Output from AI Model
0,description,"Black and white photograph showing multiple people harvesting cotton in a field. Adults and children are visible among cotton plants, with some carrying woven baskets. A tree line is visible in the background. The image is formatted as a postcard."
1,transcription,"{'print': ['COTTON PICKING IN THE SUNNY SOUTH', 'COPYRIGHT 1909 BY H.TEES', 'POST CARD', 'CORRESPONDENCE', 'NAME AND ADDRESS', 'PLACE POSTAGE STAMP HERE'], 'handwriting': ['M. Rivard', 'How are you?']}"
2,date,1909
3,location,Southern United States
4,publication_info,"[Copyright 1909 by H.Tees, Postcard format]"
5,contextual_info,"[Created during Jim Crow era, Documents agricultural labor practices, Part of cotton industry documentation, Postcard format suggests commercial distribution]"
6,format,Still Image
7,genre,"[black-and-white photograph, postcard]"
8,objects,"[cotton plants, woven baskets, trees]"
9,actions,"[cotton picking, harvesting, standing, working]"


Unnamed: 0_level_0,Unnamed: 1_level_0,Output from AI Model
Bias ID,Bias Item,Unnamed: 2_level_1
1,bias_level,High
1,bias_type,racial
1,explanation,The image romanticizes cotton picking in the post-slavery South with phrases like 'Sunny South' while documenting exploitative labor practices primarily affecting Black Americans. The postcard format commodifies and normalizes these conditions.
2,bias_level,High
2,bias_type,ageism
2,explanation,The presence of children engaged in cotton picking documents child labor practices without acknowledgment of their exploitation.
3,bias_level,Medium
3,bias_type,cultural
3,explanation,The portrayal of cotton picking as a picturesque scene minimizes the historical trauma and ongoing economic exploitation associated with cotton farming in the American South.


In [15]:
print("With Sonnet 3.5 v2")
reload(p)
reload(gba)
return_all = False
return_cot = False
image_path = "ground_truth/images"
work_id = "880ht76hj7-cor"
shas = ground_truth_df[ground_truth_df["work_id"] == work_id][["page_sha1", "page_title"]]
front_sha = shas[shas["page_title"].str.lower() == "front"]["page_sha1"].values[0]
back_sha = shas[shas["page_title"].str.lower() == "back"]["page_sha1"].values[0]

# Read and encode the images
image_data = s3_util.load_image_bytes(bucket_name, f"ground_truth_images/{front_sha}", s3_kwargs)
image_data_back = s3_util.load_image_bytes(bucket_name, f"ground_truth_images/{back_sha}", s3_kwargs)
img_bytes_list = [image_data, image_data_back]

llm_kwargs = {
    # "model": "anthropic.claude-3-5-sonnet-20240620-v1:0",
    "model": "us.anthropic.claude-3-5-sonnet-20241022-v2:0",
    "region_name": "us-east-1",
}
results = gba.generate_bias_analysis(img_bytes_list, llm_kwargs, " ")

With Sonnet 3.5 v2


In [12]:
print(results["cot"])
print(results["metadata"].model_dump())


1. Object Breakdown:
- Historical photograph from 1909 showing cotton picking in the American South
- Multiple individuals working in a cotton field
- Adults and children present in the scene
- Large cotton field with mature cotton plants
- Woven baskets visible for collecting cotton
- Printed text at bottom of image
- Postcard format with correspondence on reverse

2. Text Analysis:
Front:
- Printed text: "COTTON PICKING IN THE SUNNY SOUTH"
- Copyright notice: "COPYRIGHT 1909 BY H.TEES"

Back:
- Standard postcard template text
- Handwritten address: "M Pakward"
- Handwritten message: "How are you?"
- Additional handwritten text appears faded/illegible

3. Historical Context and Bias Considerations:
- Image depicts agricultural labor in post-Civil War South
- Shows African American workers, including children, in cotton fields
- The phrase "Sunny South" presents a romanticized view of what was often exploitative labor
- Children working suggests child labor issues
- The casual present

In [16]:
print(results["cot"])
print(results["bias_analysis"].model_dump())


1. Object Breakdown:
- Historical photograph postcard from 1909 showing cotton picking in the American South
- Shows multiple individuals working in a cotton field, including adults and children
- Forest/tree line visible in background
- Large cotton field with mature cotton plants ready for harvesting
- Woven baskets visible for collecting cotton
- Printed title and copyright information on front
- Standard postcard format on reverse with handwritten message

2. Text Analysis:
Front of card:
- Printed text: "COTTON PICKING IN THE SUNNY SOUTH"
- Copyright text: "COPYRIGHT 1909 BY H.TEES"

Back of card:
- Standard postcard template text: "POST CARD", "CORRESPONDENCE", "NAME AND ADDRESS"
- Handwritten text: "M. Rivard" and "How are you?"
- Additional handwritten text appears to be present but is too faint/unclear to transcribe accurately

3. Historical Context and Bias Considerations:
- Image depicts cotton picking in post-slavery but Jim Crow era South
- The phrase "Sunny South" appear