In [2]:
# INSTALLS THAT NEED TO BE RUN ON CONDA
# !pip install langchain-aws
# !pip install loguru

In [1]:
import json
import boto3
import base64
import re
import os
from PIL import Image
from pathlib import Path
import glob
import time
from importlib import reload
import pandas as pd
from IPython.display import display
from io import BytesIO
from botocore.config import Config

pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# os.chdir('..')
print("CWD:", os.getcwd())
bedrock_runtime = boto3.client("bedrock-runtime")
s3 = boto3.client("s3")


def show_base64_image(encoded_str):
    # Add padding if missing
    missing_padding = len(encoded_str) % 4
    if missing_padding:
        encoded_str += "=" * (4 - missing_padding)

    # Decode and display
    image_data = base64.b64decode(encoded_str)
    image = Image.open(BytesIO(image_data))
    display(image)

CWD: /home/ec2-user/SageMaker/ai-description/projects/research


In [2]:
try:
    os.chdir("../../lib/src")
    import image_captioning_assistant.generate.prompts as p
    import image_captioning_assistant.generate.generate_bias_analysis as gba
    import image_captioning_assistant.generate.generate_structured_metadata as gsm
    import image_captioning_assistant.generate.utils as gen_utils

    # import image_captioning_assistant.data.data_classes as dc
    import image_captioning_assistant.aws.s3 as s3_util
finally:
    os.chdir("../../projects/research")

In [3]:
# download ground truth set to local
def download_s3_directory(bucket, s3_prefix, local_dir):
    s3 = boto3.client("s3")
    paginator = s3.get_paginator("list_objects_v2")

    for page in paginator.paginate(Bucket=bucket, Prefix=s3_prefix):
        for obj in page.get("Contents", []):
            # Skip directory markers
            if obj["Key"].endswith("/"):
                continue

            # Build local path
            relative_path = obj["Key"].replace(s3_prefix, "", 1)
            local_path = local_dir / relative_path

            # Create parent directories and download
            local_path.parent.mkdir(parents=True, exist_ok=True)
            s3.download_file(bucket, obj["Key"], str(local_path))


# Configuration
bucket_name = "gaiic-emory-dev"
local_base = Path("ground_truth")

# Download single CSV file
csv_path = local_base / "ground_truth.csv"
csv_path.parent.mkdir(parents=True, exist_ok=True)
boto3.client("s3").download_file(bucket_name, "ground_truth.csv", str(csv_path))

# Download entire images directory
# download_s3_directory(
#     bucket=bucket_name,
#     s3_prefix='ground_truth_images/',
#     local_dir=local_base / 'images'
# )

In [4]:
ground_truth_df = pd.read_csv("ground_truth/ground_truth.csv")

In [5]:
def display_work_id_images(work_id):
    shas = ground_truth_df[ground_truth_df["work_id"] == work_id]["page_sha1"]

    for sha in shas:
        img_path = f"research/ground_truth/images/{sha}"
        if Path(img_path).exists():
            display(Image.open(img_path))


# display_work_id_images('7203xsj45j-cor')

In [19]:
def print_output(output):
    if "metadata" in output and output["metadata"]:
        s = pd.Series(output["metadata"])
        display(pd.DataFrame({"Metadata Item": s.index, "Output from AI Model": s.values}))
    bias_list = output["bias_analysis"]
    # Convert to DataFrame with multi-index
    multi_index_data = []
    for i, bias_dict in enumerate(bias_list):
        for key, value in bias_dict.items():
            multi_index_data.append(((i + 1, key), value))

    # Create DataFrame
    multi_index = pd.MultiIndex.from_tuples([item[0] for item in multi_index_data], names=["Bias ID", "Bias Item"])
    df = pd.DataFrame(
        {"Output from AI Model": [item[1] for item in multi_index_data]},
        index=multi_index,
    )
    display(df)

Be less specific on objects that don't matter, like house, not windows, not parts of the poster, but that it's a poster

make sure to characterize the object itself like that it's a black and white photo

In [7]:
# # s3://gaiic-emory-dev/ground_truth_images/3420d30e9b3c03a19105b4d1c92ff2b8880905c8
s3_kwargs = {
    "config": Config(
        s3={"addressing_style": "virtual"},
        signature_version="s3v4",
    ),
    "region_name": "us-east-1",
}
image_path = "ground_truth/images"
work_id = "880ht76hj7-cor"
shas = ground_truth_df[ground_truth_df["work_id"] == work_id][["page_sha1", "page_title"]]
front_sha = shas[shas["page_title"].str.lower() == "front"]["page_sha1"].values[0]
back_sha = shas[shas["page_title"].str.lower() == "back"]["page_sha1"].values[0]
front_bytes = s3_util.load_image_bytes(bucket_name, f"ground_truth_images/{front_sha}", s3_kwargs)
len(front_bytes)

27971190

In [8]:
# from PIL import Image
# import base64
# from io import BytesIO

# with open(image_full_path, "rb") as image_file:
#     # Open image and convert to RGB (removes alpha channel if present)
#     image = Image.open(image_file).convert('RGB')

#     # Set maximum dimensions while maintaining aspect ratio
#     max_dimension = 2048  # Adjust this based on your size requirements
#     image.thumbnail((max_dimension, max_dimension), Image.LANCZOS)

#     # Optimize JPEG quality and save to buffer
#     buffer = BytesIO()
#     image.save(buffer,
#               format='JPEG',
#               quality=85,  # Adjust between 75-95 for quality/size balance
#               optimize=True)

#     buffer.seek(0)
#     image_data = base64.b64encode(buffer.read()).decode("utf-8")

# # Verify size constraint
# if len(image_data) >= 10000000:
#     raise ValueError("Resized image still exceeds size limit - try reducing max_dimension or quality")
# print(len(image_data))


# show_base64_image(image_data)

In [9]:
print("With Sonnet 3.5 v2")
reload(p)
reload(gsm)
return_all = False
return_cot = False
image_path = "ground_truth/images"
work_id = "880ht76hj7-cor"
shas = ground_truth_df[ground_truth_df["work_id"] == work_id][["page_sha1", "page_title"]]
front_sha = shas[shas["page_title"].str.lower() == "front"]["page_sha1"].values[0]
back_sha = shas[shas["page_title"].str.lower() == "back"]["page_sha1"].values[0]

# Read and encode the images
image_data = s3_util.load_image_bytes(bucket_name, f"ground_truth_images/{front_sha}", s3_kwargs)
image_data_back = s3_util.load_image_bytes(bucket_name, f"ground_truth_images/{back_sha}", s3_kwargs)
img_bytes_list = [image_data, image_data_back]

llm_kwargs = {
    # "model": "anthropic.claude-3-5-sonnet-20240620-v1:0",
    "model": "us.anthropic.claude-3-5-sonnet-20241022-v2:0",
    "region_name": "us-east-1",
}
results = gsm.generate_structured_metadata(img_bytes_list, llm_kwargs, " ")

With Sonnet 3.5 v2


In [10]:
print(results["cot"])
print_output(results["metadata"].model_dump())


1. Breaking down the object:
- This is a black and white postcard showing cotton picking in the American South
- Multiple people are working in a cotton field
- Two children are visible in the foreground
- Workers are wearing various types of clothing including hats and long garments
- Forest/tree line visible in background
- Cotton plants with white bolls fill the field
- Woven baskets visible for collecting cotton
- The image appears to be from 1909 based on copyright text

2. Text identification:
Front of postcard:
- Printed caption: "COTTON PICKING IN THE SUNNY SOUTH"
- Copyright notice: "COPYRIGHT 1909 BY H.TEES"

Back of postcard:
- Standard postcard template text
- Handwritten message (partially visible but difficult to read completely)
- Name "M. Rivard"
- Message "How are you?"

3. Transcription analysis:
The printed text is clear and straightforward. The handwritten text on the reverse is partially legible but some portions are too faint or unclear to transcribe with confide

Unnamed: 0,Metadata Item,Output from AI Model
0,description,"Black and white photograph on a postcard showing multiple people picking cotton in a large field. The field is full of cotton plants with white cotton bolls ready for harvesting. Workers are spread across the field, and two children are visible in the foreground near cotton baskets. A tree line is visible in the background. The workers are wearing various types of work clothing including hats and long garments suitable for field work."
1,transcription,"{'printed_text': ['COTTON PICKING IN THE SUNNY SOUTH', 'COPYRIGHT 1909 BY H.TEES', 'POST CARD', 'CORRESPONDENCE', 'NAME AND ADDRESS', 'PLACE POSTAGE STAMP HERE'], 'handwriting': ['M. Rivard', 'How are you?']}"
2,date,1909
3,location,American South
4,publication_info,[Copyright 1909 by H.Tees]
5,contextual_info,"[Cotton was a major agricultural product in the American South, Agricultural labor often involved entire families including children, Postcard format suggests this image was meant for commercial distribution]"
6,format,Mixed Material
7,genre,"[black-and-white photographs, postcards]"
8,objects,"[cotton plants, cotton bolls, woven baskets, work clothes, hats]"
9,actions,"[picking cotton, harvesting, working]"


Unnamed: 0_level_0,Unnamed: 1_level_0,Output from AI Model
Bias ID,Bias Item,Unnamed: 2_level_1
1,bias_level,BiasLevel.high
1,bias_type,BiasType.racial
1,explanation,"The image presents a romanticized view of cotton picking in the American South, minimizing the historical context of exploitation and forced labor. The phrase 'Sunny South' particularly diminishes the harsh realities of agricultural labor conditions faced by Black Americans."
2,bias_level,BiasLevel.high
2,bias_type,BiasType.age
2,explanation,"The presence of children working in the cotton field normalizes child labor, presenting it as an acceptable practice without acknowledging its problematic nature."
3,bias_level,BiasLevel.medium
3,bias_type,BiasType.cultural
3,explanation,The postcard format and caption present a romanticized view of Southern agricultural practices without acknowledging the complex social and economic systems that underpinned them.


In [16]:
print(results.cot)
print_output(results.model_dump())


1. Object Breakdown:
- Historical postcard showing cotton picking scene in the American South
- Black and white photograph
- Shows multiple people working in a cotton field
- Two children visible in foreground
- Cotton baskets/sacks visible
- Forest/tree line in background
- Text at bottom of image and on reverse
- Postcard format with correspondence section

2. Text Identification:
Front:
- Printed caption: "COTTON PICKING IN THE SUNNY SOUTH"
- Copyright text: "COPYRIGHT 1909 BY H.TEES"

Back:
- Standard postcard template text
- Handwritten address: "M. Rivard"
- Handwritten message: "How are you?"
- Additional handwritten text that is difficult to decipher

3. Transcription Analysis:
The printed text is clear and straightforward. The handwritten portions require multiple attempts:
- Address section clearly shows "M. Rivard"
- Message clearly shows "How are you?"
- The longer handwritten message on left side is too faint/unclear to transcribe reliably

4. Metadata Elements Analysis:


Unnamed: 0,Metadata Item,Output from AI Model
0,description,Black and white photograph showing multiple people picking cotton in a large field with a tree line visible in the background. Several adults are bent over working among the cotton plants while two children stand in the foreground near cotton baskets. The image is formatted as a postcard.
1,transcription,"{'printed_text': ['COTTON PICKING IN THE SUNNY SOUTH', 'COPYRIGHT 1909 BY H.TEES', 'POST CARD', 'CORRESPONDENCE', 'NAME AND ADDRESS', 'PLACE POSTAGE STAMP HERE'], 'handwriting': ['M. Rivard', 'How are you?']}"
2,date,1909
3,location,American South
4,publication_info,"[Copyright 1909 by H.Tees, Postcard format]"
5,contextual_info,"[Documents agricultural labor practices in the post-Civil War American South, Part of the sharecropping and tenant farming system, Postcard format suggests this type of imagery was used for casual correspondence]"
6,format,Still Image
7,genre,"[black-and-white photograph, postcard, documentary photography]"
8,objects,"[cotton plants, cotton baskets, work clothes, trees]"
9,actions,"[cotton picking, agricultural labor, standing]"


Unnamed: 0_level_0,Unnamed: 1_level_0,Output from AI Model
Bias ID,Bias Item,Unnamed: 2_level_1
1,bias_level,BiasLevel.high
1,bias_type,BiasType.racial
1,explanation,The image depicts exploitative agricultural labor practices that disproportionately affected Black Americans in the post-Civil War South. The romanticized caption 'Sunny South' minimizes the harsh realities of the labor conditions.
2,bias_level,BiasLevel.high
2,bias_type,BiasType.age
2,explanation,"The presence of children in the cotton field documents child labor practices, suggesting exploitation of minors in agricultural work."
3,bias_level,BiasLevel.medium
3,bias_type,BiasType.cultural
3,explanation,"The postcard format suggests this scene was considered acceptable for casual correspondence, normalizing exploitative labor conditions and reflecting the cultural attitudes of the time period."


In [17]:
print("With Sonnet 3.5 v2")
reload(p)
reload(gba)
return_all = False
return_cot = False
image_path = "ground_truth/images"
work_id = "880ht76hj7-cor"
shas = ground_truth_df[ground_truth_df["work_id"] == work_id][["page_sha1", "page_title"]]
front_sha = shas[shas["page_title"].str.lower() == "front"]["page_sha1"].values[0]
back_sha = shas[shas["page_title"].str.lower() == "back"]["page_sha1"].values[0]

# Read and encode the images
image_data = s3_util.load_image_bytes(bucket_name, f"ground_truth_images/{front_sha}", s3_kwargs)
image_data_back = s3_util.load_image_bytes(bucket_name, f"ground_truth_images/{back_sha}", s3_kwargs)
img_bytes_list = [image_data, image_data_back]

llm_kwargs = {
    # "model": "anthropic.claude-3-5-sonnet-20240620-v1:0",
    "model": "us.anthropic.claude-3-5-sonnet-20241022-v2:0",
    "region_name": "us-east-1",
}
results = gba.generate_bias_analysis(img_bytes_list, llm_kwargs, " ")

With Sonnet 3.5 v2


In [20]:
print(results.cot)
print_output(results.model_dump())


1. Object Breakdown:
- Historical postcard from 1909 showing cotton picking in the American South
- Black and white photograph showing multiple individuals working in a cotton field
- Adults and children visible in the scene
- Cotton plants with white bolls visible throughout the field
- Wooded area visible in background
- Woven baskets visible for collecting cotton
- Title text at bottom of image
- Standard postcard format on reverse with handwritten text

2. Text Analysis:
Front:
- Printed text: "COTTON PICKING IN THE SUNNY SOUTH"
- Copyright notice: "COPYRIGHT 1909 BY H.TEES"

Back:
- Standard postcard template text
- Handwritten address: "M.aRivard"
- Handwritten message: "How are you?"
- Additional handwritten text appears faded/illegible in diagonal lines

3. Historical Context and Bias Considerations:
- Image depicts cotton picking in post-slavery but Jim Crow era South
- Shows child labor alongside adults
- The phrase "Sunny South" presents a romanticized view of what was ofte

Unnamed: 0_level_0,Unnamed: 1_level_0,Output from AI Model
Bias ID,Bias Item,Unnamed: 2_level_1
1,bias_level,BiasLevel.high
1,bias_type,BiasType.racial
1,explanation,The image presents exploitative agricultural labor practices that disproportionately affected African American communities in the post-Civil War South. The romanticized 'Sunny South' caption minimizes the harsh realities of the labor conditions and racial inequities of the era.
2,bias_level,BiasLevel.high
2,bias_type,BiasType.age
2,explanation,"The photograph shows children engaged in cotton picking labor, documenting child labor practices without acknowledgment of their problematic nature."
3,bias_level,BiasLevel.medium
3,bias_type,BiasType.cultural
3,explanation,"The postcard format commodifies and trivializes the labor conditions of marginalized communities for tourist consumption, presenting a sanitized view of agricultural labor in the Jim Crow era South."


In [12]:
print(results["cot"])
print(results["bias_analysis"].model_dump())


1. Object Breakdown:
- Historical postcard from 1909 showing cotton picking in the American South
- Black and white photograph showing multiple individuals working in a cotton field
- Adults and children visible in the scene
- Cotton plants with white bolls visible throughout the field
- Wooded area visible in background
- Large woven baskets visible for collecting cotton
- Caption reads "COTTON PICKING IN THE SUNNY SOUTH"
- Copyright notice indicates "COPYRIGHT 1909 BY H.TEES"

2. Text Analysis:
Printed text on front:
Possible transcriptions:
1. "COTTON PICKING IN THE SUNNY SOUTH"
2. "Cotton Picking in the Sunny South"
Selected: "COTTON PICKING IN THE SUNNY SOUTH" as it appears in all capitals on the original

Printed text on back:
- Standard postcard template text: "POST CARD", "CORRESPONDENCE", "NAME AND ADDRESS"
- "PLACE POSTAGE STAMP HERE"

Handwritten text on back:
- "M Rivard"
- "How are you?"
- Additional handwritten text is present but too faint/unclear to transcribe accurate