In [2]:
# INSTALLS THAT NEED TO BE RUN ON CONDA
# !pip install langchain-aws
# !pip install loguru

In [1]:
import json
import boto3
import base64
import re
import os
from PIL import Image
from pathlib import Path
import glob
import time
from importlib import reload
import pandas as pd
from IPython.display import display
from io import BytesIO
from botocore.config import Config

pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# os.chdir('..')
print("CWD:", os.getcwd())
bedrock_runtime = boto3.client("bedrock-runtime")
s3 = boto3.client('s3')

def show_base64_image(encoded_str):
    # Add padding if missing
    missing_padding = len(encoded_str) % 4
    if missing_padding:
        encoded_str += '=' * (4 - missing_padding)
    
    # Decode and display
    image_data = base64.b64decode(encoded_str)
    image = Image.open(BytesIO(image_data))
    display(image)

CWD: /home/ec2-user/SageMaker/ai-description/projects/research


In [2]:
try:
    os.chdir('../../lib/src')
    import image_captioning_assistant.generate.prompts as p
    import image_captioning_assistant.generate.generate_bias_analysis as gba
    import image_captioning_assistant.generate.generate_structured_metadata as gsm
    import image_captioning_assistant.generate.utils as gen_utils
    # import image_captioning_assistant.data.data_classes as dc
    import image_captioning_assistant.aws.s3 as s3_util
finally:
    os.chdir('../../projects/research')

In [3]:
# download ground truth set to local
def download_s3_directory(bucket, s3_prefix, local_dir):
    s3 = boto3.client('s3')
    paginator = s3.get_paginator('list_objects_v2')
    
    for page in paginator.paginate(Bucket=bucket, Prefix=s3_prefix):
        for obj in page.get('Contents', []):
            # Skip directory markers
            if obj['Key'].endswith('/'):
                continue

            # Build local path
            relative_path = obj['Key'].replace(s3_prefix, '', 1)
            local_path = local_dir / relative_path
            
            # Create parent directories and download
            local_path.parent.mkdir(parents=True, exist_ok=True)
            s3.download_file(bucket, obj['Key'], str(local_path))

# Configuration
bucket_name = 'gaiic-emory-dev'
local_base = Path('ground_truth')

# Download single CSV file
csv_path = local_base / 'ground_truth.csv'
csv_path.parent.mkdir(parents=True, exist_ok=True)
boto3.client('s3').download_file(bucket_name, 'ground_truth.csv', str(csv_path))

# Download entire images directory
# download_s3_directory(
#     bucket=bucket_name,
#     s3_prefix='ground_truth_images/',
#     local_dir=local_base / 'images'
# )

In [4]:
ground_truth_df = pd.read_csv('ground_truth/ground_truth.csv')

In [5]:
def display_work_id_images(work_id):
    shas = ground_truth_df[ground_truth_df['work_id']==work_id]['page_sha1']
    
    for sha in shas:
        img_path = f'research/ground_truth/images/{sha}'
        if Path(img_path).exists():
            display(Image.open(img_path))
# display_work_id_images('7203xsj45j-cor')

In [6]:
def print_output(output):
    if "metadata" in output and output['metadata']:
        s = pd.Series(output['metadata'])
        display(pd.DataFrame({"Metadata Item": s.index, "Output from AI Model": s.values}))
    bias_list = output['bias_analysis']
    # Convert to DataFrame with multi-index
    multi_index_data = []
    for i, bias_dict in enumerate(bias_list):
        for key, value in bias_dict.items():
            multi_index_data.append(((i + 1, key), value))

    # Create DataFrame
    multi_index = pd.MultiIndex.from_tuples(
        [item[0] for item in multi_index_data], names=["Bias ID", "Bias Item"]
    )
    df = pd.DataFrame(
        {"Output from AI Model": [item[1] for item in multi_index_data]},
        index=multi_index,
    )
    display(df)

Be less specific on objects that don't matter, like house, not windows, not parts of the poster, but that it's a poster

make sure to characterize the object itself like that it's a black and white photo

In [7]:
# # s3://gaiic-emory-dev/ground_truth_images/3420d30e9b3c03a19105b4d1c92ff2b8880905c8
s3_kwargs = {
        "config": Config(
            s3={"addressing_style": "virtual"},
            signature_version="s3v4",
        ),
        "region_name": 'us-east-1',
    }
image_path = 'ground_truth/images'
work_id = '880ht76hj7-cor'
shas = ground_truth_df[ground_truth_df['work_id']==work_id][['page_sha1','page_title']]
front_sha = shas[shas['page_title'].str.lower()=='front']['page_sha1'].values[0]
back_sha = shas[shas['page_title'].str.lower()=='back']['page_sha1'].values[0]
front_bytes = s3_util.load_image_bytes(bucket_name, f'ground_truth_images/{front_sha}', s3_kwargs)
len(front_bytes)

27971190

In [8]:
# from PIL import Image
# import base64
# from io import BytesIO

# with open(image_full_path, "rb") as image_file:
#     # Open image and convert to RGB (removes alpha channel if present)
#     image = Image.open(image_file).convert('RGB')
    
#     # Set maximum dimensions while maintaining aspect ratio
#     max_dimension = 2048  # Adjust this based on your size requirements
#     image.thumbnail((max_dimension, max_dimension), Image.LANCZOS)
    
#     # Optimize JPEG quality and save to buffer
#     buffer = BytesIO()
#     image.save(buffer, 
#               format='JPEG', 
#               quality=85,  # Adjust between 75-95 for quality/size balance
#               optimize=True)
    
#     buffer.seek(0)
#     image_data = base64.b64encode(buffer.read()).decode("utf-8")

# # Verify size constraint
# if len(image_data) >= 10000000:
#     raise ValueError("Resized image still exceeds size limit - try reducing max_dimension or quality")
# print(len(image_data))



# show_base64_image(image_data)

In [9]:
print("With Sonnet 3.5 v2")
reload(p)
reload(gsm)
return_all = False
return_cot = False
image_path = 'ground_truth/images'
work_id = '880ht76hj7-cor'
shas = ground_truth_df[ground_truth_df['work_id']==work_id][['page_sha1','page_title']]
front_sha = shas[shas['page_title'].str.lower()=='front']['page_sha1'].values[0]
back_sha = shas[shas['page_title'].str.lower()=='back']['page_sha1'].values[0]

# Read and encode the images
image_data = s3_util.load_image_bytes(bucket_name, f'ground_truth_images/{front_sha}', s3_kwargs)
image_data_back = s3_util.load_image_bytes(bucket_name, f'ground_truth_images/{back_sha}', s3_kwargs)
img_bytes_list = [image_data, image_data_back]

llm_kwargs = {
                # "model": "anthropic.claude-3-5-sonnet-20240620-v1:0",
                "model": "us.anthropic.claude-3-5-sonnet-20241022-v2:0",
                "region_name": 'us-east-1',
            }
results = gsm.generate_structured_metadata(
    img_bytes_list,
    llm_kwargs,
    " "
)

With Sonnet 3.5 v2


In [10]:
print(results.cot)
print_output(results.model_dump())


1. Object Breakdown:
- Historical postcard showing cotton picking scene in the American South
- Multiple people working in a cotton field
- Two children visible in foreground
- Cotton baskets/sacks present
- Forest/tree line in background
- Sepia-toned photograph
- Copyright text and handwritten message on reverse

2. Text Analysis:
Front:
- Printed text: "COTTON PICKING IN THE SUNNY SOUTH"
- Copyright notice: "COPYRIGHT 1909 BY H.TEES"

Back:
- Standard postcard template text
- Handwritten message appears to be personal correspondence
- Name "Markward" written
- Message "How are you?"
- Additional handwritten text that is difficult to decipher due to cursive writing style and age of document

3. Transcription Analysis:
Multiple possible interpretations of the handwritten text on reverse were considered, but much is illegible. The clearly legible portions are:
- "Markward"
- "How are you?"
The rest of the handwritten text requires too much speculation to transcribe accurately.

4. Met

Unnamed: 0,Metadata Item,Output from AI Model
0,description,"Black and white photograph showing multiple individuals picking cotton in a large field. Several adults are bent over working among cotton plants, while two children stand near cotton baskets in the foreground. A tree line is visible in the background. The image captures agricultural labor conditions in the early 20th century American South."
1,transcription,"{'printed_text': ['COTTON PICKING IN THE SUNNY SOUTH', 'COPYRIGHT 1909 BY H.TEES', 'POST CARD', 'CORRESPONDENCE', 'NAME AND ADDRESS', 'PLACE POSTAGE STAMP HERE'], 'handwriting': ['Markward', 'How are you?']}"
2,date,1909
3,location,Southern United States
4,publication_info,"[Copyright 1909 by H.Tees, Postcard format]"
5,contextual_info,"[Documents cotton farming practices in early 20th century, Shows agricultural labor conditions, Represents sharecropping/tenant farming era, Produced as souvenir postcard]"
6,format,LibraryFormat.still_image
7,genre,"[black-and-white photograph, postcard]"
8,objects,"[cotton plants, cotton baskets, trees]"
9,actions,"[picking cotton, harvesting, standing]"


Unnamed: 0_level_0,Unnamed: 1_level_0,Output from AI Model
Bias ID,Bias Item,Unnamed: 2_level_1
1,bias_level,BiasLevel.high
1,bias_type,BiasType.racial
1,explanation,The image depicts and was produced during a period of systematic racial exploitation in agricultural labor. The postcard format suggests this serious subject was treated as entertainment or souvenir material.
2,bias_level,BiasLevel.high
2,bias_type,BiasType.age
2,explanation,The presence of children in a labor setting documents child labor practices that would now be considered exploitative and illegal.
3,bias_level,BiasLevel.medium
3,bias_type,BiasType.cultural
3,explanation,The phrase 'Sunny South' minimizes the harsh realities of agricultural labor conditions and perpetuates a romanticized view of the American South during a period of significant social injustice.


In [11]:
print("With Sonnet 3.5 v2")
reload(p)
reload(gba)
return_all = False
return_cot = False
image_path = 'ground_truth/images'
work_id = '880ht76hj7-cor'
shas = ground_truth_df[ground_truth_df['work_id']==work_id][['page_sha1','page_title']]
front_sha = shas[shas['page_title'].str.lower()=='front']['page_sha1'].values[0]
back_sha = shas[shas['page_title'].str.lower()=='back']['page_sha1'].values[0]

# Read and encode the images
image_data = s3_util.load_image_bytes(bucket_name, f'ground_truth_images/{front_sha}', s3_kwargs)
image_data_back = s3_util.load_image_bytes(bucket_name, f'ground_truth_images/{back_sha}', s3_kwargs)
img_bytes_list = [image_data, image_data_back]

llm_kwargs = {
                # "model": "anthropic.claude-3-5-sonnet-20240620-v1:0",
                "model": "us.anthropic.claude-3-5-sonnet-20241022-v2:0",
                "region_name": 'us-east-1',
            }
results = gba.generate_bias_analysis(
    img_bytes_list,
    llm_kwargs,
    " "
)

With Sonnet 3.5 v2


In [12]:
print(results.cot)
print_output(results.model_dump())


1. Object Breakdown:
- This is a 1909 postcard showing cotton picking in the American South
- The image shows multiple individuals, including adults and children, working in a cotton field
- The workers appear to be African American, wearing period work clothing
- The scene depicts manual labor in what appears to be a post-slavery but likely sharecropping context
- The landscape shows a cotton field with trees in the background
- The postcard format suggests this was meant as a souvenir or communication item

2. Text Analysis:
Printed text on front:
- "COTTON PICKING IN THE SUNNY SOUTH"
- "COPYRIGHT 1909 BY H.TEES"

Postcard reverse side printed text:
- "POST CARD"
- "CORRESPONDENCE"
- "NAME AND ADDRESS"
- "PLACE POSTAGE STAMP HERE"

Handwritten text on reverse:
- "M. Rivard"
- "How are you?"
- Additional handwritten text that is difficult to fully decipher due to cursive writing style and fading

3. Bias Analysis Considerations:
High level concerns:
- The image romanticizes and norma

Unnamed: 0_level_0,Unnamed: 1_level_0,Output from AI Model
Bias ID,Bias Item,Unnamed: 2_level_1
1,bias_level,BiasLevel.high
1,bias_type,BiasType.racial
1,explanation,"The image romanticizes and commodifies the exploitation of African American agricultural workers in the post-Civil War South, turning their labor into a tourist souvenir while minimizing the harsh realities of their working conditions"
2,bias_level,BiasLevel.medium
2,bias_type,BiasType.age
2,explanation,"The presence of children working in the cotton field normalizes child labor, presenting it as an acceptable practice"
3,bias_level,BiasLevel.high
3,bias_type,BiasType.cultural
3,explanation,"The phrase 'Sunny South' and the postcard format trivialize the serious social and economic oppression of the sharecropping system, presenting it as a picturesque scene rather than acknowledging its exploitative nature"
