In [48]:
import os
from anthropic import Anthropic
import base64

client = Anthropic(
    api_key=os.environ.get("ANTHROPIC_API_KEY"), # make sure you create your .env file first
)

# The data directory of png files to transcribe
# Download national archive files here:
# https://evidence-hou.se/events/big-llm-hack-24/data/correspondence.html
dir = './data/Sample Images - PREM 19/1'

# It seems that the files from national archieve are actually jpeg despite being named png? According to anthropic anyway
file_extension_to_consider = '.png'
image_media_type = "image/jpeg"

transcripts_dir_name = 'transcripts'


## Define the fields you want to extract from each letter


In [49]:
fields = [
  {
    'title': "Date",
    'tag': 'date',
    'description': 'The date the letter was written in YYYY-MM-DD format, or N/A if unknown. If the year and month are known but not the day, use the first day of the month.'
  },
  {
    'title': "Sender",
    'tag': 'sender',
    'description': 'The person or department which sent the letter, or N/A if unknown'
  },
  {
    'title': "Recipient",
    'tag': 'recipient',
    'description': 'The person or department which received the letter, or N/A if unknown'
  },
  {
    'title': "Subject",
    'tag': 'subject',
    'description': 'A one-line subject of the letter if present, otherwise infer this yourself from the context'
  }
]

In [50]:
def prompt():
    fields_string = "\n".join(f"- {field['title']}: {field['description']}. Wrap this field in <{field['tag']}> tags." for field in fields)
    
    return f"""
Transcribe the text in this image in full, wrapped in <transcription> tags.

Please also extract the following fields:

{fields_string}

""".strip()

print(prompt())

Transcribe the text in this image in full, wrapped in <transcription> tags.

Please also extract the following fields:

- Date: The date the letter was written in YYYY-MM-DD format, or N/A if unknown. If the year and month are known but not the day, use the first day of the month.. Wrap this field in <date> tags.
- Sender: The person or department which sent the letter, or N/A if unknown. Wrap this field in <sender> tags.
- Recipient: The person or department which received the letter, or N/A if unknown. Wrap this field in <recipient> tags.
- Subject: A one-line subject of the letter if present, otherwise infer this yourself from the context. Wrap this field in <subject> tags.


## Transcribe all the files in `dir`


In [51]:
os.makedirs(os.path.join(dir, transcripts_dir_name), exist_ok=True)

i = 0
files = sorted(os.listdir(dir))
for filename in files:
    if not filename.endswith(file_extension_to_consider):
        continue
    
    i += 1
    print(f"{i}/{len(files)} - Processing {filename}")

    filepath = os.path.join(dir, filename)

    with open(filepath, "rb") as f:
        image_data = base64.b64encode(f.read()).decode("utf-8")

    message = client.messages.create(
        model="claude-3-haiku-20240307",
        max_tokens=1024,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": image_media_type,
                            "data": image_data,
                        },
                    },
                    {"type": "text", "text": prompt()}
                ],
            }
        ],
    )
    parsed_doc = message.content[0].text

    with open(os.path.join(dir, transcripts_dir_name, filename.replace(file_extension_to_consider, '.txt')), 'w') as f:
        f.write(parsed_doc)

1/32 - Processing PREM19_0_0001_10_Im0.png
2/32 - Processing PREM19_0_0001_1_Im0.png
3/32 - Processing PREM19_0_0001_2_Im0.png
4/32 - Processing PREM19_0_0001_31_Im0.png
5/32 - Processing PREM19_0_0001_32_Im0.png
6/32 - Processing PREM19_0_0001_33_Im0.png
7/32 - Processing PREM19_0_0001_34_Im0.png
8/32 - Processing PREM19_0_0001_35_Im0.png
9/32 - Processing PREM19_0_0001_3_Im0.png
10/32 - Processing PREM19_0_0001_4_Im0.png
11/32 - Processing PREM19_0_0001_5_Im0.png
12/32 - Processing PREM19_0_0001_6_Im0.png
13/32 - Processing PREM19_0_0001_7_Im0.png
14/32 - Processing PREM19_0_0001_8_Im0.png
15/32 - Processing PREM19_0_0001_9_Im0.png
