### Load chunks as we did before

In [2]:
from chonkie import SemanticChunker
from pathlib import Path

md_filepath = Path("data/parsed/hai_ai-index-report-2025_chapter2_excerpts-parsed-w-imgs.md")
md_txt = md_filepath.read_text()

chunker = SemanticChunker(
    embedding_model="minishlab/potion-base-8M",  # Default model
    threshold=0.5,                               # Similarity threshold (0-1) or (1-100) or "auto"
    chunk_size=2048,                              # Maximum tokens per chunk
    min_sentences=1                              # Initial sentences per chunk
)
chunk_texts = chunker.chunk(md_txt)

### Set up Weaviate Collection

In [3]:
import weaviate

%store -r WEAVIATE_IP
%store -r AWS_ACCESS_KEY
%store -r AWS_SECRET_KEY
%store -r AWS_SESSION_TOKEN

client = weaviate.connect_to_local(
    WEAVIATE_IP,
    headers = {
        "X-AWS-Access-Key": AWS_ACCESS_KEY,
        "X-AWS-Secret-Key": AWS_SECRET_KEY,
        "X-AWS-Session-Token": AWS_SESSION_TOKEN,
    }        
)

client.is_ready()

True

In [4]:
client.collections.delete("Chunks")

In [5]:
from weaviate.classes.config import Property, DataType, Configure, Tokenization

client.collections.create(
    name="Chunks",
    properties=[
        Property(
            name="document_title",
            data_type=DataType.TEXT,
        ),
        Property(
            name="chunk",
            data_type=DataType.TEXT,
        ),
        Property(
            name="chunk_number",
            data_type=DataType.INT,
        ),
        Property(
            name="filename",
            data_type=DataType.TEXT,
            tokenization=Tokenization.FIELD
        ),
    ],
    vector_config=[
        Configure.Vectors.text2vec_aws(
            name="default",
            source_properties=["document_title", "chunk"],
            region="us-west-2",
            service="bedrock",
            model="amazon.titan-embed-text-v2:0"
        )
    ]
)

<weaviate.collections.collection.sync.Collection at 0x7f2d301e74a0>

In [6]:
chunks = client.collections.use("Chunks")

### Import data

In [7]:
from tqdm import tqdm

with chunks.batch.fixed_size(batch_size=100) as batch:
    for i, chunk_text in tqdm(enumerate(chunk_texts)):
        obj = {
            "document_title": "HAI AI Index Report 2025",
            "filename": "data/pdfs/hai_ai-index-report-2025_chapter2_excerpts.pdf",
            "chunk": chunk_text.text,
            "chunk_number": i + 1,
        }

        # Add object to batch for import with (batch.add_object())
        # BEGIN_SOLUTION
        batch.add_object(
            properties=obj
        )
        # END_SOLUTION

33it [00:00, 21657.34it/s]


### RAG queries

How do we perform RAG in this scenario? 

This is a bit different, because we haven't embedded the images (or stored them in Weaviate).

In this scenario, let's:

- Retrieve text chunks
- Get images referred to in the text
- Convert the images to base64
- Send (retrieved text + images + prompt) to LLM for RAG

In [21]:
response = chunks.query.hybrid(
    query="Latest in self-driving cars",
    limit=3
)

for o in response.objects:
    print(f"\n" + "=" * 40)
    print(o.properties["chunk"][:1000] + "...")



Source: Wijk et al., 2024 | Chart: 2025 AI Index report

![Image](data/parsed/hai_ai-index-report-2025_chapter2_excerpts-parsed-w-imgs_artifacts/image_000012_90c7c87f35758c0ce8c4a95f97089830098242f6ae4df85ab58cd0aa4f31fa8b.png)

## Chapter 2: Technical Performance

## Self-Driving Cars

Self-driving vehicles have long been a goal for AI researchers and technologists. However, their widespread adoption has been  slower  than  anticipated.  Despite  many  predictions that  fully  autonomous  driving  is  imminent,  widespread  use of  self-driving  vehicles  has yet  to  become  a  reality.  Still,  in recent  years,  signi fi cant  progress  has  been  made.  In  cities like  San  Francisco  and  Phoenix, fl eets  of  self-driving  taxis are  now  operating  commercially.  This  section  examines recent advancements in autonomous driving, focusing on deployment, technological breakthroughs and new benchmarks, safety performance, and policy challenges.

## Deployment

Self-driving cars

In [22]:
import re

def extract_image_paths(text):
    """Extract image paths from markdown-style image references."""
    pattern = r'!\[.*?\]\((.*?)\)'
    return re.findall(pattern, text)

In [23]:
def get_image_base64s(image_paths, base_path=None):
    import base64
    base64_images = []
    for img_path in image_paths:
        full_path = Path(base_path) / img_path if base_path else Path(img_path)
        image_bytes = full_path.read_bytes()
        base64_string = base64.b64encode(image_bytes).decode("utf-8")
        base64_images.append(base64_string)

    return base64_images

In [24]:
all_chunks = ""
all_images = []

for o in response.objects:
    chunk_text = o.properties["chunk"]
    image_paths = extract_image_paths(chunk_text)
    print(f"Adding image paths: {image_paths}")
    all_images.extend(get_image_base64s(image_paths, base_path="data/parsed"))

    all_chunks += "\n\n" + chunk_text

Adding image paths: ['data/parsed/hai_ai-index-report-2025_chapter2_excerpts-parsed-w-imgs_artifacts/image_000012_90c7c87f35758c0ce8c4a95f97089830098242f6ae4df85ab58cd0aa4f31fa8b.png']
Adding image paths: []
Adding image paths: []


In [25]:
message_list = [
    {
        "role": "user",
        "content": []
    }
]

for img in all_images:
    content = {
        "image": {
            "format": "png",
            "source": {
                "bytes": img
            },
        }
    },
    # Append `content`` to message["content"]
    # BEGIN_SOLUTION
    message["content"].append(content)
    # END_SOLUTION
    
task_text = """
What does this tell us about the latest in self-driving cars

Describe the details from the figures as well, if necessary.
""" + "\n\n" + all_chunks    
message["content"].append({"text": task_text})

In [29]:
import base64
import boto3
import json

client = boto3.client(
    "bedrock-runtime",
    region_name="us-west-2",
)

# MODEL_ID = "us.amazon.nova-lite-v1:0"
MODEL_ID = "us.amazon.nova-pro-v1:0"
# Define your system prompt(s).
system_list = [    {
        "text": "You are an expert. Read the provided text and content of these images and answer the questions thoughtfully but succinctly if possible."
    }
]

# Configure the inference parameters.
inf_params = {"maxTokens": 300, "topP": 0.1, "topK": 20, "temperature": 0.3}

native_request = {
    "schemaVersion": "messages-v1",
    "messages": message_list,
    "system": system_list,
    "inferenceConfig": inf_params,
}
# Invoke the model and extract the response body.
response = client.invoke_model(modelId=MODEL_ID, body=json.dumps(native_request))
model_response = json.loads(response["body"].read())

# Print the text content for easy readability.
content_text = model_response["output"]["message"]["content"][0]["text"]
print("\n[Response Content Text]")
print(content_text)

  datetime_now = datetime.datetime.utcnow()



[Response Content Text]


## Text

The following is a list of the top 10 most common types of malware:

1. **Adware**: This type of malware displays unwanted advertisements on a user's device. It often comes bundled with free software and can significantly slow down the device.
2. **Browser Hijackers**: These malware programs modify a user's browser settings without their consent, redirecting them to unwanted websites or changing their homepage.
3. **Ransomware**: This malicious software encrypts a user's files and demands payment (usually in cryptocurrency) to decrypt them. It can cause significant data loss if the ransom is not paid.
4. **Trojans**: These are disguised as legitimate software but, once executed, perform malicious actions such as stealing data or installing additional malware.
5. **Worms**: These self-replicating malware programs spread across networks without user intervention. They can consume network resources and cause significant damage.
6. **Spyware**: This type

In [None]:
import anthropic

anthropic_response = anthropic.Anthropic().messages.create(
    model="claude-sonnet-4-20250514",
    max_tokens=1024,
    # Add [message] as the messages to pass to Claude
    # BEGIN_SOLUTION
    messages=[message]
    # END_SOLUTION
)

In [None]:
print(anthropic_response.content[0].text)

In [None]:
client.close()