# Overview



In [1]:
!pip install -q -r requirements.txt

# !pip install openai==1.93.0      # Only for testing
# ! pip install --upgrade docling openai torch


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import os
import boto3
from botocore.config import Config
from docling.document_converter import DocumentConverter
from pathlib import Path
from pymilvus import MilvusClient
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.chat import SystemMessagePromptTemplate, HumanMessagePromptTemplate
import httpx

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# === Configuration ===
endpoint = os.getenv("AWS_S3_ENDPOINT")           # MinIO service DNS name (e.g. minio.minio.svc.cluster.local)
access_key = os.getenv("AWS_ACCESS_KEY_ID")       # MinIO access key
secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")   # MinIO secret key
region = os.getenv("AWS_DEFAULT_REGION")          # Dummy value; boto3 still expects one
bucket_name = os.getenv("AWS_S3_BUCKET")          # Default bucket to use for the Workspace data connection 
object_key = "2502.07835v1.pdf"                   # The name of the PDF in the S3 bucket
download_dir = "downloads"                        # Location to download the dowuments to

# RAG demo server URL
inference_server_url = "https://llama-32-3b-instruct-quantizedw8a8.rag-demo.svc.cluster.local/v1"

# Document Ingestion

In [4]:
# === Initialise S3 client ===
s3 = boto3.client(
    "s3",
    endpoint_url=f"http://{endpoint}",
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key,
    region_name=region,
    config=Config(signature_version="s3v4"),
)

# === Ensure download directory exists ===
os.makedirs(download_dir, exist_ok=True)
local_path = os.path.join(download_dir, object_key)
print(f"Downloading from {bucket_name}::{object_key} to: {local_path}")

# === Download the file ===
try:
    print(f"🟢 INFO: Downloading documents for chunking and ingestion...")
    s3.download_file(bucket_name, object_key, local_path)
    print(f"✅ Downloaded '{object_key}' to '{local_path}'")
except s3.exceptions.NoSuchKey:
    print(f"❌ File '{object_key}' not found in bucket '{bucket_name}'")
except Exception as e:
    print(f"❌ Error downloading file: {e}")


Downloading from rag-docs::2502.07835v1.pdf to: downloads/2502.07835v1.pdf
🟢 INFO: Downloading documents for chunking and ingestion...
✅ Downloaded '2502.07835v1.pdf' to 'downloads/2502.07835v1.pdf'


# Embedding Generation

In [5]:
# SentenceTransformer for generating text embeddings
from sentence_transformers import SentenceTransformer

"""
Text Embedding Module  
This module initialises a SentenceTransformer model using the ‘all-MiniLM-L6-v2’ embedding model and provides a function to generate text embeddings. (M.S. 0.98)

Global Variables:
    embedding_model (str): Name of the Hugging Face embedding model to load. (M.S. 0.98)
    model (SentenceTransformer): Instance of SentenceTransformer initialised with the specified embedding model. (M.S. 0.98)

Functions:
    emb_text(text: str) -> list[float]:
        Encode the input text and return its embedding vector as a list of floats. (M.S. 0.98)
"""
embedding_model="all-MiniLM-L6-v2"
model = SentenceTransformer(embedding_model)

def emb_text(text: str) -> list[float]:
    return model.encode(text)

In [6]:
# Use this to find the default number of dimensions this embedding model generates. We will use that later when we create the Milvus database schema.
test_embedding = emb_text("This is a test")
embedding_dim = len(test_embedding)

print(f"🟢 INFO: Embedding dimensions for the {embedding_model} model are: {embedding_dim}")
print(f"👀 DEBUG: Embedding snippet: {test_embedding[:10]}")

🟢 INFO: Embedding dimensions for the all-MiniLM-L6-v2 model are: 384
👀 DEBUG: Embedding snippet: [ 0.0306124   0.01383137 -0.02084381  0.01632793 -0.01023149 -0.04798423
 -0.01731336  0.03728744  0.04588732  0.034405  ]


In [7]:
from utils import project_root

# Assemble a complete path to the file so the document import can properly and reliably always find the document.
doc_source = project_root() / local_path

if not doc_source.is_file():
    raise FileNotFoundError(f"{DOC_SOURCE} does not exist.")

print(f"🟢 INFO: Found document at: {doc_source}")

🟢 INFO: Found document at: /opt/app-root/src/rhoai-roadshow-v2/docs/2-rag/notebook/downloads/2502.07835v1.pdf


In [8]:
"""
Parse and chunk a PDF using Docling v2.x
"""
doc = DocumentConverter().convert(source=doc_source).document

In [9]:
print(f"🟢 INFO: {doc.pages}")

🟢 INFO: {1: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=1), 2: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=2), 3: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=3), 4: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=4), 5: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=5), 6: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=6), 7: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=7), 8: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=8), 9: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=9), 10: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=10), 11: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=11), 12: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=12), 13: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=13)}


# Connect to Milvus

In [10]:
collection_name = "my_rag_collection"

milvus_client = MilvusClient(
    uri="http://milvus-service.milvus.svc.cluster.local:19530",
    db_name="default"
)

In [11]:
if milvus_client.has_collection(collection_name):
    milvus_client.drop_collection(collection_name)

In [12]:
milvus_client.create_collection(
    collection_name=collection_name,
    dimension=embedding_dim,
    metric_type="IP",  # Inner product distance
    consistency_level="Strong",  # Supported values are (`"Strong"`, `"Session"`, `"Bounded"`, `"Eventually"`). See https://milvus.io/docs/consistency.md#Consistency-Level for more details.
)

In [13]:
from docling_core.transforms.chunker import HierarchicalChunker

from docling.document_converter import DocumentConverter

converter = DocumentConverter()
chunker = HierarchicalChunker()

# Convert the input file to Docling Document
source = doc_source
doc = converter.convert(source).document

# Perform hierarchical chunking. This is faster than Hybrid chunking, but not as good.
texts = [chunk.text for chunk in chunker.chunk(doc)]

# Vector Storage and Search

In [14]:
from tqdm import tqdm

data = []

for i, chunk in enumerate(tqdm(texts, desc="Processing chunks")):
    embedding = emb_text(chunk)
    data.append({"id": i, "vector": embedding, "text": chunk})

milvus_client.insert(collection_name=collection_name, data=data)

Processing chunks: 100%|██████████| 70/70 [00:00<00:00, 197.46it/s]


{'insert_count': 70, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], 'cost': 0}

# Visualising how embeddings are stored in a vector database

<Describe how this visualises how the text is stored in the vector database.

https://projector.tensorflow.org/

# Query-Time Retrieval

In [15]:
question = (
    "What are the challenges of assessing assessing the quality of AI-generated code? What are some strategies for doing this?"
)

In [16]:
search_res = milvus_client.search(
    collection_name=collection_name,
    data=[emb_text(question)],
    limit=3,
    search_params={"metric_type": "IP", "params": {}},
    output_fields=["text"],
)

In [17]:
import json

retrieved_lines_with_distances = [
    (res["entity"]["text"], res["distance"]) for res in search_res[0]
]
print(f"🟢 INFO: Raw format of document chunks retrieved from the database: \n{json.dumps(retrieved_lines_with_distances, indent=4)}")

🟢 INFO: Raw format of document chunks retrieved from the database: 
[
    [
        "The rise of Large Language Models (LLMs) in software engineering, particularly in code generation, has garnered significant attention. However, assessing the quality of AI-generated code remains a challenge due to the inherent complexity of programming tasks and the lack of robust evaluation metrics that align well with human judgment. Traditional token-based metrics such as BLEU and ROUGE, while commonly used in natural language processing, exhibit weak correlations with human assessments in code intelligence and verification tasks. Furthermore, these metrics are primarily research focused and are not designed for seamless integration into the software development lifecycle, limiting their practical utility for developers seeking to improve code quality and security.",
        0.7107064723968506
    ],
    [
        "Inspired by G-EVAL , this paper proposes ICE-Score , an evaluation metric that levera

# Augmented Generation

In [18]:
# Create the context that we will pass to the LLM along with the question so that it can generate a response.
context = "\n".join(
    [line_with_distance[0] for line_with_distance in retrieved_lines_with_distances]
)
print(f"🟢 INFO: Documents assembled into a single context for the LLM: : \"{context}\"")

🟢 INFO: Documents assembled into a single context for the LLM: : "The rise of Large Language Models (LLMs) in software engineering, particularly in code generation, has garnered significant attention. However, assessing the quality of AI-generated code remains a challenge due to the inherent complexity of programming tasks and the lack of robust evaluation metrics that align well with human judgment. Traditional token-based metrics such as BLEU and ROUGE, while commonly used in natural language processing, exhibit weak correlations with human assessments in code intelligence and verification tasks. Furthermore, these metrics are primarily research focused and are not designed for seamless integration into the software development lifecycle, limiting their practical utility for developers seeking to improve code quality and security.
Inspired by G-EVAL , this paper proposes ICE-Score , an evaluation metric that leverages LLMs for code assessment across multiple programming languages, in

In [19]:
SYSTEM_PROMPT = (
  "You are an AI assistant that answers questions based solely on the provided context. "
  "If the answer cannot be found in context, reply truthfully that you don’t know."
)

USER_PROMPT = (
  "Context:\n"
  "{context}\n"
  "Question:\n"
  "{question}\n"
  "Answer concisely:"
)

In [20]:
llm = ChatOpenAI(
    model="llama-32-3b-instruct-quantizedw8a8",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    api_key="EMPTY",  # if you prefer to pass api key in directly instaed of using env vars
    base_url=inference_server_url,
    http_client=httpx.Client(verify=False)    # Because we are using an internal API endpoint (service) we need to disable SSL certificate checking.
)

# Define system and human templates
SYSTEM_PROMPT = SystemMessagePromptTemplate.from_template(
    "You are an AI assistant that answers questions based solely on the provided context. "
    "If the answer cannot be found in context, reply truthfully that you don’t know."
)

HumanMessagePromptTemplate = HumanMessagePromptTemplate.from_template(
    "Context:\n"
    "{context}\n"
    "Question:\n"
    "{question}\n"
    "Answer concisely:"
)

# Combine into a chat prompt
chat_prompt = ChatPromptTemplate.from_messages(
    [SYSTEM_PROMPT, HumanMessagePromptTemplate]
)

prompt = chat_prompt.format_prompt(context=context, question=question)

ai_msg = llm.invoke(prompt)


In [21]:
print(f"Original queston: {question}")

Original queston: What are the challenges of assessing assessing the quality of AI-generated code? What are some strategies for doing this?


In [22]:
print(f"Original context: \"{context}\"")

Original context: "The rise of Large Language Models (LLMs) in software engineering, particularly in code generation, has garnered significant attention. However, assessing the quality of AI-generated code remains a challenge due to the inherent complexity of programming tasks and the lack of robust evaluation metrics that align well with human judgment. Traditional token-based metrics such as BLEU and ROUGE, while commonly used in natural language processing, exhibit weak correlations with human assessments in code intelligence and verification tasks. Furthermore, these metrics are primarily research focused and are not designed for seamless integration into the software development lifecycle, limiting their practical utility for developers seeking to improve code quality and security.
Inspired by G-EVAL , this paper proposes ICE-Score , an evaluation metric that leverages LLMs for code assessment across multiple programming languages, including Java, Python, C, C++, and JavaScript . 

In [23]:
print(f"Response from LLM: {ai_msg.content}")

Response from LLM: The challenges of assessing the quality of AI-generated code include:

1. Complexity of programming tasks
2. Lack of robust evaluation metrics that align with human judgment
3. Weak correlations between traditional token-based metrics (e.g., BLEU, ROUGE) and human assessments

Strategies for assessing AI-generated code include:

1. Using human-centered evaluation metrics (e.g., ICE-Score) that incorporate both usefulness and functional correctness.
2. Leveraging execution-based evaluation to assess the code's functionality.
3. Providing actionable insights through reverse-generated requirements and SBC scores.
4. Addressing syntactic variations and alternative solutions in generated code.
