In [None]:
# INSTALLS
# llama-index-readers-file pymupdf
# pip install llama-index-llms-azure-openai
# ipykernel
# pip install llama-index-llms-ollama

# NOT YET
# %pip install llama-index-embeddings-huggingface

# https://docs.llamaindex.ai/en/stable/examples/low_level/oss_ingestion_retrieval/

########

# ! pip install langchain langchain-chroma "unstructured[all-docs]" pydantic lxml langchain-community

In [None]:
from unstructured.partition.pdf import partition_pdf
from langchain_ollama import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
import uuid
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_core.runnables import RunnablePassthrough
from typing import Any
from backend.util.eval_util import ensure_model_available
from pydantic import BaseModel
from langchain_ollama import OllamaEmbeddings
import csv

In [None]:
# Pull models
llm_model = "llama3.1:8b-chat"
ensure_model_available(llm_model)

In [None]:
embedding_model = "mxbai-embed-large"
ensure_model_available(embedding_model)

## Data Loading

In [None]:
PATH = "/workspaces/llm-testing/backend/rag_data/llama2.pdf"

# Get elements
raw_pdf_elements = partition_pdf(
    filename=PATH + "LLaMA2.pdf",
    # Unstructured first finds embedded image blocks
    extract_images_in_pdf=False,
    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    # Titles are any sub-section of the document
    infer_table_structure=True,
    # Post processing to aggregate text once we have the title
    chunking_strategy="by_title",
    # Chunking params to aggregate text blocks
    # Attempt to create a new chunk 3800 chars
    # Attempt to keep chunks > 2000 chars
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    image_output_dir_path=PATH,
)

In [None]:
# Create a dictionary to store counts of each type
category_counts = {}

for element in raw_pdf_elements:
    category = str(type(element))
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

# Unique_categories will have unique elements
# TableChunk if Table > max chars set above
unique_categories = set(category_counts.keys())
category_counts

In [None]:
class Element(BaseModel):
    type: str
    text: Any


# Categorize by type
categorized_elements = []
for element in raw_pdf_elements:
    if "unstructured.documents.elements.Table" in str(type(element)):
        categorized_elements.append(Element(type="table", text=str(element)))
    elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
        categorized_elements.append(Element(type="text", text=str(element)))

# Tables
table_elements = [e for e in categorized_elements if e.type == "table"]
print(len(table_elements))

# Text
text_elements = [e for e in categorized_elements if e.type == "text"]
print(len(text_elements))

## Multi-vector retriever

In [None]:
# Prompt
prompt_text = """You are an assistant tasked with summarizing tables and text. \
Give a concise summary of the table or text. Table or text chunk: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)

# Summary chain
model = ChatOllama(model=llm_model)
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

In [None]:
# Apply to text
texts = [i.text for i in text_elements if i.text != ""]
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})

In [None]:
# Apply to tables
tables = [i.text for i in table_elements]
table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})

## Add to vectorstore

In [None]:
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="summaries", embedding_function=OllamaEmbeddings(model=embedding_model)
    )

# The storage layer for the parent documents
store = InMemoryStore()  # <- Can we extend this to images
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

# Add texts
doc_ids = [str(uuid.uuid4()) for _ in texts]
summary_texts = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(text_summaries)
]
retriever.vectorstore.add_documents(summary_texts)
retriever.docstore.mset(list(zip(doc_ids, texts)))

# Add tables
table_ids = [str(uuid.uuid4()) for _ in tables]
summary_tables = [
    Document(page_content=s, metadata={id_key: table_ids[i]})
    for i, s in enumerate(table_summaries)
]
retriever.vectorstore.add_documents(summary_tables)
retriever.docstore.mset(list(zip(table_ids, tables)))

## RAG

In [None]:
# Prompt template
template = """Answer the question based only on the following context, which can include text and tables:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# LLM
model = ChatOllama(model="llama2:13b-chat")

# RAG pipeline
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [None]:
chain.invoke("What is the number of training tokens for LLaMA2?")

## Rag Evaluation

In [None]:
# generate questions and expected answers
questions_and_answers = [
    {"question": "What is Llama 2?", "expected_answer": "Llama 2 is a collection of pretrained and fine-tuned large language models developed by Meta AI, ranging from 7B to 70B parameters."},
    {"question": "What datasets were used to train Llama 2?", "expected_answer": "Publicly available online data excluding Meta's own products or services."},
    {"question": "What size models are included in the Llama 2 release?", "expected_answer": "7B, 13B, and 70B parameter models."},
    {"question": "What is RLHF and how was it used in Llama 2-Chat?", "expected_answer": "Reinforcement Learning with Human Feedback was used after supervised fine-tuning to better align responses with human preferences."},
    {"question": "How does Llama 2 improve over Llama 1?", "expected_answer": "Better data cleaning, 40% more tokens, doubled context length, and Grouped-Query Attention."},
    {"question": "What special technique was introduced to maintain dialogue consistency over multiple turns?", "expected_answer": "Ghost Attention (GAtt)."},
    {"question": "What steps were taken to ensure the safety of Llama 2?", "expected_answer": "Safety tuning, red teaming, safety-specific annotations, and safety reward models."},
    {"question": "What optimizer and learning schedule were used during pretraining?", "expected_answer": "AdamW optimizer with cosine learning rate schedule."},
    {"question": "What benchmarks does Llama 2 outperform its predecessors on?", "expected_answer": "MMLU, BBH, AGI Eval, and other academic benchmarks."},
    {"question": "What was the carbon footprint of training Llama 2 models?", "expected_answer": "539 tCO2eq emissions, 100% offset by Meta."},
    {"question": "Why is open-sourcing Llama 2 significant?", "expected_answer": "It promotes responsible, transparent AI development."},
    {"question": "What type of tokenization was used for Llama 2?", "expected_answer": "Byte-Pair Encoding (BPE) with 32k tokens."},
    {"question": "What are the two main objectives Llama 2-Chat was optimized for during RLHF?", "expected_answer": "Helpfulness and Safety."},
    {"question": "What is the Responsible Use Guide for Llama 2?", "expected_answer": "Guidelines provided to safely deploy Llama 2 and Llama 2-Chat."},
    {"question": "What is rejection sampling in Llama 2 fine-tuning?", "expected_answer": "Sampling multiple outputs and choosing the best using a reward model."},
    {"question": "What major safety evaluation metric was used for Llama 2-Chat?", "expected_answer": "Evaluation on ~2,000 adversarial prompts for safety violations."},
    {"question": "How does Llama 2-Chat compare to GPT-3.5 and GPT-4 on benchmarks?", "expected_answer": "Competitive with GPT-3.5 but still behind GPT-4."},
    {"question": "What is the purpose of training separate helpfulness and safety reward models?", "expected_answer": "To better specialize alignment without trade-offs between objectives."},
    {"question": "How did the authors address dataset contamination?", "expected_answer": "Through analyses to detect and limit overlap between training data and benchmarks."},
    {"question": "What is GQA and why is it important for Llama 2?", "expected_answer": "Grouped-Query Attention improves inference scalability in large models."},
]

In [None]:
# Eval
csv_filename = "rag_evaluation_results.csv"
with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=["Question", "Expected Answer", "Generated Answer", "Correctness", "Relevance", "Groundedness", "Retrieval relevance"])
    writer.writeheader()

    # Iterate through questions
    for qa in questions_and_answers:
        question = qa["question"]
        expected_answer = qa["expected_answer"]

        try:
            generated_answer = chain.invoke(question)
        except Exception as e:
            generated_answer = f"Error: {str(e)}"

        writer.writerow({
            "Question": question,
            "Expected Answer": expected_answer,
            "Generated Answer": generated_answer,
            "Correctness": "",
            "Relevance": "",
            "Groundedness": "",
            "Retrieval relevance": ""
        })

print(f"CSV file '{csv_filename}' has been created!")

In [None]:
# Imports
from pathlib import Path
from llama_index.readers.file import PyMuPDFReader
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.ollama import Ollama
from llama_index.core.schema import TextNode

In [None]:
# api_key = "<api-key>"
# azure_endpoint = "https://<your-resource-name>.openai.azure.com/"
# api_version = "2023-07-01-preview"

# llm = AzureOpenAI(
#     model="gpt-35-turbo-16k",
#     deployment_name="my-custom-llm",
#     api_key=api_key,
#     azure_endpoint=azure_endpoint,
#     api_version=api_version,
# )
llm = Ollama(model="llama2", request_timeout=60.0)
embed_model = Ollama(model="BAAI/bge-small-en")

In [4]:
# Data load
loader = PyMuPDFReader()
documents = loader.load(file_path="backend/rag_data/llama2.pdf")

In [None]:
node_parser = SentenceSplitter(chunk_size=1024)
nodes = node_parser.get_nodes_from_documents(documents)
index = VectorStoreIndex(nodes)
query_engine = index.as_query_engine(llm=llm)

In [None]:
# Test
query_str = (
    "What is the specific name given to the fine-tuned LLMs optimized for"
    " dialogue use cases?"
)

generated_answer = str(query_engine.query(query_str))


In [None]:
# DATASET GENERATION