# Part 1 Building RAG AI Tutor - Using other LLMs and Embedding Models
## Using LLama 3.1 70B on Together.ai

In [69]:
from llama_index.core import Document, VectorStoreIndex
from llama_index.core.evaluation import generate_question_context_pairs
from llama_index.core.evaluation import RetrieverEvaluator, RelevancyEvaluator, FaithfulnessEvaluator, BatchEvalRunner
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.llms.utils import LLM
from llama_index.core.prompts import PromptTemplate
from llama_index.core.schema import MetadataMode, TextNode
from llama_index.core.text_splitter import TokenTextSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset
from llama_index.llms.openai import OpenAI
from llama_index.llms.together import TogetherLLM
from llama_index.vector_stores.chroma import ChromaVectorStore
from tqdm import tqdm
from typing import Dict, List, Tuple

import asyncio
import chromadb
import csv
import json
import nest_asyncio
import os
import pprint
import re
import time
import uuid
import warnings

In [22]:
TogetherApiKeyEnvVar='TOGETHER_AI_API_KEY'
assert TogetherApiKeyEnvVar in os.environ

In [3]:
nest_asyncio.apply()

<b>Create a Vector Store</b>

In [6]:
vectorStoreName = "mini-llama-articles"
chromaClient = chromadb.PersistentClient( path=vectorStoreName)
chromaCollection = chromaClient.get_or_create_collection( vectorStoreName)
vectorStore = ChromaVectorStore( chroma_collection=chromaCollection)

<b>Ingest documents into the Vector storage.</b>

In [8]:
!curl -o ./mini-llama-articles.csv "https://raw.githubusercontent.com/AlaFalaki/tutorial_notebooks/main/data/mini-llama-articles.csv"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  169k  100  169k    0     0   363k      0 --:--:-- --:--:-- --:--:--  363k


In [10]:
rows = []

with open("mini-llama-articles.csv", mode="r", encoding="utf-8") as file: 
    csvReader = csv.reader(file)
    for index, row in enumerate( csvReader):
        if 0==index: 
            continue
        rows.append(row)
len(rows)        

14

In [13]:
documents = [Document(text=row[1], 
                      metadata={"title": row[0], "url": row[2], "source_name": row[3]}) 
             for row in rows]
len(documents)

14

In [14]:
textSplitter = TokenTextSplitter( separator=" ", chunk_size=512, chunk_overlap=128)

In [16]:
# Create the pipeline to apply the transformation to the documents i.e. the chunks.
# Stores the nodes into the vector store.
pipeline = IngestionPipeline(
    transformations=[
        textSplitter,
        HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
    ],
    vector_store=vectorStore
)

# Run the pipeline to produce the nodes.
nodes = pipeline.run(documents=documents, show_progress=True)

Parsing nodes:   0%|          | 0/14 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/108 [00:00<?, ?it/s]

In [19]:
type(nodes[0]) # Each node is a TextNode

llama_index.core.schema.TextNode

<b>Creates the query engine, using the LLM and the Vector Store</B>

In [29]:
llm = TogetherLLM( model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo", api_key=os.environ[TogetherApiKeyEnvVar])
index = VectorStoreIndex.from_vector_store(vectorStore, 
                                           embed_model="local:BAAI/bge-small-en-v1.5", 
                                           similarity_top_k=2)

queryEngine = index.as_query_engine(llm=llm)

In [34]:
# Test the Query engine
res = queryEngine.query("How many parameters LLaMA2 has?")
print(res.response)

Llama 2 is available in four different model sizes: 7 billion, 13 billion, 34 billion, and 70 billion parameters.


In [35]:
for src in res.source_nodes:
    print("Node ID\\t", src.node_id)
    print("Title\\t", src.metadata['title'])
    print("Score\\t", src.score)
    print("-_"*20)
len(res.source_nodes)    

Node ID\t 07f05b18-77c0-4122-aa1c-7a43d7bf772b
Title\t Meta's Llama 2: Revolutionizing Open Source Language Models for Commercial Use
Score\t 0.6191229753131267
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_


1

<b>Evaluates the retriever.</b>

In [82]:
DEFAULT_QA_GENERATE_PROMPT_TMPL = """
You are a Teacher/Professor. 
Your task is to setup questions for an upcoming examination. 
The questions should be diverse in nature across the document. 
Restrict the questions to the context information provided.
The questions must be about the provided context.
Do not use any other knowledge about the subject.
Questions should be easy to understand.

Context information is below:

{context_str}
"""
print(DEFAULT_QA_GENERATE_PROMPT_TMPL)


You are a Teacher/Professor. 
Your task is to setup questions for an upcoming examination. 
The questions should be diverse in nature across the document. 
Restrict the questions to the context information provided.
The questions must be about the provided context.
Do not use any other knowledge about the subject.
Questions should be easy to understand.

Context information is below:

{context_str}



In [83]:
def myGenerateQuestionContextPairs(
    nodes: List[TextNode],
    llm: LLM,
    qa_generate_prompt_tmpl: str = DEFAULT_QA_GENERATE_PROMPT_TMPL,
    num_questions_per_chunk: int = 2,
    request_delay: float = 2.0
) -> EmbeddingQAFinetuneDataset:
    """Generate examples given a set of nodes with delays between requests."""
    node_dict = {
        node.node_id: node.get_content(metadata_mode=MetadataMode.NONE)
        for node in nodes
    }

    queries = {}
    relevant_docs = {}

    for node_id, text in tqdm(node_dict.items()):
        query = qa_generate_prompt_tmpl.format(
            context_str=text, num_questions_per_chunk=num_questions_per_chunk
        )
        response = llm.complete(query)

        result = str(response).strip().split("\n")
        questions = [
            re.sub(r"^\d+[\).\s]", "", question).strip() for question in result
        ]
        questions = [question for question in questions if len(question) > 0][
            :num_questions_per_chunk
        ]

        num_questions_generated = len(questions)
        if num_questions_generated < num_questions_per_chunk:
            warnings.warn(
                f"Fewer questions generated ({num_questions_generated}) "
                f"than requested ({num_questions_per_chunk})."
            )

        for question in questions:
            question_id = str(uuid.uuid4())
            queries[question_id] = question
            relevant_docs[question_id] = [node_id]

        time.sleep(request_delay)

    return EmbeddingQAFinetuneDataset(
        queries=queries, corpus=node_dict, relevant_docs=relevant_docs
    )


In [84]:
evalDatasetFile='rag_eval_dataset_new.json'
llmOai = OpenAI(model="gpt-4o-mini")
if os.path.exists(evalDatasetFile):
    print(f'Eval dataset {evalDatasetFile} exists')
    ragEvalDataset = EmbeddingQAFinetuneDataset.from_json( evalDatasetFile)
else:
    print(f'Eval dataset {evalDatasetFile} does not exist: creating it.')
    ragEvalDataset = myGenerateQuestionContextPairs(nodes[:10],
                                                    llm=llmOai,
                                                    num_questions_per_chunk=1)

    # We can save the evaluation dataset as a json file for later use.
    ragEvalDataset.save_json(evalDatasetFile)
    print(f'Created {evalDatasetFile}.')

Eval dataset rag_eval_dataset_new.json does not exist: creating it.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [01:32<00:00,  9.28s/it]

Created rag_eval_dataset_new.json.





In [66]:
async def run_evaluation(index, rag_eval_dataset, top_k_values, llm_judge, llm, n_queries_to_evaluate=20,num_work=1):
    evaluation_results = {}

    # ------------------- MRR and Hit Rate -------------------

    for top_k in top_k_values:
        # Get MRR and Hit Rate
        retriever = index.as_retriever(similarity_top_k=top_k)
        retriever_evaluator = RetrieverEvaluator.from_metric_names(
            ["mrr", "hit_rate"], retriever=retriever
        )
        eval_results = await retriever_evaluator.aevaluate_dataset(rag_eval_dataset)
        avg_mrr = sum(res.metric_vals_dict["mrr"] for res in eval_results) / len(eval_results)
        avg_hit_rate = sum(res.metric_vals_dict["hit_rate"] for res in eval_results) / len(eval_results)

        # Collect the evaluation results
        evaluation_results[f"mrr_@_{top_k}"] = avg_mrr
        evaluation_results[f"hit_rate_@_{top_k}"] = avg_hit_rate

    # ------------------- Faithfulness and Relevancy -------------------

    # Extract the questions from the dataset
    queries = list(rag_eval_dataset.queries.values())
    batch_eval_queries = queries[:n_queries_to_evaluate]

    # Initiate the faithfulnes and relevancy evaluator objects
    faithfulness_evaluator = FaithfulnessEvaluator(llm=llm_judge)
    relevancy_evaluator = RelevancyEvaluator(llm=llm_judge)

    # The batch evaluator runs the evaluation in batches
    runner = BatchEvalRunner(
        {
            "faithfulness": faithfulness_evaluator,
            "relevancy": relevancy_evaluator
        },
        workers=num_work,
        show_progress=True,
    )

    # Get faithfulness and relevancy scores
    query_engine = index.as_query_engine(llm=llm)
    eval_results = await runner.aevaluate_queries(
        query_engine, queries=batch_eval_queries
    )
    faithfulness_score = sum(result.passing for result in eval_results['faithfulness']) / len(eval_results['faithfulness'])
    relevancy_score = sum(result.passing for result in eval_results['relevancy']) / len(eval_results['relevancy'])
    evaluation_results["faithfulness"] = faithfulness_score
    evaluation_results["relevancy"] = relevancy_score

    return evaluation_results

In [67]:
numQueriesToEvaluate=10

In [73]:
# top_k_values = [2, 4, 6, 8, 10]
top_k_values = [4]

llmJudge = OpenAI(temperature=0, model="gpt-4o")

evaluation_results = await run_evaluation(index, 
                                          ragEvalDataset, 
                                          top_k_values, 
                                          llmJudge,                                          
                                          llm=llm,
                                          n_queries_to_evaluate=numQueriesToEvaluate,
                                          num_work=1)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:58<00:00,  5.85s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:15<00:00,  1.32it/s]


In [74]:
pprint.pprint(evaluation_results, width=20)

{'faithfulness': 1.0,
 'hit_rate_@_4': 0.0,
 'mrr_@_4': 0.0,
 'relevancy': 0.9}


<pre>
When k=2
{'faithfulness': 0.9,
 'hit_rate_@_2': 0.0,
 'mrr_@_2': 0.0,
 'relevancy': 0.9}
</pre> 