In [52]:
# prompt: code for ingesting data from google drive

from google.colab import drive

# Mount your Google Drive to access files
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [53]:
import json
from pathlib import Path
from pprint import pprint


file_path = '/content/drive/MyDrive/News Article/random_articles.json'
data = json.loads(Path(file_path).read_text())

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [2]:
pip install langchain_community langchain-huggingface datasets langchain_cohere

Collecting langchain_community
  Downloading langchain_community-0.2.5-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-huggingface
  Downloading langchain_huggingface-0.0.3-py3-none-any.whl (17 kB)
Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain_cohere
  Downloading langchain_cohere-0.1.8-py3-none-any.whl (31 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl (28 kB)
Collecting langchain<0.3.0,>=0.2.5 (from langchain_community)
  Downloading langchain-0.2.5-py3-none-any.whl (974 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m974.6/974.6 kB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollect

# Preparing Dataset

In [89]:
# Importing All Libraries
import re
import pandas as pd
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEndpoint
from langchain.docstore.document import Document as LangchainDocument
import datasets
import random
from tqdm.auto import tqdm


In [6]:
keywords = ["israel","hamas","gaza"]

def clean_text(text):
  text=re.sub(r'\W+'," ",text)
  text=text.lower()
  return text

def is_relevant(text):
  return any(keyword in text for keyword in keywords)

In [7]:
filtered_articles=[]
y_train=[]
news=[]

for item in data:
  article_body = clean_text(item['articleBody'])
  article_title = clean_text(item['title'])

  if is_relevant(article_body) or is_relevant(article_title):
    filtered_articles.append(item)
    news.append(article_title)
    y_train.append(1)
  else:
    news.append(article_title)
    y_train.append(0)

In [9]:
type(filtered_articles)

list

In [None]:
# prompt: randomly picking 2500 articles from filtered_articles list

import random

random_articles = random.sample(filtered_articles, 2500)

# Optional: Print the randomly selected articles
for article in random_articles:
  print(article['title'])


In [None]:
# prompt: how to save these random articles

import json

with open('/content/drive/MyDrive/News Article/random_articles.json', 'w') as f:
  json.dump(random_articles, f)


In [None]:
# prompt: loading this randomarticle.json

import json

with open('/content/random_articles.json', 'r') as f:
  random_articles = json.load(f)

# Optional: Print the loaded articles
for article in random_articles:
  print(article['title'])


In [13]:
langchain_docs = [LangchainDocument(page_content=doc['articleBody'],metadata={"source":doc['source'],"title":doc["title"]}) for doc in random_articles]

# Preparing eval dataset

In [11]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2500,chunk_overlap=200,add_start_index=True,separators=["\n","\n\n",".",""," "])

In [14]:
docs_processed=[]

for doc in langchain_docs:
  docs_processed += text_splitter.split_documents([doc])

In [15]:
llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-7B-Instruct-v0.3",
    task="text-generation",
    max_new_tokens=1000,
    do_sample=False,
)

In [16]:
QA_generation_prompt = """
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)

Now here is the context.

Context: {context}\n
Output:::"""

In [17]:
def generate_response(llm,prompt):
  return llm(prompt)

In [88]:
import random
from tqdm.auto import tqdm

N_GENERATIONS = 100

print(f"Generating {N_GENERATIONS} factoid questions...")
outputs=[]

for sampled_context in tqdm(random.sample(docs_processed, N_GENERATIONS)):
  prompt = QA_generation_prompt.format(context=sampled_context.page_content)
  response = llm(prompt)
  question = response.split("Factoid question:")[1].split("Answer:")[0].strip()
  answer = response.split("Answer:")[1].strip()
  outputs.append({
      "question":question,
      "answer":answer,
      "context":sampled_context.page_content,
      "source":sampled_context.metadata["source"],
      "title":sampled_context.metadata["title"]
      })




Generating 100 factoid questions...


NameError: name 'docs_processed' is not defined

In [19]:
display(pd.DataFrame(outputs))

Unnamed: 0,question,answer,context,source,title
0,What kind of vessel was hit by a projectile ne...,"The Number 9, a 4,253-teu container ship.",At least two commercial vessels and a US warsh...,https://www.tradewindsnews.com/,"Houthis step up Red Sea attacks, targeting str..."
1,What is the new border strategy that the Biden...,The new border strategy that the Biden adminis...,Biden considering expelling migrants without a...,https://www.dailymail.co.uk/,Biden considering expelling migrants without a...
2,How many hours of research could be saved by u...,The context does not provide a specific number...,How well do you really know your competitors? ...,https://www.verdict.co.uk/,Authors accuse Meta of ‘knowingly’ training AI...
3,What is Michael Douglas' religious identity?,Michael Douglas identifies as a Reform Jew.,Anu Kuruvilla By\n\nExpress News Service\n\nKO...,https://www.newindianexpress.com/,"Hollywood stars Michael Douglas, Catherine Zet..."
4,What is the size of the biggest tunnel found i...,The biggest tunnel found in Gaza is twice the ...,While the military was aware that Hamas had an...,https://thefederal.com/,Israel finds large tunnel adjacent to Gaza bor...
...,...,...,...,...,...
95,What is the registration process to read full ...,The registration process to read full articles...,"Dear Reader,\n\nThis section is about Living i...",https://gulfnews.com/,UN expert says Israel has committed genocide i...
96,What did Pope Francis tell President Isaac Her...,Pope Francis told President Isaac Herzog durin...,Pope Francis held a tense phone call with Pres...,https://www.timesofisrael.com/,Report: Pope told Herzog last month Israel can...
97,Who reclaimed and reanimated the Hairpin websi...,"Nebojša Vujinović Vujo, a Serbian DJ-turned-en...",The platforms carry fabricated reports about p...,https://www.hindustantimes.com/,The rise of merchants of menace: What will the...
98,What action is the US considering against the ...,The US is considering striking the Houthis.,Top officials in Washington are actively weigh...,https://www.wionews.com/,US weighs options to retaliate against Houthi ...


## Setting up critique agents

In [20]:
question_groundedness_critique_prompt = """
You will be given a context and a question.
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the question and context.

Question: {question}\n
Context: {context}\n
Answer::: """

In [21]:
question_standalone_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how context-independant this question is.
Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1.
The questions can contain obscure technical nouns or acronyms like Gradio, Hub, Hugging Face or Space and still be a 5: it must simply be clear to an operator with access to documentation what the question is about.

For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independant from the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

In [22]:
print("Generating critique for each QA couple")

for output in tqdm(outputs):
  evaluations = {
      "question_groundedness":generate_response(
          llm,question_groundedness_critique_prompt.format(context=output["context"],question=output["question"])
          ),
      "question_standalone":generate_response(
          llm,question_standalone_critique_prompt.format(question=output["question"])
      )
  }
  try:
    for criterion, evaluation in evaluations.items():
      score,eval = (
          int(evaluation.split("Total rating: ")[-1].strip()),
          evaluation.split("Total rating: ")[-2].split("Evaluation: ")[1]
      )
      output.update(
          {
              f"{criterion}_score":score,
              f"{criterion}_evaluation":eval
          }
      )
  except Exception as e:
    print(e)
    continue


Generating critique for each QA couple


  0%|          | 0/100 [00:00<?, ?it/s]

invalid literal for int() with base 10: "5\nEvaluation: The question is clearly and unambiguously answerable with the provided context. The context states that the Biden administration is signaling to Congress that they are open to supporti
invalid literal for int() with base 10: 'The question refers to a specific document written by a specific entity (Israeli Ministry of Intelligence) about a specific event (the transfer of Palestinian civilians at the end of the war in Gaza)
list index out of range
invalid literal for int() with base 10: '3.5 (Since the question is relatively context-independent, but it does depend on some specific information about the person making the statement.)'
invalid literal for int() with base 10: '2.5'
invalid literal for int() with base 10: '5.0'
invalid literal for int() with base 10: '3\n\nEvaluation: While the context does mention that the high school will be remodeled and enlarged, it does not provide a specific amount for what taxpayers should expect 

In [23]:
df_outputs=pd.DataFrame(outputs)

In [24]:
df_outputs = df_outputs.dropna()

In [25]:
df_outputs.shape

(92, 9)

In [26]:
filtered_df_outputs = df_outputs.loc[
    (df_outputs["question_groundedness_score"] >= 4)
    & (df_outputs["question_standalone_score"] >= 4)
]

In [27]:
filtered_df_outputs.shape

(35, 9)

In [None]:
filtered_df_outputs.head()

In [30]:
df = pd.read_csv('qa_data.csv')

In [32]:
df.drop(columns=['Unnamed: 0'],inplace=True)

In [34]:
# load qa_data.csv and merge the currently constructede df to it

merged_df = pd.concat([df, filtered_df_outputs], ignore_index=True)


In [35]:
# save the merged data
merged_df.to_csv('qa_data.csv')

In [58]:
qa_data = pd.read_csv('qa_data.csv')

In [59]:
qa_data.drop(columns=["Unnamed: 0"],inplace=True)

In [60]:
eval_dataset = datasets.Dataset.from_pandas(qa_data, split="train", preserve_index=False)

In [61]:
eval_dataset

Dataset({
    features: ['question', 'answer', 'context', 'source', 'title', 'question_groundedness_score', 'question_groundedness_evaluation', 'question_standalone_score', 'question_standalone_evaluation'],
    num_rows: 91
})

In [62]:
from google.colab import userdata
import os
cohere_key=userdata.get('COHERE_API_KEY')
os.environ['COHERE_API_KEY']=cohere_key

In [76]:
from langchain_cohere import CohereEmbeddings
# embeddings = CohereEmbeddings(model="embed-english-light-v3.0")

In [11]:
metadata = [
    {key: value for key, value in item.items() if key not in ['articleBody','scrapedDate','dateModified']}
    for item in random_articles
]

In [12]:
article_bodies = [item['articleBody'] for item in random_articles]

In [12]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2500,chunk_overlap=200,add_start_index=True,separators=["\n","\n\n",".",""," "])

In [18]:
texts = text_splitter.create_documents(texts=article_bodies,metadatas=metadata)

In [34]:
question_groundedness_critique_prompt = """
You will be given a context and a question.
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the question and context.

Question: {question}\n
Context: {context}\n
Answer::: """

In [35]:
question_standalone_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how context-independant this question is.
Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1.
The questions can contain obscure technical nouns or acronyms like Gradio, Hub, Hugging Face or Space and still be a 5: it must simply be clear to an operator with access to documentation what the question is about.

For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independant from the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

In [None]:
print("Generating critique for each QA couple")

for output in tqdm(outputs):
  evaluations = {
      "question_groundedness":generate_response(
          llm,question_groundedness_critique_prompt.format(context=output["context"],question=output["question"])
          ),
      "question_standalone":generate_response(
          llm,question_standalone_critique_prompt.format(question=output["question"])
      )
  }
  try:
    for criterion, evaluation in evaluations.items():
      score,eval = (
          int(evaluation.split("Total rating: ")[-1].strip()),
          evaluation.split("Total rating: ")[-2].split("Evaluation: ")[1]
      )
      output.update(
          {
              f"{criterion}_score":score,
              f"{criterion}_evaluation":eval
          }
      )
  except Exception as e:
    print(e)
    continue


# RAG SYSTEM

In [63]:
from typing import Optional,List,Tuple

In [64]:
RAW_KNOWLEDGE_BASE = [
    LangchainDocument(page_content=doc['articleBody'],metadata={"source":doc['source'],"title":doc["title"]}) for doc in random_articles
]

In [82]:
def split_documents(
    chunk_size: int,
    knowledge_base: List[LangchainDocument],
    tokenizer_name:str
):
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size=chunk_size,chunk_overlap=int(chunk_size/10),add_start_index=True,separators=["\n","\n\n",".",""," "]
      )

  docs_processed=[]

  for doc in knowledge_base:
    docs_processed += text_splitter.split_documents([doc])

  unique_texts = {}
  docs_processed_unique = []

  for doc in docs_processed:
    if doc.page_content not in unique_texts:
      unique_texts[doc.page_content]=True
      docs_processed_unique.append(doc)

  return docs_processed_unique



# Retriever-Embedding

In [84]:
pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [66]:
from langchain.vectorstores import FAISS
from langchain.embeddings import CohereEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_cohere import CohereEmbeddings
import os

In [67]:
def load_embeddings(
    langchain_docs: List[LangchainDocument],
    chunk_size:int,
    embeddings_model: Optional[str]="thenlper/get-small",
)->FAISS:

  embedding_model = CohereEmbeddings(model=embeddings_model)
  index_name = f"index_chunk:{chunk_size}_embeddings:{embeddings_model.replace('/','~')}"
  index_folder_path = f"./data/indexes/{index_name}"

  if os.path.isdir(index_folder_path):
    return FAISS.load_local(
        index_folder_path,
        embedding_model,
        distance_strategy=DistanceStrategy.COSINE
    )
  else:
    docs_processed = split_documents(
        chunk_size,
        langchain_docs,
        embeddings_model
    )

    knowledge_index = FAISS.from_documents(
        docs_processed,
        embedding_model,
        distance_strategy=DistanceStrategy.COSINE
    )
    knowledge_index.save_local(index_folder_path)
    return knowledge_index

# READER_LLM

In [68]:
READER_LLM = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-7B-Instruct-v0.3",
    task="text-generation",
    max_new_tokens=1000,
    do_sample=False,
)

In [69]:
RAG_PROMPT_TEMPLATE = """
<|system|>
Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.</s>
<|user|>
Context:
{context}
---
Now here is the question you need to answer.

Question: {question}
</s>
<|assistant|>
"""

In [70]:
def answer_with_rag(
    question:str,
    llm,
    knowledge_index,
    reranker=None,
    num_retrieved_docs:int=10,
    num_docs_final:int=3
)->Tuple[str,List[LangchainDocument]]:

  relevant_docs = knowledge_index.similarity_search(question,k=num_retrieved_docs)
  relevant_docs = [doc.page_content for doc in relevant_docs]

  if reranker:
    pass

  relevant_docs = relevant_docs[:num_docs_final]

  context = "\nExtracted documents:\n"
  context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

  final_prompt = RAG_PROMPT_TEMPLATE.format(context=context,question=question)

  answer = llm(final_prompt)

  return answer,relevant_docs





# Benchmarking RAG

In [94]:
def run_rag_tests(
    eval_dataset: datasets.Dataset,
    llm,
    knowledge_index,
    output_file:str,
    reranker=None,
    verbose: Optional[bool]=True,
    test_settings:Optional[str]=None
):
  try:
    with open(output_file,"r") as f:
      outputs = json.load(f)
  except:
    outputs=[]

  for example in tqdm(eval_dataset):
    question = example["question"]
    if question in [output["question"] for output in outputs]:
      continue

    answer, relevant_docs = answer_with_rag(
        question,
        llm,
        knowledge_index,
        reranker=reranker
    )
    if verbose:
      print("=================================================================================")
      print(f"Question: {question}")
      print(f"Answer: {answer}")
      print(f"True answer: {example['answer']}")

    result = {
        "question":question,
        "true_answer":example["answer"],
        "source_doc":example["source"],
        "generated_answer":answer,
        "retrieved_docs":[doc for doc in relevant_docs]
    }

    if test_settings:
      result["test_settings"] = test_settings

    outputs.append(result)

    with open(output_file,"w") as f:
      json.dump(outputs,f)

In [72]:
EVALUATION_PROMPT = """###Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
4. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.

###The instruction to evaluate:
{instruction}

###Response to evaluate:
{response}

###Reference Answer (Score 5):
{reference_answer}

###Score Rubrics:
[Is the response correct, accurate, and factual based on the reference answer?]
Score 1: The response is completely incorrect, inaccurate, and/or not factual.
Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
Score 3: The response is somewhat correct, accurate, and/or factual.
Score 4: The response is mostly correct, accurate, and factual.
Score 5: The response is completely correct, accurate, and factual.

###Feedback:"""

In [73]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate
)

from langchain.schema import SystemMessage

In [97]:
evaluation_prompt_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(content="You are a fair evaluator language model."),
        HumanMessagePromptTemplate.from_template(EVALUATION_PROMPT)
    ]
)

In [75]:
%pip install -qU langchain-groq

In [46]:
from langchain_groq import ChatGroq

In [50]:
from google.colab import userdata
import os
groq_key=userdata.get('GROQ_API_KEY')
os.environ['GROQ_API_KEY']=groq_key

In [77]:
eval_chat_model=ChatGroq(
    temperature=0,
    model="llama3-70b-8192"
)
evaluator_name="Groq_llama"

In [95]:
def evaluate_answers(
    answer_path:str,
    eval_chat_model,
    evaluator_name,
    evaluation_prompt_template:ChatPromptTemplate
)->None:
  answers = []

  if os.path.isfile(answer_path):
    answers = json.load(open(answer_path,"r"))

  for experiment in tqdm(answers):
    if f"eval_score_{evaluator_name}" in experiment:
      continue
    eval_prompt = evaluation_prompt_template.format_messages(
        instruction = experiment["question"],
        response = experiment["generated_answer"],
        reference_answer = experiment["true_answer"]
    )
    eval_result = eval_chat_model(eval_prompt)
    feedback,score = [item.strip() for item in eval_result.content.split("[RESULT]")]
    experiment[f"eval_score_{evaluator_name}"] = score
    experiment[f"eval_feedback_{evaluator_name}"] = feedback


    with open(answer_path,"w") as f:
      json.dump(answers,f)

In [96]:
knowledge_index = load_embeddings(
    RAW_KNOWLEDGE_BASE,
    chunk_size=2500,
    embeddings_model="embed-english-light-v3.0"
)

In [99]:
if not os.path.isdir("./output"):
  os.mkdir("./output")

for chunk_size in [2500]:
  for embeddings in ["embed-english-light-v3.0"]:
    for rerank in [False]:
      settings_name = f"chunk_size:{chunk_size}_embeddings:{embeddings.replace('/','~')}_rerank:{rerank}_readerModel:{READER_LLM}"
      output_file_name = f"./output/rag_{settings_name}.json"

      print(f"Running RAG with settings: {settings_name}")

      print("Loading knowledge base emeddings...")

      # knowledge_index = load_embeddings(
      #     RAW_KNOWLEDGE_BASE,
      #     chunk_size=chunk_size,
      #     embeddings_model=embeddings
      # )

      print("Running RAG...")

      reranker = None

      run_rag_tests(
          eval_dataset=eval_dataset,
          llm=READER_LLM,
          knowledge_index=knowledge_index,
          output_file=output_file_name,
          reranker=reranker,
          verbose=False,
          test_settings=settings_name
      )

      print("Running evaluation...")
      evaluate_answers(
          output_file_name,
          eval_chat_model,
          evaluator_name,
          evaluation_prompt_template
      )




Running RAG with settings: chunk_size:2500_embeddings:embed-english-light-v3.0_rerank:False_readerModel:[1mHuggingFaceEndpoint[0m
Params: {'endpoint_url': None, 'task': 'text-generation', 'model_kwargs': {}}
Loading knowledge base emeddings...
Running RAG...


  0%|          | 0/91 [00:00<?, ?it/s]

Running evaluation...


  0%|          | 0/90 [00:00<?, ?it/s]

  warn_deprecated(


# inspect result

In [100]:
import glob

In [110]:
outputs = []
for file in glob.glob("./output/*.json"):
  output = pd.DataFrame(json.load(open(file,"r")))
  output["settings"] = file
  outputs.append(output)

result = pd.concat(outputs)

In [111]:
result.head()

Unnamed: 0,question,true_answer,source_doc,generated_answer,retrieved_docs,test_settings,eval_score_Groq_llama,eval_feedback_Groq_llama,settings
0,What type of guidance does the PAC-3 missile u...,The PAC-3 missile uses Track-Via-Missile (TVM)...,https://www.washingtonpost.com/,The PAC-3 missile uses Track-Via-Missile (TVM)...,[. Some newer models of these systems have dif...,chunk_size:2500_embeddings:embed-english-light...,5,Feedback: The response accurately states that ...,./output/rag_chunk_size:2500_embeddings:embed-...
1,"Who does Francesca admire for taking tangible,...",Francesca admires artists who not only speak o...,https://thefortyfive.com/,"Francesca Albanese admires Ancel Langwa, who i...",[To the tens of thousands of people who follow...,chunk_size:2500_embeddings:embed-english-light...,1,Feedback: The response is not accurate and fac...,./output/rag_chunk_size:2500_embeddings:embed-...
2,What company provides the most comprehensive C...,GlobalData,https://www.energymonitor.ai/,The company that provides the most comprehensi...,[How well do you really know your competitors?...,chunk_size:2500_embeddings:embed-english-light...,4,"Feedback: The response is mostly correct, accu...",./output/rag_chunk_size:2500_embeddings:embed-...
3,What has the Gaza Health Ministry stated about...,Hospitals in southern Gaza have collapsed and ...,https://www.democracynow.org/,The Gaza Health Ministry stated that the two ...,"[Health care, including attacks (Gaza Strip)\n...",chunk_size:2500_embeddings:embed-english-light...,4,Feedback: The response provides specific detai...,./output/rag_chunk_size:2500_embeddings:embed-...
4,How many people have been driven from their ho...,Nearly 85% of the territory’s 2.3 million peop...,https://nagalandpost.com/,"According to the context, 1.87 million people ...",[The assault into the south is triggering a ne...,chunk_size:2500_embeddings:embed-english-light...,4,Feedback: The response provides a specific num...,./output/rag_chunk_size:2500_embeddings:embed-...


In [112]:
result["eval_score_Groq_llama"] = result["eval_score_Groq_llama"].apply(lambda x: int(x) if isinstance(x,str) else 1)
result["eval_score_Groq_llama"] = (result["eval_score_Groq_llama"]-1)/4

In [113]:
result.head()

Unnamed: 0,question,true_answer,source_doc,generated_answer,retrieved_docs,test_settings,eval_score_Groq_llama,eval_feedback_Groq_llama,settings
0,What type of guidance does the PAC-3 missile u...,The PAC-3 missile uses Track-Via-Missile (TVM)...,https://www.washingtonpost.com/,The PAC-3 missile uses Track-Via-Missile (TVM)...,[. Some newer models of these systems have dif...,chunk_size:2500_embeddings:embed-english-light...,1.0,Feedback: The response accurately states that ...,./output/rag_chunk_size:2500_embeddings:embed-...
1,"Who does Francesca admire for taking tangible,...",Francesca admires artists who not only speak o...,https://thefortyfive.com/,"Francesca Albanese admires Ancel Langwa, who i...",[To the tens of thousands of people who follow...,chunk_size:2500_embeddings:embed-english-light...,0.0,Feedback: The response is not accurate and fac...,./output/rag_chunk_size:2500_embeddings:embed-...
2,What company provides the most comprehensive C...,GlobalData,https://www.energymonitor.ai/,The company that provides the most comprehensi...,[How well do you really know your competitors?...,chunk_size:2500_embeddings:embed-english-light...,0.75,"Feedback: The response is mostly correct, accu...",./output/rag_chunk_size:2500_embeddings:embed-...
3,What has the Gaza Health Ministry stated about...,Hospitals in southern Gaza have collapsed and ...,https://www.democracynow.org/,The Gaza Health Ministry stated that the two ...,"[Health care, including attacks (Gaza Strip)\n...",chunk_size:2500_embeddings:embed-english-light...,0.75,Feedback: The response provides specific detai...,./output/rag_chunk_size:2500_embeddings:embed-...
4,How many people have been driven from their ho...,Nearly 85% of the territory’s 2.3 million peop...,https://nagalandpost.com/,"According to the context, 1.87 million people ...",[The assault into the south is triggering a ne...,chunk_size:2500_embeddings:embed-english-light...,0.75,Feedback: The response provides a specific num...,./output/rag_chunk_size:2500_embeddings:embed-...


In [105]:
average_scores = result.groupby("settings")["eval_score_Groq_llama"].mean()
average_scores.sort_values()

settings
./output/rag_chunk_size:2500_embeddings:embed-english-light-v3.0_rerank:False_readerModel:[1mHuggingFaceEndpoint[0m\nParams: {'endpoint_url': None, 'task': 'text-generation', 'model_kwargs': {}}.json    0.694444
Name: eval_score_Groq_llama, dtype: float64

In [109]:
average_scores.head()

settings
./output/rag_chunk_size:2500_embeddings:embed-english-light-v3.0_rerank:False_readerModel:[1mHuggingFaceEndpoint[0m\nParams: {'endpoint_url': None, 'task': 'text-generation', 'model_kwargs': {}}.json    0.694444
Name: eval_score_Groq_llama, dtype: float64