Author: Ghana Ahmada
---

---



# Initialization

## Install Dependencies

In [123]:
!pip install colbert-ai ragas gdown langchain huggingface-hub langchain-community langchain-huggingface -q

## Import Library

In [124]:
import os
import gdown
import zipfile
import logging
from tqdm import tqdm
from genericpath import isdir

import pandas as pd

import colbert
from colbert import Indexer, Searcher
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert.data import Queries, Collection

from openai import OpenAI
from openai import ChatCompletion
from google.colab import userdata

from typing import Any, List, Optional, Sequence

from datasets import Dataset
from langchain_core.documents import Document
from langchain_core.retrievers import BaseRetriever
from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun, Callbacks

from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_huggingface import HuggingFaceEndpoint

from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)

import warnings
warnings.filterwarnings("ignore")

## Download Dataset & Index

In [126]:
def check_dir(dir_name: str) -> bool:
    return os.path.isdir(dir_name)

def download_data(url, filename, dir_name: str = "data", is_zipped=True) -> None:
    if not check_dir(dir_name):
        os.mkdir(dir_name)
    os.chdir(dir_name)
    logging.info("Downloading data....")
    gdown.download(
        url, quiet=False
    )
    if is_zipped:
        logging.info("Extracting zip file....")
        with zipfile.ZipFile(f"{filename}.zip", 'r') as zip_ref:
            zip_ref.extractall(filename)
        os.remove(f"{filename}.zip")
    os.chdir("..")


# download question and context dataset
download_data(url="https://drive.google.com/uc?&id=16EN4yCMdwJ1i1l3bZQFkry6CixMGFj68",
              filename="bio_dataset",
              dir_name="data",
              is_zipped=False)

# download question, context, and gpt answer
download_data(url="https://drive.google.com/uc?&id=1ENbYP-gr0LvfYAuGWF-wgMta7zKIk0lE",
              filename="bio_dataset_with_ground_truth",
              dir_name="data",
              is_zipped=False)

# download indexed KB
download_data(url="https://drive.google.com/uc?&id=1n-KkrUpfVncfP_vbsyYMgCzGhclVTu8i",
              filename="experiments",
              dir_name="data",
              is_zipped=True)

Downloading...
From: https://drive.google.com/uc?&id=16EN4yCMdwJ1i1l3bZQFkry6CixMGFj68
To: /content/data/bio_dataset.csv
100%|██████████| 1.42M/1.42M [00:00<00:00, 61.0MB/s]
Downloading...
From: https://drive.google.com/uc?&id=1ENbYP-gr0LvfYAuGWF-wgMta7zKIk0lE
To: /content/data/bio_dataset_with_ground_truth.csv
100%|██████████| 170k/170k [00:00<00:00, 49.8MB/s]
Downloading...
From: https://drive.google.com/uc?&id=1n-KkrUpfVncfP_vbsyYMgCzGhclVTu8i
To: /content/data/experiments.zip
100%|██████████| 1.17M/1.17M [00:00<00:00, 105MB/s]


In [161]:
df = pd.read_csv("/content/data/bio_dataset_with_ground_truth.csv")
df["grount_truth"] = df["grount_truth"].apply(process_text)
df.rename(columns={"grount_truth": "ground_truth"}, inplace=True)

# ColBERT

In [127]:
with Run().context(RunConfig(experiment='/content/data/experiments/experiments/msmarco')):
    searcher = Searcher(index="heyhi", checkpoint="ghanahmada/biology-mcolbert")

[Sep 08, 21:16:57] #> Loading codec...
[Sep 08, 21:16:57] #> Loading IVF...
[Sep 08, 21:16:57] #> Loading doclens...


100%|██████████| 1/1 [00:00<00:00, 577.89it/s]

[Sep 08, 21:16:57] #> Loading codes and residuals...



100%|██████████| 1/1 [00:00<00:00, 271.27it/s]


In [128]:
query_en = "What is the optimum temperature for most human enzymes to be most active?"
query_id = "Berapa suhu optimum untuk sebagian besar enzim manusia agar paling aktif?"
print(f"#> {query_id}")

results = searcher.search(query_id, k=3)
for passage_id, passage_rank, passage_score in zip(*results):
    print(f"\t [{passage_rank}] \t\t {passage_score:.1f} \t\t {searcher.collection[passage_id]}")

#> Berapa suhu optimum untuk sebagian besar enzim manusia agar paling aktif?

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: Berapa suhu optimum untuk sebagian besar enzim manusia agar paling aktif?, 		 True, 		 None
#> Output IDs: torch.Size([64]), tensor([  101,   100, 14321, 53750, 10113, 84390, 10303, 28184, 10465, 10782,
        39564, 15096, 36894, 27777, 36769, 19958, 45861,   136,   102,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103,   103,   103])
#> Output Mask: torch.Size([64]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 

## 1.1. Generate relevance context

In [163]:
contexts = []

for query in tqdm(df["question"].tolist(), total=len(df)):
    results = searcher.search(query, k=1)
    for passage_id, passage_rank, passage_score in zip(*results):
        contexts.append(searcher.collection[passage_id])

100%|██████████| 192/192 [01:52<00:00,  1.71it/s]


# 2. Evaluation

In [None]:
qna_data = pd.read_csv("/content/data/bio_dataset.csv")
truth_context_question = qna_data[qna_data["label"] == 1]
truth_context_question["header"].values[2]

'What is the optimum temperature for most human enzymes to be most active?'

## 2.1. Creating ground truth with GPT4o

In [None]:
def ask(question: str, context: str):
    client = OpenAI(
        api_key=userdata.get('openai_key')
    )
    response: ChatCompletion = client.chat.completions.create(
        model="gpt-4o",
        messages=[
          {"role": "system",
            "content": """You are a professional annotator who is answering a biology question given a context.
                          Your task is to answer educational questions based on the given material that can be answered directly using information from the text.
                          The answer should be clear, concise, and suitable for primary school students.
                          """},
          {"role": "user",
            "content": f"""Understand this question:
                          {question}

                          Then, think step by step about this context to find the answer for previous question:
                          {context}

                          Then, answer the question using this exactly format.

                            [A]: answer
                    """},
        ]
        )
    return [answer.message.content for answer in response.choices]

In [None]:
dct = {"question": [], "context": [], "grount_truth": []}

for _, row in tqdm(truth_context_question.iterrows()):
    dct["question"].append(row["header"])
    dct["context"].append(row["content"])
    dct["grount_truth"].append(ask(row["header"], row["content"]))

df = pd.DataFrame(dct)

192it [06:59,  2.19s/it]


In [140]:
def process_text(text: str):
    return text.replace("[A]: ", "").replace("[", "").replace("]", "").strip()

# df.to_csv("bio_dataset_with_ground_truth.csv", index=False)

In [141]:
df_ground_truth = df.copy()
df_ground_truth

Unnamed: 0,question,context,ground_truth
0,Why is an enzyme less active at low temperatures?,Optimum:mostfavourableAn enzyme is less active...,'An enzyme is less active at low temperatures ...
1,What happens to the kinetic energy of molecule...,Optimum:mostfavourableAn enzyme is less active...,"'As the temperature increases, the kinetic ene..."
2,What is the optimum temperature for most human...,Optimum:mostfavourableAn enzyme is less active...,"'For most human enzymes, the optimum temperatu..."
3,"What are enzymes, and where are they usually f...",Biological catalysts are large biological mole...,'Enzymes are large biological molecules that a...
4,What happens when potassium chlorate(VII) is h...,Biological catalysts are large biological mole...,'When potassium chlorate(VII) is heated with m...
...,...,...,...
187,In which direction do water molecules move dur...,• The term ‘water potential’ is always used in...,'Water molecules move from a dilute solution t...
188,What separates the two solutions of different ...,• The term ‘water potential’ is always used in...,'A partially permeable membrane separates the ...
189,What are the substances called that enzymes ac...,The substances on which enzymes act are called...,'The substances that enzymes act on are called...
190,"In the 'lock-and-key' hypothesis, what part of...",The substances on which enzymes act are called...,"""In the 'lock-and-key' hypothesis, the substra..."


## 2.2. RAG Pipeline to generate answer

In [145]:
llm = HuggingFaceEndpoint(
                repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
                huggingfacehub_api_token="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",
                max_new_tokens=512,
                top_k=10,
                top_p=0.95,
                typical_p=0.95,
                temperature=0.01,
                repetition_penalty=1.03,
                streaming=True,
                return_full_text=True,
            )


prompt = ChatPromptTemplate.from_template(
      """
      Jawab pertanyaan berikut sebagai guru sesuai dengan bahasa yang digunakan dalam pertanyaan.

      <context>
      {context}
      </context>

      Question: {input}

      Note: Answer in the same language as the question.
      """

      )

class ColBERTLangChainRetriever(BaseRetriever):
    model: Any
    kwargs: dict = {}

    def _get_relevant_documents(self, query: str, *, run_manager: CallbackManagerForRetrieverRun,
    ) -> List[Document]:
        """Get documents relevant to a query using the ColBERT searcher."""

        results = self.model.search(query, **self.kwargs)

        documents = [
            Document(
                page_content=self.model.collection[passage_id],
                metadata={"passage_id": passage_id, "rank": passage_rank, "score": passage_score}
            )
            for passage_id, passage_rank, passage_score in zip(*results)
        ]

        return documents


def as_langchain_retriever(model, **kwargs: Any) -> BaseRetriever:
    return ColBERTLangChainRetriever(model=model, kwargs=kwargs)

def clean_text(text: str) -> str:
    return text.replace("<|eot_id|>", "").strip()

def generate_answer(query: str, llm, prompt):
    searcher_langchain = as_langchain_retriever(model=searcher, k=1)
    document_chain = create_stuff_documents_chain(llm, prompt)
    retrieval_chain = create_retrieval_chain(searcher_langchain, document_chain)

    answer = retrieval_chain.invoke({"input": query})["answer"]
    cleaned_answer = clean_text(answer)
    return cleaned_answer

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [146]:
generated_answers = []

for question in tqdm(df_ground_truth["question"], total=len(df_ground_truth)):
    answer = generate_answer(question, llm, prompt)
    generated_answers.append(answer)

df_ground_truth["answer"] = generated_answers

100%|██████████| 192/192 [01:52<00:00,  1.71it/s]


## 2.3. Evaluating using RAGAS

In [166]:
def clean_text_again(text: str) -> str:
    return text.replace("Jawaban:", "").replace("Answer:", "").strip()

eval_data_for_ragas = {
    "question": df_ground_truth["question"].tolist(),
    "answer": [clean_text_again(text) for text in df_ground_truth["answer"].tolist()],
    "contexts": [[c] for c in contexts],
    "ground_truth": df_ground_truth["ground_truth"].tolist()
}

In [167]:
eval_data_for_ragas["answer"]

['Because the kinetic energy of molecules is low, and the rate of substrate molecules colliding with the enzyme is very low.',
 'As the temperature increases, the kinetic energy of molecules increases.',
 'The optimum temperature for most human enzymes to be most active is about 40-45°C.',
 'Enzymes are proteins, and they are usually found in living organisms, such as animals, plants, and microorganisms.',
 'When potassium chlorate(VII) is heated with manganese(IV) oxide, oxygen is rapidly given off.',
 'Inorganic catalysts seperti manganese(IV) oxide berguna dalam reaksi kimia karena mereka dapat mempercepat reaksi tanpa mengubah diri mereka sendiri. Mereka juga dapat digunakan untuk mengurangi suhu reaksi dan membuat reaksi dapat berlangsung pada suhu yang lebih rendah. Contohnya, manganese(IV) oxide dapat mempercepat reaksi pembakaran potassium chlorate(VII) menjadi oxygen dan potassium chloride tanpa mengubah diri sendiri. Oleh karena itu, inorganic catalysts seperti manganese(IV) 

In [168]:
dataset = Dataset.from_dict(eval_data_for_ragas)

In [169]:
result = evaluate(
    dataset = dataset,
    metrics=[
        context_precision,
        context_recall,
        faithfulness,
        answer_relevancy,
    ],
)

result_df = result.to_pandas()

Evaluating:   0%|          | 0/768 [00:00<?, ?it/s]

ERROR:ragas.executor:Exception raised in Job[644]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[127]: TimeoutError()


In [170]:
result_df

Unnamed: 0,question,answer,contexts,ground_truth,context_precision,context_recall,faithfulness,answer_relevancy
0,Why is an enzyme less active at low temperatures?,Because the kinetic energy of molecules is low...,[Optimum:mostfavourableAn enzyme is less activ...,'An enzyme is less active at low temperatures ...,1.0,1.0,1.000000,1.000000
1,What happens to the kinetic energy of molecule...,"As the temperature increases, the kinetic ener...",[Optimum:mostfavourableAn enzyme is less activ...,"'As the temperature increases, the kinetic ene...",1.0,1.0,1.000000,1.000000
2,What is the optimum temperature for most human...,The optimum temperature for most human enzymes...,[Optimum:mostfavourableAn enzyme is less activ...,"'For most human enzymes, the optimum temperatu...",1.0,1.0,1.000000,1.000000
3,"What are enzymes, and where are they usually f...","Enzymes are proteins, and they are usually fou...","[Enzymes are proteins,and hence are affectedby...",'Enzymes are large biological molecules that a...,0.0,0.0,0.333333,0.989525
4,What happens when potassium chlorate(VII) is h...,When potassium chlorate(VII) is heated with ma...,[Biological catalysts are large biological mol...,'When potassium chlorate(VII) is heated with m...,1.0,1.0,0.500000,1.000000
...,...,...,...,...,...,...,...,...
187,In which direction do water molecules move dur...,Water molecules move from a dilute solution to...,[• The term ‘water potential’ is always used i...,'Water molecules move from a dilute solution t...,1.0,1.0,1.000000,0.934232
188,What separates the two solutions of different ...,A partially permeable membrane.,[• The term ‘water potential’ is always used i...,'A partially permeable membrane separates the ...,1.0,1.0,1.000000,0.888083
189,What are the substances called that enzymes ac...,The substances called that enzymes act on are ...,[Enzymes are classified according to the chemi...,'The substances that enzymes act on are called...,1.0,0.0,0.000000,0.983794
190,"In the 'lock-and-key' hypothesis, what part of...",The substrate fits into the active site of the...,[The substances on which enzymes act are calle...,"""In the 'lock-and-key' hypothesis, the substra...",1.0,1.0,1.000000,0.889844


In [171]:
result_df.to_csv("ragas_report_retrieved_context.csv", index=False)

In [172]:
print(f"context_precision: {result_df['context_precision'].mean()}")
print(f"context_recall: {result_df['context_recall'].mean()}")
print(f"faithfulness: {result_df['faithfulness'].mean()}")
print(f"answer_relevancy: {result_df['answer_relevancy'].mean()}")

context_precision: 0.8743455496507854
context_recall: 0.7972324346405228
faithfulness: 0.8300685425685425
answer_relevancy: 0.9247891386626608
