In [1]:
!pip install unidecode gdown pymupdf huggingface-hub langchain langchain-community langchain-huggingface openai faiss-gpu -qq

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m379.1 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m367.8/367.8 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
!git -C ColBERT/ pull || git clone https://github.com/stanford-futuredata/ColBERT.git

fatal: cannot change to 'ColBERT/': No such file or directory
Cloning into 'ColBERT'...
remote: Enumerating objects: 2813, done.[K
remote: Counting objects: 100% (1324/1324), done.[K
remote: Compressing objects: 100% (428/428), done.[K
remote: Total 2813 (delta 1025), reused 985 (delta 892), pack-reused 1489 (from 1)[K
Receiving objects: 100% (2813/2813), 2.06 MiB | 7.12 MiB/s, done.
Resolving deltas: 100% (1779/1779), done.


In [3]:
import os
import gdown
import zipfile
import logging
from genericpath import isdir

import re
import sys
import json
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from unidecode import unidecode
from concurrent.futures import ThreadPoolExecutor, as_completed

# pdf parser; pip install pymupdf
import fitz

# rag
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate
from langchain_huggingface import HuggingFaceEndpoint

# for synthetic dataset
from openai import OpenAI
from openai import ChatCompletion

# retrieval
sys.path.insert(0, 'ColBERT/')
import colbert
from colbert import Trainer
from colbert.data import Queries
from colbert import Indexer, Searcher
from colbert.data import Queries, Collection
from colbert.utils.utils import print_message
from colbert.data.collection import Collection
from colbert.modeling.checkpoint import Checkpoint
from colbert.indexing.index_saver import IndexSaver
from colbert.search.index_storage import IndexScorer
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert.infra.launcher import Launcher, print_memory_stats
from colbert.indexing.collection_encoder import CollectionEncoder
from colbert.indexing.collection_indexer import CollectionIndexer

import torch

import faiss
assert faiss.get_num_gpus() > 0

import warnings
from kaggle_secrets import UserSecretsClient
warnings.filterwarnings("ignore")
OPENAI_KEY = UserSecretsClient().get_secret("openai_key")

ModuleNotFoundError: No module named 'ujson'

In [None]:
try:
    import google.colab
    !pip install -U pip
    !pip install -e ColBERT/['faiss-gpu','torch']
except Exception:
  import sys; sys.path.insert(0, 'ColBERT/')
  try:
    from colbert import Indexer, Searcher
  except Exception:
    print("If you're running outside Colab, please make sure you install ColBERT in conda following the instructions in our README. You can also install (as above) with pip but it may install slower or less stable faiss or torch dependencies. Conda is recommended.")
    assert False

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)

    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed = 42
set_seed(seed)

In [None]:
def check_dir(dir_name: str) -> bool:
    return os.path.isdir(dir_name)

def download_data(url, filename, dir_name: str = "data", is_zipped=True) -> None:
    if not check_dir(dir_name):
        os.mkdir(dir_name)
    os.chdir(dir_name)
    logging.info("Downloading data....")
    gdown.download(
        url, quiet=False
    )
    if is_zipped:
        logging.info("Extracting zip file....")
        with zipfile.ZipFile(f"{filename}.zip", 'r') as zip_ref:
            zip_ref.extractall(filename)
        os.remove(f"{filename}.zip")
    os.chdir("..")

download_data(url="https://drive.google.com/uc?&id=1YonKKtavO1qjUGEgGZaofFQy4kJVFsPc",
              filename="Cells and Chemistry of Life",
              dir_name="data",
              is_zipped=False)

download_data(url="https://drive.google.com/uc?&id=1qrl8WVRTdsXuQScCRxj-CMv7KaBcyjPV",
              filename="synthetic_qna_biology",
              dir_name="data",
              is_zipped=False)

# 1. Knowledge Base Construction

In [None]:
def calculate_median(sizes):
    sizes = sorted(sizes)
    n = len(sizes)
    if n == 0:
        return None

    middle = n // 2
    if n % 2 == 1:
        return sizes[middle]
    else:
        return (sizes[middle - 1] + sizes[middle]) / 2

def get_blocks(doc):
    all_pages = []

    for page in tqdm(doc):
        page_to_blocks = {}
        dct = page.get_text("dict")
        blocks = dct["blocks"]
        unique_sizes = set()

        for num, block in enumerate(blocks):
            if "lines" in block.keys():
                block_id = f"B{num}"
                page_to_blocks[block_id] = []
                lines = block["lines"]
                for line in lines:
                    for obj in line["spans"]:
                        page_to_blocks[block_id].append({"text": obj["text"], "size": obj["size"]})
                        unique_sizes.add(obj["size"])

        median_size = calculate_median(unique_sizes)

        all_pages.append({
            "blocks": page_to_blocks,
            "median_size": median_size
        })

    return all_pages


def segment_text(all_blocks, debug=False, alpha=1):
    result_dct = {"header": [], "content": []}
    for block_dct in tqdm(all_blocks):
        blocks = block_dct["blocks"]
        threshold = block_dct["median_size"]
        header = ""
        segmented_content = ""
        for _, block in blocks.items():
            content = ""
            for obj in block:
                if obj["size"] < threshold + alpha:
                    content += obj["text"].strip()
                    segmented_content += obj["text"].strip()
                else:
                    header = ""
                    segmented_content = ""
                    header += obj["text"].strip()

            if header != "" and len(header) > 2:
                result_dct["header"].append(header.strip().replace("\xa0", " "))
                result_dct["content"].append(segmented_content.strip().replace("\xa0", " "))

            if debug:
                print(f"header: {header}")
                print(f"content: {content}")
                print(f"segmented_content: {segmented_content}")
                print("==================================================")

    result_df = pd.DataFrame(result_dct)
    return result_df


def process_segmented_df(df):
    df = df.groupby('header', as_index=False)['content'].last()
    df = df[df["content"] != ""].reset_index(drop=True)

    df["qid"] = [i for i in range(len(df))]
    df["docno"] = [i for i in range(len(df))]
    return df

In [None]:
PDF_PATH = r"/kaggle/working/data/Cells and Chemistry of Life.pdf"

doc = fitz.open(PDF_PATH)
all_blocks = get_blocks(doc)

result_df = segment_text(all_blocks, debug=False, alpha=0.5)
result_df = process_segmented_df(result_df)
result_df

FileNotFoundError: no such file: '/kaggle/working/data/Cells and Chemistry of Life.pdf'

In [None]:
# result_df.to_csv("ground_truth.csv", index=False)

# 2. Retrieval Engine

## 2.1. Creating Retrieval Dataset

In [None]:
def clean_text(text: str):
    text = re.sub(r'\b(\w+?)(es|s)\b', r'\1', text)
    text = re.sub(r'[^\w\s,.]', '', text)
    tokens = re.findall(r'\w+', text)
    return " ".join(tokens)

def filter_qa_data(qa_data, raw=True):
    topics = ["learning", "enzyme","protein", "golgi", "diffusion", "turgor", "cytoplasm",
             "nucleus", "chromosome", "fat", "protein", "carbohydrate", "vacuoles", "mitochondria",
             "chloroplast", "endoplasmic", "ribosome", "membrane", "surface area", "glucose", "glycogen",
             "starch", ]
    list_of_query = []
    list_of_document = []
    indexes = []

    for id_num, (idx, row) in enumerate(qa_data.iterrows()):
        for topic in topics:
            cleaned_text = clean_text(text=row["content"].lower())
            if topic in cleaned_text:
                if raw:
                    list_of_query.append([id_num, row["header"]])
                else:
                    list_of_query.append([id_num, row["header"], row["is_train"]])
                list_of_document.append([id_num, row["content"], topic])
                indexes.append(idx)
                break

    if raw:
        query_df = pd.DataFrame(list_of_query, columns=["qid", "header"], index=indexes)
    else:
        query_df = pd.DataFrame(list_of_query, columns=["qid", "header", "is_train"], index=indexes)
    selected_document_df = pd.DataFrame(list_of_document, columns=["docno", "content","name"], index=indexes)

    document_df = qa_data[["content"]]
    document_df["docno"] = [i for i in range(len(qa_data)) ]
    qna = pd.concat([query_df, selected_document_df], axis=1)
    return qna, document_df


def create_qrels(qna: pd.DataFrame,
                 master_doc_df: pd.DataFrame,
                 neg_count: int=10,):
    list_of_qrels = []
    selected_qid = []

    for _, row in tqdm(qna.iterrows(), total=qna.shape[0]):
        if row["qid"] not in selected_qid:
            list_of_qrels.append([row["qid"], row["header"], row["docno"], row["content"], 1, row["is_train"]])
            selected_qid.append(row["qid"])

            selected_topic = row["name"]
            selected_idx = [row["docno"]]
            for _ in range(neg_count):
                rand_idx = np.random.randint(0, master_doc_df.shape[0])
                rand_doc = master_doc_df.iloc[rand_idx]["content"]

                while rand_idx in selected_idx and selected_topic in rand_doc.lower():
                    rand_idx = np.random.randint(0, master_doc_df.shape[0])
                    rand_doc = master_doc_df.iloc[rand_idx]["content"]

                list_of_qrels.append([row["qid"],
                                      row["header"],
                                      master_doc_df.iloc[rand_idx]["docno"],
                                      master_doc_df.iloc[rand_idx]["content"],
                                      0,
                                     row["is_train"]])
                selected_idx.append(rand_doc)

    qrels_df = pd.DataFrame(list_of_qrels, columns=["qid", "header", "docno", "content", "label", "is_train"])
    return qrels_df

In [None]:
class BuildSyntheticQNA:
    def __init__(self, qna: pd.DataFrame, openai_key: str=None):
        self.qna = qna
        self.openai_key = openai_key

    def _ask(self, material: str):
        client = OpenAI(
            api_key=self.openai_key
        )
        response: ChatCompletion = client.chat.completions.create(
            model="gpt-4o",
            messages=[
              {"role": "system",
               "content": """You are a professional annotator who is creating retrieval-style questions for a dataset.
                             Your task is to generate three educational questions based on the given material that can be answered directly using information from the text.
                             The questions should be clear, concise, and suitable for primary school students.
                             Each question should test their understanding of the biology concepts presented in the material."""},
              {"role": "user",
               "content": f"""Think step by step about this material:
                              {material}

                              Then, create three questions using this exactly format.

                               [Q1]: first question
                               [Q2]: second question
                               [Q3]: third question

                               REMEMBER, form a question that would be ask by primary student so use simple word but still related to the material
                        """},
            ]
            )
        return [answer.message.content for answer in response.choices]

    def _generate_gpt_questions(self, qna: pd.DataFrame):
        raw_synthetic_qna = {'header': [], 'content': [], 'generated_question': []}

        for i, (_,row) in tqdm(enumerate(qna.iterrows()), total=len(qna)):
            generated_question = self._ask(row['content'])[0]

            raw_synthetic_qna["header"].append(row['header'])
            raw_synthetic_qna["content"].append(row['content'])
            raw_synthetic_qna["generated_question"].append(generated_question)

        raw_synthetic_qna_df = pd.DataFrame(raw_synthetic_qna)
        return raw_synthetic_qna_df

    def _split_generated_question(self, raw_synthetic_qna_df: pd.DataFrame):
        synthetic_qna = {'raw_header': [], 'content': [], 'header': [], 'is_train': []}

        for _, row in raw_synthetic_qna_df.iterrows():
            gen_question = row["generated_question"]
            split_at_first_q = re.split(r'\[Q1\]:', gen_question, maxsplit=1)

            if len(split_at_first_q) > 1:
                questions = re.split(r'\[Q\d+\]: ', split_at_first_q[1])
                parsed_questions = [question.strip() for question in questions if question.strip()]
                for idx, q in enumerate(parsed_questions):
                    synthetic_qna["raw_header"].append(row["header"])
                    synthetic_qna["content"].append(row["content"])
                    synthetic_qna["header"].append(q)
                    if idx == 2:
                        synthetic_qna["is_train"].append(False)
                    elif 0 <= idx <= 1:
                        synthetic_qna["is_train"].append(True)
                    else:
                        raise ValueError()

                if len(parsed_questions) != 3:
                    print(f"NUM LEN: {len(parsed_questions)}")
                    print(gen_question)
                    print(parsed_questions)
                    print("=======================")

        synthetic_qna_df = pd.DataFrame(synthetic_qna)
        synthetic_qna_df = synthetic_qna_df[~synthetic_qna_df["content"].str.contains("Learning", na=False)]
        return synthetic_qna_df

    def build_qna(self, loaded_qna_path: str=None):
        if loaded_qna_path is not None:
            return pd.read_csv(loaded_qna_path)
        else:
            print("Building qna dataset using gpt...")
            assert self.openai_key is not None

            raw_synthetic_qna_df = self._generate_gpt_questions(self.qna)
            synthetic_qna_df = self._split_generated_question(raw_synthetic_qna_df)
            return synthetic_qna_df


In [None]:
RANDOM_STATE = 2024
FALSE_EXAMPLE_COUNT = 9
np.random.seed(RANDOM_STATE)


qna, master_doc_df = filter_qa_data(result_df, raw=True)
qna_builder = BuildSyntheticQNA(qna=qna)

synthetic_qna_df = qna_builder.build_qna(loaded_qna_path="/kaggle/working/data/synthetic_qna_biology.csv")
qna, master_doc_df = filter_qa_data(synthetic_qna_df, raw=False)

qrels_df = create_qrels(qna=qna,
                        neg_count=FALSE_EXAMPLE_COUNT,
                        master_doc_df=master_doc_df)
qrels_df

100%|██████████| 192/192 [00:00<00:00, 534.86it/s]


Unnamed: 0,qid,header,docno,content,label,is_train
0,0,Why is an enzyme less active at low temperatures?,0,Optimum:mostfavourableAn enzyme is less active...,1,True
1,0,Why is an enzyme less active at low temperatures?,136,Rough endoplasmic reticulum (RER)consistsof a ...,0,True
2,0,Why is an enzyme less active at low temperatures?,96,1Describe how you would test for reducing suga...,0,True
3,0,Why is an enzyme less active at low temperatures?,128,"Figure 3.20Eggs contain protein,healthy fats a...",0,True
4,0,Why is an enzyme less active at low temperatures?,27,Figure 4.10 shows the effect of temperature on...,0,True
...,...,...,...,...,...,...
1915,191,What happens to the enzyme after it converts t...,12,•Chloroplastsare oval structures found in plan...,0,False
1916,191,What happens to the enzyme after it converts t...,29,Figure 4.10 shows the effect of temperature on...,0,False
1917,191,What happens to the enzyme after it converts t...,19,Table 1.1 shows the key differences between pl...,0,False
1918,191,What happens to the enzyme after it converts t...,25,Figure 2.5 illustrates the process of diffusio...,0,False


In [None]:
synthetic_qna_df.to_csv("synthetic_qna_biology.csv", index=False)

## 2.2. Finetuning Retrieval (ColBERT)

In [None]:
BACKBONE_MODEL = "google-bert/bert-base-multilingual-cased"

train_config = {
    'triples_path': '/kaggle/working/triples.train.small.jsonl',
    'queries_path': '/kaggle/working/queries.train.small.tsv',
    'collection_path':  '/kaggle/working/collection.tsv',
    'root_path': '/kaggle/working/experiments',
    'experiment_name': 'msmarco',
    'model_checkpoint': BACKBONE_MODEL,
    'checkpoint_path': None,
    'nranks': 1
}

In [None]:
def escape_tsv(value):
    """Escape tabs and newlines in a string for safe TSV writing."""
    return value.replace('\t', '\\t').replace('\n', '\\n')

def unescape_tsv(value):
    """Unescape tabs and newlines in a string after reading from TSV."""
    return value.replace('\\t', '\t').replace('\\n', '\n')

def get_unique_ordered_list(original_list):
    seen = set()
    unique_list = []
    for item in original_list:
        if item not in seen:
            unique_list.append(item)
            seen.add(item)
    return unique_list

def get_example_data(qrels: pd.DataFrame):
    sampled_data = qrels[qrels["is_train"] == True][["content", "header", "label"]].dropna().copy()
    sampled_data["content"] = sampled_data["content"].apply(lambda row: " ".join(row.split()[:500]))
    print(f"loaded data with {len(sampled_data)} rows")

    questions = [{"qid":i, "query": item} for i, item in
                 enumerate(get_unique_ordered_list(sampled_data["header"].tolist()))]
    inv_questions = {item:i for i, item in
                 enumerate(get_unique_ordered_list(sampled_data["header"].tolist()))}
    passages = [{"pid":i, "passage": item} for i, item in
                 enumerate(get_unique_ordered_list(sampled_data["content"].tolist()))]
    inv_passages = {item:i for i, item in
                 enumerate(get_unique_ordered_list(sampled_data["content"].tolist()))}

    triples = []
    for q in sampled_data["header"].unique():
        curr_df = sampled_data[sampled_data["header"] == q]

        for idx in range(1, 10):
            tup = [inv_questions[q],
                   inv_passages[curr_df["content"].iloc[0]],
                   inv_passages[curr_df["content"].iloc[idx]]]
            triples.append(tup)

    labels = sampled_data["label"].tolist()

    return questions, passages, triples


def setup_training(qrels, triples_path, queries_path, collection_path, root_path):
    questions, passages, triples = get_example_data(qrels=qrels)

    with open('/kaggle/working/triples.train.small.jsonl', 'w') as f:
        for item in triples:
            f.write(json.dumps(item) + '\n')

    with open('/kaggle/working/queries.train.small.tsv', 'w') as f:
        for item in questions:
            f.write(f"{item['qid']}\t{escape_tsv(item['query'])}\n")

    with open('/kaggle/working/collection.tsv', 'w') as f:
        for item in passages:
            f.write(f"{item['pid']}\t{escape_tsv(item['passage'])}\n")

In [None]:
!mkdir -p {root_path}/checkpoint

# Define the training function
def train_colbert(qrels,
                  triples_path,
                  queries_path,
                  collection_path,
                  root_path,
                  experiment_name,
                  model_checkpoint=BACKBONE_MODEL):

    setup_training(qrels,
                   triples_path,
                   queries_path,
                   collection_path,
                   root_path)

    with Run().context(RunConfig(nranks=1, experiment=experiment_name)):

        colbert_config = ColBERTConfig(
            bsize=16,
            query_maxlen=64,
            doc_maxlen=512,
            dim=256,
            root=root_path,
        )

        trainer = Trainer(
            triples=triples_path,
            queries=queries_path,
            collection=collection_path,
            config=colbert_config,
        )

        checkpoint_path = trainer.train(checkpoint=model_checkpoint)

        print(f"Saved checkpoint to {checkpoint_path}...")


train_colbert(qrels=qrels_df,
              triples_path=train_config['triples_path'],
              queries_path=train_config['queries_path'],
              collection_path=train_config['collection_path'],
              root_path=train_config['root_path'],
              experiment_name=train_config['experiment_name'],
              model_checkpoint=train_config['model_checkpoint']
             )

loaded data with 1280 rows
#> Starting...




nranks = 1 	 num_gpus = 1 	 device=0
{
    "query_token_id": "[unused0]",
    "doc_token_id": "[unused1]",
    "query_token": "[Q]",
    "doc_token": "[D]",
    "ncells": null,
    "centroid_score_threshold": null,
    "ndocs": null,
    "load_index_with_mmap": false,
    "index_path": null,
    "index_bsize": 64,
    "nbits": 1,
    "kmeans_niters": 4,
    "resume": false,
    "pool_factor": 1,
    "clustering_mode": "hierarchical",
    "protected_tokens": 0,
    "similarity": "cosine",
    "bsize": 16,
    "accumsteps": 1,
    "lr": 3e-6,
    "maxsteps": 500000,
    "save_every": null,
    "warmup": null,
    "warmup_bert": null,
    "relu": false,
    "nway": 2,
    "use_ib_negatives": false,
    "reranker": false,
    "distillation_alpha": 1.0,
    "ignore_scores": false,
    "model_name": null,
    "query_maxlen": 64,
    "attend_to_mask_tokens": false,
    "interaction": "colbert",
    "dim": 256,
    "doc_maxlen": 512,
    "mask_punctuation": true,
    "checkpoint": "google-bert

Some weights of HF_ColBERT were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['linear.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler()
  return torch.cuda.amp.autocast() if self.activated else NullContextManager()




#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: Why is an enzyme less active at low temperatures?, 		 True, 		 None
#> Output IDs: torch.Size([64]), tensor([  101,   100, 24781, 10124, 10151, 53138, 15306, 14459, 10160, 15626,
        45091,   136,   102,   103,   103,   103,   103,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103,   103,   103], device='cuda:0')
#> Output Mask: torch.Size([64]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')

#>>>    30.82 27.08 		

In [None]:
config_med = {
    'triples_path': '/kaggle/working/triples.train.small.jsonl',
    'queries_path': '/kaggle/working/queries.train.small.tsv',
    'collection_path':  '/kaggle/working/collection.tsv',
    'root_path': '/kaggle/working/experiments/msmarco',
    'experiment_name': 'msmarco',
    'model_checkpoint': BACKBONE_MODEL,
    'checkpoint_path': None, # trained colbert
    'index_name':'heyhi',
    'nranks': 1
}

config_xm = {
    'triples_path': '/kaggle/working/triples.train.small.jsonl',
    'queries_path': '/kaggle/working/queries.train.small.tsv',
    'collection_path':  '/kaggle/working/collection.tsv',
    'root_path': '/kaggle/working/experiments',
    'experiment_name': 'colbert',
    'model_checkpoint': 'indobenchmark/indobert-base-p1',
    'checkpoint_path': "antoinelouis/colbert-xm", # trained colbert
    'index_name':'xm',
    'nranks': 1
}

In [None]:
MMARCO_LANGUAGES = {
    'ar': ('arabic', 'ar_AR'),
    'de': ('german', 'de_DE'),
    'en': ('english', 'en_XX'),
    'es': ('spanish', 'es_XX'),
    'fr': ('french', 'fr_XX'),
    'hi': ('hindi', 'hi_IN'),
    'id': ('indonesian', 'id_ID'),
    'it': ('italian', 'it_IT'),
    'ja': ('japanese', 'ja_XX'),
    'nl': ('dutch', 'nl_XX'),
    'pt': ('portuguese', 'pt_XX'),
    'ru': ('russian', 'ru_RU'),
    'vi': ('vietnamese', 'vi_VN'),
    'zh': ('chinese', 'zh_CN'),
}
MRTYDI_LANGUAGES = {
    'ar': ('arabic', 'ar_AR'),
    'bn': ('bengali', 'bn_IN'),
    'en': ('english', 'en_XX'),
    'fi': ('finnish', 'fi_FI'),
    'id': ('indonesian', 'id_ID'),
    'ja': ('japanese', 'ja_XX'),
    'ko': ('korean', 'ko_KR'),
    'ru': ('russian', 'ru_RU'),
    'sw': ('swahili', 'sw_KE'),
    'te': ('telugu', 'te_IN'),
    'th': ('thai', 'th_TH'),
}
MIRACL_LANGUAGES = {
    'ar': ('arabic', 'ar_AR'),
    'bn': ('bengali', 'bn_IN'),
    'en': ('english', 'en_XX'),
    'es': ('spanish', 'es_XX'),
    'fa': ('persian', 'fa_IR'),
    'fi': ('finnish', 'fi_FI'),
    'fr': ('french', 'fr_XX'),
    'hi': ('hindi', 'hi_IN'),
    'id': ('indonesian', 'id_ID'),
    'ja': ('japanese', 'ja_XX'),
    'ko': ('korean', 'ko_KR'),
    'ru': ('russian', 'ru_RU'),
    'sw': ('swahili', 'sw_KE'),
    'te': ('telugu', 'te_IN'),
    'th': ('thai', 'th_TH'),
    'zh': ('chinese', 'zh_CN'),
}
ALL_LANGUAGES = {**MMARCO_LANGUAGES, **MRTYDI_LANGUAGES, **MIRACL_LANGUAGES}


def set_xmod_language(model, lang:str):
    """
    Set the default language code for the model. This is used when the language is not specified in the input.
    Source: https://github.com/huggingface/transformers/blob/v4.34.1/src/transformers/models/xmod/modeling_xmod.py#L687
    """
    lang = lang.split('-')[0]
    if (value := ALL_LANGUAGES.get(lang)) is not None:
        model.set_default_language(value[1])
    else:
        raise KeyError(f"Language {lang} not supported.")

#-----------------------------------------------------------------------------------------------------------------#
#                                               INDEXER
#-----------------------------------------------------------------------------------------------------------------#
class CustomIndexer(Indexer):
    def __launch(self, collection):
        manager = mp.Manager()
        shared_lists = [manager.list() for _ in range(self.config.nranks)]
        shared_queues = [manager.Queue(maxsize=1) for _ in range(self.config.nranks)]
        launcher = Launcher(custom_encode)
        launcher.launch(self.config, collection, shared_lists, shared_queues, self.verbose)

def custom_encode(config, collection, shared_lists, shared_queues, verbose: int = 3):
    encoder = CustomCollectionIndexer(config=config, collection=collection, verbose=verbose)
    encoder.run(shared_lists)

class CustomCollectionIndexer(CollectionIndexer):
    def __init__(self, config: ColBERTConfig, collection, verbose=2):
        self.verbose = verbose
        self.config = config
        self.rank, self.nranks = self.config.rank, self.config.nranks
        self.use_gpu = self.config.total_visible_gpus > 0
        if self.config.rank == 0 and self.verbose > 1:
            self.config.help()
        self.collection = Collection.cast(collection)
        self.checkpoint = Checkpoint(self.config.checkpoint, colbert_config=self.config)
        if self.checkpoint.bert.__class__.__name__.lower().startswith("xmod"):
            language = detect(self.collection.__getitem__(0))
            Run().print_main(f"#> Setting X-MOD language adapters to {language}.")
            set_xmod_language(self.checkpoint.bert, lang=language)
        if self.use_gpu:
            self.checkpoint = self.checkpoint.cuda()
        self.encoder = CollectionEncoder(config, self.checkpoint)
        self.saver = IndexSaver(config)
        print_memory_stats(f'RANK:{self.rank}')

#-----------------------------------------------------------------------------------------------------------------#
#                                               SEARCHER
#-----------------------------------------------------------------------------------------------------------------#
class CustomSearcher(Searcher):
    def __init__(self, index, checkpoint=None, collection=None, config=None, index_root=None, verbose:int = 3):
        self.verbose = verbose
        if self.verbose > 1:
            print_memory_stats()

        initial_config = ColBERTConfig.from_existing(config, Run().config)

        default_index_root = initial_config.index_root_
        index_root = index_root if index_root else default_index_root
        self.index = os.path.join(index_root, index)
        self.index_config = ColBERTConfig.load_from_index(self.index)

        self.checkpoint = checkpoint or self.index_config.checkpoint
        self.checkpoint_config = ColBERTConfig.load_from_checkpoint(self.checkpoint)
        self.config = ColBERTConfig.from_existing(self.checkpoint_config, self.index_config, initial_config)

        self.collection = Collection.cast(collection or self.config.collection)
        self.configure(checkpoint=self.checkpoint, collection=self.collection)

        self.checkpoint = Checkpoint(self.checkpoint, colbert_config=self.config, verbose=self.verbose)
        if self.checkpoint.bert.__class__.__name__.lower().startswith("xmod"):
            language = detect(self.collection.__getitem__(0))
            print_message(f"#> Setting X-MOD language adapters to {language}.")
            set_xmod_language(self.checkpoint.bert, lang=language)
        use_gpu = self.config.total_visible_gpus > 0
        if use_gpu:
            self.checkpoint = self.checkpoint.cuda()
        load_index_with_mmap = self.config.load_index_with_mmap
        if load_index_with_mmap and use_gpu:
            raise ValueError(f"Memory-mapped index can only be used with CPU!")
        self.ranker = IndexScorer(self.index, use_gpu, load_index_with_mmap)
        print_memory_stats()

In [None]:
def escape_tsv(value):
    """Escape tabs and newlines in a string for safe TSV writing."""
    return value.replace('\t', '\\t').replace('\n', '\\n')

def unescape_tsv(value):
    """Unescape tabs and newlines in a string after reading from TSV."""
    return value.replace('\\t', '\t').replace('\\n', '\n')

def get_unique_ordered_list(original_list):
    seen = set()
    unique_list = []
    for item in original_list:
        if item not in seen:
            unique_list.append(item)
            seen.add(item)
    return unique_list

def get_data(qrels: pd.DataFrame, mode: str="train"):
    assert mode in ["train", "eval"], f"mode expected to be 'train' or 'eval', got {mode}"

    if mode == "train":
        sampled_data = qrels[qrels["is_train"] == True][["content", "header", "label"]].copy()
    elif mode == "eval":
        sampled_data = qrels[qrels["is_train"] == False][["content", "header", "label"]].copy()

    sampled_data["content"] = sampled_data["content"].apply(lambda row: " ".join(row.split()[:500]))

    questions = [{"qid":i, "query": item} for i, item in
                 enumerate(get_unique_ordered_list(sampled_data["header"].tolist()))]
    inv_questions = {item:i for i, item in
                 enumerate(get_unique_ordered_list(sampled_data["header"].tolist()))}
    passages = [{"pid":i, "passage": item} for i, item in
                 enumerate(get_unique_ordered_list(sampled_data["content"].tolist()))]
    inv_passages = {item:i for i, item in
                 enumerate(get_unique_ordered_list(sampled_data["content"].tolist()))}

    # create triples
    triples = []
    for q in sampled_data["header"].unique():
        curr_df = sampled_data[sampled_data["header"] == q]

        for idx in range(1, 10):
            tup = [inv_questions[q],
                   inv_passages[curr_df["content"].iloc[0]],
                   inv_passages[curr_df["content"].iloc[idx]]]
            triples.append(tup)

    labels = sampled_data["label"].tolist()


    return questions, passages, triples

def get_collection(qrels):
    qna = qrels
    passages = [{"pid":i, "passage": item} for i, item in
                 enumerate(get_unique_ordered_list(qna["content"].tolist()))]
    return passages

def load_collection(qrels, n_sample=2000, use_all=False):
    if use_all:
        qna = qrels
        passages = qna["content"].tolist()

    else:
        sampled_data = qrels[qrels["is_train"] == False][["content", "header", "label"]].copy()
        sampled_data["content"] = sampled_data["content"].apply(lambda row: " ".join(row.split()[:500]))
        passages = get_unique_ordered_list(sampled_data["content"].tolist())
    return passages

In [None]:
def setup(qrels, triples_path, queries_path, collection_path, root_path):
    questions, passages, triples = get_data(qrels=qrels, mode="eval")

    with open('/kaggle/working/triples.train.small.jsonl', 'w') as f:
        for item in triples:
            f.write(json.dumps(item) + '\n')

    with open('/kaggle/working/queries.train.small.tsv', 'w') as f:
        for item in questions:
            f.write(f"{item['qid']}\t{escape_tsv(item['query'])}\n")

    with open('/kaggle/working/collection.tsv', 'w') as f:
        for item in passages:
            f.write(f"{item['pid']}\t{escape_tsv(item['passage'])}\n")


def init_index(indexer, collection, config, verbose=3):
    with Run().context(RunConfig(nranks=config["nranks"], experiment=config["experiment_name"])):
        if config["checkpoint_path"] is None:
            checkpoint_path = get_experiment_folder(config["root_path"], config["experiment_name"])
        else:
            checkpoint_path = config["checkpoint_path"]

        colbert_config = ColBERTConfig(
            nbits=2,
            root=config["root_path"],
        )
        indexer = Indexer(checkpoint=checkpoint_path, config=colbert_config, verbose=verbose)
        indexer.index(name=f"{config['index_name']}",
                      collection=collection, overwrite=True)

class ColBERTSearcher:
    def __init__(self, qrels, config, verbose, searcher=None):
        self.qrels = qrels
        self.config = config
        self.verbose = verbose
        self.collection = self.load_collection()
        self.searcher = self.init_searcher(searcher)

    def load_collection(self, use_all=False):
        if use_all:
            qna = pd.read_csv('/kaggle/working/data/med_dataset_subsection/qna_med.csv')
            passages = qna["passage"].tolist()
        else:
            qrels = self.qrels
            sampled_data = qrels[qrels["is_train"] == False][["content", "header", "label"]].copy()
            passages = get_unique_ordered_list(sampled_data["content"].tolist())

        return passages

    def init_searcher(self, searcher=None):
        if searcher is None:
            searcher = Searcher

        with Run().context(RunConfig(experiment=self.config["experiment_name"])):
            searcher = searcher(index=f"{self.config['index_name']}",
                                collection=self.collection,
                                verbose=self.verbose)
        return searcher

    def infer(self, query, k=3):
        results = self.searcher.search(query, k=k)

        data = []
        for passage_id, passage_rank, passage_score in zip(*results):
            passage_text = self.searcher.collection[passage_id]
            if self.verbose > 0:
                print(f"{passage_rank:<5} {passage_score:<10.4f} {passage_id:<15} {passage_text}")

            data.append([passage_rank, passage_score, passage_id, passage_text, passage_score])

        df = pd.DataFrame(data, columns=["rank", "score", "passage_id", "passage_text", "passage_score"])
        return df


def add_inference(_eval_df, searcher, k=10):
    K = k
    eval_df = _eval_df.copy()

    query_df = eval_df.iloc[[i for i in range(len(eval_df)) if i%10 == 0]]

    for idx, row in tqdm(query_df.iterrows(), total=len(query_df)):
        query = row["header"]
        result = searcher.infer(query, k=K)
        passage_text = result["passage_text"].tolist()
        passage_id = result["passage_id"].tolist()
        passage_score = result["passage_score"].tolist()

        for i in range(K):
            if i == 0:
                eval_df.at[idx, f"context_{i+1}"] = passage_text[i]
                eval_df.at[idx, f"id_context_{i+1}"] = passage_id[i]
                eval_df.at[idx, f"score_context_{i+1}"] = passage_score[i]
            else:
                if i < len(passage_text):
                    eval_df.at[idx, f"context_{i+1}"] = passage_text[i]
                    eval_df.at[idx, f"id_context_{i+1}"] = passage_id[i]
                    eval_df.at[idx, f"score_context_{i+1}"] = passage_score[i]
                else:
                    eval_df.at[idx, f"context_{i+1}"] = "-"
                    eval_df.at[idx, f"id_context_{i+1}"] = -1
                    eval_df.at[idx, f"score_context_{i+1}"] = -1

    eval_df = eval_df.fillna(method="ffill")
    return eval_df

In [None]:
setup(qrels=qrels_df,
      triples_path=config_med['triples_path'],
      queries_path=config_med['queries_path'],
      collection_path=config_med['collection_path'],
      root_path=config_med['root_path'])

eval_df = qrels_df
eval_df = eval_df[eval_df["is_train"] == False]
eval_df

Unnamed: 0,qid,header,docno,content,label,is_train
20,2,What is the optimum temperature for most human...,2,Optimum:mostfavourableAn enzyme is less active...,1,False
21,2,What is the optimum temperature for most human...,11,"Now, look at Figure 3.27. Do you agree with wh...",0,False
22,2,What is the optimum temperature for most human...,28,Figure 4.10 shows the effect of temperature on...,0,False
23,2,What is the optimum temperature for most human...,91,Calculating Surface Area-to-volume Ratios1Cons...,0,False
24,2,What is the optimum temperature for most human...,39,Fat in a peanut seed can be broken down by oxi...,0,False
...,...,...,...,...,...,...
1915,191,What happens to the enzyme after it converts t...,12,•Chloroplastsare oval structures found in plan...,0,False
1916,191,What happens to the enzyme after it converts t...,29,Figure 4.10 shows the effect of temperature on...,0,False
1917,191,What happens to the enzyme after it converts t...,19,Table 1.1 shows the key differences between pl...,0,False
1918,191,What happens to the enzyme after it converts t...,25,Figure 2.5 illustrates the process of diffusio...,0,False


In [None]:
def get_experiment_folder(path, experiment_name):
    while experiment_name != "colbert":
        path = os.path.join(path, experiment_name)
        experiment_name = os.listdir(path)[0]
    else:
        path = os.path.join(path, experiment_name)

    return path


# config_med["checkpoint_path"] = get_experiment_folder(config_med["root_path"],
#                                                   config_med["experiment_name"])

init_index(indexer=Indexer,
           collection=load_collection(qrels=qrels_df, use_all=False),
           config=config_med,
           verbose=-1)



[Sep 07, 05:11:20] #> Creating directory /kaggle/working/experiments/msmarco/indexes/heyhi 


#> Starting...


  self.scaler = torch.cuda.amp.GradScaler()
  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


nranks = 1 	 num_gpus = 1 	 device=0
[Sep 07, 05:11:25] [0] 		 #> Encoding 64 passages..
[Sep 07, 05:11:27] [0] 		 avg_doclen_est = 154.875 	 len(local_sample) = 64
[Sep 07, 05:11:27] [0] 		 #> Saving the indexing plan to /kaggle/working/experiments/msmarco/indexes/heyhi/plan.json ..


  sub_sample = torch.load(sub_sample_path)


Clustering 9417 points in 256D to 1024 clusters, redo 1 times, 4 iterations
  Preprocessing in 0.00 s
[Sep 07, 05:11:27] Loading decompress_residuals_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...


If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].


[Sep 07, 05:12:29] Loading packbits_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...


If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
  centroids = torch.load(centroids_path, map_location='cpu')
  avg_residual = torch.load(avgresidual_path, map_location='cpu')
  bucket_cutoffs, bucket_weights = torch.load(buckets_path, map_location='cpu')
  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


[0.034, 0.032, 0.033, 0.036, 0.03, 0.032, 0.031, 0.038, 0.032, 0.034, 0.031, 0.032, 0.035, 0.034, 0.034, 0.033, 0.031, 0.03, 0.029, 0.034, 0.034, 0.032, 0.035, 0.036, 0.03, 0.037, 0.036, 0.038, 0.034, 0.032, 0.032, 0.035, 0.033, 0.033, 0.034, 0.033, 0.032, 0.035, 0.03, 0.038, 0.038, 0.037, 0.033, 0.037, 0.035, 0.036, 0.037, 0.032, 0.034, 0.036, 0.036, 0.03, 0.035, 0.034, 0.035, 0.039, 0.035, 0.035, 0.033, 0.03, 0.033, 0.034, 0.034, 0.031, 0.032, 0.032, 0.034, 0.031, 0.036, 0.037, 0.032, 0.035, 0.033, 0.031, 0.033, 0.036, 0.034, 0.035, 0.032, 0.038, 0.034, 0.031, 0.033, 0.032, 0.032, 0.035, 0.032, 0.034, 0.033, 0.034, 0.031, 0.036, 0.035, 0.035, 0.037, 0.035, 0.036, 0.035, 0.032, 0.036, 0.034, 0.033, 0.034, 0.036, 0.035, 0.039, 0.033, 0.033, 0.034, 0.039, 0.038, 0.034, 0.032, 0.035, 0.031, 0.032, 0.031, 0.038, 0.034, 0.035, 0.032, 0.034, 0.034, 0.033, 0.03, 0.033, 0.035, 0.037, 0.03, 0.033, 0.034, 0.034, 0.031, 0.033, 0.033, 0.03, 0.032, 0.032, 0.031, 0.037, 0.034, 0.034, 0.032, 0.034, 

1it [00:01,  1.21s/it]
  return torch.load(codes_path, map_location='cpu')
100%|██████████| 1/1 [00:00<00:00, 1065.90it/s]
100%|██████████| 1024/1024 [00:00<00:00, 46953.39it/s]


[Sep 07, 05:13:32] #> Optimizing IVF to store map from centroids to list of pids..
[Sep 07, 05:13:32] #> Building the emb2pid mapping..
[Sep 07, 05:13:32] len(emb2pid) = 9912
[Sep 07, 05:13:32] #> Saved optimized IVF to /kaggle/working/experiments/msmarco/indexes/heyhi/ivf.pid.pt

#> Joined...


In [None]:
init_index(indexer=CustomIndexer,
           collection=load_collection(qrels_df, use_all=False),
           config=config_xm,
           verbose=-1)


artifact.metadata:   0%|          | 0.00/2.14k [00:00<?, ?B/s]



[Sep 07, 05:13:33] #> Creating directory /kaggle/working/experiments/colbert/indexes/xm 


#> Starting...


  self.scaler = torch.cuda.amp.GradScaler()
  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


nranks = 1 	 num_gpus = 1 	 device=0
[Sep 07, 05:14:05] [0] 		 #> Encoding 64 passages..
[Sep 07, 05:14:06] [0] 		 avg_doclen_est = 157.96875 	 len(local_sample) = 64
[Sep 07, 05:14:06] [0] 		 #> Saving the indexing plan to /kaggle/working/experiments/colbert/indexes/xm/plan.json ..


  sub_sample = torch.load(sub_sample_path)
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].


Clustering 9605 points in 128D to 1024 clusters, redo 1 times, 4 iterations
  Preprocessing in 0.00 s
[Sep 07, 05:14:06] Loading decompress_residuals_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[Sep 07, 05:14:07] Loading packbits_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...


If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
  centroids = torch.load(centroids_path, map_location='cpu')
  avg_residual = torch.load(avgresidual_path, map_location='cpu')
  bucket_cutoffs, bucket_weights = torch.load(buckets_path, map_location='cpu')
  return torch.cuda.amp.autocast() if self.activated else NullContextManager()


[0.04, 0.034, 0.035, 0.039, 0.038, 0.035, 0.037, 0.035, 0.035, 0.035, 0.032, 0.034, 0.034, 0.035, 0.04, 0.036, 0.034, 0.036, 0.034, 0.036, 0.037, 0.03, 0.031, 0.033, 0.035, 0.038, 0.038, 0.035, 0.036, 0.036, 0.034, 0.038, 0.034, 0.03, 0.04, 0.03, 0.035, 0.033, 0.037, 0.034, 0.034, 0.031, 0.037, 0.037, 0.036, 0.034, 0.032, 0.034, 0.036, 0.031, 0.033, 0.035, 0.036, 0.038, 0.034, 0.036, 0.035, 0.032, 0.035, 0.033, 0.036, 0.034, 0.033, 0.032, 0.035, 0.036, 0.032, 0.034, 0.033, 0.038, 0.034, 0.034, 0.031, 0.035, 0.034, 0.036, 0.03, 0.037, 0.035, 0.035, 0.034, 0.035, 0.034, 0.035, 0.041, 0.037, 0.032, 0.035, 0.033, 0.035, 0.032, 0.033, 0.037, 0.034, 0.035, 0.032, 0.034, 0.035, 0.035, 0.035, 0.033, 0.037, 0.03, 0.033, 0.035, 0.036, 0.032, 0.033, 0.033, 0.038, 0.031, 0.033, 0.035, 0.033, 0.031, 0.034, 0.033, 0.034, 0.03, 0.029, 0.03, 0.032, 0.034, 0.037, 0.033, 0.036, 0.032, 0.035]
[Sep 07, 05:14:07] [0] 		 #> Encoding 64 passages..


1it [00:00,  1.62it/s]
  return torch.load(codes_path, map_location='cpu')
100%|██████████| 1/1 [00:00<00:00, 1235.80it/s]
100%|██████████| 1024/1024 [00:00<00:00, 37803.15it/s]


[Sep 07, 05:14:08] #> Optimizing IVF to store map from centroids to list of pids..
[Sep 07, 05:14:08] #> Building the emb2pid mapping..
[Sep 07, 05:14:08] len(emb2pid) = 10110
[Sep 07, 05:14:08] #> Saved optimized IVF to /kaggle/working/experiments/colbert/indexes/xm/ivf.pid.pt

#> Joined...


In [None]:
searcher_med = ColBERTSearcher(qrels=qrels_df, config=config_med, verbose=-1)
eval_df_med = add_inference(eval_df, searcher_med, k=10)

[Sep 07, 05:14:10] #> Loading codec...
[Sep 07, 05:14:10] Loading decompress_residuals_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[Sep 07, 05:14:10] Loading packbits_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[Sep 07, 05:14:11] #> Loading IVF...
[Sep 07, 05:14:11] #> Loading doclens...


100%|██████████| 1/1 [00:00<00:00, 5216.80it/s]

[Sep 07, 05:14:11] #> Loading codes and residuals...



100%|██████████| 1/1 [00:00<00:00, 459.50it/s]
100%|██████████| 64/64 [00:02<00:00, 26.47it/s]


In [None]:
searcher_xm = ColBERTSearcher(qrels=qrels_df, config=config_xm, verbose=-1)
eval_df_xm = add_inference(eval_df, searcher_xm, k=10)

[Sep 07, 05:14:27] #> Loading codec...
[Sep 07, 05:14:27] #> Loading IVF...
[Sep 07, 05:14:27] #> Loading doclens...


100%|██████████| 1/1 [00:00<00:00, 2003.01it/s]

[Sep 07, 05:14:27] #> Loading codes and residuals...



100%|██████████| 1/1 [00:00<00:00, 281.88it/s]
100%|██████████| 64/64 [00:02<00:00, 31.28it/s]


In [None]:
def mean_reciprocal_rank(true_labels, predicted_lists):
    def reciprocal_rank(true_label, predicted_list):
        for i, predicted_id in enumerate(predicted_list, start=1):
            if predicted_id == true_label:
                return 1 / i
        return 0

    total_queries = len(true_labels)
    if total_queries == 0:
        return 0

    rr_sum = sum(reciprocal_rank(true_label, predicted_list)
                 for true_label, predicted_list in zip(true_labels, predicted_lists))

    return rr_sum / total_queries


def load_prediction(_eval_df: pd.DataFrame, collection):
    # raw: processed
    eval_df = _eval_df.copy()
    mapping = {}

    for _, row in eval_df.iterrows():
        raw_idx = collection[row["content"]]
        processed_idx = row["docno"]

        mapping[processed_idx] = raw_idx
        mapping[processed_idx] = raw_idx

    true_labels = [mapping[i] for i in eval_df["docno"].values]

    prediction = eval_df[[f"id_context_{i}" for i in range(1, 11)]].astype(int).values[::10].tolist()
    eval_df["label"] = true_labels

    return true_labels, prediction

In [None]:
sampled_data = qrels_df[qrels_df["is_train"] == False].reset_index(drop=True)
passages = get_unique_ordered_list(sampled_data["content"].tolist())

collection = {item:i for i, item in enumerate(passages)}

true_label_med, pred_med = load_prediction(eval_df_med, collection)

mrr_med = mean_reciprocal_rank(np.array(true_label_med)[::10], pred_med)
print(f'Mean Reciprocal Rank (MRR): {mrr_med}')

Mean Reciprocal Rank (MRR): 0.846875


In [None]:
true_label_xm, pred_xm = load_prediction(eval_df_xm, collection)
mrr_xm = mean_reciprocal_rank(np.array(true_label_xm)[::10], pred_xm)
print(f'Mean Reciprocal Rank (MRR): {mrr_xm}')

Mean Reciprocal Rank (MRR): 0.8165550595238095


In [None]:
eval_df_xm[eval_df_xm["label"] == 1]

Unnamed: 0,qid,header,docno,content,label,is_train,context_1,id_context_1,score_context_1,context_2,...,score_context_7,context_8,id_context_8,score_context_8,context_9,id_context_9,score_context_9,context_10,id_context_10,score_context_10
20,2,What is the optimum temperature for most human...,2,Optimum:mostfavourableAn enzyme is less active...,1,False,Optimum:mostfavourableAn enzyme is less active...,0.0,27.109375,Optimum pH values ofsome enzymes:• amylase (a ...,...,15.664062,The substances on which enzymes act are called...,47.0,15.375000,Figure 4.10 shows the effect of temperature on...,2.0,15.218750,"Enzymes are proteins,and hence are affectedby ...",28.0,15.031250
50,5,Why are inorganic catalysts like manganese(IV)...,5,Biological catalysts are large biological mole...,1,False,Biological catalysts are large biological mole...,10.0,22.953125,Enzymes are made up of protein molecules that ...,...,11.851562,Enzymes alter the rates of chemical reactions ...,38.0,11.351562,All living things are made up of billions of t...,60.0,11.234375,Carbohydrates are needed:• as a substrate for ...,32.0,11.156250
80,8,What enzyme is needed to break down maltose in...,8,Starch in the foods we eat may be digested in ...,1,False,Starch in the foods we eat may be digested in ...,18.0,24.187500,Some food molecules are large and insoluble in...,...,14.593750,Enzymes are classified according to the chemic...,19.0,14.031250,Enzymes are very efficient molecules. Since th...,35.0,13.898438,Enzymes are highly specific in their action. F...,61.0,12.726562
110,11,What do you need to add to food pieces in a te...,11,"Now, look at Figure 3.27. Do you agree with wh...",1,False,"Now, look at Figure 3.27. Do you agree with wh...",1.0,24.812500,"Glucose, fructose and maltose are also known a...",...,18.375000,Biological catalysts are large biological mole...,10.0,18.000000,1 (a)List the chemical elements that make up p...,41.0,17.234375,Optimum pH values ofsome enzymes:• amylase (a ...,25.0,17.125000
140,14,Why is chlorophyll important for plants?,14,•Chloroplastsare oval structures found in plan...,1,False,•Chloroplastsare oval structures found in plan...,29.0,22.875000,Avacuoleis a fluid-filled space enclosed by ap...,...,-1.000000,-,-1.0,-1.000000,-,-1.0,-1.000000,-,-1.0,-1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1790,179,Is milk a plant or animal protein source?,179,Proteins can be found in both animal and plant...,1,False,Proteins can be found in both animal and plant...,48.0,22.453125,Proteins are used in the:• synthesis of new cy...,...,12.343750,Proteins can be detected by the biuret test. T...,8.0,11.632812,"A cell is surrounded by a living, partially pe...",63.0,11.476562,Avacuoleis a fluid-filled space enclosed by ap...,34.0,11.195312
1820,182,"Which digestive enzyme breaks down proteins, a...",182,Some food molecules are large and insoluble in...,1,False,Some food molecules are large and insoluble in...,15.0,21.625000,Starch in the foods we eat may be digested in ...,...,15.382812,Proteins are used in the:• synthesis of new cy...,26.0,15.312500,Biological catalysts are large biological mole...,10.0,14.546875,The substances on which enzymes act are called...,47.0,14.257812
1850,185,Why is it not a good idea to add too much fert...,185,"Turgor, or how turgid cells are, plays an impo...",1,False,"Turgor, or how turgid cells are, plays an impo...",7.0,21.625000,The rate of diffusion can be affected by facto...,...,9.164062,Theethanol emulsion testis a test for the pres...,27.0,8.523438,Enzymes are classified according to the chemic...,19.0,7.636719,1Describe how you would test for reducing suga...,12.0,7.199219
1880,188,What separates the two solutions of different ...,188,• The term ‘water potential’ is always used in...,1,False,• The term ‘water potential’ is always used in...,62.0,24.421875,When a cell is placed in a solution with a low...,...,11.843750,Optimum pH values ofsome enzymes:• amylase (a ...,25.0,9.906250,The substances on which enzymes act are called...,47.0,9.460938,All living things are made up of billions of t...,60.0,8.960938


# 3. RAG

## 3.1. Prompt LLM

In [None]:
llm = HuggingFaceEndpoint(
                repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
                huggingfacehub_api_token="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",
                max_new_tokens=512,
                top_k=10,
                top_p=0.95,
                typical_p=0.95,
                temperature=0.01,
                repetition_penalty=1.03,
                return_full_text=True,
            )

question = """Learning Outcomes• Identify and state the functions of the following cell structures (including organelles) of typicalplant and animal
 cells from diagrams, light micrographs and as seen under the light microscopeusing prepared slides and fresh material treated with an appropriate temporary
 stainingtechnique: cell wall, cell membrane, cytoplasm, nucleus, cell vacuoles (large, sap-filled in plantcells, small, temporary in animal cells), chloroplasts.
 • Identify and state the functions of the following membrane systems and organellesfrom diagrams and electron micrographs: endoplasmic reticulum, Golgi body,mitochondria,
 ribosomes.• Compare the structure of typical animal and plant cells.Figure 1.1RobertHooke observed thatcork is made up oftiny cells.3

 Generate a multi choice question without giving the answer too"""

template = """

Question: {question}

Answer: Let's think step by step. Answer briefly but easy to understand by primary student."""

prompt = PromptTemplate.from_template(template)

llm_chain = prompt | llm
print(llm_chain.invoke({"question": question}))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Question: Learning Outcomes• Identify and state the functions of the following cell structures (including organelles) of typicalplant and animal
 cells from diagrams, light micrographs and as seen under the light microscopeusing prepared slides and fresh material treated with an appropriate temporary
 stainingtechnique: cell wall, cell membrane, cytoplasm, nucleus, cell vacuoles (large, sap-filled in plantcells, small, temporary in animal cells), chloroplasts.
 • Identify and state the functions of the following membrane systems and organellesfrom diagrams and electron micrographs: endoplasmic reticulum, Golgi body,mitochondria,
 ribosomes.• Compare t