# Libraries


### Remember to restart the env after installing ( CTRL + M + .)

In [None]:
!pip install tqdm
!pip install --upgrade pymilvus
!pip install "pymilvus[model]"
!pip install -U einops flash_attn
!pip install beir
!pip install unsloth
!pip install triton xformers
!pip install --no-deps trl peft accelerate bitsandbytes
!pip install rouge

### Importing libraries

Flash attention not supported by Turing or older architecture GPUs

In [None]:
flashAttentionEnable = False

In [None]:
import os
import re
import requests
import torch
import time
import nltk
import pandas                         as pd
import numpy                          as np
import torch.nn.functional            as F
from datasets                         import load_dataset
from beir.datasets.data_loader        import GenericDataLoader
from abc                              import abstractmethod, ABC
from pymilvus                         import MilvusClient, DataType, CollectionSchema, FieldSchema, Collection, db, connections, utility, model
from tqdm                             import tqdm, trange
from transformers                     import AutoModel, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig,AutoConfig
from beir                             import util, LoggingHandler
from unsloth                          import FastLanguageModel
from unsloth.chat_templates           import get_chat_template
from transformers                     import GenerationConfig
from beir.retrieval.evaluation        import EvaluateRetrieval
from collections                      import defaultdict
from nltk.translate.bleu_score        import sentence_bleu, SmoothingFunction
from rouge                            import Rouge
from nltk.translate.meteor_score      import meteor_score
from sklearn.feature_extraction.text  import TfidfVectorizer
from sklearn.metrics.pairwise         import cosine_similarity

# Dataset

Downloads Dictionary for Nltk

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

## NFCorpus

NFCorpus datset for benchmarking retrival

In [None]:
# Download NFCorpus dataset
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/nfcorpus.zip"
out_dir = "datasets"
data_path = util.download_and_unzip(url, out_dir)

# Load the NFCorpus dataset
corpus, queries, qrels = GenericDataLoader(data_path).load(split='test')

datasets/nfcorpus.zip:   0%|          | 0.00/2.34M [00:00<?, ?iB/s]

  0%|          | 0/3633 [00:00<?, ?it/s]

## MsMarco

MsMarco datset for benchmarking retrival + Generation

In [None]:
ds = load_dataset("microsoft/ms_marco", "v1.1")

README.md:   0%|          | 0.00/9.48k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/21.4M [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/175M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/10047 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/82326 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/9650 [00:00<?, ? examples/s]

In [None]:
def getMsMarcoDataset(ds):
  msMarcoDocs = []
  [msMarcoDocs.extend(doc) for doc in [i["passage_text"] for i in ds["validation"]["passages"][:ds_cup]]];
  msMarcoDocIndex = [f"DOC-{i}" for i in range(len(msMarcoDocs))]
  msMarcoPlainIndex =  [f"PLAIN-{i}" for i in range(ds_cup)]
  msMarcoQrels = {}
  docCnt = 0
  passages = ds["validation"]["passages"]
  for i in range(ds_cup):
    weights     = passages[i]["is_selected"]
    weightsDict = {msMarcoDocIndex[docCnt+j]:weights[j] for j in range(len(weights)) if weights[j]>0 }
    docCnt     += len(weights)
    msMarcoQrels[msMarcoPlainIndex[i]] = weightsDict
  msMarcoCorpus = {msMarcoDocIndex[i]:{"text":msMarcoDocs[i]} for i in range(len(msMarcoDocs))}
  msMarcoQueries = {msMarcoPlainIndex[index]:i for index,i in enumerate(ds["validation"]["query"][:ds_cup])}
  msMarcoAnswers = {msMarcoPlainIndex[index]:i for index,i in enumerate(ds["validation"]["answers"][:ds_cup])}
  return msMarcoCorpus, msMarcoQueries, msMarcoQrels, msMarcoAnswers

In [None]:
ds_cup = 1000
msMarcoCorpus, msMarcoQueries, msMarcoQrels, msMarcoAnswers = getMsMarcoDataset(ds)

# Milvus database

Milvus class implementation to access,upload,delete vectorDb on cloud

Parent Milvus Manager - Children for Dense or Sparse Embeddings

In [None]:
class BaseMilvus:
  def __init__(self, cluster_endpoint, token):
    self.cluster_endpoint = cluster_endpoint
    self.token = token
    self.client = MilvusClient(uri=cluster_endpoint, token=token)

  def query(self, query_vector,top_k, collection ):
    return self.client.search(
      collection_name=collection,
      anns_field="embedding",
      data=[query_vector],
      limit=top_k,
      output_fields=["text"]
    )

  def add_elements(self, collection_name, batch_ids, batch_embeddings, batch_texts):
    if self.client.has_collection(collection_name):
      data = [{'id': id,'embedding': embedding, 'text': text}
                for id, embedding, text in zip(batch_ids, batch_embeddings, batch_texts)]
      self.client.insert(collection_name=collection_name, data=data)
      # print(f"Elements added successfully.")
    else:
      print(f"Collection '{collection_name}' does not exist.")

  def drop_collection(self, collection_name):
      if self.client.has_collection(collection_name):
          self.client.drop_collection(collection_name)
          print(f"Collection '{collection_name}' dropped successfully.")
      else:
          print(f"Collection '{collection_name}' does not exist.")

  def list_collections(self):
      return self.client.list_collections()

In [None]:
class DenseMilvusManager(BaseMilvus):
  def __init__(self, cluster_endpoint, token):
    super().__init__(cluster_endpoint, token)

  def create_collection(self, collection_name, vec_dim, max_text_length=1000):
    # Define fields for the schema
    fields = [
        FieldSchema(name="id", dtype=DataType.VARCHAR, max_length=64, is_primary=True, auto_id=False),
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=vec_dim),
        FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=max_text_length),
    ]

    # Create the schema
    schema = CollectionSchema(fields, description=f"Collection for {collection_name}")

    # Check if the collection already exists
    if self.client.has_collection(collection_name):
        print(f"Collection '{collection_name}' already exists.")
        return None

    # Create index on embedding field
    index_params = self.client.prepare_index_params()
    index_params.add_index(
      field_name="embedding",
      index_type="AUTOINDEX",
      metric_type="COSINE"
    )
    # Create the collection
    self.client.create_collection(
      collection_name=collection_name,
      dimension=vec_dim,
      schema=schema,
      index_params=index_params,
      consistency_level="Strong"
    )
    print(f"Collection '{collection_name}' successfully created.")

In [None]:
class SparseMilvusManager(BaseMilvus):
  def __init__(self, cluster_endpoint, token):
    super().__init__(cluster_endpoint, token)

  def create_collection(self, collection_name, max_text_length=1000):
    fields = [
        FieldSchema(name="id", dtype=DataType.VARCHAR,max_length=64, is_primary=True, auto_id=False),
        FieldSchema(name="embedding", dtype=DataType.SPARSE_FLOAT_VECTOR),
        FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=max_text_length),
    ]
    schema = CollectionSchema(fields, collection_name)

    # Check if the collection already exists
    if self.client.has_collection(collection_name):
        print(f"Collection '{collection_name}' already exists.")
        return None

    # Create index on embedding field
    index_params = self.client.prepare_index_params()
    index_params.add_index(
        field_name="embedding",
        index_type="SPARSE_INVERTED_INDEX",
        metric_type="IP"                      # only avaible one for bm25
    )

    # Create a collection according to Schema
    client.create_collection(
        collection_name=collection_name,
        schema=schema,
        index_params=index_params,
        consistency_level="Strong"
    )
    print(f"Collection '{collection_name}' successfully created.")

# In the following code we will test/benchmark the following:
- Late vs Early chunking: We will evaluate the performance of two chunking techniques: early chunking and late chunking. The early chunking method, which is the standard approach, involves dividing long documents into chunks before embedding them. In contrast, the late chunking method is a novel approach that embeds the document at the token level first, then chunks it, and finally pools the tokens within each chunk into a single embedding.
- The above techniques will be tested with different embedding models: StellaV5, JinaV3, JinaV2 and BGE3
- It will be tested with different text segmenter/chunker(Fixed-size, Semantic and Dynamic)
- The Dynamic segmenting model will use Small Language Models presented by JinaAI


Links:
-
[LateVsEarlyPart1](https://jina.ai/news/what-late-chunking-really-is-and-what-its-not-part-ii/)
-
[LateVsEarlyPart2](https://jina.ai/news/late-chunking-in-long-context-embedding-models/)
-
[Dynamic segmenter](https://jina.ai/news/finding-optimal-breakpoints-in-long-documents-using-small-language-models/)





# Segmenter-Tokenizer-Embedding Models:

## Segmenters

Segmenters to chunk the documents before embedding

### Jina-AI  SEMANTIC-Chunking



The following segmenter uses the jina segmenter API to segment the text, in particular it's regex-based

In [None]:
class JinaSegmenter():
  def __init__(self, size=100):
      self.base_url = 'https://tokenize.jina.ai/'
      self.headers = {
          'Content-Type': 'application/json',
          'Authorization': 'Bearer jina_fa7339f3d0854b12ade38e75822e280f7hFKX0Acu8vvSO49o30slqVI8oiO'
      }

  def segment(self, input_text, max_chunk_length=512):

      payload = {
          "content": input_text,
          "return_chunks": True,
          "max_chunk_length": max_chunk_length,
          "return_tokens": False,
      }

      # Make the API request
      response = requests.post(self.base_url, json=payload, headers=self.headers)

      # Handle errors
      if response.status_code != 200:
          raise Exception(f"API request failed with status code {response.status_code}: {response.text}")

      response_data = response.json()

      # Extract chunks and return
      chunks = response_data.get("chunks", [])
      return chunks


### FIXED-SIZE Chunking


The following chunks the text with fixed length

In [None]:
class FixedChunking():
  def __init__(self, size=100):
    self.size = size
  def segment(self, text ):
    return [text[i:i+self.size] for i in range(0, len(text), self.size)]

### Dynamic window chunking


The following models segments the text using Small Language Models, in particular:
- 1) Sesigned to identify boundaries based on the structural elements of the document .
- 2) Identfy topics within the text, such as "the beginning of the Second World War," and using these topics to define segment boundaries
- 3) Not only identifies text boundaries but also generates summaries for each segments

In [None]:
class QuenChunker():
  def __init__(self, model_name, prompt_template,
               regex, max_chunk_length=512,
               max_seq_length = 8192,
               max_new_tokens = 1024,
               size=100):
    self.prompt_template = prompt_template
    self.regex = regex
    self.model, self.tokenizer = FastLanguageModel.from_pretrained(
                            model_name = model_name,
                            max_seq_length = max_seq_length,
                            dtype = None,
                            load_in_4bit = True,
                       )
    FastLanguageModel.for_inference( self.model)
    self.gen_config = GenerationConfig.from_pretrained(
        "unsloth/Qwen2-0.5B-Instruct-bnb-4bit",
        max_length=max_seq_length,
        max_new_tokens=max_new_tokens,
    )
    self.fixChunk = FixedChunking(size=512)

  def extract_chunks(self, text, chunk_headers_raw):
    chunk_head = re.findall(self.regex, chunk_headers_raw)
    chunk_headers = [ h[:-10] for h in chunk_head]
    chunks = []
    matched = []
    for i in range(len(chunk_headers) - 1):
      current_header_escaped = re.escape(chunk_headers[i])
      next_header_escaped = re.escape(chunk_headers[i + 1])
      pattern = f"{current_header_escaped}(.*?){next_header_escaped}"
      matc = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
      matched.append(matc == None)
      if matc:
        chunks.append(chunk_headers[i] + matc.group(1).strip())


    # Handle the last chunk, capturing until the end of the text
    if len(chunk_headers) > 0:
      last_header = chunk_headers[-1]
      last_header_escaped = re.escape(last_header)
      last_chunk_pattern = f"{last_header_escaped}(.*)"

      matc = re.search(last_chunk_pattern, text, re.DOTALL)
      matched.append(matc == None)
    else:
      return self.fixChunk.segment(text)
    if np.sum(matched) > 0:
      chunks = self.fixChunk.segment(text)
    else:
      chunks.append(last_header + matc.group(1).strip())
    return chunks

  def segment(self, text):
    text = text.replace("\n", " ")
    text = re.sub(r'\s+', " ", text)
    text = text.strip()
    prompt = self.prompt_template.format(text)
    tokenized = self.tokenizer(prompt, return_tensors='pt')
    input_ids = tokenized['input_ids'].cuda()
    attention_mask = tokenized['attention_mask'].cuda()
    with torch.inference_mode():
        output = self.model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            generation_config=self.gen_config
        )
    result = self.tokenizer.decode(
        output[0][len(input_ids[0]):],
        skip_special_tokens=True
    )
    chunks = self.extract_chunks(text, result)
    return chunks

#### Simple-qwen-0.5

In [None]:
class SimpleQwenChunker(QuenChunker):
  def __init__(self, size=100):
    model = "jinaai/text-seg-lm-qwen2-0.5b"
    simple_chunking_prompt = """
    Below is an instruction that describes a task, paired with an input. Write a response that appropriately completes the request.
    ### Instruction:
    Split the given text into chunks. Use the format "CHUNK [index]: [head]" to respond, where "[index]" is the index of each chunk and "[head]" is the beginning of each chunk (up to 50 characters).
    ### Input:
    {}
    ### Response:
    """.lstrip()
    extraction_regex = r'CHUNK \d+:\s*(.*)'
    super().__init__(model, simple_chunking_prompt, extraction_regex, size=size)

#### Topic-qwen-0.5

In [None]:
class TopicQwenChunker(QuenChunker):
  def __init__(self, size=100):
    model = "jinaai/text-seg-lm-qwen2-0.5b-cot-topic-chunking"
    cot_topic_chunking_prompt = """
    Below is an instruction that describes a task, paired with an input. Write a response that appropriately completes the request.
    ### Instruction:
    Identify the topics in the given text in this format:
    ### TOPICS:
    TOPIC [index]: [topic]
    Then, split the text into chunks in this format:
    ### CHUNKS:
    CHUNK [index]: [chunk_head]
    ...
    Pay attention to these details:
    1. Topics should short and concise.
    2. Chunk heads should be the begining text of each chunk, up to 50 characters long.

    The topics and chunks should be in the same order they appear in the original text.
    ### Input:
    {}
    ### Response:
    """.lstrip()
    extraction_regex = r'CHUNK \d+:\s*(.*)'
    super().__init__(model, cot_topic_chunking_prompt, extraction_regex, size=size)

#### Summary-qwen-0.5

In [None]:
class SummaryQwenChunker(QuenChunker):
  def __init__(self, size=100):
    model = "jinaai/text-seg-lm-qwen2-0.5b-summary-chunking"
    summary_chunking_prompt = """
    Below is an instruction that describes a task, paired with an input. Write a response that appropriately completes the request.
    ### Instruction:
    Split the given text into chunks and generate a summary for each chunk.
    Respond in the following format:
    CHUNK 0
    SUMMARY: [chunk_summary]
    HEAD: [chunk_head]
    CHUNK 1
    SUMMARY: [chunk_summary]
    HEAD: [chunk_head]
    ... and so on ...
    Pay attention to these details:
    1. Summaries should be one sentence long.
    2. Chunk heads should be the begining text of each chunk, up to 50 characters long.
    ### Input:
    {}
    ### Response:
    """.lstrip()
    extraction_regex = r'HEAD: (.*)'
    super().__init__(model, summary_chunking_prompt, extraction_regex, size=size)

## Tokenizer

The following are Tokenizer used to tokenize the text before embedding.

### StellaV5


In [None]:
class StellaTokenizerV5():
  def __init__(self):
    self.model_id = "dunzhang/stella_en_1.5B_v5"
    self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, use_flash_attn=flashAttentionEnable)

  def get_tokenizer(self):
    return self.tokenizer

  def tokenize(self, text):
    if type(text) == list:
      tokens_conc = []
      annotations = [] # store positions of texts within the sequence
      for txt in text:
        with torch.no_grad():
          tokens = self.tokenizer(txt)['input_ids']
          tokens_conc += tokens[1:-1]
          if not annotations:
            annotations.append((0,len(tokens[1:-1])))
          else:
            annotations.append((annotations[-1][1],annotations[-1][1]+len(tokens[1:-1])))
      annotations = [annotations]
      tokens_conc = torch.tensor([tokens_conc], dtype=torch.long)
    elif type(text) == str:
          tokens_conc = [[self.tokenizer(text, return_tensors='pt')['input_ids']]]
          annotations = [[(1,len(tokens_conc))]]
    else:
      raise ValueError("Input must be a string or a list of strings")
    return tokens_conc, annotations

### Jina-AI-V3

In [None]:
class JinaTokenizerV3():
  def __init__(self):
    self.model_id = "jinaai/jina-embeddings-v3"
    self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, use_flash_attn=flashAttentionEnable)

  def get_tokenizer(self):
    return self.tokenizer

  def tokenize(self, text):
    if type(text) == list:
      tokens_conc = []
      annotations = [] # store positions of texts within the sequence
      for txt in text:
        with torch.no_grad():
          tokens = self.tokenizer(txt)['input_ids']
          tokens_conc += tokens[1:-1]
          if not annotations:
            annotations.append((0,len(tokens[1:-1])))
          else:
            annotations.append((annotations[-1][1],annotations[-1][1]+len(tokens[1:-1])))
      annotations = [annotations]
      tokens_conc = torch.tensor([tokens_conc], dtype=torch.long)
    elif type(text) == str:
          tokens_conc = [[self.tokenizer(text, return_tensors='pt')['input_ids']]]
          annotations = [[(1,len(tokens_conc))]]
    else:
      raise ValueError("Input must be a string or a list of strings")
    return tokens_conc, annotations

### Jina-AI-V2-Colbert

In [None]:
class JinaTokenizerV2():
  def __init__(self):
    self.model_id = "jinaai/jina-colbert-v2"
    self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, use_flash_attn=flashAttentionEnable)

  def get_tokenizer(self):
    return self.tokenizer

  def tokenize(self, text):
    if type(text) == list:
      tokens_conc = []
      annotations = [] # store positions of texts within the sequence
      for txt in text:
        with torch.no_grad():
          tokens = self.tokenizer(txt)['input_ids']
          tokens_conc += tokens[1:-1]
          if not annotations:
            annotations.append((0,len(tokens[1:-1])))
          else:
            annotations.append((annotations[-1][1],annotations[-1][1]+len(tokens[1:-1])))
      annotations = [annotations]
      tokens_conc = torch.tensor([tokens_conc], dtype=torch.long)
    elif type(text) == str:
          tokens_conc = [[self.tokenizer(text, return_tensors='pt')['input_ids']]]
          annotations = [[(1,len(tokens_conc))]]
    else:
      raise ValueError("Input must be a string or a list of strings")
    return tokens_conc, annotations

### BGE-base-en-v1.5


In [None]:
class BgeTokenizer():
  def __init__(self):
    self.model_id = 'BAAI/bge-m3'
    self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, use_flash_attn=flashAttentionEnable)

  def get_tokenizer(self):
    return self.tokenizer

  def tokenize(self, text):
    if type(text) == list:
      tokens_conc = []
      annotations = [] # store positions of texts within the sequence
      for txt in text:
        with torch.no_grad():
          tokens = self.tokenizer(txt)['input_ids']
          tokens_conc += tokens[1:-1]
          if not annotations:
            annotations.append((0,len(tokens[1:-1])))
          else:
            annotations.append((annotations[-1][1],annotations[-1][1]+len(tokens[1:-1])))
      annotations = [annotations]
      tokens_conc = torch.tensor([tokens_conc], dtype=torch.long)
    elif type(text) == str:
          tokens_conc = [[self.tokenizer(text, return_tensors='pt')['input_ids']]]
          annotations = [[(1,len(tokens_conc))]]
    else:
      raise ValueError("Input must be a string or a list of strings")
    return tokens_conc, annotations

## Embedder

The following Embedder are ranked in the MTEB benchmark leaderboard(subclass retrival) as:
- StellaV5 as 5th
- jina-embeddings-v3 as 62nd
- jina-embeddings-v2 as 122nd
- BGE-M3 as 207th

### Stella

In [None]:
class StellaEmbedderV5():
  def __init__(self):
    !git clone --filter=blob:none --no-checkout https://huggingface.co/dunzhang/stella_en_1.5B_v5
    %cd /content/stella_en_1.5B_v5
    !git sparse-checkout init
    !git sparse-checkout set 2_Dense_1024
    !git checkout
    %cd /content
    self.model_id = 'dunzhang/stella_en_1.5B_v5'
    self.model = AutoModel.from_pretrained(self.model_id, trust_remote_code=True).cuda();
    self.tokenizer = AutoTokenizer.from_pretrained(self.model_id);
    self.vector_linear = torch.nn.Linear(in_features=self.model.config.hidden_size, out_features=1024)
    vector_linear_dict = {
            k.replace("linear.", ""): v for k, v in
            torch.load("/content/stella_en_1.5B_v5/2_Dense_1024/pytorch_model.bin").items()}
    self.vector_linear.load_state_dict(vector_linear_dict)
    self.vector_linear.cuda()
    self.model.eval()

  def get_model(self):
    return self.model

  def encode(self, chunks, task, max_length=512):
    if type(chunks) == str:
      query_prompt = "Instruct: Given a web search query, retrieve relevant passages that answer the query.\nQuery: "
      chunks = [query_prompt + chunks]
    chunks_embeddings = []
    for chunk in chunks:
      tokens = self.tokenizer(chunk, return_tensors='pt')['input_ids'].to("cuda")
      with torch.no_grad():
        attention_mask = torch.ones_like(tokens)
        token_emb = self.model(input_ids=tokens,attention_mask=attention_mask)[0]
        token_embeddings = token_emb.masked_fill(~attention_mask[..., None].bool(), 0.0)
        div = token_embeddings.size(1)
        pooled_embeddings = token_embeddings.sum(dim=1) / div
        pooled_embeddings = F.normalize(self.vector_linear(pooled_embeddings), p=2, dim=1).squeeze(0)
      chunks_embeddings.append(pooled_embeddings.to(dtype=torch.float32).detach().cpu().numpy())
    if task == 'retrieval.query':
      return chunks_embeddings
    else:
      return [chunks_embeddings]

  def encodeLateChunking(self, text_token, span_annotation, task, max_length = None ):
    with torch.no_grad():
      attention_mask = torch.ones_like(text_token)
      token_emb1 = self.model(input_ids=text_token,attention_mask=attention_mask)[0]
      token_embeddings = token_emb1.masked_fill(~attention_mask[..., None].bool(), 0.0)
      outputs = []
      for embeddings, annotations in zip(token_embeddings, span_annotation):
          if (max_length is not None):  # remove annotations which go bejond the max-length of the model
              annotations   = [
                                (start, min(end, max_length - 1))
                                  for (start, end) in annotations
                                    if start < (max_length - 1)
                              ]
          pooled_embeddings = [
                                embeddings[start:end].sum(dim=0) / (end - start)
                                  for (start, end) in annotations
                                    if (end - start) >= 1
                              ]
          pooled_embeddings = [
                                F.normalize(self.vector_linear(pooled_embedding), p=2, dim=0)
                                  for  pooled_embedding in pooled_embeddings
                              ]
          pooled_embeddings = [
                                embedding.to(dtype=torch.float32).detach().cpu().numpy()
                                  for embedding in pooled_embeddings
                              ]
          outputs.append(pooled_embeddings)
    return outputs

### JINA-AI-V3

In [None]:
class JinaEmbedderV3():
  def __init__(self):
    self.model_id = "jinaai/jina-embeddings-v3"
    self.model = AutoModel.from_pretrained(self.model_id, trust_remote_code=True, use_flash_attn=flashAttentionEnable).cuda();

  def get_model(self):
    return self.model

  def encode(self, text, task, device="cuda", normalize_embeddings=True, max_length=512):
    with torch.no_grad():
      return [self.model.encode(sentences=text,
                               device=device,
                               normalize_embeddings=normalize_embeddings,
                               task=task,
                               max_length=max_length)]

  def encodeLateChunking(self, text_token, span_annotation, task, max_length = None ):
    task_id = self.model._adaptation_map[task]
    adapter_mask = torch.full((len(text_token),), task_id, dtype=torch.int32)
    with torch.no_grad():
      token_embeddings = self.model(input_ids=text_token,
                                    adapter_mask=adapter_mask)[0]
    outputs = []
    for embeddings, annotations in zip(token_embeddings, span_annotation):
        if (max_length is not None):  # remove annotations which go bejond the max-length of the model
            annotations   = [
                              (start, min(end, max_length - 1))
                                for (start, end) in annotations
                                  if start < (max_length - 1)
                            ]
        pooled_embeddings = [
                              embeddings[start:end].sum(dim=0) / (end - start)
                                for (start, end) in annotations
                                  if (end - start) >= 1
                            ]
        pooled_embeddings = [
                              F.normalize(pooled_embedding.unsqueeze(-1), p=2, dim=0).squeeze(-1)
                                for  pooled_embedding in pooled_embeddings
                            ]
        pooled_embeddings = [
                              embedding.to(dtype=torch.float32).detach().cpu().numpy()
                                for embedding in pooled_embeddings
                            ]
        outputs.append(pooled_embeddings)
    return outputs

### Jina-AI-V2-Colbert

In [None]:
class JinaEmbedderV2():
  def __init__(self):
    self.model_id = "jinaai/jina-colbert-v2"
    self.model = AutoModel.from_pretrained(self.model_id, trust_remote_code=True, use_flash_attn=flashAttentionEnable).cuda();

  def get_model(self):
    return self.model

  def encode(self, text, task, device="cuda", normalize_embeddings=True, max_length=512):
    with torch.no_grad():
      return [self.model.encode(sentences=text,
                               device=device,
                               normalize_embeddings=normalize_embeddings,
                               task=task,
                               max_length=max_length)]

  def encodeLateChunking(self, text_token, span_annotation, task, max_length = None ):
    with torch.no_grad():
      token_embeddings = self.model(input_ids=text_token)[0]
    outputs = []
    for embeddings, annotations in zip(token_embeddings, span_annotation):
        if (max_length is not None):  # remove annotations which go bejond the max-length of the model
            annotations   = [
                              (start, min(end, max_length - 1))
                                for (start, end) in annotations
                                  if start < (max_length - 1)
                            ]
        pooled_embeddings = [
                              embeddings[start:end].sum(dim=0) / (end - start)
                                for (start, end) in annotations
                                  if (end - start) >= 1
                            ]
        pooled_embeddings = [
                              F.normalize(pooled_embedding.unsqueeze(-1), p=2, dim=0).squeeze(-1)
                                for  pooled_embedding in pooled_embeddings
                            ]
        pooled_embeddings = [
                              embedding.to(dtype=torch.float32).detach().cpu().numpy()
                                for embedding in pooled_embeddings
                            ]
        outputs.append(pooled_embeddings)
    return outputs

### BGE-m3

In [None]:
class BgeEmbedder():
  def __init__(self):
    self.model_id = 'BAAI/bge-m3'
    self.model = AutoModel.from_pretrained(self.model_id, trust_remote_code=True).cuda();
    self.tokenizer = AutoTokenizer.from_pretrained(self.model_id);
    self.model.eval()



  def get_model(self):
    return self.model

  def encode(self, chunks, task, max_length=512):
    if type( chunks) == str:
      chunks = [chunks]
    chunks_embeddings = []
    for chunk in chunks:
      tokens = self.tokenizer(chunk, return_tensors='pt')['input_ids'].to("cuda")
      with torch.no_grad():
        token_embeddings = self.model(input_ids=tokens)[0]
        pooled_embeddings = token_embeddings.sum(dim=1) / len(token_embeddings)
        pooled_embeddings = F.normalize(pooled_embeddings, p=2, dim=1).squeeze(0)
      chunks_embeddings.append(pooled_embeddings.to(dtype=torch.float32).detach().cpu().numpy())
    if task == 'retrieval.query':
      return chunks_embeddings
    else:
      return [chunks_embeddings]

  # def encode(self, chunks, task, max_length=512):
  #   if type( chunks) == str:
  #     chunks = [chunks]
  #   chunks_embeddings = []
  #   for chunk in chunks:
  #     tokens = self.tokenizer(chunk, return_tensors='pt')['input_ids'].to("cuda")
  #     with torch.no_grad():
  #       token_embeddings = self.model(input_ids=tokens)[0]
  #       div = token_embeddings.size(1)
  #       pooled_embeddings = token_embeddings.sum(dim=1) / div
  #       pooled_embeddings = F.normalize(pooled_embeddings, p=2, dim=1).squeeze(0)
  #     chunks_embeddings.append(pooled_embeddings.to(dtype=torch.float32).detach().cpu().numpy())
  #   if task == 'retrieval.query':
  #     return chunks_embeddings
  #   else:
  #     return [chunks_embeddings]


  def encodeLateChunking(self, text_token, span_annotation, task, max_length = None ):
    with torch.no_grad():
      token_embeddings = self.model(input_ids=text_token)[0]
    outputs = []
    for embeddings, annotations in zip(token_embeddings, span_annotation):
        if (max_length is not None):  # remove annotations which go bejond the max-length of the model
            annotations   = [
                              (start, min(end, max_length - 1))
                                for (start, end) in annotations
                                  if start < (max_length - 1)
                            ]
        pooled_embeddings = [
                              embeddings[start:end].sum(dim=0) / (end - start)
                                for (start, end) in annotations
                                  if (end - start) >= 1
                            ]
        pooled_embeddings = [
                              F.normalize(pooled_embedding.unsqueeze(-1), p=2, dim=0).squeeze(-1)
                                for  pooled_embedding in pooled_embeddings
                            ]
        pooled_embeddings = [
                              embedding.to(dtype=torch.float32).detach().cpu().numpy()
                                for embedding in pooled_embeddings
                            ]
        outputs.append(pooled_embeddings)
    return outputs

  # def encodeLateChunking(self, text_token, span_annotation, task, max_length = None ):
  #   with torch.no_grad():
  #     token_embeddings = self.model(input_ids=text_token)[0]
  #   outputs = []
  #   for embeddings, annotations in zip(token_embeddings, span_annotation):
  #       if (max_length is not None):  # remove annotations which go bejond the max-length of the model
  #           annotations   = [
  #                             (start, min(end, max_length - 1))
  #                               for (start, end) in annotations
  #                                 if start < (max_length - 1)
  #                           ]
  #       pooled_embeddings = [
  #                             embeddings[start:end].sum(dim=0) / (end - start)
  #                               for (start, end) in annotations
  #                                 if (end - start) >= 1
  #                           ]
  #       pooled_embeddings = [
  #                             F.normalize(pooled_embedding.unsqueeze(-1), p=2, dim=0).squeeze(-1)
  #                               for  pooled_embedding in pooled_embeddings
  #                           ]
  #       pooled_embeddings = [
  #                             embedding.to(dtype=torch.float32).detach().cpu().numpy()
  #                               for embedding in pooled_embeddings
  #                           ]
  #       outputs.append(pooled_embeddings)
  #   return outputs

# Upload Pipeline

This are the cluster info to access to the cloud vectorDB

In [None]:
CLUSTER_ENDPOINT = "https://in05-490089cedbf3c2b.serverless.gcp-us-west1.cloud.zilliz.com"
TOKEN = "e93c53c46a322eb31e81514b7c75cd92dbb8d7de66678cdf76610280909d57f590cd93045fdf7f13702f8679fa40fcddaffa43d5"
MAX_TEXT = 10000

Pipeline to upload the documents to the cloud database cluster

In [None]:
class UploadPipeline():
  def __init__(self, segmenter, tokenizer, embedder, cluster_endpoint, token, max_text_length):
    self.max_text_length  = max_text_length
    self.Segmenter        = segmenter
    self.Tokenizer        = tokenizer
    self.Embedder         = embedder
    self.milvusManager    = DenseMilvusManager(cluster_endpoint, token)

  def drop_collection(self, collection_name):
    self.milvusManager.drop_collection(collection_name)

  def get_embedder(self):
    return self.Embedder.get_model()

  def create_collection(self, collection_name, vec_dim, max_milvus_text_length = 10000):
    self.milvusManager.create_collection(collection_name, vec_dim, max_milvus_text_length)

  def embed_and_insert(self, corpus, collection, lateChunking=True, batch_size=32):
    ids,texts = zip(*[ (key, value['text']) for (key,value) in corpus.items() ])
    texts = list(texts) # Convert the texts tuple to a list
    ids = list(ids)  # Convert the ids tuple to a list
    pbar = tqdm(total=len(texts), desc= f"{collection} Embedding and inserting", leave=True)
    for i in range(0, len(texts)-batch_size, batch_size):
      id,texts_chunks,embedd = [],[],[]
      for j in range(i,i+batch_size):
        chunks = self.Segmenter.segment(texts[j])
        if lateChunking:
          chunks_tokens, annotations = self.Tokenizer.tokenize(chunks)
          embedding = self.Embedder.encodeLateChunking(chunks_tokens.to("cuda"),
                                                       annotations,
                                                       'retrieval.passage',
                                                       max_length=self.max_text_length)
        else:
          embedding = self.Embedder.encode(chunks, 'retrieval.passage')
        [id.append(ids[j]+f"_{v}")  for v in range(len(chunks)) ]
        [texts_chunks.append(chunk) for chunk in chunks         ]
        [embedd.append(emb)         for emb in embedding[0]     ]
        pbar.update(1)
        time.sleep(0.1)
      self.milvusManager.add_elements(collection, id, embedd, texts_chunks)
    id,texts_chunks,embedd = [],[],[]
    for x in range(j,len(texts)):
      chunks = self.Segmenter.segment(texts[x])
      if lateChunking:
        chunks_tokens, annotations = self.Tokenizer.tokenize(chunks)
        embedding = self.Embedder.encodeLateChunking(chunks_tokens.to("cuda"),
                                                     annotations,
                                                     'retrieval.passage',
                                                     max_length=self.max_text_length)
      else:
        embedding = self.Embedder.encode(chunks,
                                         'retrieval.passage')
      [id.append(ids[x]+f"_{v}")  for v in range(len(chunks)) ]
      [texts_chunks.append(chunk) for chunk in chunks         ]
      [embedd.append(emb)         for emb in embedding[0]     ]
      pbar.update(1)
    pbar.close()
    self.milvusManager.add_elements(collection, id, embedd, texts_chunks)

MsMarco collections on the vectorDb  
The one commented aren't available on the cloud  
To generate the commented one just remove the comment and run the pipeline (ONLY ON THE INDEXES OF THE ONE NOT ALREADY PRESENT)

In [None]:
MsCollections = [
                  "Stella_earlyChunk_fixSeg"    ,
                  "Stella_lateChunk_fixSeg"    ,
                  # "Jinav3_earlyChunk_fixSeg"    ,
                  "Jinav3_lateChunk_fixSeg"     ,
                  # "Jinav3_earlyChunk_semSeg"    ,
                  # "Jinav3_lateChunk_dynSegSt"   ,
                  # "Jinav3_lateChunk_dynSegTop"  ,
                  # "Jinav3_lateChunk_dynSegSum"  ,
                  # "Jinav2_earlyChunk_fixSeg"    ,
                  # "Jinav2_lateChunk_fixSeg"     ,
                  # "Jinav2_earlyChunk_semSeg"    ,
                  # "Jinav2_lateChunk_dynSegSt"   ,
                  # "Jinav2_lateChunk_dynSegTop"  ,
                  # "Jinav2_lateChunk_dynSegSum"  ,
                  # "BGE_earlyChunk_fixSeg"       ,
                  # "BGE_lateChunk_fixSeg"        ,
                  # "BGE_earlyChunk_semSeg"       ,
                  # "BGE_lateChunk_dynSegSt"      ,
                  # "BGE_lateChunk_dynSegTop"     ,
                  # "BGE_lateChunk_dynSegSum"     ,
              ]

NfCorpus collections on the vectorDb  
The one commented aren't available on the cloud  
To generate the commented one just remove the comment and run the pipeline (ONLY ON THE INDEXES OF THE ONE NOT ALREADY PRESENT)

In [None]:
NfCollections = [
                  "Stella_earlyChunk_fixSeg"    ,
                  "Stella_lateChunk_fixSeg"    ,
                  "Jinav3_earlyChunk_fixSeg"    ,
                  "Jinav3_lateChunk_fixSeg"     ,
                  "Jinav3_earlyChunk_semSeg"    ,
                  "Jinav3_lateChunk_dynSegSt"   ,
                  "Jinav3_lateChunk_dynSegTop"  ,
                  # "Jinav3_lateChunk_dynSegSum"  ,
                  "Jinav2_earlyChunk_fixSeg"    ,
                  "Jinav2_lateChunk_fixSeg"     ,
                  "Jinav2_earlyChunk_semSeg"    ,
                  "Jinav2_lateChunk_dynSegSt"   ,
                  "Jinav2_lateChunk_dynSegTop"  ,
                  # "Jinav2_lateChunk_dynSegSum"  ,
                  "BGE_earlyChunk_fixSeg"       ,
                  "BGE_lateChunk_fixSeg"        ,
                  "BGE_earlyChunk_semSeg"       ,
                  "BGE_lateChunk_dynSegSt"      ,
                  "BGE_lateChunk_dynSegTop"     ,
                  # "BGE_lateChunk_dynSegSum"     ,
              ]

The following are just Dicts used to get the associated segmenter/tokenizer/embedder associated to each Collection

In [None]:
segmenterDict = {
                  "Stella_earlyChunk_fixSeg"    : FixedChunking,
                  "Stella_lateChunk_fixSeg"     : FixedChunking,
                  "Jinav3_earlyChunk_fixSeg"    : FixedChunking,
                  "Jinav3_lateChunk_fixSeg"     : FixedChunking,
                  "Jinav3_earlyChunk_semSeg"    : JinaSegmenter,
                  "Jinav3_lateChunk_dynSegSt"   : SimpleQwenChunker,
                  "Jinav3_lateChunk_dynSegTop"  : TopicQwenChunker,
                  "Jinav3_lateChunk_dynSegSum"  : SummaryQwenChunker,
                  "Jinav2_earlyChunk_fixSeg"    : FixedChunking,
                  "Jinav2_lateChunk_fixSeg"     : FixedChunking,
                  "Jinav2_earlyChunk_semSeg"    : JinaSegmenter,
                  "Jinav2_lateChunk_dynSegSt"   : SimpleQwenChunker,
                  "Jinav2_lateChunk_dynSegTop"  : TopicQwenChunker,
                  "Jinav2_lateChunk_dynSegSum"  : SummaryQwenChunker,
                  "BGE_earlyChunk_fixSeg"       : FixedChunking,
                  "BGE_lateChunk_fixSeg"        : FixedChunking,
                  "BGE_earlyChunk_semSeg"       : JinaSegmenter,
                  "BGE_lateChunk_dynSegSt"      : SimpleQwenChunker,
                  "BGE_lateChunk_dynSegTop"     : TopicQwenChunker,
                  "BGE_lateChunk_dynSegSum"     : SummaryQwenChunker,
              }

In [None]:
tokenizerDict = {
                  "Stella_earlyChunk_fixSeg"    : StellaTokenizerV5,
                  "Stella_lateChunk_fixSeg"     : StellaTokenizerV5,
                  "Jinav3_earlyChunk_fixSeg"    : JinaTokenizerV3,
                  "Jinav3_lateChunk_fixSeg"     : JinaTokenizerV3,
                  "Jinav3_earlyChunk_semSeg"    : JinaTokenizerV3,
                  "Jinav3_lateChunk_dynSegSt"   : JinaTokenizerV3,
                  "Jinav3_lateChunk_dynSegTop"  : JinaTokenizerV3,
                  "Jinav3_lateChunk_dynSegSum"  : JinaTokenizerV3,
                  "Jinav2_earlyChunk_fixSeg"    : JinaTokenizerV2,
                  "Jinav2_lateChunk_fixSeg"     : JinaTokenizerV2,
                  "Jinav2_earlyChunk_semSeg"    : JinaTokenizerV2,
                  "Jinav2_lateChunk_dynSegSt"   : JinaTokenizerV2,
                  "Jinav2_lateChunk_dynSegTop"  : JinaTokenizerV2,
                  "Jinav2_lateChunk_dynSegSum"  : JinaTokenizerV2,
                  "BGE_earlyChunk_fixSeg"       : BgeTokenizer,
                  "BGE_lateChunk_fixSeg"        : BgeTokenizer,
                  "BGE_earlyChunk_semSeg"       : BgeTokenizer,
                  "BGE_lateChunk_dynSegSt"      : BgeTokenizer,
                  "BGE_lateChunk_dynSegTop"     : BgeTokenizer,
                  "BGE_lateChunk_dynSegSum"     : BgeTokenizer,
              }

In [None]:
embedderDict = {
                  "Stella_earlyChunk_fixSeg"    : StellaEmbedderV5,
                  "Stella_lateChunk_fixSeg"     : StellaEmbedderV5,
                  "Jinav3_earlyChunk_fixSeg"    : JinaEmbedderV3,
                  "Jinav3_lateChunk_fixSeg"     : JinaEmbedderV3,
                  "Jinav3_earlyChunk_semSeg"    : JinaEmbedderV3,
                  "Jinav3_lateChunk_dynSegSt"   : JinaEmbedderV3,
                  "Jinav3_lateChunk_dynSegTop"  : JinaEmbedderV3,
                  "Jinav3_lateChunk_dynSegSum"  : JinaEmbedderV3,
                  "Jinav2_earlyChunk_fixSeg"    : JinaEmbedderV2,
                  "Jinav2_lateChunk_fixSeg"     : JinaEmbedderV2,
                  "Jinav2_earlyChunk_semSeg"    : JinaEmbedderV2,
                  "Jinav2_lateChunk_dynSegSt"   : JinaEmbedderV2,
                  "Jinav2_lateChunk_dynSegTop"  : JinaEmbedderV2,
                  "Jinav2_lateChunk_dynSegSum"  : JinaEmbedderV2,
                  "BGE_earlyChunk_fixSeg"       : BgeEmbedder,
                  "BGE_lateChunk_fixSeg"        : BgeEmbedder,
                  "BGE_earlyChunk_semSeg"       : BgeEmbedder,
                  "BGE_lateChunk_dynSegSt"      : BgeEmbedder,
                  "BGE_lateChunk_dynSegTop"     : BgeEmbedder,
                  "BGE_lateChunk_dynSegSum"     : BgeEmbedder,
              }

In [None]:
earlyLateDict = {
                  "Stella_earlyChunk_fixSeg"    : False,
                  "Stella_lateChunk_fixSeg"     : True,
                  "Jinav3_earlyChunk_fixSeg"    : False,
                  "Jinav3_lateChunk_fixSeg"     : True,
                  "Jinav3_earlyChunk_semSeg"    : False,
                  "Jinav3_lateChunk_dynSegSt"   : True,
                  "Jinav3_lateChunk_dynSegTop"  : True,
                  "Jinav3_lateChunk_dynSegSum"  : True,
                  "Jinav2_earlyChunk_fixSeg"    : False,
                  "Jinav2_lateChunk_fixSeg"     : True,
                  "Jinav2_earlyChunk_semSeg"    : False,
                  "Jinav2_lateChunk_dynSegSt"   : True,
                  "Jinav2_lateChunk_dynSegTop"  : True,
                  "Jinav2_lateChunk_dynSegSum"  : True,
                  "BGE_earlyChunk_fixSeg"       : False,
                  "BGE_lateChunk_fixSeg"        : True,
                  "BGE_earlyChunk_semSeg"       : False,
                  "BGE_lateChunk_dynSegSt"      : True,
                  "BGE_lateChunk_dynSegTop"     : True,
                  "BGE_lateChunk_dynSegSum"     : True,
              }

In [None]:
vecDimDict = {
                  "Stella_earlyChunk_fixSeg"    : 1024,
                  "Stella_lateChunk_fixSeg"     : 1024,
                  "Jinav3_earlyChunk_fixSeg"    : 1024,
                  "Jinav3_lateChunk_fixSeg"     : 1024,
                  "Jinav3_earlyChunk_semSeg"    : 1024,
                  "Jinav3_lateChunk_dynSegSt"   : 1024,
                  "Jinav3_lateChunk_dynSegTop"  : 1024,
                  "Jinav3_lateChunk_dynSegSum"  : 1024,
                  "Jinav2_earlyChunk_fixSeg"    : 1024,
                  "Jinav2_lateChunk_fixSeg"     : 1024,
                  "Jinav2_earlyChunk_semSeg"    : 1024,
                  "Jinav2_lateChunk_dynSegSt"   : 1024,
                  "Jinav2_lateChunk_dynSegTop"  : 1024,
                  "Jinav2_lateChunk_dynSegSum"  : 1024,
                  "BGE_earlyChunk_fixSeg"       : 1024,
                  "BGE_lateChunk_fixSeg"        : 1024,
                  "BGE_earlyChunk_semSeg"       : 1024,
                  "BGE_lateChunk_dynSegSt"      : 1024,
                  "BGE_lateChunk_dynSegTop"     : 1024,
                  "BGE_lateChunk_dynSegSum"     : 1024,
              }

The following code is used to upload the docs on the cloud.  
It's commented to avoid to reupload the embedding twice by mistake.

In [None]:
# for col in NfCollections:
#   segmenter     = segmenterDict[col](size=512)
#   tokenizer     = tokenizerDict[col]()
#   embedder      = embedderDict[col]()
#   lateChunking  = earlyLateDict[col]
#   VEC_DIM       = vecDimDict[col]

#   uploadPipeline = UploadPipeline(segmenter, tokenizer, embedder, CLUSTER_ENDPOINT, TOKEN, MAX_TEXT)
#   uploadPipeline.create_collection(col, VEC_DIM, MAX_TEXT)
#   uploadPipeline.embed_and_insert(corpus,col,lateChunking=lateChunking)

In [None]:
# for col in MsCollections:
#   segmenter     = segmenterDict[col](size=512)
#   tokenizer     = tokenizerDict[col]()
#   embedder      = embedderDict[col]()
#   lateChunking  = earlyLateDict[col]
#   VEC_DIM       = vecDimDict[col]

#   uploadPipeline = UploadPipeline(segmenter, tokenizer, embedder, CLUSTER_ENDPOINT, TOKEN, MAX_TEXT)
#   uploadPipeline.create_collection("Ms_"+col, VEC_DIM, MAX_TEXT)
#   uploadPipeline.embed_and_insert(msMarcoCorpus,"Ms_"+col,lateChunking=lateChunking)

To drop the collection

In [None]:
# uploadPipeline.drop_collection(Collections[12])                     #!!!!!!!!!!!! UNCOMMENT ONLY TO DROP THE COLLECTION

# Query Pipeline

The following code can be used to query the vectorDb+LLM.  
In particular for the LLM we used the Phi-3.5 LLM uploaded quantized in 4bit using Unsloath for Fast Inference

In [None]:
class QueryPipeline():
  def __init__(self,
               llm_model,
               llm_tokenizer,
               retrieverEmbedder,
               phi3_template,
               cluster_endpoint,
               token):
    self.llm_model          = llm_model
    self.llm_tokenizer      = llm_tokenizer
    self.retrieverEmbedder  = retrieverEmbedder
    self.milvusManager      = DenseMilvusManager(cluster_endpoint, token)
    self.phi3_template      = phi3_template

  def retriever(self, query, top_k, collection):
    query_vector = self.retrieverEmbedder.encode(query, "retrieval.query")
    docs = self.milvusManager.query(query_vector[0], top_k, collection)
    retrived_chunks = [hit["entity"]["text"]  for hit in docs[0] ]
    ids = [hit["id"]  for hit in docs[0] ]
    score = [hit["distance"]  for hit in docs[0] ]

    return retrived_chunks, ids, score

  def extract_llm_responses(self, conversation: str):
    # Regex to match assistant output between <|assistant|> and <|end|>
    assistant_responses = re.findall(r"<\|assistant\|>(.*?)<\|end\|>", conversation, re.DOTALL)
    return assistant_responses[0]

  def query(self, query, top_k, collection):
    doc_chunks,_,_ = self.retriever(query, top_k, collection)
    docs_content = "[<<<"+">>>,<<<".join(doc_chunks)+">>>]"
    query = f"{query}. Give me a short answer using only the following informations: {docs_content}. If the answer is not present just say <<No Answer>>"
    messages = [{"from": "human", "value": query}]
    inputs = self.llm_tokenizer.apply_chat_template(
                        messages,
                        tokenize = True,
                        add_generation_prompt = False, # Must add for generation
                        return_tensors = "pt" ).to("cuda")

    generation_config = GenerationConfig(
             eos_token_id=self.llm_tokenizer.convert_tokens_to_ids("<|end|>")
            )
    outputs = self.llm_model.generate(input_ids = inputs,
                                 max_new_tokens = 1000,
                                 use_cache = True,
                                 generation_config=generation_config)
    response_raw = self.llm_tokenizer.batch_decode(outputs)
    return self.extract_llm_responses(response_raw[0]), doc_chunks

In [None]:
max_s_length = 8192  # no limitations, kaiokendev's incorporated
# 4bit pre quantized models supported for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

In [None]:
phi3_template = \
    "{% for message in messages %}"\
        "{% if message['role'] == 'user' %}"\
            "{{'<|user|>\n' + message['content'] + '<|end|>\n'}}"\
        "{% elif message['role'] == 'assistant' %}"\
            "{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}"\
        "{% else %}"\
            "{{'<|' + message['role'] + '|>\n' + message['content'] + '<|end|>\n'}}"\
        "{% endif %}"\
    "{% endfor %}"\
    "{% if add_generation_prompt %}"\
        "{{ '<|assistant|>\n' }}"\
    "{% endif %}"

In [None]:
def get_model_tokenizer(model, max_s_length, load_in_4bit = True): # Use 4bit quantization to reduce memory usage. Can be False.
  dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
  phi_model, phi_tokenizer = FastLanguageModel.from_pretrained(
      model_name = model,
      max_seq_length = max_s_length,
      dtype = dtype,
      load_in_4bit = load_in_4bit,
      # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
  )
  phi_tokenizer = get_chat_template(
      phi_tokenizer,
      chat_template = (phi3_template,"<|end|>"), #"phi-l3.5", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
      mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
  )
  FastLanguageModel.for_inference(phi_model); # Enable native 2x faster inference
  return phi_model, phi_tokenizer

In [None]:
phi_model, phi_tokenizer = get_model_tokenizer("unsloth/Phi-3.5-mini-instruct", max_s_length)

==((====))==  Unsloth 2024.12.8: Fast Llama patching. Transformers: 4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


## To query

The following code can be used to query

In [None]:
# queryPipeline = QueryPipeline(phi_model,
#                                   phi_tokenizer,
#                                   embedderDict[Collections[10]](),
#                                   phi3_template,
#                                   CLUSTER_ENDPOINT,
#                                   TOKEN)

In [None]:
# answer, docs = queryPipeline.query("Cos'è la glicemia, rispondimi in italiano?", 5, Collections[10])

In [None]:
# print(answer)

In [None]:
# docs

# Retriever Evaluator

The following two methods are employed to retrieve the documents, which are then evaluated against the query relevance judgments (qrels) using specific performance metrics.

In [None]:
def evaluate(model_name, qrels, results, metrics_k_values):
    print("Evaluating retriever on questions against qrels")
    ndcg, map_, recall, precision = EvaluateRetrieval.evaluate(
        qrels, results, metrics_k_values
    )
    print("Results for:", model_name)
    for k in metrics_k_values:
        print(
            {
                f"NDCG@{k}": ndcg[f"NDCG@{k}"],
                f"MAP@{k}": map_[f"MAP@{k}"],
                f"Recall@{k}": recall[f"Recall@{k}"],
                f"Precision@{k}": precision[f"P@{k}"],
                f"F1@{k}": 2*precision[f"P@{k}"]*recall[f"Recall@{k}"]/(precision[f"P@{k}"]+recall[f"Recall@{k}"]),
            }
        )
    print("-------------------------------------")
    file_output_data = {}
    for k in metrics_k_values:
        file_output_data[f"NDCG@{k}"]=ndcg[f"NDCG@{k}"]
        file_output_data[f"MAP@{k}"]= map_[f"MAP@{k}"]
        file_output_data[f"Recall@{k}"]= recall[f"Recall@{k}"]
        file_output_data[f"Precision@{k}"]= precision[f"P@{k}"]
        file_output_data[f"F1@{k}"]= 2*precision[f"P@{k}"]*recall[f"Recall@{k}"]/(precision[f"P@{k}"]+recall[f"Recall@{k}"])
    return file_output_data

In [None]:
def evalRetrival(corpus, queries, qrels, top_k, collection, queryPipeline):
  ids,texts = zip(*[ (key, value['text']) for (key,value) in corpus.items() ])
  texts = list(texts)
  ids = list(ids)
  queries = list(queries.items())
  pbar = tqdm(queries, desc="evalRetrival", leave=True)
  run = {}
  for plain, query in pbar:
    docs, doc_ids, scores = queryPipeline.retriever(query, top_k, collection)
    doc_ids = [id.split("_")[0] for id in doc_ids]
    id_score_map = defaultdict(list)
    for doc_id, score in zip(doc_ids, scores):
      id_score_map[doc_id].append(score)
    unique_ids = []
    average_scores = []
    for doc_id, score_list in id_score_map.items():
      unique_ids.append(doc_id)
      average_scores.append(max(score_list))
    run_int = {}
    for ind in range(len(unique_ids)):
      run_int[unique_ids[ind]] = average_scores[ind]
    run[plain] = run_int
  return run

In [None]:
metrics = []
for col in NfCollections:
  queryPipeline = QueryPipeline(phi_model,
                                phi_tokenizer,
                                embedderDict[col](),
                                phi3_template,
                                CLUSTER_ENDPOINT,
                                TOKEN);
  print(col)
  pred = evalRetrival(corpus, queries, qrels, 40, col, queryPipeline);
  metrics.append(evaluate(col, qrels, pred,[3,5,7,10]))

In [None]:
Msmetrics = []
for col in MsCollections:
  queryPipeline = QueryPipeline(phi_model,
                                phi_tokenizer,
                                embedderDict[col](),
                                phi3_template,
                                CLUSTER_ENDPOINT,
                                TOKEN);
  print(col)
  pred = evalRetrival(msMarcoCorpus, msMarcoQueries, msMarcoQrels, 40, "Ms_"+col, queryPipeline);
  Msmetrics.append(evaluate("Ms_"+col, msMarcoQrels, pred,[3,5,7,10]))

In [None]:
pd.DataFrame(metrics,index=NfCollections)

Unnamed: 0,NDCG@3,MAP@3,Recall@3,Precision@3,F1@3,NDCG@5,MAP@5,Recall@5,Precision@5,F1@5,NDCG@7,MAP@7,Recall@7,Precision@7,F1@7,NDCG@10,MAP@10,Recall@10,Precision@10,F1@10
Stella_earlyChunk_fixSeg,0.47085,0.11889,0.1337,0.44376,0.205489,0.44353,0.13715,0.16061,0.38328,0.226364,0.42976,0.1499,0.18304,0.34852,0.240022,0.41424,0.16175,0.20631,0.31053,0.247912
Stella_lateChunk_fixSeg,0.46549,0.11327,0.12587,0.43653,0.195398,0.44502,0.1339,0.15898,0.387,0.225376,0.42794,0.14634,0.17866,0.34808,0.236124,0.41064,0.15822,0.20024,0.30743,0.242519
Jinav3_earlyChunk_fixSeg,0.38637,0.09077,0.10303,0.37049,0.161225,0.37423,0.10739,0.12995,0.33127,0.186672,0.35976,0.1168,0.14678,0.29943,0.196994,0.34667,0.12703,0.16666,0.26563,0.204816
Jinav3_lateChunk_fixSeg,0.39594,0.08483,0.09548,0.37977,0.152595,0.38003,0.1037,0.12804,0.33932,0.185923,0.36942,0.11422,0.14815,0.31092,0.200679,0.35429,0.12556,0.16971,0.27585,0.210138
Jinav3_earlyChunk_semSeg,0.39551,0.09619,0.11249,0.37461,0.173024,0.37752,0.11094,0.13574,0.32941,0.192257,0.36674,0.12069,0.15225,0.30164,0.20236,0.3537,0.13088,0.17293,0.26811,0.21025
Jinav3_lateChunk_dynSegSt,0.39971,0.08785,0.09648,0.3808,0.153954,0.38473,0.10592,0.12758,0.34118,0.185714,0.37288,0.11721,0.15003,0.31137,0.202492,0.35662,0.12669,0.16566,0.27399,0.206479
Jinav3_lateChunk_dynSegTop,0.39632,0.08385,0.09164,0.37564,0.147336,0.3833,0.10208,0.12112,0.34303,0.179027,0.36794,0.11302,0.14427,0.30739,0.196374,0.35161,0.12294,0.16264,0.27214,0.203601
Jinav2_earlyChunk_fixSeg,0.28047,0.05473,0.06622,0.26729,0.106143,0.26153,0.06434,0.08597,0.22848,0.124932,0.25102,0.07077,0.10131,0.20522,0.135653,0.23719,0.0755,0.11229,0.17709,0.137435
Jinav2_lateChunk_fixSeg,0.3077,0.05953,0.06753,0.29205,0.109695,0.28016,0.06869,0.08412,0.24334,0.125021,0.26557,0.07441,0.09731,0.21451,0.133885,0.2554,0.08164,0.11878,0.19102,0.146477
Jinav2_earlyChunk_semSeg,0.30384,0.06702,0.07679,0.28586,0.12106,0.29409,0.07976,0.10049,0.26006,0.144964,0.27938,0.08592,0.1153,0.22733,0.153,0.26906,0.09254,0.13061,0.20217,0.158696


- The Stella_earlyChunk_fixSeg RAG model stands out as the top-performing approach, demonstrating superior results across key evaluation metrics.
-
While late chunking introduces a novel methodology, it does not consistently surpass early chunking in performance. Notably, for certain models like BGE in our case, early chunking remains more effective, highlighting the importance of model-specific considerations when choosing a chunking strategy.



The top three models will now be further evaluated using the MS MARCO dataset to assess their performance on a widely recognized benchmark

In [None]:
pd.DataFrame(Msmetrics,index=MsCollections)

Unnamed: 0,NDCG@3,MAP@3,Recall@3,Precision@3,F1@3,NDCG@5,MAP@5,Recall@5,Precision@5,F1@5,NDCG@7,MAP@7,Recall@7,Precision@7,F1@7,NDCG@10,MAP@10,Recall@10,Precision@10,F1@10
Stella_earlyChunk_fixSeg,0.62723,0.4995,0.987,0.01267,0.025019,0.63052,0.5015,0.9945,0.0096,0.019016,0.63174,0.5021,0.998,0.00743,0.01475,0.63203,0.5022,0.999,0.0053,0.010544
Stella_lateChunk_fixSeg,0.4992,0.33842,0.98,0.01233,0.024354,0.50308,0.34082,0.989,0.0094,0.018623,0.50533,0.34186,0.9955,0.00771,0.015301,0.50563,0.34198,0.9965,0.0055,0.01094
Jinav3_lateChunk_fixSeg,0.62511,0.49758,0.984,0.01167,0.023066,0.62811,0.49943,0.991,0.0086,0.017052,0.63011,0.50034,0.9965,0.00714,0.014178,0.63041,0.50045,0.9975,0.0051,0.010148


On the MS MARCO dataset, we observe that the performance of Stella_lateChunk_fixSeg decreases compared to its early chunking version, further demonstrating that late chunking is not always the optimal choice across different scenarios.

# Generator evaluator

Here will be evaluated the generation part of the RAG on the best 3 models previously tested

In [None]:
def evaluate_generated_answer(original_answer, generated_answer):
    scores = {}
    # BLEU Score
    smooth = SmoothingFunction().method1
    scores['BLEU'] = sentence_bleu(
        [original_answer.split()], generated_answer.split(), smoothing_function=smooth
    )
    # ROUGE Scores
    rouge = Rouge()
    rouge_scores = rouge.get_scores(generated_answer, original_answer)[0]
    scores['ROUGE-1'] = rouge_scores['rouge-1']['f']
    scores['ROUGE-2'] = rouge_scores['rouge-2']['f']
    scores['ROUGE-L'] = rouge_scores['rouge-l']['f']
    # METEOR Score
    scores['METEOR'] = meteor_score([original_answer.split()], generated_answer.split())
    # Cosine Similarity using TF-IDF
    try:                                                                   # Try to calculate cosine similarity
      vectorizer = TfidfVectorizer(stop_words=None, token_pattern=r'(?u)\b\w+\b')  # Include all words as features and modified token pattern
      tfidf_matrix = vectorizer.fit_transform([original_answer, generated_answer])
      cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
      scores['COSINE'] = cosine_sim[0][0]
    except ValueError:                                                      # If vocabulary is empty, set cosine similarity to 0
      scores['COSINE'] = 0
    return scores

In [None]:
embedder = embedderDict[MsCollections[0]]()

In [None]:
queryPipeline = QueryPipeline(phi_model,
                              phi_tokenizer,
                              embedder,
                              phi3_template,
                              CLUSTER_ENDPOINT,
                              TOKEN)
scores_tot = {
              'BLEU'    : 0,
              'ROUGE-1' : 0,
              'ROUGE-2' : 0,
              'ROUGE-L' : 0,
              'METEOR'  : 0,
              'COSINE'  : 0
             }
pbar = tqdm(total=len(msMarcoQueries), desc="evalGeneration", leave=True)
for i,quer in enumerate(msMarcoQueries.values()):
  generated_answer = queryPipeline.query(quer,3,"Ms_"+MsCollections[0])
  origAnswer = list(msMarcoAnswers.values())[i]
  if origAnswer == []:
    origAnswer = "No Answer"
  else:
    origAnswer = origAnswer[0]
  scor = evaluate_generated_answer(origAnswer, generated_answer[0])
  scores_tot["BLEU"]    +=scor["BLEU"]
  scores_tot["ROUGE-1"] +=scor["ROUGE-1"]
  scores_tot["ROUGE-2"] +=scor["ROUGE-2"]
  scores_tot["ROUGE-L"] +=scor["ROUGE-L"]
  scores_tot["METEOR"]  +=scor["METEOR"]
  scores_tot["COSINE"]  +=scor["COSINE"]
  pbar.update(1)

DEBUG:pymilvus.milvus_client.milvus_client:Created new connection using: 16f3580c3bb44a01b66893f1b1d26570
evalGeneration:   1%|          | 6/1000 [00:56<2:35:53,  9.41s/it]
evalGeneration: 100%|██████████| 1000/1000 [1:00:10<00:00,  2.50s/it]

In [None]:
pd.DataFrame(scores_tot,index=[0])/len(msMarcoQueries)

Unnamed: 0,BLEU,ROUGE-1,ROUGE-2,ROUGE-L,METEOR,COSINE
0,0.051764,0.222865,0.100998,0.205905,0.226742,0.262254


In [None]:
queryPipeline = QueryPipeline(phi_model,
                              phi_tokenizer,
                              embedder,
                              phi3_template,
                              CLUSTER_ENDPOINT,
                              TOKEN)
scores_tot = {
              'BLEU'    : 0,
              'ROUGE-1' : 0,
              'ROUGE-2' : 0,
              'ROUGE-L' : 0,
              'METEOR'  : 0,
              'COSINE'  : 0
             }
pbar = tqdm(total=len(msMarcoQueries), desc="evalGeneration", leave=True)
for i,quer in enumerate(msMarcoQueries.values()):
  generated_answer = queryPipeline.query(quer,3,"Ms_"+MsCollections[1])
  origAnswer = list(msMarcoAnswers.values())[i]
  if origAnswer == []:
    origAnswer = "No Answer"
  else:
    origAnswer = origAnswer[0]
  scor = evaluate_generated_answer(origAnswer, generated_answer[0])
  scores_tot["BLEU"]    +=scor["BLEU"]
  scores_tot["ROUGE-1"] +=scor["ROUGE-1"]
  scores_tot["ROUGE-2"] +=scor["ROUGE-2"]
  scores_tot["ROUGE-L"] +=scor["ROUGE-L"]
  scores_tot["METEOR"]  +=scor["METEOR"]
  scores_tot["COSINE"]  +=scor["COSINE"]
  pbar.update(1)

In [None]:
pd.DataFrame(scores_tot,index=[0])/len(msMarcoQueries)

Unnamed: 0,BLEU,ROUGE-1,ROUGE-2,ROUGE-L,METEOR,COSINE
0,0.043017,0.206348,0.087346,0.188846,0.207389,0.244524


In [None]:
embedder = embedderDict[MsCollections[2]]()

In [None]:
queryPipeline = QueryPipeline(phi_model,
                              phi_tokenizer,
                              embedder,
                              phi3_template,
                              CLUSTER_ENDPOINT,
                              TOKEN)
scores_tot = {
              'BLEU'    : 0,
              'ROUGE-1' : 0,
              'ROUGE-2' : 0,
              'ROUGE-L' : 0,
              'METEOR'  : 0,
              'COSINE'  : 0
             }
pbar = tqdm(total=len(msMarcoQueries), desc="evalGeneration", leave=True)
for i,quer in enumerate(msMarcoQueries.values()):
  generated_answer = queryPipeline.query(quer,3,"Ms_"+MsCollections[2])
  origAnswer = list(msMarcoAnswers.values())[i]
  if origAnswer == []:
    origAnswer = "No Answer"
  else:
    origAnswer = origAnswer[0]
  scor = evaluate_generated_answer(origAnswer, generated_answer[0])
  scores_tot["BLEU"]    +=scor["BLEU"]
  scores_tot["ROUGE-1"] +=scor["ROUGE-1"]
  scores_tot["ROUGE-2"] +=scor["ROUGE-2"]
  scores_tot["ROUGE-L"] +=scor["ROUGE-L"]
  scores_tot["METEOR"]  +=scor["METEOR"]
  scores_tot["COSINE"]  +=scor["COSINE"]
  pbar.update(1)

In [None]:
pd.DataFrame(scores_tot,index=[0])/len(msMarcoQueries)

Unnamed: 0,BLEU,ROUGE-1,ROUGE-2,ROUGE-L,METEOR,COSINE
0,0.048387,0.205317,0.090989,0.189956,0.213013,0.242102


As a direct consequence of the retriever component's performance, the generation phase achieves the highest effectiveness in the first of the three models: Stella, utilizing early chunking combined with fixed segmenting. This approach ensures optimal alignment between the retrieval and generation processes, leading to superior results in generating coherent and contextually relevant outputs.