In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/NLP/Project

/content/drive/MyDrive/NLP/Project


In [None]:
%pip install llama-index-llms-openai
%pip install llama-index-embeddings-openai
%pip install llama-index-finetuning
%pip install llama-index-readers-file
%pip install llama-index-embeddings-huggingface

Collecting llama-index-llms-openai
  Downloading llama_index_llms_openai-0.1.16-py3-none-any.whl (10 kB)
Collecting llama-index-core<0.11.0,>=0.10.24 (from llama-index-llms-openai)
  Downloading llama_index_core-0.10.30-py3-none-any.whl (15.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.4/15.4 MB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json (from llama-index-core<0.11.0,>=0.10.24->llama-index-llms-openai)
  Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)
Collecting deprecated>=1.2.9.3 (from llama-index-core<0.11.0,>=0.10.24->llama-index-llms-openai)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Collecting dirtyjson<2.0.0,>=1.0.8 (from llama-index-core<0.11.0,>=0.10.24->llama-index-llms-openai)
  Downloading dirtyjson-1.0.8-py3-none-any.whl (25 kB)
Collecting httpx (from llama-index-core<0.11.0,>=0.10.24->llama-index-llms-openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━

In [None]:
import os
import json
import pandas as pd
from tqdm.notebook import tqdm
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core.schema import TextNode
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import MetadataMode
from llama_index.finetuning import generate_qa_embedding_pairs
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset
from llama_index.llms.openai import OpenAI
from llama_index.finetuning import SentenceTransformersFinetuneEngine

#Finetuning the embedding model on Lyft Dataset


In [None]:
def load_corpus(files, verbose=False):
    # If verbose is True, print the list of files that will be loaded
    if verbose:
        print(f"Loading files {files}")

    # Create an instance of SimpleDirectoryReader with the given files
    reader = SimpleDirectoryReader(input_files=files)

    # Load data (documents) from the specified files
    docs = reader.load_data()

    # If verbose is True, print the number of documents loaded
    if verbose:
        print(f"Loaded {len(docs)} docs")

    # Create an instance of SentenceSplitter to parse the documents into smaller units : Chunking
    parser = SentenceSplitter()

    # Parse the documents into nodes (smaller text units) with optional progress display
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    # If verbose is True, print the number of nodes parsed from the documents
    if verbose:
        print(f"Parsed {len(nodes)} nodes")

    # Return the parsed nodes
    return nodes


In [None]:

TRAIN_FILES = ["/content/lyft_annual_report.pdf"]
VAL_FILES = ["/content/uber_annual_report.pdf"]

# TRAIN_CORPUS_FPATH = "./data/train_corpus.json"
# VAL_CORPUS_FPATH = "./data/val_corpus.json"

In [None]:
train_nodes = load_corpus(TRAIN_FILES, verbose=True)
val_nodes = load_corpus(VAL_FILES, verbose=True)

Loading files ['/content/lyft_annual_report.pdf']
Loaded 136 docs


Parsing nodes:   0%|          | 0/136 [00:00<?, ?it/s]

Parsed 204 nodes
Loading files ['/content/uber_annual_report.pdf']
Loaded 153 docs


Parsing nodes:   0%|          | 0/153 [00:00<?, ?it/s]

Parsed 236 nodes


#Generate synthetic queries
Now, we use an LLM (gpt-3.5-turbo) to generate questions using each text chunk in the corpus as context.

Each pair of (generated question, text chunk used as context) becomes a datapoint in the finetuning dataset (either for training or evaluation).

In [None]:
OPENAI_API_TOKEN = ""
os.environ["OPENAI_API_KEY"] = OPENAI_API_TOKEN

In [None]:
train_dataset = generate_qa_embedding_pairs(
    llm=OpenAI(model="gpt-3.5-turbo"), nodes=train_nodes
)
val_dataset = generate_qa_embedding_pairs(
    llm=OpenAI(model="gpt-3.5-turbo"), nodes=val_nodes
)

100%|██████████| 204/204 [07:38<00:00,  2.25s/it]
100%|██████████| 236/236 [08:29<00:00,  2.16s/it]


In [None]:
train_dataset.save_json("train_dataset.json")
val_dataset.save_json("val_dataset.json")

In [None]:
# [Optional] Load
train_dataset = EmbeddingQAFinetuneDataset.from_json("train_dataset.json")
val_dataset = EmbeddingQAFinetuneDataset.from_json("val_dataset.json")

#Finetuning

In [None]:
finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset,
    model_id="BAAI/bge-small-en",
    model_output_path="test_model",
    val_dataset=val_dataset,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
finetune_engine.finetune()

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/41 [00:00<?, ?it/s]

Iteration:   0%|          | 0/41 [00:00<?, ?it/s]

In [None]:
embed_model = finetune_engine.get_finetuned_model()

In [None]:
embed_model

HuggingFaceEmbedding(model_name='test_model', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7c6025e54520>, max_length=512, normalize=True, query_instruction=None, text_instruction=None, cache_folder=None)

#Evaluate Finetuned Model
In this section, we evaluate 3 different embedding models:

1. proprietary OpenAI embedding,

2. open source BAAI/bge-small-en, and

3. our finetuned embedding model.

We consider 2 evaluation approaches:

1. a simple custom hit rate metric

2. using InformationRetrievalEvaluator from sentence_transformers

We show that finetuning on synthetic (LLM-generated) dataset significantly improve upon an opensource embedding model.

# Define eval function
Option 1: We use a simple hit rate metric for evaluation:

1. for each (query, relevant_doc) pair,
2. we retrieve top-k documents with the query, and
3. it's a hit if the results contain the relevant_doc.

This approach is very simple and intuitive, and we can apply it to both the proprietary OpenAI embedding as well as our open source and fine-tuned embedding models.

In [None]:
def evaluate(dataset, embed_model, top_k=5, verbose=False):
    # Extract corpus, queries, and relevant documents from the dataset
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    # Create TextNode objects from the corpus, where each node has an id and text
    nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]

    # Create a vector store index with the given embedding model and show progress if required
    index = VectorStoreIndex(nodes, embed_model=embed_model, show_progress=True)

    # Initialize a retriever from the index with the specified top_k similarity threshold
    retriever = index.as_retriever(similarity_top_k=top_k)

    # Create a list to store evaluation results
    eval_results = []

    # Iterate through the queries, displaying a progress bar with tqdm
    for query_id, query in tqdm(queries.items()):
        # Retrieve similar nodes based on the query
        retrieved_nodes = retriever.retrieve(query)

        # Extract the node IDs from the retrieved nodes
        retrieved_ids = [node.node.node_id for node in retrieved_nodes]

        # Get the expected ID of the relevant document for this query
        expected_id = relevant_docs[query_id][0]

        # Determine if the expected relevant document is among the retrieved IDs
        is_hit = expected_id in retrieved_ids  # assuming only 1 relevant document per query

        # Create a dictionary to store the evaluation result for this query
        eval_result = {
            "is_hit": is_hit,  # Whether the correct document was retrieved
            "retrieved": retrieved_ids,  # List of IDs of the retrieved nodes
            "expected": expected_id,  # Expected ID of the relevant document
            "query": query_id,  # ID of the query
        }

        # Add the evaluation result to the list
        eval_results.append(eval_result)

    # Return the list of evaluation results
    return eval_results


Option 2: We use the InformationRetrievalEvaluator from sentence_transformers.

This provides a more comprehensive suite of metrics, but we can only run it against the sentencetransformers compatible models (open source and our finetuned model, not the OpenAI embedding model).

In [None]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers import SentenceTransformer
from pathlib import Path


def evaluate_st(dataset, model_id, name):
    # Extract the corpus, queries, and relevant documents from the dataset
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    # Create an evaluator for information retrieval using the Sentence Transformers library
    # Pass in the queries, corpus, and relevant documents, along with a name for the evaluation
    evaluator = InformationRetrievalEvaluator(
        queries, corpus, relevant_docs, name=name
    )

    # Load the specified sentence transformer model
    model = SentenceTransformer(model_id)

    # Set the output path for storing the evaluation results
    output_path = "results/"

    # Ensure the output directory exists, creating it if necessary
    Path(output_path).mkdir(exist_ok=True, parents=True)

    # Run the evaluator with the loaded model and save the results to the output path
    return evaluator(model, output_path=output_path)


#OpenAI

In [None]:
ada = OpenAIEmbedding()
ada_val_results = evaluate(val_dataset, ada)
df_ada = pd.DataFrame(ada_val_results)

Generating embeddings:   0%|          | 0/236 [00:00<?, ?it/s]

  0%|          | 0/472 [00:00<?, ?it/s]

In [None]:
hit_rate_ada = df_ada["is_hit"].mean()
hit_rate_ada

0.9152542372881356

#BAAI/bge-small-en

In [None]:
bge = "local:BAAI/bge-small-en"
bge_val_results = evaluate(val_dataset, bge)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generating embeddings:   0%|          | 0/236 [00:00<?, ?it/s]

  0%|          | 0/472 [00:00<?, ?it/s]

In [None]:
df_bge = pd.DataFrame(bge_val_results)

In [None]:
hit_rate_bge = df_bge["is_hit"].mean()
hit_rate_bge

0.864406779661017

In [None]:
evaluate_st(val_dataset, "BAAI/bge-small-en", name="bge")

0.692192770892109

#Finetuned

In [None]:
finetuned = "local:test_model"
val_results_finetuned = evaluate(val_dataset, finetuned)
df_finetuned = pd.DataFrame(val_results_finetuned)

Generating embeddings:   0%|          | 0/236 [00:00<?, ?it/s]

  0%|          | 0/472 [00:00<?, ?it/s]

In [None]:
hit_rate_finetuned = df_finetuned["is_hit"].mean()
hit_rate_finetuned

0.8898305084745762

In [None]:
evaluate_st(val_dataset, "test_model", name="finetuned")

0.7406313641717485

In [None]:
df_ada

Unnamed: 0,is_hit,retrieved,expected,query
0,True,"[74ea78f3-1bd4-4fef-a045-1a4cb88a1a74, 0ec2046...",74ea78f3-1bd4-4fef-a045-1a4cb88a1a74,8cfbde7f-e22d-4bbf-809d-a358e15f32c5
1,True,"[74ea78f3-1bd4-4fef-a045-1a4cb88a1a74, cdbe1b1...",74ea78f3-1bd4-4fef-a045-1a4cb88a1a74,db5362aa-f9df-4226-8a25-34409cce655d
2,True,"[8a849ec8-106f-4353-b4d1-596963f69264, cdbe1b1...",8a849ec8-106f-4353-b4d1-596963f69264,f6b71a01-9269-42c9-a536-c5072842d353
3,True,"[70dc4eea-f724-4784-a659-bf53be0abd49, 8a849ec...",8a849ec8-106f-4353-b4d1-596963f69264,2ffe9bbb-f815-4818-a0b4-6499616c4fbe
4,True,"[bf8c6017-a221-4895-a3e7-9ce3abdf90c7, c0940f4...",4e6cfcf9-b462-4a24-9702-c0a9d17fafe5,e3df5046-1f96-4e0f-82fc-e54aea3011b9
...,...,...,...,...
467,True,"[9f29dac4-3217-4028-ac87-daf6e11194eb, 28a404b...",9f29dac4-3217-4028-ac87-daf6e11194eb,d75a26f2-4d7e-4b2f-b760-635b0cd5e21a
468,True,"[c0940f4f-eb23-4835-8084-0b03dd6ed92c, 5a2edb5...",c0940f4f-eb23-4835-8084-0b03dd6ed92c,9382c168-c5da-40a4-8123-fa0de793720e
469,True,"[4e6cfcf9-b462-4a24-9702-c0a9d17fafe5, c0940f4...",c0940f4f-eb23-4835-8084-0b03dd6ed92c,9fe5235a-8909-4608-8556-542ed387715a
470,True,"[bf8c6017-a221-4895-a3e7-9ce3abdf90c7, 74ea78f...",bf8c6017-a221-4895-a3e7-9ce3abdf90c7,c76cd541-8209-438e-8fc0-ce916065d638


In [None]:
df_bge

Unnamed: 0,is_hit,retrieved,expected,query
0,True,"[74ea78f3-1bd4-4fef-a045-1a4cb88a1a74, eb89a80...",74ea78f3-1bd4-4fef-a045-1a4cb88a1a74,8cfbde7f-e22d-4bbf-809d-a358e15f32c5
1,True,"[74ea78f3-1bd4-4fef-a045-1a4cb88a1a74, 933b651...",74ea78f3-1bd4-4fef-a045-1a4cb88a1a74,db5362aa-f9df-4226-8a25-34409cce655d
2,True,"[8a849ec8-106f-4353-b4d1-596963f69264, 5a2edb5...",8a849ec8-106f-4353-b4d1-596963f69264,f6b71a01-9269-42c9-a536-c5072842d353
3,True,"[8a849ec8-106f-4353-b4d1-596963f69264, 5a2edb5...",8a849ec8-106f-4353-b4d1-596963f69264,2ffe9bbb-f815-4818-a0b4-6499616c4fbe
4,True,"[c0940f4f-eb23-4835-8084-0b03dd6ed92c, bf8c601...",4e6cfcf9-b462-4a24-9702-c0a9d17fafe5,e3df5046-1f96-4e0f-82fc-e54aea3011b9
...,...,...,...,...
467,True,"[9f29dac4-3217-4028-ac87-daf6e11194eb, 28a404b...",9f29dac4-3217-4028-ac87-daf6e11194eb,d75a26f2-4d7e-4b2f-b760-635b0cd5e21a
468,True,"[c0940f4f-eb23-4835-8084-0b03dd6ed92c, 5a2edb5...",c0940f4f-eb23-4835-8084-0b03dd6ed92c,9382c168-c5da-40a4-8123-fa0de793720e
469,True,"[c0940f4f-eb23-4835-8084-0b03dd6ed92c, 4e6cfcf...",c0940f4f-eb23-4835-8084-0b03dd6ed92c,9fe5235a-8909-4608-8556-542ed387715a
470,True,"[bf8c6017-a221-4895-a3e7-9ce3abdf90c7, 74ea78f...",bf8c6017-a221-4895-a3e7-9ce3abdf90c7,c76cd541-8209-438e-8fc0-ce916065d638


In [None]:
df_finetuned

Unnamed: 0,is_hit,retrieved,expected,query
0,True,"[74ea78f3-1bd4-4fef-a045-1a4cb88a1a74, 933b651...",74ea78f3-1bd4-4fef-a045-1a4cb88a1a74,8cfbde7f-e22d-4bbf-809d-a358e15f32c5
1,True,"[74ea78f3-1bd4-4fef-a045-1a4cb88a1a74, 933b651...",74ea78f3-1bd4-4fef-a045-1a4cb88a1a74,db5362aa-f9df-4226-8a25-34409cce655d
2,True,"[8a849ec8-106f-4353-b4d1-596963f69264, 5a2edb5...",8a849ec8-106f-4353-b4d1-596963f69264,f6b71a01-9269-42c9-a536-c5072842d353
3,True,"[8a849ec8-106f-4353-b4d1-596963f69264, 74ea78f...",8a849ec8-106f-4353-b4d1-596963f69264,2ffe9bbb-f815-4818-a0b4-6499616c4fbe
4,False,"[c0940f4f-eb23-4835-8084-0b03dd6ed92c, bf8c601...",4e6cfcf9-b462-4a24-9702-c0a9d17fafe5,e3df5046-1f96-4e0f-82fc-e54aea3011b9
...,...,...,...,...
467,True,"[9f29dac4-3217-4028-ac87-daf6e11194eb, 28a404b...",9f29dac4-3217-4028-ac87-daf6e11194eb,d75a26f2-4d7e-4b2f-b760-635b0cd5e21a
468,True,"[c0940f4f-eb23-4835-8084-0b03dd6ed92c, 5a2edb5...",c0940f4f-eb23-4835-8084-0b03dd6ed92c,9382c168-c5da-40a4-8123-fa0de793720e
469,True,"[4e6cfcf9-b462-4a24-9702-c0a9d17fafe5, c0940f4...",c0940f4f-eb23-4835-8084-0b03dd6ed92c,9fe5235a-8909-4608-8556-542ed387715a
470,True,"[bf8c6017-a221-4895-a3e7-9ce3abdf90c7, 74ea78f...",bf8c6017-a221-4895-a3e7-9ce3abdf90c7,c76cd541-8209-438e-8fc0-ce916065d638


In [None]:
df_ada['model'] = 'ada'
df_bge['model'] = 'bge'
df_finetuned['model'] = 'fine_tuned'

In [None]:
df_all = pd.concat([df_ada, df_bge, df_finetuned])
df_all.groupby('model').mean('is_hit')

Unnamed: 0_level_0,is_hit
model,Unnamed: 1_level_1
ada,0.915254
bge,0.864407
fine_tuned,0.889831


In [None]:
df_st_bge = pd.read_csv('results/Information-Retrieval_evaluation_bge_results.csv')
df_st_finetuned = pd.read_csv('results/Information-Retrieval_evaluation_finetuned_results.csv')

In [None]:
df_st_bge['model'] = 'bge'
df_st_finetuned['model'] = 'fine_tuned'
df_st_all = pd.concat([df_st_bge, df_st_finetuned])
df_st_all = df_st_all.set_index('model')
df_st_all

Unnamed: 0_level_0,epoch,steps,cos_sim-Accuracy@1,cos_sim-Accuracy@3,cos_sim-Accuracy@5,cos_sim-Accuracy@10,cos_sim-Precision@1,cos_sim-Recall@1,cos_sim-Precision@3,cos_sim-Recall@3,...,dot_score-Recall@1,dot_score-Precision@3,dot_score-Recall@3,dot_score-Precision@5,dot_score-Recall@5,dot_score-Precision@10,dot_score-Recall@10,dot_score-MRR@10,dot_score-NDCG@10,dot_score-MAP@100
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bge,-1,-1,0.57839,0.766949,0.832627,0.887712,0.57839,0.57839,0.25565,0.766949,...,0.57839,0.25565,0.766949,0.166525,0.832627,0.088771,0.887712,0.687092,0.735929,0.692193
fine_tuned,-1,-1,0.627119,0.824153,0.889831,0.934322,0.627119,0.627119,0.274718,0.824153,...,0.627119,0.274718,0.824153,0.177966,0.889831,0.093432,0.934322,0.738322,0.78633,0.740631


In [None]:
df_st_all.to_csv("InformationRetrievalEvaluator.csv")

# Evaluation on Generation

In [None]:
from llama_index.core.llama_dataset import download_llama_dataset
from llama_index.core.llama_pack import download_llama_pack
from llama_index.core import VectorStoreIndex

# download and install dependencies for benchmark dataset
rag_dataset, documents = download_llama_dataset(
    "PatronusAIFinanceBenchDataset", "./data"
)

In [None]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.llama_dataset import LabelledRagDataset


rag_dataset = LabelledRagDataset.from_json("./data/rag_dataset.json")


In [None]:
documents = SimpleDirectoryReader(input_dir="./data/source_files").load_data()

In [None]:
len(documents)

4306

In [None]:
documents_for_finetuning = SimpleDirectoryReader(input_dir="./data/files_for_finetuning").load_data()

In [None]:
def load_corpus(documents_for_finetuning, verbose=False):

    # Load data (documents) from the specified files
    docs = documents_for_finetuning

    # If verbose is True, print the number of documents loaded
    if verbose:
        print(f"Loaded {len(docs)} docs")

    # Create an instance of SentenceSplitter to parse the documents into smaller units : Chunking
    parser = SentenceSplitter()

    # Parse the documents into nodes (smaller text units) with optional progress display
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    # If verbose is True, print the number of nodes parsed from the documents
    if verbose:
        print(f"Parsed {len(nodes)} nodes")

    # Return the parsed nodes
    return nodes


In [None]:
train_nodes = load_corpus(documents_for_finetuning, verbose=True)

Loaded 121 docs


Parsing nodes:   0%|          | 0/121 [00:00<?, ?it/s]

Parsed 138 nodes


In [None]:
rag_dataset.to_pandas()

Unnamed: 0,query,reference_contexts,reference_answer,reference_answer_by,query_by
0,What is the FY2018 capital expenditure amount ...,[Table of Contents \n3M Company and Subsidiari...,$1577.00,human,human
1,Assume that you are a public equities analyst....,[Table of Contents \n3M Company and Subsidiari...,$8.70,human,human
2,Is 3M a capital-intensive business based on FY...,[3M Company and Subsidiaries\n Consolidated St...,"No, the company is managing its CAPEX and Fixe...",human,human
3,What drove operating margin change as of FY202...,"[SG&A, measured as a percent of sales, increas...",Operating Margin for 3M in FY2022 has decrease...,human,human
4,"If we exclude the impact of M&A, which segment...",[Worldwide Sales Change\nBy Business Segment O...,The consumer segment shrunk by 0.9% organically.,human,human
...,...,...,...,...,...
93,Among all of the derivative instruments that V...,[Derivative Instruments \nWe enter into deriva...,Cross currency swaps. Its notional value was $...,human,human
94,"As of FY 2021, how much did Verizon expect to ...",[Pension and postretirement health care and li...,The estimated pension benefits were $1097 mill...,human,human
95,Does Verizon have a reasonably healthy liquidi...,[Consolidated Balance Sheets \nVerizon Communi...,No. The quick ratio was approximately 0.54 for...,human,human
96,Is Verizon a capital intensive business based ...,[Consolidated Balance Sheets \nVerizon Communi...,Yes. Verizon's capital intensity ratio was app...,human,human


In [None]:
OPENAI_API_TOKEN = "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"
os.environ["OPENAI_API_KEY"] = OPENAI_API_TOKEN

In [None]:
train_dataset = generate_qa_embedding_pairs(
    llm=OpenAI(model="gpt-3.5-turbo"), nodes=train_nodes
)

100%|██████████| 138/138 [03:13<00:00,  1.40s/it]


In [None]:
train_dataset.save_json("train_dataset_finetuning.json")

In [None]:
train_dataset = EmbeddingQAFinetuneDataset.from_json("train_dataset_finetuning.json")

In [None]:
finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset,
    model_id="BAAI/bge-small-en",
    model_output_path="test_model",
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
finetune_engine.finetune()

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/28 [00:00<?, ?it/s]

Iteration:   0%|          | 0/28 [00:00<?, ?it/s]

In [None]:
embed_model = finetune_engine.get_finetuned_model()

In [None]:
embed_model

HuggingFaceEmbedding(model_name='test_model', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7c5b06ada350>, max_length=512, normalize=True, query_instruction=None, text_instruction=None, cache_folder=None)

# Generation Evaluation

In [None]:
from llama_index.core.llama_dataset import download_llama_dataset
from llama_index.core.llama_pack import download_llama_pack
from llama_index.core import VectorStoreIndex

In [None]:
import nest_asyncio

nest_asyncio.apply()

In [None]:
# evaluate using the RagEvaluatorPack
RagEvaluatorPack = download_llama_pack(
    "RagEvaluatorPack", "./rag_evaluator_pack"
)

In [None]:
# build a basic RAG pipeline off of the source documents
index = VectorStoreIndex.from_documents(embed_model = embed_model, documents=documents)
query_engine = index.as_query_engine()

In [None]:
# construction requires a query_engine, a rag_dataset, and optionally a judge_llm
rag_evaluator_pack = RagEvaluatorPack(
    query_engine=query_engine, rag_dataset=rag_dataset, judge_llm = OpenAI(model="gpt-3.5-turbo")
)

In [None]:
# PERFORM EVALUATION
benchmark_df = rag_evaluator_pack.run()  # async arun() also supported
print(benchmark_df)

100%|██████████| 10/10 [00:22<00:00,  2.25s/it]
100%|██████████| 10/10 [00:19<00:00,  1.99s/it]
100%|██████████| 10/10 [00:22<00:00,  2.29s/it]
100%|██████████| 10/10 [00:22<00:00,  2.23s/it]
100%|██████████| 10/10 [00:19<00:00,  1.99s/it]
100%|██████████| 10/10 [00:21<00:00,  2.15s/it]
100%|██████████| 10/10 [00:25<00:00,  2.58s/it]
100%|██████████| 10/10 [00:21<00:00,  2.19s/it]
100%|██████████| 10/10 [00:19<00:00,  1.99s/it]
100%|██████████| 8/8 [00:17<00:00,  2.25s/it]
2it [00:04,  2.39s/it]
2it [00:05,  2.82s/it]
2it [00:04,  2.29s/it]
2it [00:05,  2.51s/it]
2it [00:04,  2.06s/it]
2it [00:04,  2.31s/it]
2it [00:04,  2.44s/it]
2it [00:04,  2.23s/it]
2it [00:04,  2.36s/it]
2it [00:04,  2.01s/it]
2it [00:05,  2.60s/it]
2it [00:04,  2.34s/it]
2it [00:04,  2.50s/it]
2it [00:04,  2.23s/it]
2it [00:03,  1.74s/it]
2it [00:03,  1.91s/it]
2it [00:05,  2.62s/it]
2it [00:04,  2.11s/it]
2it [00:03,  1.88s/it]
2it [00:03,  1.83s/it]
2it [00:04,  2.11s/it]
2it [00:05,  2.92s/it]
2it [00:04,  2.0

rag                            base_rag
metrics                                
mean_correctness_score         3.592391
mean_relevancy_score           0.346939
mean_faithfulness_score        0.622449
mean_context_similarity_score  0.849334


#Naive Embedding Model

In [None]:
from sentence_transformers import SentenceTransformer
model_id="BAAI/bge-small-en"
model = SentenceTransformer(model_id)

In [None]:
# build a basic RAG pipeline off of the source documents
index = VectorStoreIndex.from_documents(embed_model = "local:BAAI/bge-small-en", documents=documents)
query_engine = index.as_query_engine()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# construction requires a query_engine, a rag_dataset, and optionally a judge_llm
rag_evaluator_pack = RagEvaluatorPack(
    query_engine=query_engine, rag_dataset=rag_dataset, judge_llm = OpenAI(model="gpt-3.5-turbo")
)

In [None]:
# PERFORM EVALUATION
naive_df = rag_evaluator_pack.run()  # async arun() also supported
print(naive_df)

2it [00:06,  3.19s/it]
2it [00:05,  2.75s/it]
2it [00:05,  2.52s/it]
2it [00:04,  2.11s/it]
2it [00:04,  2.18s/it]
2it [00:04,  2.32s/it]
2it [00:04,  2.16s/it]
2it [00:04,  2.03s/it]
2it [00:03,  1.78s/it]
2it [00:04,  2.09s/it]
2it [00:04,  2.38s/it]
2it [00:04,  2.45s/it]
2it [00:05,  2.67s/it]
2it [00:05,  2.86s/it]
2it [00:03,  1.88s/it]
2it [00:04,  2.23s/it]
2it [00:05,  2.88s/it]
2it [00:04,  2.45s/it]
2it [00:04,  2.11s/it]
2it [00:03,  1.88s/it]
2it [00:04,  2.02s/it]
2it [00:04,  2.25s/it]
2it [00:04,  2.15s/it]
2it [00:04,  2.33s/it]
2it [00:03,  1.66s/it]
2it [00:04,  2.20s/it]
2it [00:03,  1.74s/it]
2it [00:05,  2.56s/it]
2it [00:03,  1.79s/it]
2it [00:06,  3.36s/it]
2it [00:05,  2.76s/it]
2it [00:04,  2.16s/it]
2it [00:05,  2.83s/it]
2it [00:04,  2.41s/it]
2it [00:04,  2.04s/it]
2it [00:04,  2.22s/it]
2it [00:04,  2.28s/it]
2it [00:03,  1.90s/it]
2it [00:05,  2.71s/it]
2it [00:05,  2.50s/it]
2it [00:04,  2.00s/it]
2it [00:03,  1.97s/it]
2it [00:03,  1.96s/it]
2it [00:05,

rag                            base_rag
metrics                                
mean_correctness_score         3.532609
mean_relevancy_score           0.306122
mean_faithfulness_score        0.602041
mean_context_similarity_score  0.849325
