## Testing different embedding models

This notebook tests various LLM embedding models. It compares OpenAI, Google Gemini, Cohere, and a small model from HuggingFace. It compares relevancy, faithfulness, and correctness.

Load relevant libraries

In [None]:
%%capture
!pip install llama-index
!pip install langchain
!pip install pypdf
!pip install openai

In [None]:
%%capture
!pip install llama-index-embeddings-gemini
!pip install llama-index-embeddings-cohere
!pip install llama-index-embeddings-huggingface

In [None]:
%%capture
!pip install openai
!pip install cohere
!pip install transformers


In [None]:
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    Response,
)
from llama_index.core.evaluation import (
    FaithfulnessEvaluator,
    RelevancyEvaluator,
    CorrectnessEvaluator,
    RetrieverEvaluator,
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset)
from llama_index.core.evaluation import DatasetGenerator

Get API keys ready

In [None]:
import os
from google.colab import userdata
OPENAI_KEY = userdata.get('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY'] = OPENAI_KEY
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']

In [None]:
GEMINI_KEY = userdata.get('GEMINI_API_KEY')
os.environ['GEMINI_API_KEY'] = GEMINI_KEY
GOOGLE_API_KEY = os.environ['GEMINI_API_KEY']

In [None]:
COHERE_KEY = userdata.get('COHERE_API_KEY')
os.environ['COHERE_API_KEY'] = COHERE_KEY
COHERE_API_KEY = os.environ['COHERE_API_KEY']

In [None]:
HUGGINGFACE_KEY = userdata.get('HUGGINGFACE_API_KEY')
os.environ['HUGGINGFACE_API_KEY'] = HUGGINGFACE_KEY
HUGGINGFACE_API_KEY = os.environ['HUGGINGFACE_API_KEY']

Load and parse document

In [None]:
!mkdir data
!wget "https://licindia.in/documents/20121/62876/Final+Policy+document_LICs+New+Jeevan+Shanti_V05_logo.pdf/f7950a05-1136-1bfb-bf4a-f997d85fc30f?t=1707288544327" -O 'data/doc.pdf'

--2024-10-22 21:39:35--  https://licindia.in/documents/20121/62876/Final+Policy+document_LICs+New+Jeevan+Shanti_V05_logo.pdf/f7950a05-1136-1bfb-bf4a-f997d85fc30f?t=1707288544327
Resolving licindia.in (licindia.in)... 64.185.181.238
Connecting to licindia.in (licindia.in)|64.185.181.238|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 588928 (575K) [application/pdf]
Saving to: ‘data/doc.pdf’


2024-10-22 21:39:36 (800 KB/s) - ‘data/doc.pdf’ saved [588928/588928]



In [None]:
from pathlib import Path
from llama_index.readers.file import PDFReader

In [None]:
loader = PDFReader()
documents = loader.load_data(file=Path('./data/doc.pdf'))

In [None]:
from llama_index.core import VectorStoreIndex
index = VectorStoreIndex.from_documents(documents)

In [None]:
retriever = index.as_retriever()

Get embedding models ready

In [None]:
from llama_index.embeddings.openai import OpenAIEmbedding
embed_model_openai = OpenAIEmbedding(model='text-embedding-3-small', api_key=OPENAI_API_KEY)

In [None]:
from llama_index.embeddings.gemini import GeminiEmbedding
model_name = "models/text-embedding-004"
embed_model_gemini = GeminiEmbedding(model_name=model_name, api_key=GOOGLE_API_KEY)

In [None]:
from llama_index.embeddings.cohere import CohereEmbedding

In [None]:
embed_model_cohere = CohereEmbedding(
    cohere_api_key=COHERE_API_KEY,
    model_name="embed-english-v3.0",
    input_type="search_query",
)

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

In [None]:
embed_model_huggingface = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
from llama_index.llms.openai import OpenAI
from llama_index.core.llms import ChatMessage

In [None]:
llm = OpenAI(model='gpt-4o-mini')

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
# attach to the same event-loop
import nest_asyncio

nest_asyncio.apply()

import logging
import sys

# Set up the root logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)  # Set logger level to INFO

# Clear out any existing handlers
logger.handlers = []

# Set up the StreamHandler to output to sys.stdout (Colab's output)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)  # Set handler level to INFO

# Add the handler to the logger
logger.addHandler(handler)

Generate questions and answers

In [None]:
data_generator = DatasetGenerator.from_documents(documents)
eval_dataset = data_generator.generate_dataset_from_nodes(num = 20)

  return cls(


HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.co

  return QueryResponseDataset(queries=queries, responses=responses_dict)


In [None]:
eval_questions = [example[0] for example in eval_dataset.qr_pairs]
eval_answers = [example[1] for example in eval_dataset.qr_pairs]

This function takes an embedding model and llm model as input and outputs the context relevancy, faithfulness, and correctness score.

In [None]:
def evaluate_responses(embedding_model, llm_model):
    vector_store_index = VectorStoreIndex.from_documents(documents, embed_model=embedding_model, llm=llm_model, show_progress=False)
    retriever = vector_store_index.as_retriever(similarity_top_k=3)
    query_engine = vector_store_index.as_query_engine(similarity_top_k=3)

    relevancy_evaluator = RelevancyEvaluator(llm=llm_model)
    faithfulness_evaluator = FaithfulnessEvaluator(llm=llm_model)
    correctness_evaluator = CorrectnessEvaluator(llm=llm_model)

    eval_results = []

    for ind in range(len(eval_questions)):
        eval_query = eval_questions[ind]
        response_vector = query_engine.query(eval_query)

        eval_source_result_full = [
            {
                "relevancy": relevancy_evaluator.evaluate(
                    query=eval_query,
                    response=response_vector.response,
                    contexts=[source_node.get_content()]
                ),
                "faithfulness": faithfulness_evaluator.evaluate(
                    query=eval_query,
                    response=response_vector.response,
                    contexts=[source_node.get_content()]
                ),
                "correctness": correctness_evaluator.evaluate(
                    query=eval_query,
                    response=response_vector.response,
                    contexts=[source_node.get_content()]
                )
            }
            for source_node in response_vector.source_nodes
        ]

        overall_result = {
            "relevancy": "Pass" if any(result["relevancy"].passing for result in eval_source_result_full) else "Fail",
            "faithfulness": "Pass" if any(result["faithfulness"].passing for result in eval_source_result_full) else "Fail",
            "correctness": "Pass" if any(result["correctness"].passing for result in eval_source_result_full) else "Fail"
        }

        eval_results.append(overall_result)

    return eval_results




Test different embeddings on the same llm

In [None]:
%%capture
res_openai = evaluate_responses(embed_model_openai,llm)
res_gemini = evaluate_responses(embed_model_gemini,llm)
res_cohere = evaluate_responses(embed_model_cohere,llm)
res_bge = evaluate_responses(embed_model_huggingface,llm)

HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddi

In [None]:
import pandas as pd
import numpy as np

def compute_pass_rate(eval_results):
    df = pd.DataFrame(eval_results)
    return df.applymap(lambda x: x == "Pass").mean()

results_dict = {
    'OpenAI': compute_pass_rate(res_openai),
    'Gemini': compute_pass_rate(res_gemini),
    'Cohere': compute_pass_rate(res_cohere),
    'HuggingFace': compute_pass_rate(res_bge)
}

summary_df = pd.DataFrame(results_dict)

summary_df = summary_df.T

print(summary_df)



             relevancy  faithfulness  correctness
OpenAI            0.95          0.90         0.95
Gemini            1.00          0.90         0.80
Cohere            0.95          0.95         0.90
HuggingFace       1.00          0.95         0.90


Evaluate different llms for the same embedding model

In [None]:
llm_openai_4mini = OpenAI(model='gpt-4o-mini', api_key = OPENAI_API_KEY)
llm_openai_4 = OpenAI(model='gpt-4o', api_key = OPENAI_API_KEY)
llm_openai_35 = OpenAI(model='gpt-3.5-turbo', api_key = OPENAI_API_KEY)


In [None]:
res_openai_4mini = evaluate_responses(embed_model_openai,llm_openai_4mini)
res_openai_4 = evaluate_responses(embed_model_openai,llm_openai_4)
res_openai_35 = evaluate_responses(embed_model_openai,llm_openai_35)


HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddi

In [None]:
results_dict = {
    'OpenAI_4': compute_pass_rate(res_openai_4),
    'OpenAI_4mini': compute_pass_rate(res_openai_4mini),
    'OpenAI_35': compute_pass_rate(res_openai_35)
}

summary_df = pd.DataFrame(results_dict)

summary_df = summary_df.T

print(summary_df)

              relevancy  faithfulness  correctness
OpenAI_4           0.90          0.85         0.65
OpenAI_4mini       0.95          0.90         0.95
OpenAI_35          0.95          1.00         0.00
