Skip to content

evaluate function #1421

@amin-kh96

Description

@amin-kh96

I create a subclass of baseragassembeddings. because I already have all the embeddings for context, query, and question. I did this to not use the openai API key. because it is costly and also I want to use other models like mistral or etc.
The model that I used to create the embeddings is 'text-embedding-ada-002'. but the problem that I have been
dealing with this error:
Exception has occurred: OpenAIError
The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable
File "C:\Users\Amin\OneDrive - unige.it\Desktop\tirocini\code\version01rageva.py", line 165, in
evaluation_report = evaluate(ragas_data, metrics=metrics, embeddings=custom_embeddings)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
openai.OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable.

as far as I remember I knew that I do not need for an API key. so please help me fix this error. I also put my code here for taking a look. note that I am using a model from Hugginface, and the model name is :
model_name = 'distilbert-base-uncased'

import json
from transformers import AutoModel, AutoTokenizer
import torch
import numpy as np
from datasets import Dataset
from ragas.embeddings import BaseRagasEmbeddings
from ragas.metrics import context_utilization,ContextUtilization
from ragas import evaluate

Load the ground truth data

file_path = 'assets\GT.json'
with open(file_path) as f:
ground_truth_data = json.load(f)

Load the question and the answer and the chunks

file_path = 'assets\user_llm_interaction_embeddings_c1521dd5_b819_4241_b3a4_3e5c1388037c.json'
with open(file_path) as f:
llm = json.load(f)

Initialize an empty list to hold the new dataset

data_set = []

Iterate through the list and combine every two dictionaries

for i in range(0, len(llm), 2):
combined_dict = {
"text_vector_1": llm[i].get("text_vector", []),
"text_vector_2": llm[i + 1].get("text_vector", []),
'chunks': llm[i + 1].get('chunks', [])
}
data_set.append(combined_dict)

def map_chunks(data_set, ground_truth_data):
for item in data_set: # Iterate over each dictionary in data_set
c = [] # Reset c for each item
for chunk_id in item['chunks']: # Loop through 'chunks' in the current dictionary
for element in ground_truth_data: # Loop through ground_truth_data
if element['id'] == chunk_id: # Match chunk_id with element's id
c.append(element['text_vector']) # Append the matching text_vector to c
item['chunks'] = c # Replace the original 'chunks' (ids) with the mapped text_vector values

return data_set  # Return the updated data_set

data_set = map_chunks(data_set, ground_truth_data)

Assuming data_set is a list of dictionaries

ragas_data = [
{
"question": entry["text_vector_1"], # Assuming this is a list of strings
"answer": entry["text_vector_2"], # Assuming this is a list of strings
"contexts": entry["chunks"] # Assuming this is a list of lists of strings
}
for entry in data_set
]

Create the required structure for Dataset

formatted_data = {
"question": [entry["question"] for entry in ragas_data],
"contexts": [entry["contexts"] for entry in ragas_data],
"answer": [entry["answer"] for entry in ragas_data]
}

model_name = 'distilbert-base-uncased'

class CustomHuggingFaceRagasEmbeddings(BaseRagasEmbeddings):
def init(self, model_name: str, custom_embeddings: list = None):
"""
Initialize the Custom Hugging Face Ragas Embeddings with the specified model and custom embeddings.

    Parameters:
        model_name (str): The name of the Hugging Face model to use (e.g., 'distilbert-base-uncased').
        custom_embeddings (list): A list of pre-computed custom embeddings (optional).
    """
    model_name = 'distilbert-base-uncased'
    self.tokenizer = AutoTokenizer.from_pretrained(model_name)
    self.model = AutoModel.from_pretrained(model_name)
    self.custom_embeddings = custom_embeddings  # Store the custom embeddings

def embed_documents(self, texts: list) -> np.ndarray:
    """
    Generate embeddings for a list of documents.
    
    Parameters:
        texts (list): A list of documents to embed.

    Returns:
        np.ndarray: An array of embeddings for the documents.
    """
    if self.custom_embeddings is not None:
        # If custom embeddings are provided, return those instead
        return np.array(self.custom_embeddings)
    
    # Generate new embeddings using the model if no custom embeddings are available
    inputs = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True)

    with torch.no_grad():
        outputs = self.model(**inputs)

    # Use the pooled output or the CLS token as the embedding
    embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token for sentence embedding
    return embeddings.numpy()  # Convert to NumPy array

def embed_query(self, query: str) -> np.ndarray:
    """
    Generate an embedding for a single query.
    
    Parameters:
        query (str): The query to embed.

    Returns:
        np.ndarray: The embedding for the query.
    """
    # If custom embeddings are provided, generate embedding based on those
    if self.custom_embeddings is not None:
        # You might want to handle how to relate the query to your custom embeddings
        raise NotImplementedError("Custom query embeddings are not supported with provided custom embeddings.")
    
    # Generate a new embedding using the model
    inputs = self.tokenizer(query, return_tensors='pt', padding=True, truncation=True)

    with torch.no_grad():
        outputs = self.model(**inputs)

    # Use the pooled output or the CLS token as the embedding
    embedding = outputs.last_hidden_state[:, 0, :]  # CLS token for single query embedding
    return embedding.numpy()  # Convert to NumPy array

Initialize the custom embeddings class

custom_embeddings = CustomHuggingFaceRagasEmbeddings(ragas_data)

ragas_embeddings = CustomHuggingFaceRagasEmbeddings(model_name=model_name, custom_embeddings=custom_embeddings)

Define the evaluation metrics

metrics = [context_utilization]

#lets define a custom function of evaluate function
#def custom_evaluate(ragas_data, metrics, embeddings: BaseRagasEmbeddings):
"""
Custom evaluation function that avoids using OpenAI API.

Parameters:
    dataset: The dataset to evaluate.
    metrics: A list of metrics to evaluate.
    embeddings: A custom embedding model (subclass of BaseRagasEmbeddings).
    
Returns:
    A dictionary of evaluation results.
"""

results = {}

# Iterate over the metrics and evaluate
#for metric in metrics:
    #try:
        # Make sure the metric is compatible with the custom embeddings
       # if isinstance(metric, ContextUtilization):  # Check for specific metric type
            #result = evaluate(ragas_data, metric, embeddings=embeddings)
    #        results[metric.name] = result
   # except Exception as e:
       # print(f"Error while evaluating metric {metric.name}: {e}")

return results

Run the evaluation

evaluation_report = evaluate(ragas_data, metrics=metrics, embeddings=custom_embeddings)

Print the evaluation results

print("RAGAS Evaluation Report:")
print(evaluation_report)

Metadata

Metadata

Assignees

No one assigned

    Labels

    answered🤖 The question has been answered. Will be closed automatically if no new commentsquestionFurther information is requested

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions