In [1]:
import ragas

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import os
TOGETHER_API_KEY = os.env['TOGETHER_API_KEY']

In [18]:
import json
from ragas import SingleTurnSample, EvaluationDataset

with open("../dataset.json") as f:
    data = json.load(f)

questions = data['question']
ground_truths = data['ground_truths']
answers = data['answer']
contexts = data['contexts']

samples = []

for i in range(2):   # range(len(questions))
    sample = SingleTurnSample(
        user_input=questions[i], 
        retrieved_contexts=contexts[i], 
        response=answers[i],
        reference="\n".join(ground_truths[i])
    )
    samples.append(sample)

eval_dataset = EvaluationDataset(samples=samples)
eval_dataset

#from datasets import load_dataset
# eval_dataset = load_dataset("json", data_files="../dataset.json")
# eval_dataset

EvaluationDataset(features=['user_input', 'retrieved_contexts', 'response', 'reference'], len=2)

In [58]:
from llama_index.embeddings.openai import OpenAIEmbedding
from ragas.embeddings import LlamaIndexEmbeddingsWrapper  # LangchainEmbeddingsWrapper

embedding_model_params = {
    "api_key": TOGETHER_API_KEY,
    "api_base": "https://api.together.xyz/v1",
    "model_name": "togethercomputer/m2-bert-80M-32k-retrieval"
}
embeddings = OpenAIEmbedding(**embedding_model_params)
evaluator_embeddings = LlamaIndexEmbeddingsWrapper(embeddings)
evaluator_embeddings

<ragas.embeddings.base.LlamaIndexEmbeddingsWrapper at 0x788d2e72c950>

In [59]:
evaluator_embeddings.embed_query("What's the most famous tower of Paris?")

[-0.024072325,
 0.060781237,
 -0.016816378,
 -0.004188854,
 0.003114574,
 0.037006628,
 -0.014296099,
 0.05881617,
 0.08166336,
 -0.00041788627,
 0.014675082,
 -0.010115089,
 0.027907567,
 0.02533795,
 -0.009160925,
 0.044618413,
 -0.040329587,
 -0.07533443,
 -0.03465597,
 0.024921114,
 -0.046954054,
 -0.010807617,
 -0.027965117,
 0.036141243,
 -0.021553403,
 0.14104816,
 0.004555829,
 0.07522436,
 0.09299631,
 -0.042492636,
 -0.03587672,
 -0.054819778,
 0.0010217387,
 0.020009708,
 0.023855839,
 -0.006927454,
 0.07019608,
 -0.014631169,
 -0.032102644,
 -0.04982573,
 0.059244923,
 -0.05556083,
 0.0013084703,
 0.021227159,
 -0.06888124,
 -0.050158534,
 -0.04116523,
 0.007368303,
 -0.05171302,
 0.06660133,
 0.089541465,
 0.02946038,
 0.09827351,
 0.03586011,
 0.0678252,
 -0.05280285,
 -0.06435847,
 -0.014457261,
 -0.10958488,
 -0.016454905,
 0.028531194,
 0.0055424212,
 -0.035888635,
 -0.05260759,
 0.04106157,
 -0.1250776,
 0.06574742,
 0.026016878,
 -0.08859429,
 0.039683428,
 0.0725348

In [70]:
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness
from ragas.metrics import ResponseRelevancy
from ragas.metrics import SemanticSimilarity
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import TopicAdherenceScore

from ragas import evaluate

llm_model_params = {
    "api_key": TOGETHER_API_KEY,
    "base_url": "https://api.together.xyz/v1",
    "model": "Qwen/Qwen2.5-7B-Instruct-Turbo"
}

from llama_index.llms.together import TogetherLLM
chat_model = TogetherLLM(**llm_model_params)

from ragas.llms import LlamaIndexLLMWrapper

evaluator_llm = LlamaIndexLLMWrapper(chat_model)
#rc = LLMContextRecall()

fc = FactualCorrectness()
ff = Faithfulness()
rr = ResponseRelevancy(embeddings = evaluator_embeddings)
ss = SemanticSimilarity(embeddings=evaluator_embeddings)

# needs multiturn samples dataset
# tas = TopicAdherenceScore(mode="precision", llm=evaluator_llm)

metrics = [fc, ff, rr, ss]
results = evaluate(dataset=eval_dataset, metrics=metrics, llm=evaluator_llm)

Evaluating: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:17<00:00,  2.14s/it]


In [71]:
df = results.to_pandas()
df.head()

Unnamed: 0,user_input,retrieved_contexts,response,reference,factual_correctness,faithfulness,answer_relevancy,semantic_similarity
0,What is ECMWF’s main focus in weather prediction?,[Seasonal Forecasting at ECMWF | CMEMS\n\r\nSk...,ECMWF's main focus in weather prediction is on...,ECMWF's main focus in weather prediction is on...,1.0,0.6,0.540814,0.692752
1,How does ECMWF use satellite data to improve w...,[Seasonal Forecasting at ECMWF | CMEMS\n\r\nSk...,ECMWF uses satellite data to improve weather f...,ECMWF uses satellite data to improve weather f...,1.0,0.0,0.76358,0.386332
