# Single response evaluation using RAGAS

https://docs.ragas.io/en/stable/concepts/metrics/available_metrics

In [1]:
# !python -m pip install -U ragas dataset
import ragas
print(f"ragas: {ragas.__version__}")

ragas: 0.2.14


In [2]:
import os, time
import pandas as pd
import numpy as np

# Libraries to customize ragas embedding model.
from langchain_openai import OpenAIEmbeddings
from ragas.embeddings import LangchainEmbeddingsWrapper

# Change the default llm-as-critic LLM.
LLM_NAME = "gpt-4o-mini" #OpenAI
ragas_llm = ragas.llms.llm_factory(model=LLM_NAME)
print(f"llm: {ragas_llm}")

# Change the default embeddings
# Initialize OpenAIEmbeddings with the specified model and dimensions
lc_embed_model = OpenAIEmbeddings(
    model="text-embedding-3-small",
    dimensions=512)
# Wrap the Langchain embeddings model for RAGAS
ragas_emb = LangchainEmbeddingsWrapper(embeddings=lc_embed_model)
print(f"embeddings: {ragas_emb}")

llm: LangchainLLMWrapper(langchain_llm=ChatOpenAI(...))
embeddings: LangchainEmbeddingsWrapper(embeddings=OpenAIEmbeddings(...))


In [3]:
## READ IN THE ORIGINAL DOCS

# Define the path to the data folder
data_folder = 'data'

# Read the contents of doc1.txt and doc2.txt
with open(os.path.join(data_folder, 'doc1.txt'), 'r', encoding='utf-8') as file:
    doc1 = file.read()

with open(os.path.join(data_folder, 'doc2.txt'), 'r', encoding='utf-8') as file:
    doc2 = file.read()

In [4]:
## GET THE ORIGINAL PROMPT TEMPLATE

prompt_template = """You are a legal analyst tasked with performing a detailed comparative analysis of two lengthy legal agreements:
Doc1: {doc1}
Doc2: {doc2}

Analyze what each document says about each topic carefully to identify and present the contextual differences 
in how each document addresses each topic. The topics to compare are:
- Definition of Confidential Information
- Permitted Use & Restrictions
- Data Security

Output JSON summarizing the contextual differences between Doc1 and Doc2 for each topic.
JSON keys: 
topic, summary, doc1_context, doc2_context

summary: 
Provide a concise summary of the key contextual difference between Doc1 and Doc2 
for the specific feature. Focus on the *meaningful distinction* and its *practical implications*.

doc1_context: 
Quote the *relevant text excerpt* from Doc1 that pertains to the feature. 
**Within this quoted text, use bold markdown formatting to highlight the specific words 
or phrases that are different or absent compared to the corresponding clause in Doc2.**

doc2_context: 
Quote the *corresponding text excerpt* from Doc2 that addresses the same feature. 
**Within this quoted text, use bold markdown formatting to highlight the specific words 
or phrases that are different or absent compared to the clause in Doc1.**

Use bold markdown to highlight the differing text within the "doc1_context" and "doc2_context" columns as described above.
Make sure you highlight in bold for each row, only text differences, to make it easier for the user to see the differences.
"""

In [5]:
# Get the current working directory.
cwd = os.getcwd()
relative_path = '/evals/sambanova_example_deepseekr1_context10k.csv'
file_path = cwd + relative_path
# print(f"file_path: {file_path}")

# Read LLM answers to evaluate from a CSV file.
eval_df = pd.read_csv(file_path, header=0, skip_blank_lines=True)
eval_df = eval_df.iloc[:, 0:5].copy()  # keep only 1st 5 columns
eval_df

Unnamed: 0,model,context length,time to response,correct structure,answer
0,Meta-Llama-3.3-70B-Instruct,128K,3s 484ms,Yes,topic='Definition of Confidential Information'...
1,DeepSeek-R1 w/2K chunk size,8K,1m 46s 770ms,Yes,**Confidential Information**: Document A uses ...
2,DeepSeek-R1 w/4K chunk size,8K,49s 450ms,Yes,"""topic"": ""Definition of Confidential Informati..."
3,DeepSeek-R1 w/7K chunk size,8K,12s 975ms,Yes,"""topic"": ""Definition of Confidential Informati..."
4,DeepSeek-R1-Distill-Llama-70B,128K,5s 910ms,Yes,"“topic"": ""Definition of Confidential Informati..."
5,Together.ai’s deepseek-ai/DeepSeek-R1-Distill-...,128K,12s 940ms,Yes,**Confidential Information** | Document A def...


# Run Summarization scorer for every model and append to eval df

[doc link](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/summarization_score/#summarization-score)

In [6]:
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import SummarizationScore

# https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/summarization_score/#summarization-score

scores = []

# Loop through each answer in eval_df.answer
for answer in eval_df.answer:
    # Assemble a single turn summarization data point
    sample = SingleTurnSample(
        response=answer,  # Use the current answer in the loop
        reference_contexts=[doc1, doc2]
    )
    scorer = SummarizationScore(llm=ragas_llm)
    score = await scorer.single_turn_ascore(sample)  # Get the score for the current sample
    scores.append(score)  # Append the score to the list

scores

[0.47197409790326283,
 0.47293189247714834,
 0.47327544922647685,
 0.4696628979532347,
 0.4695587898473776,
 0.4759926707893477]

In [7]:
# Append scores to eval_df
eval_df['summarization_score'] = np.round(scores, 3)
eval_df

Unnamed: 0,model,context length,time to response,correct structure,answer,summarization_score
0,Meta-Llama-3.3-70B-Instruct,128K,3s 484ms,Yes,topic='Definition of Confidential Information'...,0.472
1,DeepSeek-R1 w/2K chunk size,8K,1m 46s 770ms,Yes,**Confidential Information**: Document A uses ...,0.473
2,DeepSeek-R1 w/4K chunk size,8K,49s 450ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.473
3,DeepSeek-R1 w/7K chunk size,8K,12s 975ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.47
4,DeepSeek-R1-Distill-Llama-70B,128K,5s 910ms,Yes,"“topic"": ""Definition of Confidential Informati...",0.47
5,Together.ai’s deepseek-ai/DeepSeek-R1-Distill-...,128K,12s 940ms,Yes,**Confidential Information** | Document A def...,0.476


In [8]:
from ragas.metrics._factual_correctness import FactualCorrectness
# https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/factual_correctness

scores = []

# Loop through each answer in eval_df.answer
for answer in eval_df.answer:
    # Assemble a single turn summarization data point
    sample = SingleTurnSample(
        response=answer,  # Use the current answer in the loop
        reference=doc1 + " " + doc2
    )
    scorer = FactualCorrectness(
        llm=ragas_llm, 
        mode="precision", atomicity="low")
    score = await scorer.single_turn_ascore(sample)  # Get the score for the current sample
    scores.append(score)  # Append the score to the list

scores

[0.36, 0.34, 0.5, 0.0, 0.68, 0.84]

In [9]:
# Append scores to eval_df
eval_df['correctness_score'] = scores
eval_df

Unnamed: 0,model,context length,time to response,correct structure,answer,summarization_score,correctness_score
0,Meta-Llama-3.3-70B-Instruct,128K,3s 484ms,Yes,topic='Definition of Confidential Information'...,0.472,0.36
1,DeepSeek-R1 w/2K chunk size,8K,1m 46s 770ms,Yes,**Confidential Information**: Document A uses ...,0.473,0.34
2,DeepSeek-R1 w/4K chunk size,8K,49s 450ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.473,0.5
3,DeepSeek-R1 w/7K chunk size,8K,12s 975ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.47,0.0
4,DeepSeek-R1-Distill-Llama-70B,128K,5s 910ms,Yes,"“topic"": ""Definition of Confidential Informati...",0.47,0.68
5,Together.ai’s deepseek-ai/DeepSeek-R1-Distill-...,128K,12s 940ms,Yes,**Confidential Information** | Document A def...,0.476,0.84


In [10]:
# from ragas.metrics import AnswerAccuracy
# # NVIDIA contribution
# # https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/nvidia_metrics/#answer-accuracy

# scores = []

# # Loop through each answer in eval_df.answer
# for answer in eval_df.answer:
#     # Assemble a single turn summarization data point
#     sample = SingleTurnSample(
#         user_input=prompt_template,
#         response=answer,
#         reference=doc1 + " " + doc2
#     )
#     scorer = AnswerAccuracy(llm=ragas_llm)
#     score = await scorer.single_turn_ascore(sample)  # Get the score for the current sample
#     scores.append(score)  # Append the score to the list

#     # Free account rate limit is 3 requests per minute
#     # Wait for 1 minute
#     time.sleep(60)

# scores

In [11]:
# # Append scores to eval_df
# eval_df['nvidia_accuracy_score'] = scores
# eval_df

In [12]:
# Calculate a mean score
eval_df['mean_score'] = \
    eval_df[['summarization_score', 'correctness_score']]\
    .mean(axis=1)
eval_df

Unnamed: 0,model,context length,time to response,correct structure,answer,summarization_score,correctness_score,mean_score
0,Meta-Llama-3.3-70B-Instruct,128K,3s 484ms,Yes,topic='Definition of Confidential Information'...,0.472,0.36,0.416
1,DeepSeek-R1 w/2K chunk size,8K,1m 46s 770ms,Yes,**Confidential Information**: Document A uses ...,0.473,0.34,0.4065
2,DeepSeek-R1 w/4K chunk size,8K,49s 450ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.473,0.5,0.4865
3,DeepSeek-R1 w/7K chunk size,8K,12s 975ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.47,0.0,0.235
4,DeepSeek-R1-Distill-Llama-70B,128K,5s 910ms,Yes,"“topic"": ""Definition of Confidential Informati...",0.47,0.68,0.575
5,Together.ai’s deepseek-ai/DeepSeek-R1-Distill-...,128K,12s 940ms,Yes,**Confidential Information** | Document A def...,0.476,0.84,0.658


In [16]:
## REPORTING

def calculate_percent_lift(scores):
    # Calculate how much better each score is than worst score
    percent_better = np.abs(scores - scores[5]) \
                         / scores * 100
    return percent_better

## ACCURACY
# Sort from highest to lowest mean accuracy score
sorted_df = eval_df.sort_values(by=eval_df.columns[-1], ascending=False).reset_index(drop=True)
# Just the sorted mean scores
scores = sorted_df.mean_score
percent_better = calculate_percent_lift(scores)
# Add percents to eval_df
sorted_df['percent_improvement'] = percent_better
# Reorder scores columns
sorted_df = sorted_df.iloc[:, [0,1,2,7,8,3,4,5,6]]

sorted_df

Unnamed: 0,model,context length,time to response,mean_score,percent_improvement,correct structure,answer,summarization_score,correctness_score
0,Together.ai’s deepseek-ai/DeepSeek-R1-Distill-...,128K,12s 940ms,0.658,64.285714,Yes,**Confidential Information** | Document A def...,0.476,0.84
1,DeepSeek-R1-Distill-Llama-70B,128K,5s 910ms,0.575,59.130435,Yes,"“topic"": ""Definition of Confidential Informati...",0.47,0.68
2,DeepSeek-R1 w/4K chunk size,8K,49s 450ms,0.4865,51.695786,Yes,"""topic"": ""Definition of Confidential Informati...",0.473,0.5
3,Meta-Llama-3.3-70B-Instruct,128K,3s 484ms,0.416,43.509615,Yes,topic='Definition of Confidential Information'...,0.472,0.36
4,DeepSeek-R1 w/2K chunk size,8K,1m 46s 770ms,0.4065,42.189422,Yes,**Confidential Information**: Document A uses ...,0.473,0.34
5,DeepSeek-R1 w/7K chunk size,8K,12s 975ms,0.235,0.0,Yes,"""topic"": ""Definition of Confidential Informati...",0.47,0.0


RESULTS MOST ACCURATE  (all models hosted by SambaNova, except 1 hosted by Together.ai)

|rank<br> model	| context length | TTR | mean_score | percent more<br> accurate |
|--|----------------|-----|------------|---------------------|
|1 | Together.ai’s DeepSeek-R1-Distill-Llama-70B | 128K| 13s | 0.6580 | 64.285714 |
|2 | DeepSeek-R1-Distill-Llama-70B | 128K | 6s | 0.5750 | 59.130435 |
|3 | DeepSeek-R1 w/4K chunk size | 8K | 49s | 0.4865 | 51.695786 |
|4 | Meta-Llama-3.3-70B-Instruct | 128K | 3s | 0.4160 | 43.509615 |
5 | DeepSeek-R1 w/2K chunk size | 8K | 1m 47s	| 0.4065 | 42.189422 |
6 | DeepSeek-R1 w/7K chunk size | 8K | 13s | 0.2350 | 0.000000 |

RESULTS FASTEST (all models hosted by SambaNova, except 1 hosted by Together.ai)

|rank <br>(accuracy) <br>model | context length | TTR | mean_score | percent <br>faster |
|-----------|----------------|-----|------------|---------------------|
|2 | DeepSeek-R1-Distill-Llama-70B | 128K | 910ms | 0.5750 | 98% |
|1 | Together.ai’s DeepSeek-R1-Distill-Llama-70B | 128K | 1300ms | 0.6580 | 98% |
|4 | Meta-Llama-3.3-70B-Instruct | 128K | 3484ms | 0.4160 | 94% |
|6 | DeepSeek-R1 w/7K chunk size | 8K | 12975ms | 0.2350 | 78% |
|3 | DeepSeek-R1 w/4K chunk size | 8K | 49045ms | 0.4865 | 18% |
|5 | DeepSeek-R1 w/2K chunk size | 8K | 60046ms	| 0.4065 | 0% |

In [22]:
(49045-60046)/60046

-0.1832095393531626