# Single response evaluation using RAGAS

https://docs.ragas.io/en/stable/concepts/metrics/available_metrics

In [1]:
# !python -m pip install -U ragas dataset
import ragas
print(f"ragas: {ragas.__version__}")

ragas: 0.2.14


In [2]:
import os, time
import pandas as pd
import numpy as np

# Libraries to customize ragas critic model.
from ragas.llms import LangchainLLMWrapper
from langchain_community.chat_models import ChatOllama

# Libraries to customize ragas embedding model.
from langchain_openai import OpenAIEmbeddings
from ragas.embeddings import LangchainEmbeddingsWrapper

# Change the default llm-as-critic LLM to local llama3.2 
LLM_NAME = 'llama3.2:1b'
ragas_llm = LangchainLLMWrapper(langchain_llm=ChatOllama(model=LLM_NAME))
print(f"llm: {ragas_llm}")

# Change the default llm-as-critic LLM.
LLM_NAME = "gpt-4o-mini" #OpenAI
ragas_llm = ragas.llms.llm_factory(model=LLM_NAME)
print(f"llm: {ragas_llm}")

# Change the default embeddings
# Initialize OpenAIEmbeddings with the specified model and dimensions
lc_embed_model = OpenAIEmbeddings(
    model="text-embedding-3-small",
    dimensions=512)
# Wrap the Langchain embeddings model for RAGAS
ragas_emb = LangchainEmbeddingsWrapper(embeddings=lc_embed_model)
print(f"embeddings: {ragas_emb}")

# TODO:  class:`~langchain-ollama package and should be used instead. 
# To use it run `pip install -U :class:`~langchain-ollama` and import as 
# `from :class:`~langchain_ollama import ChatOllama

llm: LangchainLLMWrapper(langchain_llm=ChatOllama(...))
llm: LangchainLLMWrapper(langchain_llm=ChatOpenAI(...))
embeddings: LangchainEmbeddingsWrapper(embeddings=OpenAIEmbeddings(...))


  ragas_llm = LangchainLLMWrapper(langchain_llm=ChatOllama(model=LLM_NAME))


In [3]:
## READ IN THE ORIGINAL DOCS

# Define the path to the data folder
data_folder = 'data'

# Read the contents of doc1.txt and doc2.txt
with open(os.path.join(data_folder, 'doc1.txt'), 'r', encoding='utf-8') as file:
    doc1 = file.read()

with open(os.path.join(data_folder, 'doc2.txt'), 'r', encoding='utf-8') as file:
    doc2 = file.read()

In [4]:
## GET THE ORIGINAL PROMPT TEMPLATE

prompt_template = """You are a legal analyst tasked with performing a detailed comparative analysis of two lengthy legal agreements:
Doc1: {doc1}
Doc2: {doc2}

Analyze what each document says about each topic carefully to identify and present the contextual differences 
in how each document addresses each topic. The topics to compare are:
- Definition of Confidential Information
- Permitted Use & Restrictions
- Data Security

Output JSON summarizing the contextual differences between Doc1 and Doc2 for each topic.
JSON keys: 
topic, summary, doc1_context, doc2_context

summary: 
Provide a concise summary of the key contextual difference between Doc1 and Doc2 
for the specific feature. Focus on the *meaningful distinction* and its *practical implications*.

doc1_context: 
Quote the *relevant text excerpt* from Doc1 that pertains to the feature. 
**Within this quoted text, use bold markdown formatting to highlight the specific words 
or phrases that are different or absent compared to the corresponding clause in Doc2.**

doc2_context: 
Quote the *corresponding text excerpt* from Doc2 that addresses the same feature. 
**Within this quoted text, use bold markdown formatting to highlight the specific words 
or phrases that are different or absent compared to the clause in Doc1.**

Use bold markdown to highlight the differing text within the "doc1_context" and "doc2_context" columns as described above.
Make sure you highlight in bold for each row, only text differences, to make it easier for the user to see the differences.
"""

In [5]:
# Get the current working directory.
cwd = os.getcwd()
relative_path = '/evals/sambanova_example_deepseekr1_context10k.csv'
file_path = cwd + relative_path
# print(f"file_path: {file_path}")

# Read LLM answers to evaluate from a CSV file.
eval_df = pd.read_csv(file_path, header=0, skip_blank_lines=True)
eval_df = eval_df.iloc[:, 0:5].copy()  # keep only 1st 5 columns
eval_df

Unnamed: 0,model,context length,time_to_response,correct structure,answer
0,Meta-Llama-3.3-70B-Instruct,128K,3s 765ms,Yes,Doc1 defines Confidential Information as all i...
1,DeepSeek-R1,128K,30s 465ms,Yes,"""topic"": ""Definition of Confidential Informati..."
2,Together DeepSeek-R1,128K,20s 377ms,Yes,"""topic"": ""Definition of Confidential Informati..."
3,DeepSeek-R1 w/2K chunk size,8K,1m 46s 770ms,Yes,**Confidential Information**: Document A uses ...
4,DeepSeek-R1 w/4K chunk size,8K,49s 450ms,Yes,"""topic"": ""Definition of Confidential Informati..."
5,DeepSeek-R1 w/7K chunk size,8K,41s 860ms,Yes,"""topic"": ""Definition of Confidential Informati..."
6,DeepSeek-R1-Distill-Llama-70B,128K,30s 496ms,Yes,"""topic"": ""Definition of Confidential Informati..."
7,Together.ai’s deepseek-ai/DeepSeek-R1-Distill-...,128K,26s 140ms,Yes,"""topic"": ""Definition of Confidential Informati..."


# Run Summarization scorer for every model and append to eval df

[doc link](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/summarization_score/#summarization-score)

In [6]:
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import SummarizationScore

# https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/summarization_score/#summarization-score

scores = []

# Loop through each answer in eval_df.answer
for answer in eval_df.answer:
    # Assemble a single turn summarization data point
    sample = SingleTurnSample(
        response=answer,  # Use the current answer in the loop
        reference_contexts=[doc1, doc2]
    )
    scorer = SummarizationScore(llm=ragas_llm)
    score = await scorer.single_turn_ascore(sample)  # Get the score for the current sample
    scores.append(score)  # Append the score to the list

scores

[0.4725779249172341,
 0.45380723343119506,
 0.46920482228746335,
 0.47293189247714834,
 0.47327544922647685,
 0.47309846544651973,
 0.4695692006579633,
 0.43843046619609816]

In [7]:
# Append scores to eval_df
eval_df['summarization_score'] = np.round(scores, 3)
eval_df

Unnamed: 0,model,context length,time_to_response,correct structure,answer,summarization_score
0,Meta-Llama-3.3-70B-Instruct,128K,3s 765ms,Yes,Doc1 defines Confidential Information as all i...,0.473
1,DeepSeek-R1,128K,30s 465ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.454
2,Together DeepSeek-R1,128K,20s 377ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.469
3,DeepSeek-R1 w/2K chunk size,8K,1m 46s 770ms,Yes,**Confidential Information**: Document A uses ...,0.473
4,DeepSeek-R1 w/4K chunk size,8K,49s 450ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.473
5,DeepSeek-R1 w/7K chunk size,8K,41s 860ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.473
6,DeepSeek-R1-Distill-Llama-70B,128K,30s 496ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.47
7,Together.ai’s deepseek-ai/DeepSeek-R1-Distill-...,128K,26s 140ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.438


In [8]:
from ragas.metrics._factual_correctness import FactualCorrectness
# https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/factual_correctness

scores = []

# Loop through each answer in eval_df.answer
for answer in eval_df.answer:
    # Assemble a single turn summarization data point
    sample = SingleTurnSample(
        response=answer,  # Use the current answer in the loop
        reference=doc1 + " " + doc2
    )
    scorer = FactualCorrectness(
        llm=ragas_llm, 
        mode="precision", atomicity="low")
    score = await scorer.single_turn_ascore(sample)  # Get the score for the current sample
    scores.append(score)  # Append the score to the list

scores

[0.36, 1.0, 1.0, 0.53, 0.43, 0.53, 0.94, 0.64]

In [9]:
# Append scores to eval_df
eval_df['correctness_score'] = scores
eval_df

Unnamed: 0,model,context length,time_to_response,correct structure,answer,summarization_score,correctness_score
0,Meta-Llama-3.3-70B-Instruct,128K,3s 765ms,Yes,Doc1 defines Confidential Information as all i...,0.473,0.36
1,DeepSeek-R1,128K,30s 465ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.454,1.0
2,Together DeepSeek-R1,128K,20s 377ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.469,1.0
3,DeepSeek-R1 w/2K chunk size,8K,1m 46s 770ms,Yes,**Confidential Information**: Document A uses ...,0.473,0.53
4,DeepSeek-R1 w/4K chunk size,8K,49s 450ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.473,0.43
5,DeepSeek-R1 w/7K chunk size,8K,41s 860ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.473,0.53
6,DeepSeek-R1-Distill-Llama-70B,128K,30s 496ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.47,0.94
7,Together.ai’s deepseek-ai/DeepSeek-R1-Distill-...,128K,26s 140ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.438,0.64


In [10]:
from ragas.metrics import AnswerAccuracy
# NVIDIA contribution
# https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/nvidia_metrics/#answer-accuracy

scores = []

# Loop through each answer in eval_df.answer
for answer in eval_df.answer:
    # Assemble a single turn summarization data point
    sample = SingleTurnSample(
        user_input=prompt_template,
        response=answer,
        reference=doc1 + " " + doc2
    )
    scorer = AnswerAccuracy(llm=ragas_llm)
    score = await scorer.single_turn_ascore(sample)  # Get the score for the current sample
    scores.append(score)  # Append the score to the list

In [11]:
# Append scores to eval_df
eval_df['nvidia_accuracy_score'] = scores
eval_df

Unnamed: 0,model,context length,time_to_response,correct structure,answer,summarization_score,correctness_score,nvidia_accuracy_score
0,Meta-Llama-3.3-70B-Instruct,128K,3s 765ms,Yes,Doc1 defines Confidential Information as all i...,0.473,0.36,1.0
1,DeepSeek-R1,128K,30s 465ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.454,1.0,1.0
2,Together DeepSeek-R1,128K,20s 377ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.469,1.0,1.0
3,DeepSeek-R1 w/2K chunk size,8K,1m 46s 770ms,Yes,**Confidential Information**: Document A uses ...,0.473,0.53,0.75
4,DeepSeek-R1 w/4K chunk size,8K,49s 450ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.473,0.43,1.0
5,DeepSeek-R1 w/7K chunk size,8K,41s 860ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.473,0.53,1.0
6,DeepSeek-R1-Distill-Llama-70B,128K,30s 496ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.47,0.94,1.0
7,Together.ai’s deepseek-ai/DeepSeek-R1-Distill-...,128K,26s 140ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.438,0.64,1.0


In [12]:
# # Drop Ragas correctness score column
# eval_df.drop(columns=['correctness_score'], inplace=True)

In [13]:
# Calculate a mean score
    # eval_df[['summarization_score', 'correctness_score']]\
eval_df['mean_score'] = \
    eval_df[['summarization_score', 'correctness_score', 'nvidia_accuracy_score']]\
    .mean(axis=1)
eval_df

Unnamed: 0,model,context length,time_to_response,correct structure,answer,summarization_score,correctness_score,nvidia_accuracy_score,mean_score
0,Meta-Llama-3.3-70B-Instruct,128K,3s 765ms,Yes,Doc1 defines Confidential Information as all i...,0.473,0.36,1.0,0.611
1,DeepSeek-R1,128K,30s 465ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.454,1.0,1.0,0.818
2,Together DeepSeek-R1,128K,20s 377ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.469,1.0,1.0,0.823
3,DeepSeek-R1 w/2K chunk size,8K,1m 46s 770ms,Yes,**Confidential Information**: Document A uses ...,0.473,0.53,0.75,0.584333
4,DeepSeek-R1 w/4K chunk size,8K,49s 450ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.473,0.43,1.0,0.634333
5,DeepSeek-R1 w/7K chunk size,8K,41s 860ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.473,0.53,1.0,0.667667
6,DeepSeek-R1-Distill-Llama-70B,128K,30s 496ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.47,0.94,1.0,0.803333
7,Together.ai’s deepseek-ai/DeepSeek-R1-Distill-...,128K,26s 140ms,Yes,"""topic"": ""Definition of Confidential Informati...",0.438,0.64,1.0,0.692667


In [14]:
## REPORTING

def calculate_percent_lift(scores):
    # Calculate how much better each score is than worst score
    # Assume scores are sorted best to worst.
    percent_better = np.abs(scores - scores[len(scores)-1]) \
                         / scores * 100
    return percent_better

## ACCURACY
# Sort from highest to lowest mean accuracy score
sorted_df = eval_df.sort_values(by=eval_df.columns[-1], ascending=False).reset_index(drop=True)
# Just the sorted mean scores
scores = sorted_df.mean_score
# Calculate percent improvement
percent_better = calculate_percent_lift(scores)
# Add percents to eval_df
sorted_df['percent_improvement'] = percent_better

# Reorder scores columns
columns = sorted_df.columns.tolist()
new_order = columns[:3] + columns[-2:] + columns[3:-2]
sorted_df = sorted_df.reindex(columns=new_order)

sorted_df

Unnamed: 0,model,context length,time_to_response,mean_score,percent_improvement,correct structure,answer,summarization_score,correctness_score,nvidia_accuracy_score
0,Together DeepSeek-R1,128K,20s 377ms,0.823,28.999595,Yes,"""topic"": ""Definition of Confidential Informati...",0.469,1.0,1.0
1,DeepSeek-R1,128K,30s 465ms,0.818,28.565607,Yes,"""topic"": ""Definition of Confidential Informati...",0.454,1.0,1.0
2,DeepSeek-R1-Distill-Llama-70B,128K,30s 496ms,0.803333,27.261411,Yes,"""topic"": ""Definition of Confidential Informati...",0.47,0.94,1.0
3,Together.ai’s deepseek-ai/DeepSeek-R1-Distill-...,128K,26s 140ms,0.692667,15.640038,Yes,"""topic"": ""Definition of Confidential Informati...",0.438,0.64,1.0
4,DeepSeek-R1 w/7K chunk size,8K,41s 860ms,0.667667,12.481278,Yes,"""topic"": ""Definition of Confidential Informati...",0.473,0.53,1.0
5,DeepSeek-R1 w/4K chunk size,8K,49s 450ms,0.634333,7.882291,Yes,"""topic"": ""Definition of Confidential Informati...",0.473,0.43,1.0
6,Meta-Llama-3.3-70B-Instruct,128K,3s 765ms,0.611,4.36443,Yes,Doc1 defines Confidential Information as all i...,0.473,0.36,1.0
7,DeepSeek-R1 w/2K chunk size,8K,1m 46s 770ms,0.584333,0.0,Yes,**Confidential Information**: Document A uses ...,0.473,0.53,0.75


RESULTS MOST ACCURATE  (all models hosted by SambaNova, except 2 accuracy comparisons by Together.ai)

|rank | model	| context<br> length | TTR | mean <br>score | percent<br> more<br> accurate |
|-----|---------|----------------|-----|------------|---------------------------|
|1 | Together DeepSeek-R1 | 128K | 20s | 0.823000 | 29 |
|1 | SN DeepSeek-R1 | 128K | 30s | 0.818000 | 29 |
|2 | SN DeepSeek-R1-Distill-Llama-70B | 128K | 30s | 0.803333 | 27 |
|3 | Together DeepSeek-R1-Distill-Llama-70B | 128K| 26s | 0.692667 | 16 |
|4 | SN DeepSeek-R1 w/7K chunk size | 8K | 42s | 0.667667 | 12 |
|5 | SN DeepSeek-R1 w/4K chunk size | 8K | 49s | 0.657667 | 8|
|6 | SN Meta-Llama-3.3-70B-Instruct | 128K | 3s | 0.611000 | 4 |
|7 | SN DeepSeek-R1 w/2K chunk size | 8K | 107s	| 0.584333 | 0 |

RESULTS FASTEST (all models hosted by SambaNova, except 2 accuracy comparisons by Together.ai)

|rank <br>(speed) | model | context<br> length | TTR | mean <br>score | percent <br>faster |
|--|---------------------------|----------------|-----|------------|--------------------|
|6 | SN Meta-Llama-3.3-70B-Instruct | 128K | 3s | 0.611000 | 97 |
|1 | Together DeepSeek-R1 | 128K | 20s | 0.823000 | 81 |
|3 | Together.ai’s DeepSeek-R1-Distill-Llama-70B | 128K| 26s | 0.692667 | 76 |
|1 | SN DeepSeek-R1 | 128K | 30s | 0.818000 | 72 |
|2 | SN DeepSeek-R1-Distill-Llama-70B | 128K | 30s | 0.803333 | 72 |
|4 | SN DeepSeek-R1 w/7K chunk size | 8K | 42s | 0.667667 | 61 |
|5 | SN DeepSeek-R1 w/4K chunk size | 8K | 49s | 0.657667 | 54|
|7 | SN DeepSeek-R1 w/2K chunk size | 8K | 107s	| 0.584333 | 0 |

In [21]:
(107-49)/107 * 100

54.20560747663551