In [19]:
import os
from dotenv import load_dotenv
load_dotenv(encoding='utf-8')

True

# Reading test data from json files

In [2]:
import pandas as pd
import json
def json_to_dataframe(json_file_path):
  """Reads a JSON file and converts it to a pandas DataFrame.

  Args:
    json_file_path (str): The path to the JSON file.

  Returns:
    pandas.DataFrame: The DataFrame created from the JSON data.
  """

  with open(json_file_path, 'r') as f:
    data = json.load(f)

  # Handle different JSON structures
  if isinstance(data, list):
    # If the JSON data is a list of dictionaries, create a DataFrame directly
    df = pd.DataFrame(data)
  elif isinstance(data, dict):
    # If the JSON data is a single dictionary, convert it to a list of dictionaries
    df = pd.DataFrame([data])
  else:
    raise ValueError("Unsupported JSON structure")

  return df

In [5]:
from from_root import from_root
test_file = "data/test_dataset/test_dataset_hr.json"
test_data_path = os.path.join(from_root(), test_file)
data_to_upload = json_to_dataframe(test_data_path)
data_to_upload

Unnamed: 0,question,contexts,ground_truth,response
0,What is Tech Innovators Inc.'s approach to wor...,[Q12: What is the process for handling workpla...,Tech Innovators Inc. has a zero-tolerance poli...,Tech Innovators Inc. has a zero-tolerance poli...
1,What resources should be added for new hires i...,[:check_mark:\natlassian-check_mark\n#FFFAE6\n...,Add resources for new hires in the onboarding ...,Onboarding new hires is a critical process in ...
2,What training programs are offered in data sci...,"[ learn about SEO, social media marketing, and...","Courses covering financial analysis, budgeting...",Tech Innovators Inc. offers several training p...
3,What services does the Employee Assistance Pro...,[Q12: What is the process for handling workpla...,Employees can contact the Employee Assistance ...,The Employee Assistance Program (EAP) provides...
4,What is the significance of identifying growth...,[ Self-assessment\nStart by thinking through y...,Identifying growth areas in self-assessment is...,Identifying growth areas through self-assessme...
5,How do employee engagement and disengagement d...,"[Introduction\nAt Tech Innovators Inc., we bel...",Employee engagement and disengagement differ i...,Employee engagement and disengagement represen...
6,What is the purpose of adding a header image t...,[Create a stellar overview\nThe overview is th...,Adding a header image to a Confluence space en...,Adding a header image to a Confluence space se...


# Upload test dataset to LangSmith

In [11]:
from langsmith import Client
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGSMITH_API_KEY"] = os.getenv("LANGSMITH_API_KEY")
client = Client()

In [12]:
from datasets import Dataset

question = list(data_to_upload['question'])
answer = list(data_to_upload['response'])
contexts = list(data_to_upload['contexts'])
ground_truth = list(data_to_upload['ground_truth'])

data_samples = {
    'question': question,
    'answer': answer,
    'contexts': contexts,
    'ground_truth': ground_truth
}

dataset = Dataset.from_dict(data_samples)

In [13]:
from ragas.integrations.langsmith import upload_dataset
dataset_name = "hr test"
dataset_desc = "HR department test dataset"

dataset_uploaded = upload_dataset(dataset, dataset_name, dataset_desc)

Created a new dataset 'hr test'. Dataset is accessible at https://smith.langchain.com/o/08bc9556-81b3-56d7-98aa-4f87d6cdfca5/datasets/e67c3bea-1852-4ea0-8793-e8fc10fe7fa8


# Evaluation

## With RAGAS

In [14]:
from ragas import evaluate
# from ragas.integrations.langsmith import evaluate
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)
result = evaluate(
    dataset,
    metrics=[
        answer_relevancy,
        faithfulness,
        context_recall,
        context_precision,
    ],
)

result

Evaluating:   0%|          | 0/28 [00:00<?, ?it/s]

{'answer_relevancy': 0.9666, 'faithfulness': 0.5693, 'context_recall': 0.7619, 'context_precision': 0.8571}

In [15]:
df = result.to_pandas()
df.head()

Unnamed: 0,question,answer,contexts,ground_truth,answer_relevancy,faithfulness,context_recall,context_precision
0,What is Tech Innovators Inc.'s approach to wor...,Tech Innovators Inc. has a zero-tolerance poli...,[Q12: What is the process for handling workpla...,Tech Innovators Inc. has a zero-tolerance poli...,0.974083,0.318182,1.0,1.0
1,What resources should be added for new hires i...,Onboarding new hires is a critical process in ...,[:check_mark:\natlassian-check_mark\n#FFFAE6\n...,Add resources for new hires in the onboarding ...,0.971124,0.794118,1.0,1.0
2,What training programs are offered in data sci...,Tech Innovators Inc. offers several training p...,"[ learn about SEO, social media marketing, and...","Courses covering financial analysis, budgeting...",0.987744,0.04,0.0,0.0
3,What services does the Employee Assistance Pro...,The Employee Assistance Program (EAP) provides...,[Q12: What is the process for handling workpla...,Employees can contact the Employee Assistance ...,0.994292,0.25641,1.0,1.0
4,What is the significance of identifying growth...,Identifying growth areas through self-assessme...,[ Self-assessment\nStart by thinking through y...,Identifying growth areas in self-assessment is...,0.980172,1.0,1.0,1.0


## With LangSmith

In [20]:
def cohere_rag_get_answers(message):
   import requests
   import cohere
   import json
   import os
   os.environ['COHERE_API_KEY'] = os.getenv('COHERE_API_KEY')

   # Define the API endpoint for streaming
   url = "http://localhost:8000/v1/chat"
   bearer = os.getenv('BEARER_SECRET_KEY')

   # Set headers
   headers = {
       "User-Id": "me",
       "Content-Type": "application/json",
       "Authorization": f"Bearer {bearer}",
       "Cohere-Stream": "true",  # Enable streaming for chatbot responses
   }

   # Create the payload as a JSON dictionary
   data = {"message": message}

   # Send the POST request using requests
   response = requests.post(url, headers=headers, json=data)

   # Check for successful response
   if response.status_code == 200:
     # Handle streaming response
     for line in response.iter_lines():
       # Decode the response (if necessary)
       decoded_line = line.decode("utf-8")
       # Process the received data from the stream (print it here)
       response_data = json.loads(decoded_line)  # Parse the JSON string
       #print(decoded_line)
       return {
      'answer': response_data.get("text"),
      'contexts': response_data.get("documents")
      }
   else:
     print(f"Error: {response.status_code}")
     return {
       'answer': "Error: {response.status_code}",
      'contexts': []
     }

In [21]:
from langchain.smith import RunEvalConfig
from ragas.integrations.langchain import EvaluatorChain
from ragas.metrics import (
    answer_correctness,
    answer_relevancy,
    context_precision,
    context_recall,
    faithfulness,
)

# Wrap the RAGAS metrics to use in LangChain
evaluators = [
    EvaluatorChain(metric)
    for metric in [
        answer_correctness,
        answer_relevancy,
        context_precision,
        context_recall,
        faithfulness,
    ]
]
eval_config = RunEvalConfig(custom_evaluators=evaluators)

In [22]:
from ragas.integrations.langsmith import evaluate
results = evaluate(
    dataset_name= 'hr test',
    llm_or_chain_factory=cohere_rag_get_answers,
    metrics=[
        answer_correctness,
        answer_relevancy,
        context_precision,
        context_recall,
        faithfulness,
    ], # We can add more metrics here 
    verbose=True,
)

View the evaluation results for project 'respectful-blossom-87' at:
https://smith.langchain.com/o/08bc9556-81b3-56d7-98aa-4f87d6cdfca5/datasets/e67c3bea-1852-4ea0-8793-e8fc10fe7fa8/compare?selectedSessions=fab48859-bd9c-47ff-a667-8d09ec3b2471

View all tests for Dataset hr test at:
https://smith.langchain.com/o/08bc9556-81b3-56d7-98aa-4f87d6cdfca5/datasets/e67c3bea-1852-4ea0-8793-e8fc10fe7fa8
[>                                                 ] 0/7Error: 422
Error: 422
Error: 422
Error: 422
Error: 422


No statements were generated from the answer.


[------>                                           ] 1/7Error: 422


No statements were generated from the answer.


[------------->                                    ] 2/7Error: 422


No statements were generated from the answer.


[-------------------->                             ] 3/7

No statements were generated from the answer.


[---------------------------->                     ] 4/7

No statements were generated from the answer.


[----------------------------------->              ] 5/7

No statements were generated from the answer.


[------------------------------------------>       ] 6/7

No statements were generated from the answer.


[------------------------------------------------->] 7/7

Unnamed: 0,feedback.answer_correctness,feedback.answer_relevancy,feedback.context_precision,feedback.context_recall,feedback.faithfulness,error,execution_time,run_id
count,7.0,7.0,7.0,7.0,0.0,0.0,7.0,7
unique,,,,,,0.0,,7
top,,,,,,,,9f702d91-4609-4854-b807-61347b3053b5
freq,,,,,,,,1
mean,0.163483,0.0,0.0,0.0,,,2.708194,
std,0.006237,0.0,0.0,0.0,,,1.353991,
min,0.155291,0.0,0.0,0.0,,,0.658583,
25%,0.159312,0.0,0.0,0.0,,,2.143982,
50%,0.163524,0.0,0.0,0.0,,,3.49998,
75%,0.166678,0.0,0.0,0.0,,,3.50168,
