In [21]:
from dotenv import load_dotenv
import os, sys
import streamlit as st

load_dotenv('/Users/drew_wilkins/Drews_Files/Drew/Python/VSCode/.env')

# Add the parent directory to sys.path so you can import your modules from a subdirectory
sys.path.append(os.path.abspath('..'))

from utils import rag
from utils.rag import CONFIG
from langsmith.evaluation import evaluate
from langsmith import Client
from langchain import hub
from langchain_openai import ChatOpenAI
from langsmith import traceable

In [22]:
# Config LangSmith if you also want the traces
os.environ["LANGCHAIN_API_KEY"] = st.secrets["LANGCHAIN_API_KEY"]
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "langchain_evaluator.ipynb on ASK main/local"

In [23]:
client = Client()

eval_model = "gpt-4o-mini"

In [29]:
import json
from typing import Dict, Any


def validate_and_fix_json(raw_output: Any, required_fields: Dict[str, Any]) -> Dict[str, Any]:
    """
    Validates and fixes JSON output, ensuring required fields are present.

    Args:
        raw_output (Any): The raw JSON string or dictionary from the LLM.
        required_fields (dict): A dictionary of required fields with their default values.

    Returns:
        dict: Validated and fixed JSON output aligned with required fields.
    """
    # If the input is already a dictionary, skip parsing
    if isinstance(raw_output, dict):
        parsed_response = raw_output
    else:
        try:
            # Attempt to parse the JSON string
            parsed_response = json.loads(raw_output)
        except json.JSONDecodeError as e:
            print(f"JSONDecodeError: {e}")
            print(f"Raw output: {raw_output}")

            # Attempt common fixes
            if isinstance(raw_output, str):
                if raw_output.strip().endswith('"'):
                    raw_output = raw_output.rstrip('"') + '"}'
                elif not raw_output.strip().endswith('}'):
                    raw_output += '}'

                # Retry parsing
                try:
                    parsed_response = json.loads(raw_output)
                except json.JSONDecodeError as final_e:
                    print(f"Failed to fix JSON: {final_e}")
                    parsed_response = {}
            else:
                # If it's not a string and cannot be parsed, fallback to empty dict
                parsed_response = {}

    # Ensure required fields are present with default values
    validated_response = {
        key: parsed_response.get(key, default) for key, default in required_fields.items()
    }

    return validated_response

## Grader using dummy data

In [30]:
grade_prompt_accuracy = hub.pull("drew-wks/cot_qa")

llm = ChatOpenAI(model=eval_model, temperature=0, tags=["accuracy_evaluator"])

answer_grader = grade_prompt_accuracy | llm

# Example inputs for evaluation
query = "What is the capital of France?"
ground_truth_answer = "Paris"
prediction = "Paris"

# Prepare the input for the grader
grader_input = {
    "query": query,
    "ground_truth_answer": ground_truth_answer,
    "student_answer": prediction,
}

# Invoke the grader
grader_response = answer_grader.invoke(grader_input)


required_fields = {
    "correctness": None,  # Default correctness value
    "explanation": "No explanation provided.",  # Default explanation
}


# Validate and fix the grader response
validated_response = validate_and_fix_json(grader_response, required_fields)

# Extract evaluation results
correctness = validated_response["correctness"]
explanation = validated_response["explanation"]

# Evaluation results object
evaluation_result = {
    "key": "Accuracy",
    "score": correctness,
    "value": "Correct" if correctness == 1 else "Incorrect",
    "comment": explanation,
}

# Print objects for inspection
print("Grader Input:", grader_input)
print("Grader Response:", grader_response)
print("Validated Response:", validated_response)
print("Evaluation Result:", evaluation_result)

Grader Input: {'query': 'What is the capital of France?', 'ground_truth_answer': 'Paris', 'student_answer': 'Paris'}
Grader Response: {'correctness': 1, 'explanation': "The student's answer is 'Paris', which is the same as the context provided. Since the context states that the capital of France is Paris, the student's answer is factually accurate and correct."}
Validated Response: {'correctness': 1, 'explanation': "The student's answer is 'Paris', which is the same as the context provided. Since the context states that the capital of France is Paris, the student's answer is factually accurate and correct."}
Evaluation Result: {'key': 'Accuracy', 'score': 1, 'value': 'Correct', 'comment': "The student's answer is 'Paris', which is the same as the context provided. Since the context states that the capital of France is Paris, the student's answer is factually accurate and correct."}
