In [None]:
!pip install langchain_openai

Collecting langchain_openai
  Downloading langchain_openai-0.3.30-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-core<1.0.0,>=0.3.74 (from langchain_openai)
  Downloading langchain_core-0.3.74-py3-none-any.whl.metadata (5.8 kB)
Collecting openai<2.0.0,>=1.99.9 (from langchain_openai)
  Downloading openai-1.99.9-py3-none-any.whl.metadata (29 kB)
Downloading langchain_openai-0.3.30-py3-none-any.whl (74 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.4/74.4 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading langchain_core-0.3.74-py3-none-any.whl (443 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m443.5/443.5 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading openai-1.99.9-py3-none-any.whl (786 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m786.8/786.8 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai, langchain-core, langchain_openai
  Attempting uninstal

In [None]:
import os
import json
import re
from typing import List
from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from google.colab import userdata

os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

class EvaluationResult(BaseModel):
    relevance: float = Field(..., description="Relevance score between 0 and 100")
    factual_accuracy: float = Field(..., description="Accuracy score between 0 and 100")
    completeness: float = Field(..., description="Completeness score between 0 and 100")
    overall_score: float = Field(..., description="Weighted score between 0 and 100")

llm = ChatOpenAI(model="gpt-4o-mini")

prompt_template = PromptTemplate(
    input_variables=["question", "references", "student_answer"],
    template="""
You are an evaluator. Compare the STUDENT ANSWER to the REFERENCE if reference is available or else evaluate yourself.

Question: {question}

Reference Answers:
{references}

Student Answer:
{student_answer}

Scoring rules:
- relevance: between 0 and 100
- factual_accuracy: between 0 and 100
- completeness: between 0 and 100
Formula: overall_score = 0.35 * factual_accuracy + 0.45 * relevance + 0.2 * completeness

Respond ONLY with valid JSON in this format:
{{
  "relevance": <float>,
  "factual_accuracy": <float>,
  "completeness": <float>,
  "overall_score": <float>
}}
"""
)

def extract_json(text: str) -> dict:
    match = re.search(r"\{.*\}", text, re.DOTALL)
    if not match:
        raise ValueError(f"No JSON found in model output: {text}")
    json_str = match.group(0)
    return json.loads(json_str)

def evaluate(question: str, references: List[str], student_answer: str) -> dict:
    prompt = prompt_template.format(
        question=question,
        references="\n".join(references),
        student_answer=student_answer
    )
    response = llm.invoke(prompt)
    response_text = response.content if hasattr(response, "content") else str(response)
    result_dict = extract_json(response_text)
    validated = EvaluationResult(**result_dict)
    return validated.model_dump()  # Fixed: Use model_dump() instead of dict()

In [None]:
q = "What is p-value in statistics?"

refs = ["The p-value is a fundamental concept in statistical hypothesis testing, used to quantify the strength of evidence against a null hypothesis. It is defined as the probability of obtaining a test statistic at least as extreme as the one actually observed, assuming the null hypothesis is true. In other words, it answers the question: “If the null hypothesis were correct, how likely is it that we would see results like this (or more extreme) purely by random chance?"
"A small p-value (commonly less than a chosen significance level, such as 0.05) suggests that the observed data would be quite rare if the null hypothesis were true. This provides evidence in favor of the alternative hypothesis, leading researchers to reject the null. Conversely, a large p-value indicates that the observed data is consistent with the null hypothesis, and there is insufficient evidence to reject it. Importantly, the p-value does not measure the probability that the null hypothesis itself is true, nor does it indicate the size or practical importance of an effect — it only assesses the compatibility between the data and the null model."
"In practice, p-values are used alongside effect sizes, confidence intervals, and subject-matter knowledge to make informed conclusions. While widely used, p-values can be misinterpreted and should not be the sole basis for decision-making; they are best understood as one piece of the broader statistical inference process."]

s0 = "p-value is probability"
s1 = "Area under the distribution curve outside the confidence interval is the p-value"
s2 = "Probability of an event that is atleast as extreme as the observed event is known as p-value"
s3 = "p-value measures the probability of an event that is atleast as extreme as the observed event, assuming null hypothesis is true"
s4 = "The p-value is a measure of how compatible your observed data is with the assumption that the null hypothesis is correct — it represents the likelihood of seeing the observed effect or something more extreme purely due to random chance if the null hypothesis holds."

student = [s0, s1, s2, s3, s4]

for i in range(len(student)):
  print(json.dumps(evaluate(q, refs, student[i]), indent=2))

/tmp/ipython-input-43586821.py:66: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  return validated.dict()


{
  "relevance": 30.0,
  "factual_accuracy": 40.0,
  "completeness": 5.0,
  "overall_score": 28.5
}


/tmp/ipython-input-43586821.py:66: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  return validated.dict()


{
  "relevance": 40.0,
  "factual_accuracy": 10.0,
  "completeness": 10.0,
  "overall_score": 23.5
}


/tmp/ipython-input-43586821.py:66: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  return validated.dict()


{
  "relevance": 90.0,
  "factual_accuracy": 80.0,
  "completeness": 20.0,
  "overall_score": 72.5
}


/tmp/ipython-input-43586821.py:66: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  return validated.dict()


{
  "relevance": 100.0,
  "factual_accuracy": 95.0,
  "completeness": 40.0,
  "overall_score": 86.25
}
{
  "relevance": 100.0,
  "factual_accuracy": 95.0,
  "completeness": 75.0,
  "overall_score": 93.25
}


/tmp/ipython-input-43586821.py:66: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  return validated.dict()


In [None]:
# Comprehensive Test of Automated Evaluation - First 10 Questions
import json

# Load the response file
with open("response_1755348961176.json", 'r', encoding='utf-8') as file:
    data = json.load(file)

print(f"Loaded {len(data['results'])} questions from the response file")
print(f"Testing evaluation on first 10 questions with ALL candidate answers")

# Test evaluation on first 10 questions with all candidate answers
test_results = []
questions_processed = 0
total_answers_evaluated = 0

for idx, result in enumerate(data['results'][:10]):  # Test first 10 questions
    question = result['question']
    domain = result.get('domain', 'Unknown')
    difficulty = result.get('difficulty', 'Unknown')
    
    # Check if reference answers exist
    reference_answers = []
    if 'reference_answers' in result and result['reference_answers']:
        for ref in result['reference_answers']:
            if isinstance(ref, dict) and 'answer' in ref:
                reference_answers.append(ref['answer'])
    
    if not reference_answers:
        print(f"Skipping question {idx+1}: No reference answers")
        continue
    
    print(f"\n{'='*60}")
    print(f"Processing Question {idx+1}:")
    print(f"Domain: {domain}, Difficulty: {difficulty}")
    print(f"Question: {question[:100]}...")
    print(f"Reference answers available: {len(reference_answers)}")
    
    # Evaluate ALL candidate answers for this question
    if 'candidate_answers' in result and result['candidate_answers']:
        print(f"Candidate answers to evaluate: {len(result['candidate_answers'])}")
        
        question_scores = []
        for candidate_idx, candidate_answer in enumerate(result['candidate_answers']):
            if isinstance(candidate_answer, dict):
                answer_text = candidate_answer.get('answer', '')
                answer_type = candidate_answer.get('type', 'unknown')
            else:
                answer_text = candidate_answer
                answer_type = 'unknown'
            
            print(f"\n  Candidate {candidate_idx + 1}/{len(result['candidate_answers'])}:")
            print(f"  Type: {answer_type}")
            print(f"  Answer preview: {answer_text[:120]}...")
            
            try:
                # Use the existing evaluate function
                evaluation = evaluate(question, reference_answers, answer_text)
                evaluation['question_index'] = idx + 1
                evaluation['domain'] = domain
                evaluation['difficulty'] = difficulty
                evaluation['answer_type'] = answer_type
                evaluation['candidate_index'] = candidate_idx + 1
                
                test_results.append(evaluation)
                question_scores.append(evaluation['overall_score'])
                total_answers_evaluated += 1
                
                print(f"  ✅ Evaluation completed:")
                print(f"     Overall Score: {evaluation['overall_score']:.1f}")
                print(f"     Relevance: {evaluation['relevance']:.1f}")
                print(f"     Accuracy: {evaluation['factual_accuracy']:.1f}")
                print(f"     Completeness: {evaluation['completeness']:.1f}")
                
            except Exception as e:
                print(f"  ❌ Evaluation failed: {e}")
        
        # Show question-level summary
        if question_scores:
            avg_score = sum(question_scores) / len(question_scores)
            best_score = max(question_scores)
            worst_score = min(question_scores)
            print(f"\n  Question {idx+1} Summary:")
            print(f"    Answers evaluated: {len(question_scores)}")
            print(f"    Average score: {avg_score:.1f}")
            print(f"    Best score: {best_score:.1f}")
            print(f"    Worst score: {worst_score:.1f}")
        
        questions_processed += 1

print(f"\n{'='*60}")
print(f"COMPREHENSIVE TEST EVALUATION SUMMARY")
print(f"{'='*60}")
print(f"Questions processed: {questions_processed}")
print(f"Total answers evaluated: {total_answers_evaluated}")

if test_results:
    # Overall statistics
    avg_overall = sum(r['overall_score'] for r in test_results) / len(test_results)
    avg_relevance = sum(r['relevance'] for r in test_results) / len(test_results)
    avg_accuracy = sum(r['factual_accuracy'] for r in test_results) / len(test_results)
    avg_completeness = sum(r['completeness'] for r in test_results) / len(test_results)
    
    best_score = max(r['overall_score'] for r in test_results)
    worst_score = min(r['overall_score'] for r in test_results)
    
    print(f"\nOverall Performance:")
    print(f"  Average Overall Score: {avg_overall:.2f}")
    print(f"  Average Relevance: {avg_relevance:.2f}")
    print(f"  Average Factual Accuracy: {avg_accuracy:.2f}")
    print(f"  Average Completeness: {avg_completeness:.2f}")
    print(f"  Score Range: {worst_score:.1f} - {best_score:.1f}")
    
    # Performance by domain
    domains = {}
    for result in test_results:
        domain = result['domain']
        if domain not in domains:
            domains[domain] = []
        domains[domain].append(result['overall_score'])
    
    print(f"\nPerformance by Domain:")
    for domain, scores in domains.items():
        avg_domain_score = sum(scores) / len(scores)
        print(f"  {domain}: {avg_domain_score:.2f} (n={len(scores)})")
    
    # Performance by difficulty
    difficulties = {}
    for result in test_results:
        difficulty = result['difficulty']
        if difficulty not in difficulties:
            difficulties[difficulty] = []
        difficulties[difficulty].append(result['overall_score'])
    
    print(f"\nPerformance by Difficulty:")
    for difficulty, scores in difficulties.items():
        avg_difficulty_score = sum(scores) / len(scores)
        print(f"  {difficulty}: {avg_difficulty_score:.2f} (n={len(scores)})")
    
    # Performance by answer type
    answer_types = {}
    for result in test_results:
        answer_type = result.get('answer_type', 'unknown')
        if answer_type not in answer_types:
            answer_types[answer_type] = []
        answer_types[answer_type].append(result['overall_score'])
    
    print(f"\nPerformance by Answer Type:")
    for answer_type, scores in sorted(answer_types.items(), key=lambda x: sum(x[1])/len(x[1]), reverse=True):
        avg_type_score = sum(scores) / len(scores)
        print(f"  {answer_type}: {avg_type_score:.2f} (n={len(scores)})")
    
    # Top and bottom performers
    sorted_results = sorted(test_results, key=lambda x: x['overall_score'], reverse=True)
    
    print(f"\nTop 5 Performers:")
    for i, result in enumerate(sorted_results[:5], 1):
        print(f"  {i}. Q{result['question_index']}-C{result['candidate_index']} ({result['domain']}): {result['overall_score']:.1f} - {result['answer_type']}")
    
    print(f"\nBottom 5 Performers:")
    for i, result in enumerate(sorted_results[-5:], 1):
        print(f"  {i}. Q{result['question_index']}-C{result['candidate_index']} ({result['domain']}): {result['overall_score']:.1f} - {result['answer_type']}")

print(f"\n✅ Comprehensive automated evaluation system is working!")
print(f"💡 This demonstrates the full evaluation capability")
print(f"🚀 Ready to run on the complete dataset using evaluation_runner.py")

## ✅ Pydantic V2 Compatibility Update

**Fixed Deprecation Warning**: Updated the evaluation functions to use `model_dump()` instead of the deprecated `dict()` method for Pydantic V2 compatibility.

### Changes Made:
- **Cell 2**: Updated `evaluate()` function to use `validated.model_dump()`
- **automated_evaluation.py**: Updated `evaluate_single_answer()` method
- **evaluation_runner.py**: Already compatible with Pydantic V2

This ensures compatibility with current and future versions of Pydantic while maintaining the same functionality.