# LLM-as-Judge Library Demo

This notebook demonstrates the key features of the LLM-as-Judge library.

## 1. Installation

First, install the library with provider support:

In [None]:
# Install the library
# !pip install llm-judge[providers]

## 2. Basic Setup

In [None]:
import asyncio
from llm_judge import (
    Judge,
    CategoryRegistry,
    CategoryDefinition,
    CharacteristicProperty,
    PropertyType,
    Range,
)

# For Jupyter notebooks
import nest_asyncio
nest_asyncio.apply()

## 3. Using Built-in Categories

In [None]:
# Load the category registry
registry = CategoryRegistry()

# List available categories
print("Available categories:")
for category in registry.list_categories():
    print(f"  - {category}")

# Get a specific category
academic_category = registry.get("academic_writing")
print(f"\nCategory: {academic_category.name}")
print(f"Description: {academic_category.description}")

## 4. Evaluating Content

In [None]:
# Initialize judge with mock provider (no API key needed)
judge = Judge(provider="mock", temperature=0.1)

# Sample academic content
academic_content = """
This study examines the impact of climate change on biodiversity in tropical ecosystems.
According to Smith et al. (2023), rising temperatures have led to a 30% decline in 
species diversity over the past decade [1]. The methodology employed systematic review
of peer-reviewed literature from 2010-2023, analyzing 150 studies across 45 countries.
Results indicate significant correlation (p<0.001) between temperature increase and
habitat loss [2].
"""

# Evaluate the content
result = await judge.evaluate(academic_content, academic_category)

print(f"Matches category: {result.matches_category}")
print(f"Confidence: {result.confidence:.2f}")
print(f"\nFeedback: {result.feedback}")

## 5. Creating Custom Categories

In [None]:
# Define a custom category for code documentation
code_doc_category = CategoryDefinition(
    name="code_documentation",
    description="Well-documented code with clear explanations",
    characteristic_properties=[
        CharacteristicProperty(
            name="has_docstrings",
            property_type=PropertyType.NECESSARY,
            formal_definition="Contains docstrings for functions/classes",
            measurement_function=lambda c: '"""' in c or "'''" in c,
            threshold=True,
            weight=2.0,
        ),
        CharacteristicProperty(
            name="comment_ratio",
            property_type=PropertyType.TYPICAL,
            formal_definition="Ratio of comment lines to code lines",
            measurement_function=lambda c: len([l for l in c.split('\n') if l.strip().startswith('#')]) / max(len(c.split('\n')), 1),
            threshold=0.1,  # At least 10% comments
            weight=1.0,
        ),
    ],
)

# Register the custom category
registry.register(code_doc_category)

print(f"Registered category: {code_doc_category.name}")

In [None]:
# Test with code sample
code_sample = '''
def calculate_fibonacci(n: int) -> int:
    """
    Calculate the nth Fibonacci number.
    
    Args:
        n: The position in the Fibonacci sequence
        
    Returns:
        The nth Fibonacci number
    """
    # Base cases
    if n <= 0:
        return 0
    elif n == 1:
        return 1
    
    # Recursive calculation
    return calculate_fibonacci(n-1) + calculate_fibonacci(n-2)
'''

result = await judge.evaluate(code_sample, code_doc_category)
print(f"Is well-documented code: {result.matches_category}")
print(f"Confidence: {result.confidence:.2f}")

## 6. Content Comparison

In [None]:
# Original stub article
original = "Python is a programming language."

# Enhanced version
enhanced = """
Python is a high-level, interpreted programming language known for its
simplicity and readability. Created by Guido van Rossum and first released
in 1991, Python emphasizes code readability with its use of significant
indentation. It supports multiple programming paradigms including procedural,
object-oriented, and functional programming. Python's extensive standard
library and vast ecosystem of third-party packages make it suitable for
web development, data science, artificial intelligence, and automation.
"""

# Create a wiki stub category
wiki_stub = CategoryDefinition(
    name="wiki_stub",
    description="Short Wikipedia article that needs expansion",
    characteristic_properties=[
        CharacteristicProperty(
            name="word_count",
            property_type=PropertyType.NECESSARY,
            formal_definition="Word count between 50 and 500",
            measurement_function=lambda c: len(c.split()),
            threshold=Range(min=50, max=500),
            weight=2.0,
        ),
    ],
)

# Compare versions
comparison = await judge.compare(original, enhanced, wiki_stub)

print(f"Original score: {comparison.original_score:.2f}")
print(f"Enhanced score: {comparison.modified_score:.2f}")
print(f"Improvement: {comparison.progress_percentage:.1f}%")
print(f"\nRecommendation: {comparison.recommendation}")

## 7. Batch Evaluation

In [None]:
# Multiple contents to evaluate
contents = [
    "The quantum mechanical model describes electron behavior in atoms.",
    "According to recent studies [1], the hypothesis was confirmed (p<0.05).",
    "LOL this is so cool! Check it out!!!",
    "The systematic review analyzed 50 peer-reviewed papers from 2020-2023.",
]

# Batch evaluation
results = await judge.batch_evaluate(contents, academic_category)

# Display results
for i, (content, result) in enumerate(zip(contents, results), 1):
    print(f"Content {i}: {content[:50]}...")
    print(f"  Academic: {result.matches_category}")
    print(f"  Confidence: {result.confidence:.2f}")
    print()

## 8. Using Real LLM Providers

To use real LLM providers, set your API keys as environment variables:

In [None]:
# Example with OpenAI (requires API key)
# import os
# os.environ["OPENAI_API_KEY"] = "your-api-key-here"
# 
# judge_openai = Judge(provider="openai", model="gpt-4-turbo-preview")
# result = await judge_openai.evaluate(content, category)

# Example with Anthropic
# os.environ["ANTHROPIC_API_KEY"] = "your-api-key-here"
# judge_claude = Judge(provider="anthropic", model="claude-3-opus-20240229")

# Example with Google Gemini
# os.environ["GOOGLE_API_KEY"] = "your-api-key-here"
# judge_gemini = Judge(provider="gemini", model="gemini-1.5-pro")

print("To use real providers, uncomment the code above and add your API keys.")

## 9. Multi-Provider Consensus

In [None]:
from llm_judge import ConsensusMode

# Use multiple mock providers for demonstration
# In production, you'd use different real providers
from llm_judge.providers.mock import MockProvider

providers = [
    MockProvider(always_match=True),
    MockProvider(always_match=True),
    MockProvider(always_match=False),  # Dissenting judge
]

# Create judge with consensus
consensus_judge = Judge(
    provider=providers,
    consensus_mode=ConsensusMode.MAJORITY
)

# Evaluate with consensus
result = await consensus_judge.evaluate(academic_content, academic_category)

print(f"Consensus result: {result.matches_category}")
print(f"Agreement details: {result.metadata}")

## 10. Analytics and Monitoring

In [None]:
# Check cache statistics
stats = judge.get_cache_stats()
print(f"Cache statistics:")
print(f"  Size: {stats['size']}")
print(f"  Hits: {stats['hits']}")
print(f"  Misses: {stats['misses']}")

# Clear cache if needed
judge.clear_cache()
print("\nCache cleared!")

## Summary

This notebook demonstrated:
- Using built-in categories
- Creating custom categories with properties
- Evaluating content against categories
- Comparing content versions
- Batch evaluation
- Multi-provider consensus
- Cache management

For more examples and documentation, see the [GitHub repository](https://github.com/gmelli/llm-judge).