In [3]:
from dataclasses import dataclass, field
from typing import Optional

from llm_needle_haystack_tester import LLMNeedleHaystackTester
from llm_multi_needle_haystack_tester import LLMMultiNeedleHaystackTester
from evaluators import Evaluator, LangSmithEvaluator, OpenAIEvaluator
from providers import Anthropic, ModelProvider, OpenAI, Databricks

In [2]:
import os 


os.environ['LANGCHAIN_API_KEY'] = dbutils.secrets.get(scope='<your_scope>', secret='<your_langchain_api_key>')
os.environ['NIAH_MODEL_API_KEY'] = dbutils.secrets.get(scope='<your_scope>', secret='<your_databricks_pat_token>')
os.environ['NIAH_EVALUATOR_API_KEY'] = dbutils.secrets.get(scope='<your_scope>', secret='<your_open_ai_api_key>')

In [20]:
@dataclass
class CommandArgs():
    provider: str = "databricks"
    evaluator_label: str = "openai"
    model_name: str = "<model_name>"
    evaluator_model_name: Optional[str] = "gpt-3.5-turbo-0125"
    needle: Optional[str] = "\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n"
    haystack_dir: Optional[str] = "PaulGrahamEssays"
    retrieval_question: Optional[str] = "What is the best thing to do in San Francisco?"
    results_version: Optional[int] = 1
    context_lengths_min: Optional[int] = 800
    context_lengths_max: Optional[int] = 8100
    context_lengths_num_intervals: Optional[int] = 10
    context_lengths: Optional[list[int]] = None
    document_depth_percent_min: Optional[int] = 0
    document_depth_percent_max: Optional[int] = 100
    document_depth_percent_intervals: Optional[int] = 4
    document_depth_percents: Optional[list[int]] = None
    document_depth_percent_interval_type: Optional[str] = "linear"
    num_concurrent_requests: Optional[int] = 1
    save_results: Optional[bool] = True
    results_dir: Optional[str] = 'results'
    save_contexts: Optional[bool] = True
    contexts_dir: Optional[str] = 'contexts'
    final_context_length_buffer: Optional[int] = 200
    seconds_to_sleep_between_completions: Optional[float] = None
    print_ongoing_status: Optional[bool] = True
    # LangSmith parameters
    eval_set: Optional[str] = "multi-needle-eval-pizza-3"
    # Multi-needle parameters
    multi_needle: Optional[bool] = False
    needles: list[str] = field(default_factory=lambda: [
        " Figs are one of the secret ingredients needed to build the perfect pizza. ", 
        " Prosciutto is one of the secret ingredients needed to build the perfect pizza. ", 
        " Goat cheese is one of the secret ingredients needed to build the perfect pizza. "
    ])

In [21]:
def get_model_to_test(args: dict) -> ModelProvider:
    """
    Determines and returns the appropriate model provider based on the provided command dictionnary.
    
    Args:
        args (dict): The command line arguments parsed into a CommandArgs dataclass instance.
        
    Returns:
        ModelProvider: An instance of the specified model provider class.
    
    Raises:
        ValueError: If the specified provider is not supported.
    """
    match args.provider.lower():
        case "openai":
            return OpenAI(model_name=args.model_name)
        case "anthropic":
            return Anthropic(model_name=args.model_name)
        case "databricks":
            return Databricks(model_name=args.model_name)
        case _:
            raise ValueError(f"Invalid provider: {args.provider}")

def get_evaluator(args: dict) -> Evaluator:
    """
    Selects and returns the appropriate evaluator based on the provided command arguments.
    
    Args:
        args (CommandArgs): The command line arguments parsed into a CommandArgs dataclass instance.
        
    Returns:
        Evaluator: An instance of the specified evaluator class.
        
    Raises:
        ValueError: If the specified evaluator is not supported.
    """
    match args.evaluator_label.lower():
        case "openai":
            return OpenAIEvaluator(model_name=args.evaluator_model_name,
                                   question_asked=args.retrieval_question,
                                   true_answer=args.needle)
        case "langsmith":
            return LangSmithEvaluator()
        case _:
            raise ValueError(f"Invalid evaluator: {args.evaluator}")

In [26]:
args = CommandArgs
args.model_to_test = get_model_to_test(args)
args.evaluator = get_evaluator(args)

if args.multi_needle == True:
    print("Testing multi-needle")
    tester = LLMMultiNeedleHaystackTester(**args.__dict__)
else: 
    print("Testing single-needle")
    tester = LLMNeedleHaystackTester(**args.__dict__)
tester.start_test()

Testing single-needle


ValueError: Needle, haystack, and retrieval_question must be provided.

In [None]:
tester.start_test()