In [8]:
import time
import asyncio
import nest_asyncio
import traceback
import os
import torch
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional
from dotenv import load_dotenv
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.output_parsers import JsonOutputParser

# LangChain Local Providers
# from langchain_community.llms import Ollama
from langchain_community.chat_models import ChatOllama

In [9]:
# This is required to run LangGraph's async loops in a Jupyter Notebook
nest_asyncio.apply()

In [10]:
@dataclass
class EvalCase:
    query: str
    expected_path: List[str]
    required_keywords: List[str]

@dataclass
class PerformanceMetrics:
    query: str
    success: bool = False
    total_steps: int = 0
    path_taken: List[str] = field(default_factory=list)
    loops_detected: int = 0
    redundant_calls: int = 0
    latency: float = 0.0
    orchestration_error: bool = False
    tokens_estimate: int = 0

class NotebookEvaluator:
    """
    Advanced Benchmark Suite supporting Local self-hosted APIs (Ollama).
    Optimized for CPU-only environments.
    """
    def __init__(self, workflow_factory, llm_list: Dict[str, Any]):
        self.workflow_factory = workflow_factory
        self.llms = llm_list
        self.all_results = {}

    @staticmethod
    def get_ollama_model(model_name: str):
        """
        Connects to a local Ollama instance.
        Enables JSON mode if supported by the model version.
        """
        print(f"Connecting to local Ollama model: {model_name}...")
        # format="json" tells Ollama to strictly follow JSON if the model supports it
        return ChatOllama(model=model_name, temperature=0, format="json")

    def _detect_loops_and_redundancy(self, path: List[str]) -> (int, int):
        loops = 0
        redundancy = 0
        for i in range(len(path) - 2):
            if path[i] == path[i+2] and path[i+1] == "Supervisor":
                loops += 1
        
        counts = {}
        for node in path:
            if node in ["Filing Analyst", "Price Analyst"]:
                counts[node] = counts.get(node, 0) + 1
        
        redundancy = sum(v - 1 for v in counts.values() if v > 1)
        return loops, redundancy

    async def run_benchmark(self, cases: List[EvalCase]):
        try:
            # We assume these are available in your environment
            from crew import ContextSchema, AgentState
        except ImportError:
            from stockanalyzer.crew import ContextSchema, AgentState

        for model_name, llm in self.llms.items():
            print(f"\n{'='*60}\nü§ñ BENCHMARKING LOCAL MODEL: {model_name}\n{'='*60}")
            model_results = []
            
            # Note: If your workflow_factory (create_agent) uses .with_structured_output internally,
            # you may need to modify crew.py to handle a simple chain + JsonOutputParser 
            # as a fallback for Ollama models.
            workflow = self.workflow_factory()

            for case in cases:
                metrics = PerformanceMetrics(query=case.query)
                start_time = time.time()
                print(f"\nüìù Case: {case.query}")
                
                try:
                    context_obj = ContextSchema(model=llm)
                    initial_state = AgentState(
                        messages=[HumanMessage(content=case.query)],
                        iteration_count=0
                    )

                    async for chunk in workflow.astream(
                        initial_state,
                        context=context_obj,
                        stream_mode="updates"
                    ):
                        for node_name, _ in chunk.items():
                            node_str = str(node_name).split('.')[-1] if '.' in str(node_name) else str(node_name)
                            metrics.path_taken.append(node_str)
                            metrics.total_steps += 1
                            print(f"  ‚ûú {node_str}", end="", flush=True)

                    metrics.latency = round(time.time() - start_time, 2)
                    loops, redundancy = self._detect_loops_and_redundancy(metrics.path_taken)
                    metrics.loops_detected = loops
                    metrics.redundant_calls = redundancy
                    
                    first_worker = next((n for n in metrics.path_taken if n in ["Filing Analyst", "Price Analyst"]), None)
                    expected_worker = next((n for n in case.expected_path if n in ["Filing Analyst", "Price Analyst"]), None)
                    
                    if expected_worker and first_worker != expected_worker:
                        metrics.orchestration_error = True
                    
                    metrics.success = not metrics.orchestration_error and all(n in metrics.path_taken for n in case.expected_path)
                    model_results.append(metrics)
                    status = "‚úÖ PASS" if metrics.success else "‚ùå FAIL"
                    print(f"\n  Result: {status} | Time: {metrics.latency}s | Steps: {metrics.total_steps}")

                except Exception as e:
                    # Provide clearer guidance for the common Ollama error
                    if "with_structured_output" in str(e):
                        print(f"\n  ‚ùå Config Error: Local model '{model_name}' does not support with_structured_output.")
                        print(f"     TIP: Update your supervisor_node in crew.py to use a JsonOutputParser fallback.")
                    else:
                        print(f"\n  ‚ùå Execution Error for {model_name}: {e}")

            self.all_results[model_name] = model_results

    def print_comparative_summary(self):
        print(f"\n\n{'#'*60}\nüìä FINAL COMPARATIVE SUMMARY\n{'#'*60}")
        header = f"{'Model':<30} | {'Pass%':<8} | {'Avg Steps':<10} | {'Loops':<6} | {'Avg Latency':<10}"
        print(header)
        print("-" * len(header))
        
        for model, results in self.all_results.items():
            if not results: continue
            total = len(results)
            pass_rate = (sum(1 for r in results if r.success) / total) * 100
            avg_steps = sum(r.total_steps for r in results) / total
            total_loops = sum(r.loops_detected for r in results)
            avg_latency = sum(r.latency for r in results) / total
            
            print(f"{model:<30} | {pass_rate:>6.1f}% | {avg_steps:>10.1f} | {total_loops:>6} | {avg_latency:>10.1f}s")
        print("#" * 60)

async def run_local_benchmark():
    # Attempting to import from the path where you likely have it
    try:
        from stockanalyzer.crew import create_agent
    except ImportError:
        # Fallback if the module structure is different
        def create_agent(): raise ImportError("Please ensure crew.py is in your PYTHONPATH")
    
    models_to_test = ["qwen2.5:1.5b", "phi3.5:latest"]
    llm_bench = {}
    
    for model_id in models_to_test:
        try:
            llm_bench[model_id] = NotebookEvaluator.get_ollama_model(model_id)
        except Exception as e:
            print(f"Failed to connect to Ollama for {model_id}: {e}")

    cases = [
        EvalCase(
            query="What was NVIDIA's revenue in their latest 10-Q?",
            expected_path=["Filing Analyst", "Synthesizer"],
            required_keywords=["NVIDIA", "Revenue"]
        )
    ]
    
    evaluator = NotebookEvaluator(create_agent, llm_bench)
    await evaluator.run_benchmark(cases)
    evaluator.print_comparative_summary()

In [11]:
async def run_local_benchmark():
    # Attempting to import from the path where you likely have it
    try:
        from stockanalyzer.crew import create_agent
    except ImportError:
        # Fallback if the module structure is different
        def create_agent(): raise ImportError("Please ensure crew.py is in your PYTHONPATH")
    
    models_to_test = ["qwen2.5:1.5b", "phi3.5:latest"]
    llm_bench = {}
    
    for model_id in models_to_test:
        try:
            llm_bench[model_id] = NotebookEvaluator.get_ollama_model(model_id)
        except Exception as e:
            print(f"Failed to connect to Ollama for {model_id}: {e}")

    cases = [
        EvalCase(
            query="What was NVIDIA's revenue in their latest 10-Q?",
            expected_path=["Filing Analyst", "Synthesizer"],
            required_keywords=["NVIDIA", "Revenue"]
        )
    ]
    
    evaluator = NotebookEvaluator(create_agent, llm_bench)
    await evaluator.run_benchmark(cases)
    evaluator.print_comparative_summary()

In [12]:
await run_local_benchmark()

Connecting to local Ollama model: qwen2.5:1.5b...
Connecting to local Ollama model: phi3.5:latest...

ü§ñ BENCHMARKING LOCAL MODEL: qwen2.5:1.5b

üìù Case: What was NVIDIA's revenue in their latest 10-Q?
  ‚ûú Supervisor  ‚ûú Synthesizer
  Result: ‚ùå FAIL | Time: 27.87s | Steps: 2

ü§ñ BENCHMARKING LOCAL MODEL: phi3.5:latest

üìù Case: What was NVIDIA's revenue in their latest 10-Q?
  ‚ûú Supervisor  ‚ûú Synthesizer
  Result: ‚ùå FAIL | Time: 60.48s | Steps: 2


############################################################
üìä FINAL COMPARATIVE SUMMARY
############################################################
Model                          | Pass%    | Avg Steps  | Loops  | Avg Latency
-----------------------------------------------------------------------------
qwen2.5:1.5b                   |    0.0% |        2.0 |      0 |       27.9s
phi3.5:latest                  |    0.0% |        2.0 |      0 |       60.5s
############################################################
