# LLM as an Interpreter using Judges

In [2]:
from openai import OpenAI
from openai import AzureOpenAI
from dotenv import load_dotenv

import json

import os


In [4]:
from azure.identity import DefaultAzureCredential, get_bearer_token_provider

In [5]:
from textwrap import dedent
from judges.base import BaseJudge, Judgment

In [6]:
load_dotenv()

True

In [13]:

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY")  # Recommended to store in .env
)


In [7]:
##  Azure Setup
token_provider = get_bearer_token_provider(DefaultAzureCredential(), os.environ.get("GET_TOKEN"))
client = AzureOpenAI(
    api_version="2024-12-01-preview",
    azure_endpoint=os.environ.get("AZURE_SERVICES_ENDPOINT"),
    azure_ad_token_provider=token_provider,
)

In [10]:
def client_call_test( messages ):
    output = client.chat.completions.create(
        model='gpt-4o',
        messages = messages

    ).choices[0].message.content
    return output

In [11]:
mess = [
        {
            'role': 'user', 
            'content': "hi, how are you?",
        },
    ]
client_call_test(messages=mess)

"Hello! I'm just a virtual assistant, so I don't have feelings, but thank you for asking. How can I assist you today? 😊"

## LLM-as-an-Interpreter

In [12]:
## Test case cross product
with open('input/scenarios/output_crossproduct.json', 'r', encoding='utf-8') as file:
    factors = json.load(file)

print(factors)


{'A': {'name': 'Dimensionality Assumption (3D)', 'description': 'Assuming the cross product is defined for 3D vectors only, which shapes the input contract (length 3) and the formula used.', 'attribution_weight': 0.15}, 'B': {'name': 'Mathematical Correctness', 'description': 'Implementing the exact right-hand-rule determinant formula to ensure accurate results and expected orientation.', 'attribution_weight': 0.2}, 'C': {'name': 'Input Validation', 'description': 'Checking that both inputs are vectors of length 3 to prevent silent errors and provide clear failure modes.', 'attribution_weight': 0.12}, 'D': {'name': 'Type Choices & Hints', 'description': 'Using List[float] for clarity and interoperability, with static typing hints to aid IDEs and linters.', 'attribution_weight': 0.06}, 'E': {'name': 'Error Handling Strategy', 'description': 'Raising ValueError with a helpful message when inputs are invalid to guide callers quickly.', 'attribution_weight': 0.07}, 'F': {'name': 'Simplicit

In [13]:
with open('input/scenarios/generated_crossproduct.py', 'r', encoding='utf-8') as file:
    focal_method = file.read()

print(focal_method)

from typing import List

def cross_product(vec1: List[float], vec2: List[float]) -> List[float]:
    """
    Compute the cross product of two 3D vectors.

    Args:
        vec1 (List[float]): First vector of length 3.
        vec2 (List[float]): Second vector of length 3.

    Returns:
        List[float]: The cross product vector.

    Raises:
        ValueError: If either vector is not of length 3.
    """
    if len(vec1) != 3 or len(vec2) != 3:
        raise ValueError("Both vectors must have exactly 3 elements.")

    return [
        vec1[1] * vec2[2] - vec1[2] * vec2[1],
        vec1[2] * vec2[0] - vec1[0] * vec2[2],
        vec1[0] * vec2[1] - vec1[1] * vec2[0]
    ]


In [14]:
CAUSAL_PROMPT_BASE = (
    "[CAUSAL TASK] Giving the following variable in the json file {factors}, "
    "generate a causal graph that explains how the variables influence each other. "
    "Consider that those variables are influencing the generation of the function "
    "{focal_method}, "
    "therefore the code generation should be a separate node in the graph. "
    "The outcome must be in Graphviz DOT format."
)


In [15]:
messages=[
    {
        "role": "system",
        "content": "You are an expert in causal inference and counterfactual reasoning. Your job is to infer a causal graph from a set of variables and draft counterfactual explanations.",
    },
    {
        "role": "user",
        "content": CAUSAL_PROMPT_BASE.format(factors=factors, focal_method=focal_method),
    }
]

In [16]:
output = client_call_test(messages)
output

'Below is a causal graph generated in Graphviz DOT format that explains how the variables influence each other and contribute to the design and implementation of the `cross_product` function. In the graph, the `cross_product` node represents the code that is influenced by other nodes. The arrows indicate causal relationships.\n\n```dot\ndigraph causal_graph {\n    rankdir=LR;\n\n    // Nodes for input factors\n    A [label="Dimensionality Assumption (3D)", shape=box];\n    B [label="Mathematical Correctness", shape=box];\n    C [label="Input Validation", shape=box];\n    D [label="Type Choices & Hints", shape=box];\n    E [label="Error Handling Strategy", shape=box];\n    F [label="Simplicity & Readability", shape=box];\n    G [label="Dependency Avoidance", shape=box];\n    H [label="Performance Considerations", shape=box];\n    I [label="Return Format Consistency", shape=box];\n    J [label="Documentation & Usability", shape=box];\n\n    // Node for the implementation\n    cross_produ

In [17]:
# | export 
class CodeInterpreterJudge(BaseJudge):
    def judge(
        self,
        factors: str,
        focal_method: str,
        node: str = 'C',
        #expected: str = None,
    ) -> Judgment:
        """
        Judge the causal graph and return a counterfactual explanation.
        
        Parameters:
        -----------
        factors: str
            A Json-Like input with the factors that contributes to a code generation.
        focal_method: str  
            The output generated by the neural code model.
        expected: str
            The expected output for comparison (optional).
        
        Returns:
        --------
        Judgment:
            The evaluation result containing the score and reasoning.
        """
        # Causal system-level instructions


        CAUSAL_PROMPT_BASE = (
            "[CAUSAL TASK] Giving the following variable in the json file {factors}, "
            "generate a causal graph that explains how the variables influence each other. "
            "Consider that those variables are influencing the generation of the function "
            "{focal_method}, "
            "therefore the code generation should be a separate node in the graph. "
            "The outcome must be in Graphviz DOT format."
        )

        SYSTEM_PROMPT = (
            "You are an expert in causal inference and counterfactual reasoning. "
            "Your job is to infer a causal graph from a set of variables and draft counterfactual explanations."
        )

        causal_messages=[
            {
                "role": "system",
                "content": SYSTEM_PROMPT,
            },
            {
                "role": "user",
                "content": CAUSAL_PROMPT_BASE.format(factors=factors, focal_method=focal_method),
            }
        ]

        causal_graph = client_call_test(messages = causal_messages)



        # Optional system-level instructions

        USER_PROMPT = dedent(
            f"""
            [COUNTERFACTUAL TASK] What would have changed or happened if the software tester had not considered 
            the variable {node} during the generation of the {focal_method}? 
            Provide at most three counterfactual answers. Estimate a probability of occurrence for each answer based solely 
            on the causal graph {causal_graph} in Graphviz DOT format. The outcome must be contained in a json format in which the first level 
            comprises the name of the counterfactual answer; the second level comprises a description of the answer, 
            a step by step of the rationale of that counterfactual answer, and the probability of occurrence.
            """
        )
        
        reasoning, score = self._judge(
            user_prompt = USER_PROMPT,
            system_prompt = SYSTEM_PROMPT,
        )
        
        return Judgment(reasoning=reasoning, score=score, score_type="boolean")

In [None]:
# Initialize your judge
politeness_judge = CodeInterpreterJudge(model='openai/gpt-4o-mini')

# Evaluate a response
judgment = politeness_judge.judge(
    input="Can you help me with my homework?",
    output="Sure! I'd be happy to help you with your homework. What subject are you working on?"
)

print(judgment.reasoning)
print(judgment.score)

### Testing `InterpreterJudge` 