# LLM-as-Judge: Built-in evaluator

In [1]:
import sys
sys.path.append('/opt/project/src/evaluate_llm/')
from api_key_config import settings
import os

os.environ['LANGCHAIN_TRACING_V2'] = settings.LANGCHAIN_TRACING_V2
os.environ['LANGCHAIN_API_KEY'] = settings.LANGCHAIN_API_KEY

os.environ["OPENAI_API_VERSION"] = settings.OPENAI_API_VERSION
os.environ["OPENAI_API_KEY"] = settings.OPENAI_API_KEY
os.environ["AZURE_OPENAI_ENDPOINT"] = settings.AZURE_OPENAI_ENDPOINT

os.environ["LANGCHAIN_PROJECT"] = "DBRX"

In [2]:
# Load blog post

import requests
from bs4 import BeautifulSoup

url = "https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
text = [p.text for p in soup.find_all("p")]
full_text = "\n".join(text)

In [3]:
print(full_text)

by The Mosaic Research Team
March 27, 2024 in Mosaic AI Research
Today, we are excited to introduce DBRX, an open, general-purpose LLM created by Databricks. Across a range of standard benchmarks, DBRX sets a new state-of-the-art for established open LLMs. Moreover, it provides the open community and enterprises building their own LLMs with capabilities that were previously limited to closed model APIs; according to our measurements, it surpasses GPT-3.5, and it is competitive with Gemini 1.0 Pro. It is an especially capable code model, surpassing specialized models like CodeLLaMA-70B on programming, in addition to its strength as a general-purpose LLM.
This state-of-the-art quality comes with marked improvements in training and inference performance. DBRX advances the state-of-the-art in efficiency among open models thanks to its fine-grained mixture-of-experts (MoE) architecture. Inference is up to 2x faster than LLaMA2-70B, and DBRX is about 40% of the size of Grok-1 in terms of bot

In [11]:
from langchain_openai import AzureChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import SystemMessage
from langchain_core.prompts import HumanMessagePromptTemplate

model_gen = AzureChatOpenAI(azure_deployment="gpt-35-turbo")
model_eval = AzureChatOpenAI(azure_deployment="gpt-35-turbo")

def answer_dbrx_question_llm(inputs: dict) -> dict:
    """
    Generates answers to user questions based on a provided website text using OpenAI API.

    Parameters:
    inputs (dict): A dictionary with a single key 'question', representing the user's question as a string.

    Returns:
    dict: A dictionary with a single key 'output', containing the generated answer as a string.
    """
    
    chat_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(content=(f"Answer user questions in 2-3 sentences about this context: \n\n\n {full_text}")),
        HumanMessagePromptTemplate.from_template("{text}"),
    ]
    )
    messages = chat_template.format_messages(text=inputs["question"])

    # Call OpenAI
    response = model_gen.invoke(messages)

    # Response in output dict
    return {"answer": response.content}

In [9]:
answer_dbrx_question_llm(
    {
        "question": "What are the main differences in training efficiency between MPT-7B vs DBRX?"
    }
)

{'answer': 'The main difference in training efficiency between MPT-7B and DBRX is that DBRX is nearly 4x more compute-efficient than MPT-7B. This is due to various improvements, including using an MoE architecture, other architecture changes to the network, better optimization strategies, better tokenization, and better pretraining data. These improvements have significantly increased the training efficiency of DBRX.'}

In [10]:
from langsmith.evaluation import evaluate, LangChainStringEvaluator

# Evaluators
qa_evalulator = [LangChainStringEvaluator("cot_qa",config={"llm": model_eval})]
dataset_name = "DBRX"

experiment_results = evaluate(
    answer_dbrx_question_llm,
    data=dataset_name,
    evaluators=qa_evalulator,
    experiment_prefix="test-dbrx-qa-oai-invoke",
    # Any experiment metadata can be specified here
    metadata={"variant": "stuff website context into gpt-3.5-turbo"}
)


View the evaluation results for experiment: 'test-dbrx-qa-oai-invoke-317e63ff' at:
https://smith.langchain.com/o/4da9684a-c78b-54bf-a119-2e143c6c11df/datasets/3caf3425-c29e-4dfd-9054-b069fbc2a174/compare?selectedSessions=733e16a8-1609-40aa-a381-461373740d8d




0it [00:00, ?it/s]

![Alt text](evaluate_one_metric.png)

Note :
- In actual case we need to change eval_model to GPT-4 or clude which outperform than gpt-3.5

# Evaluate multiple metirc 

In [12]:
from langsmith.evaluation import evaluate, LangChainStringEvaluator

# Evaluators
cot_qa_evalulator = LangChainStringEvaluator("cot_qa",config={"llm": model_eval})
qa_evalulator = LangChainStringEvaluator("qa",config={"llm": model_eval})
criteria_evaluator = LangChainStringEvaluator(
    "criteria",
    config={
        "criteria": {
            "harmfulness": "Is this response harmfulness?",
            "helpfulness": "Is this response helpfulness?",
        },
        "llm" : model_eval
    }
)

dataset_name = "DBRX"

experiment_results = evaluate(
    answer_dbrx_question_llm,
    data=dataset_name,
    evaluators=[cot_qa_evalulator, qa_evalulator, criteria_evaluator],
    experiment_prefix="test-dbrx-qa-oai-invoke",
    # Any experiment metadata can be specified here
    metadata={"variant": "stuff website context into gpt-3.5-turbo"}
)


View the evaluation results for experiment: 'test-dbrx-qa-oai-invoke-3e22f877' at:
https://smith.langchain.com/o/4da9684a-c78b-54bf-a119-2e143c6c11df/datasets/3caf3425-c29e-4dfd-9054-b069fbc2a174/compare?selectedSessions=5b1a7009-b0f2-48a2-b7cd-c37cb8cbe849




0it [00:00, ?it/s]

![Alt text](evaluate_multiple_metric.png)

For another evaluate, you can see in https://docs.smith.langchain.com/old/evaluation/faq/evaluator-implementations