In [1]:
import os 
from langchain_openai import AzureChatOpenAI
from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams
from deepeval.test_case import LLMTestCase
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import SystemMessage
from langchain_core.prompts import HumanMessagePromptTemplate
from langsmith.schemas import Run, Example
from langsmith.evaluation import evaluate

import sys
sys.path.append('/opt/project/src/evaluate_llm/')
from api_key_config import settings

os.environ['LANGCHAIN_TRACING_V2'] = settings.LANGCHAIN_TRACING_V2
os.environ['LANGCHAIN_API_KEY'] = settings.LANGCHAIN_API_KEY

os.environ["OPENAI_API_VERSION"] = settings.OPENAI_API_VERSION
os.environ["OPENAI_API_KEY"] = settings.OPENAI_API_KEY
os.environ["AZURE_OPENAI_ENDPOINT"] = settings.AZURE_OPENAI_ENDPOINT



In [2]:
# Load data for question-answer

import requests
from bs4 import BeautifulSoup

url = "https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
text = [p.text for p in soup.find_all("p")]
full_text = "\n".join(text)

In [3]:
#Deepeval section
class AzureOpenAI(DeepEvalBaseLLM):
    def __init__(
        self,
        model
    ):
        self.model = model

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        chat_model = self.load_model()
        return chat_model.invoke(prompt).content

    async def a_generate(self, prompt: str) -> str:
        chat_model = self.load_model()
        res = await chat_model.ainvoke(prompt)
        return res.content

    def get_model_name(self):
        return "Custom Azure OpenAI Model"
    
# Replace these with real values
custom_model = AzureChatOpenAI(
    deployment_name="gpt-35-turbo",
)
model_eval = AzureOpenAI(model=custom_model)

In [4]:
# Gen answer section
model_gen = AzureChatOpenAI(azure_deployment="gpt-35-turbo")

def answer_question_llm(inputs: dict) -> dict:
    """
    Generates answers to user questions based on a provided website text using OpenAI API.

    Parameters:
    inputs (dict): A dictionary with a single key 'question', representing the user's question as a string.

    Returns:
    dict: A dictionary with a single key 'output', containing the generated answer as a string.
    """
    
    chat_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(content=(f"Answer user questions in 2-3 sentences about this context: \n\n\n {full_text}")),
        HumanMessagePromptTemplate.from_template("{text}"),
    ]
    )
    messages = chat_template.format_messages(text=inputs["question"])

    # Call OpenAI
    response = model_gen.invoke(messages)

    # Response in output dict
    return {"answer": response.content}

# G-eval

In [5]:
metric = GEval(
    name="Correctness",
    criteria="Determine whether the actual output is factually correct based on the expected output.",
    # NOTE: you can only provide either criteria or evaluation_steps, and not both
    evaluation_steps=[
        "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
        "You should also heavily penalize omission of detail",
        "Vague language, or contradicting OPINIONS, are OK"
    ],
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
    model = model_eval
)

def g_eval(run: Run, example: Example) -> dict:
    # Get outputs
    question = example.inputs.get("question")
    prediction = run.outputs.get("answer")
    reference = example.outputs.get("answer")

    test_case = LLMTestCase(
    input=question,
    actual_output=prediction,
    expected_output=reference)
    
    metric.measure(test_case)
    return {"score": metric.score, "comment": metric.reason}


# Evaluators
qa_evalulator = [g_eval]
dataset_name = "DBRX"

# Run
experiment_results = evaluate(
    answer_question_llm,
    data=dataset_name,
    evaluators=qa_evalulator,
    experiment_prefix="test-dbrx-qa-custom-eval-g-eval",
    # Any experiment metadata can be specified here
    metadata={
        "variant": "stuff website context into gpt-4o",
    },
)

View the evaluation results for experiment: 'test-dbrx-qa-custom-eval-g-eval-e15db94f' at:
https://smith.langchain.com/o/4da9684a-c78b-54bf-a119-2e143c6c11df/datasets/3caf3425-c29e-4dfd-9054-b069fbc2a174/compare?selectedSessions=db53f139-d5c6-4128-a440-692263d5aaea




0it [00:00, ?it/s]

Output()

Output()

Output()

Output()

# Summarization 

###  build dataset for evaluate summary task

In [17]:
from langsmith import Client

client = Client()
dataset_name = "Summary deepeval"

inputs = ["""
The 'coverage score' is calculated as the percentage of assessment questions
for which both the summary and the original document provide a 'yes' answer. This
method ensures that the summary not only includes key information from the original
text but also accurately represents it. A higher coverage score indicates a
more comprehensive and faithful summary, signifying that the summary effectively
encapsulates the crucial points and details from the original content.
"""]

outputs = ["""
The coverage score quantifies how well a summary captures and
accurately represents key information from the original text,
with a higher score indicating greater comprehensiveness.
"""]

# Store
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="QA pairs about DBRX model.",
)
client.create_examples(
    inputs=[{"question": q} for q in inputs],
    outputs=[{"answer": a} for a in outputs],
    dataset_id=dataset.id,
)

### Run evaluate summarization 

In [18]:
from deepeval.metrics import SummarizationMetric
metric = SummarizationMetric(
    threshold=0.5,
    model=model_eval,
    assessment_questions=[
        "Is the coverage score based on a percentage of 'yes' answers?",
        "Does the score ensure the summary's accuracy with the source?",
        "Does a higher score mean a more comprehensive summary?"
    ]
)

def summarize(run: Run, example: Example) -> dict:
    # Get outputs
    question = example.inputs.get("question")
    prediction = run.outputs.get("answer")

    test_case = LLMTestCase(
    input=question,
    actual_output=prediction)
    
    metric.measure(test_case)
    return {"score": metric.score, "comment": metric.reason}


# Evaluators
qa_evalulator = [summarize]
dataset_name = "Summary deepeval" #See detail in build_dataset.ipynb

# Run
experiment_results = evaluate(
    answer_question_llm,
    data=dataset_name,
    evaluators=qa_evalulator,
    experiment_prefix="test-qa-custom-eval-sum-1",
    # Any experiment metadata can be specified here
    metadata={
        "variant": "stuff website context into gpt-3.5-turbo",
    },
)

View the evaluation results for experiment: 'test-qa-custom-eval-sum-1-35b8bb5c' at:
https://smith.langchain.com/o/4da9684a-c78b-54bf-a119-2e143c6c11df/datasets/766d5bd1-ebf7-4a0e-97c5-65aff8bfcbf2/compare?selectedSessions=af4729e3-6c2d-45d3-94ae-a91eabf0294d




0it [00:00, ?it/s]

Output()

# RAG evaluate 

# Answer Relevancy + Faithfullness + Contextual Recall + Contextual Precision + Contextual Relevancy + Hallucination

In [2]:
### INDEX

from bs4 import BeautifulSoup as Soup
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings

embeddings = AzureOpenAIEmbeddings(
    azure_deployment="text-embedding-ada-002",
)

# Load
url = "https://python.langchain.com/docs/expression_language/"
loader = RecursiveUrlLoader(
    url=url, max_depth=20, extractor=lambda x: Soup(x, "html.parser").text
)
docs = loader.load()

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# Embed
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)

# Index
retriever = vectorstore.as_retriever()

In [3]:
#Deepeval section
class AzureOpenAI(DeepEvalBaseLLM):
    def __init__(
        self,
        model
    ):
        self.model = model

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        chat_model = self.load_model()
        return chat_model.invoke(prompt).content

    async def a_generate(self, prompt: str) -> str:
        chat_model = self.load_model()
        res = await chat_model.ainvoke(prompt)
        return res.content

    def get_model_name(self):
        return "Custom Azure OpenAI Model"
    
# Replace these with real values
custom_model = AzureChatOpenAI(
    deployment_name="gpt-35-turbo",
    temperature = 0.0
)
model_eval = AzureOpenAI(model=custom_model)

In [4]:
### RAG
from langsmith import traceable
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import SystemMessage
from langchain_core.prompts import HumanMessagePromptTemplate

class RagBot:
    
    def __init__(self, retriever):
        self._retriever = retriever
        # Wrapping the client instruments the LLM
        self._model_gen = AzureChatOpenAI(azure_deployment="gpt-35-turbo", temperature = 0.0)

    @traceable()
    def retrieve_docs(self, question):
        return self._retriever.invoke(question)

    @traceable()
    def get_answer(self, question: str):
        similar = self.retrieve_docs(question)
        chat_template = ChatPromptTemplate.from_messages(
        [
            SystemMessage(content=("""You are a helpful AI code assistant with expertise in LCEL."
                    " Use the following docs to produce a concise code solution to the user question.\n\n"
                    f"## Docs\n\n{similar}""")),
            HumanMessagePromptTemplate.from_template("{text}"),
        ]
        )
        messages = chat_template.format_messages(text=question)
        # Evaluators will expect "answer" and "contexts"
        return {
            "answer": self._model_gen.invoke(messages).content,
            "contexts": [str(doc) for doc in similar],
        }


rag_bot = RagBot(retriever)

In [5]:
# RAG chain
def predict_rag_answer(example: dict):
    """Use this for answer evaluation"""
    response = rag_bot.get_answer(example["question"])
    return {"answer": response["answer"]}

def predict_rag_answer_with_context(example: dict):
    """Use this for evaluation of retrieved documents and hallucinations"""
    response = rag_bot.get_answer(example["question"])
    return {"answer": response["answer"], "contexts": response["contexts"]}

In [6]:
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric, ContextualPrecisionMetric, ContextualRecallMetric, ContextualRelevancyMetric

def answerrelevancy(run: Run, example: Example) -> dict:
    question = example.inputs.get("question")
    prediction = run.outputs.get("answer")
    test_case = LLMTestCase(
        input=question,
        actual_output=prediction
    )

    answerrelevancy_metric = AnswerRelevancyMetric(
        threshold=0.7,
        model=model_eval,
        include_reason=True
    )
    answerrelevancy_metric.measure(test_case)
    return {"key": "answer relevancy", "score": answerrelevancy_metric.score, "comment": answerrelevancy_metric.reason}

def faithfulness(run: Run, example: Example) -> dict:
    question = example.inputs.get("question")
    prediction = run.outputs.get("answer")
    retrieval_context = run.outputs.get("contexts")
    test_case = LLMTestCase(
        input=question,
        actual_output=prediction,
        retrieval_context=retrieval_context
    )
    
    faithfulness_metric = FaithfulnessMetric(
        threshold=0.7,
        model=model_eval,
        include_reason=True
    )
    faithfulness_metric.measure(test_case)
    return {"key": "faithfulness", "score": faithfulness_metric.score, "comment": faithfulness_metric.reason}

def contextualprecision(run: Run, example: Example) -> dict:
    question = example.inputs.get("question")
    prediction = run.outputs.get("answer")
    reference =  example.outputs.get("answer")
    retrieval_context = run.outputs.get("contexts")
    test_case = LLMTestCase(
        input=question,
        actual_output=prediction,
        expected_output=reference,
        retrieval_context=retrieval_context
    )
    
    contextualprecision_metric = ContextualPrecisionMetric(
        threshold=0.7,
        model=model_eval,
        include_reason=True
    )
    contextualprecision_metric.measure(test_case)
    return {"key": "contextual precision", "score": contextualprecision_metric.score, "comment": contextualprecision_metric.reason}

def contextualrecall(run: Run, example: Example) -> dict:
    question = example.inputs.get("question")
    prediction = run.outputs.get("answer")
    reference =  example.outputs.get("answer"),
    retrieval_context = run.outputs.get("contexts")
    test_case = LLMTestCase(
        input=question,
        actual_output=prediction,
        expected_output=reference,
        retrieval_context=retrieval_context
    )
    
    contextualrecall_metric = ContextualRecallMetric(
        threshold=0.7,
        model=model_eval,
        include_reason=True
    )
    contextualrecall_metric.measure(test_case)
    return {"key": "contextual recall", "score": contextualrecall_metric.score, "comment": contextualrecall_metric.reason}

def contextualrelevancy(run: Run, example: Example) -> dict:
    question = example.inputs.get("question")
    prediction = run.outputs.get("answer")
    retrieval_context = run.outputs.get("contexts")
    test_case = LLMTestCase(
        input=question,
        actual_output=prediction,
        retrieval_context=retrieval_context
    )
    contextualrelevancy_metric = ContextualRelevancyMetric(
        threshold=0.7,
        model=model_eval,
        include_reason=True
    )
    contextualrelevancy_metric.measure(test_case)
    return {"key": "contextual relevancy", "score": contextualrelevancy_metric.score, "comment": contextualrelevancy_metric.reason}

# Evaluators
qa_evalulator = [answerrelevancy, faithfulness, contextualprecision, contextualrecall, contextualrelevancy]
dataset_name = "RAG_test_LCEL" 

experiment_results = evaluate(
    predict_rag_answer_with_context,
    data=dataset_name,
    evaluators=qa_evalulator,
    experiment_prefix="test-RAG-test-LCEL-RAG-Deepeval",
    # Any experiment metadata can be specified here
    metadata={
        "variant": "stuff website context into gpt-3.5-turbo",
    },
)

View the evaluation results for experiment: 'test-RAG-test-LCEL-RAG-Deepeval-2d8d3cef' at:
https://smith.langchain.com/o/4da9684a-c78b-54bf-a119-2e143c6c11df/datasets/e9f3f26b-8d24-4a6d-b39d-433836c8b3f7/compare?selectedSessions=d342a771-e37c-4edb-b956-d84106a3fabd




0it [00:00, ?it/s]

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()