In [8]:
import sys
sys.path.append('/opt/project/src/evaluate_llm/')

from api_key_config import settings
import os

os.environ['LANGCHAIN_TRACING_V2'] = settings.LANGCHAIN_TRACING_V2
os.environ['LANGCHAIN_API_KEY'] = settings.LANGCHAIN_API_KEY

os.environ["OPENAI_API_VERSION"] = settings.OPENAI_API_VERSION
os.environ["OPENAI_API_KEY"] = settings.OPENAI_API_KEY
os.environ["AZURE_OPENAI_ENDPOINT"] = settings.AZURE_OPENAI_ENDPOINT

Build data for evaluation

In [8]:
import pandas as pd

# QA
inputs = [
    "Could you say: The quick brown fox jumps over the lazy dog",
]

outputs = [
    "The quick brown dog jumps over the lazy fox",
]

from langsmith import Client

client = Client()
dataset_name = "stat_test"

# Store
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="QA pairs about statistic eval",
)
client.create_examples(
    inputs=[{"question": q} for q in inputs],
    outputs=[{"answer": a} for a in outputs],
    dataset_id=dataset.id,
)

Create generate text answer question 

In [9]:
from langchain_openai import AzureChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import SystemMessage
from langchain_core.prompts import HumanMessagePromptTemplate

model_gen = AzureChatOpenAI(azure_deployment="gpt-35-turbo")
model_eval = AzureChatOpenAI(azure_deployment="gpt-35-turbo")

def answer_question_llm(inputs: dict) -> dict:
    chat_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(content=(f"Answer user questions")),
        HumanMessagePromptTemplate.from_template("{text}"),
    ]
    )
    messages = chat_template.format_messages(text=inputs["question"])

    # Call OpenAI
    response = model_gen.invoke(messages)

    # Response in output dict
    return {"answer": response.content}

BLEU

In [10]:
from nltk.translate.bleu_score import sentence_bleu

In [11]:
from langsmith.schemas import Run, Example
from langsmith.evaluation import evaluate

def bleu_score(run: Run, example: Example) -> dict:
    prediction = run.outputs.get("answer")
    reference =  example.outputs.get("answer")
    score_1gram = sentence_bleu([reference.split()], prediction.split(), weights=(1, 0, 0, 0))
    score_2gram = sentence_bleu([reference.split()], prediction.split(), weights=(0, 1, 0, 0))
    return {
        "results": [
            {"key": "BLEU-1gram", "score": score_1gram},
            {"key": "BLEU-2gram", "score": score_2gram},
            ]
        }

# Evaluators
qa_evalulator = [bleu_score]
dataset_name = "stat_test"

# Run
experiment_results = evaluate(
    answer_question_llm,
    data=dataset_name,
    evaluators=qa_evalulator,
    experiment_prefix="test-bleu-score",
    # Any experiment metadata can be specified here
    metadata={
        "variant": "stuff website context into gpt-3.5-turbo",
    },
)

View the evaluation results for experiment: 'test-bleu-score-8a86c4b6' at:
https://smith.langchain.com/o/4da9684a-c78b-54bf-a119-2e143c6c11df/datasets/17aefcee-326d-4e8e-85b3-1f890af4b74d/compare?selectedSessions=92253e8e-983b-486c-8d2c-f46d54ec4e8e




0it [00:00, ?it/s]

ROUGE

In [12]:
from langsmith.schemas import Run, Example
from langsmith.evaluation import evaluate
from rouge_score import rouge_scorer

def rouge_score(run: Run, example: Example) -> dict:
    prediction = run.outputs.get("answer")
    reference =  example.outputs.get("answer")
    scorer = rouge_scorer.RougeScorer(['rouge1','rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference,prediction)
    return {
        "results": [
            {"key": "rouge1_fmeasure", "score": scores["rouge1"].fmeasure},
            {"key": "rouge2_fmeasure", "score": scores["rouge2"].fmeasure},
            {"key": "rougeL_fmeasure", "score": scores["rougeL"].fmeasure},
            ]
        }

# Evaluators
qa_evalulator = [rouge_score]
dataset_name = "stat_test"

# Run
experiment_results = evaluate(
    answer_question_llm,
    data=dataset_name,
    evaluators=qa_evalulator,
    experiment_prefix="test-rouge-score",
    # Any experiment metadata can be specified here
    metadata={
        "variant": "stuff website context into gpt-3.5-turbo",
    },
)

View the evaluation results for experiment: 'test-rouge-score-92fc4ca9' at:
https://smith.langchain.com/o/4da9684a-c78b-54bf-a119-2e143c6c11df/datasets/17aefcee-326d-4e8e-85b3-1f890af4b74d/compare?selectedSessions=4646fcc6-2931-4270-97c9-ef3041407c47




0it [00:00, ?it/s]

METEOR

In [13]:
from langsmith.schemas import Run, Example
from langsmith.evaluation import evaluate
from nltk.translate.meteor_score import meteor_score as meteor_scorer
from nltk import word_tokenize
import nltk
nltk.download('wordnet')

def meteor_score(run: Run, example: Example) -> dict:
    prediction = run.outputs.get("answer")
    reference =  example.outputs.get("answer")
    score = meteor_scorer([word_tokenize(reference)], word_tokenize(prediction), alpha = 0.9, beta = 3, gamma = 0.5)
    return {"key": "meteor", "score": score}

# Evaluators
qa_evalulator = [meteor_score]
dataset_name = "stat_test"

# Run
experiment_results = evaluate(
    answer_question_llm,
    data=dataset_name,
    evaluators=qa_evalulator,
    experiment_prefix="test-meteor-score",
    # Any experiment metadata can be specified here
    metadata={
        "variant": "stuff website context into gpt-3.5-turbo",
    },
)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


View the evaluation results for experiment: 'test-meteor-score-2b26a352' at:
https://smith.langchain.com/o/4da9684a-c78b-54bf-a119-2e143c6c11df/datasets/17aefcee-326d-4e8e-85b3-1f890af4b74d/compare?selectedSessions=943790f6-846a-4e2f-9276-d61737d9a6e1




0it [00:00, ?it/s]