In [1]:
! pip install langfuse langchain langchain-openai --upgrade

Collecting langfuse
  Downloading langfuse-2.33.0-py3-none-any.whl.metadata (3.1 kB)
Collecting langchain
  Using cached langchain-0.2.1-py3-none-any.whl.metadata (13 kB)
Collecting langchain-openai
  Downloading langchain_openai-0.1.7-py3-none-any.whl.metadata (2.5 kB)
Collecting langchain-core<0.3.0,>=0.2.0 (from langchain)
  Using cached langchain_core-0.2.1-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Using cached langchain_text_splitters-0.2.0-py3-none-any.whl.metadata (2.2 kB)
Collecting openai<2.0.0,>=1.24.0 (from langchain-openai)
  Downloading openai-1.30.3-py3-none-any.whl.metadata (21 kB)
Collecting tiktoken<1,>=0.7 (from langchain-openai)
  Downloading tiktoken-0.7.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.6 kB)
Downloading langfuse-2.33.0-py3-none-any.whl (162 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.4/162.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hUsin

In [5]:
from dotenv import load_dotenv
load_dotenv()

True

In [6]:
import os
os.environ['EVAL_MODEL'] = "gpt-3.5-turbo-instruct"

# Langchain Eval types
EVAL_TYPES={
    "hallucination": True,
    "conciseness": True,
    "relevance": True,
    "coherence": True,
    "harmfulness": True,
    "maliciousness": True,
    "helpfulness": True,
    "controversiality": True,
    "misogyny": True,
    "criminality": True,
    "insensitivity": True
}

In [7]:
from langfuse import Langfuse

langfuse = Langfuse()

langfuse.auth_check()

True

In [8]:
def fetch_all_pages(name=None, user_id = None, limit=50):
    page = 1
    all_data = []

    while True:
        response = langfuse.get_generations(name=name, limit=limit, user_id=user_id, page=page)
        if not response.data:
            break

        all_data.extend(response.data)
        page += 1

    return all_data

In [16]:
generations = fetch_all_pages()
print(generations)

[ObservationsView(id='48bde450-f354-456e-a643-939eb3338dbc', trace_id='ba8cae75-1950-492c-a637-ca15a0064b02', type='GENERATION', name='generate-poem', start_time=datetime.datetime(2024, 5, 17, 14, 7, 58, 776000, tzinfo=datetime.timezone.utc), end_time=datetime.datetime(2024, 5, 17, 14, 8, 1, 778000, tzinfo=datetime.timezone.utc), completion_start_time=None, model='gpt-3.5-turbo-0125', model_parameters={'top_p': 1, 'max_tokens': 'inf', 'temperature': 1, 'presence_penalty': 0, 'frequency_penalty': 0}, input=[{'role': 'system', 'content': 'You are a poet. Create a poem about this city.'}, {'role': 'user', 'content': 'The capital of Bulgaria is Sofia.'}], version=None, metadata=None, output={'role': 'assistant', 'content': 'In the heart of Bulgaria, Sofia stands tall,\nA city of history, beauty, and all.\nWith cathedrals that touch the sky,\nAnd streets where ancient tales lie.\n\nThe Vitosha Mountain, a majestic view,\nGuarding the city with skies so blue.\nParks and gardens in bloom all 

In [17]:
from langchain.evaluation import load_evaluator
from langchain_openai import OpenAI
from langchain.evaluation.criteria import LabeledCriteriaEvalChain

def get_evaluator_for_key(key: str):
  llm = OpenAI(temperature=0, model=os.environ.get('EVAL_MODEL'))
  return load_evaluator("criteria", criteria=key, llm=llm)

def get_hallucination_eval():
  criteria = {
    "hallucination": (
      "Does this submission contain information"
      " not present in the input or reference?"
    ),
  }
  llm = OpenAI(temperature=0, model=os.environ.get('EVAL_MODEL'))

  return LabeledCriteriaEvalChain.from_llm(
      llm=llm,
      criteria=criteria,
  )

In [18]:
def execute_eval_and_score():

  for generation in generations:
    criteria = [key for key, value in EVAL_TYPES.items() if value and key != "hallucination"]

    for criterion in criteria:
      eval_result = get_evaluator_for_key(criterion).evaluate_strings(
          prediction=generation.output,
          input=generation.input,
      )
      print(eval_result)

      langfuse.score(name=criterion, trace_id=generation.trace_id, observation_id=generation.id, value=eval_result["score"], comment=eval_result['reasoning'])

execute_eval_and_score()


NotFoundError: Error code: 404 - {'error': {'message': 'The model `text-davinci-003` has been deprecated, learn more here: https://platform.openai.com/docs/deprecations', 'type': 'invalid_request_error', 'param': None, 'code': 'model_not_found'}}

In [12]:
# hallucination


def eval_hallucination():

  chain = get_hallucination_eval()

  for generation in generations:
    eval_result = chain.evaluate_strings(
      prediction=generation.output,
      input=generation.input,
      reference=generation.input
    )
    print(eval_result)
    if eval_result is not None and eval_result["score"] is not None and eval_result["reasoning"] is not None:
      langfuse.score(name='hallucination', trace_id=generation.trace_id, observation_id=generation.id, value=eval_result["score"], comment=eval_result['reasoning'])

In [13]:
if EVAL_TYPES.get("hallucination") == True:
  eval_hallucination()

In [14]:
# SDK is async, make sure to await all requests
langfuse.flush()