# Langchain evaluator

Example usage of custom evaluator using EvalMyAi to evaluate dataset on Langsmith

In [None]:
from langsmith import Client
from evalmyai import Evaluator, AzureAuth, OpenAIAuth
from langsmith.utils import LangSmithConflictError

## Azure auth
azure_endpoint = "https://XXXXXXXXXXXXXXXXX.openai.azure.com/"  # Replace with your Azure endpoint
azure_api_key = "XXXXXXXXXXXXXXXXXXXXXX"  # Replace with your Azure OpenAI API key
azure_api_version = "2024-08-01-preview"  # Replace with the desired API version
azure_deployment_name = "XXXXXXXXXXXXXXX"  # Replace with your model deployment name

# ## OpenAi auth
# openAi_api_key = "XXXX"
# openAi_model = "gpt-4" # select your model, we strongly recommend GPT-4.


evalMyAi_token = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" 


# Evaluation dataset  
data = {
    "How do astronauts adapt to long-term space travel?": ("They exercise, follow strict diets, and get psychological support to stay healthy in microgravity.", "Astronauts maintain fitness and mental health through exercise, diet, and communication with loved ones."),
    "What are the main contributors to greenhouse gas emissions?": ("Burning fossil fuels, deforestation, and industrial activities release carbon dioxide and methane.", "Fossil fuels and deforestation contribute to greenhouse gases, trapping heat and causing climate change."),
    "How is artificial intelligence transforming the healthcare industry?": ("AI improves diagnosis, automates tasks, and assists in surgeries with precision.", "AI speeds up medical work, helping doctors diagnose and treat patients more efficiently."),
    "What are the long-term effects of chronic stress on the human body?": ("Chronic stress weakens the immune system, raises heart disease risk, and affects mental health.", "Stress over time can harm the body, leading to illness and anxiety."),
    "How did the Industrial Revolution change global economies?": ("It boosted production, increased trade, and led to urbanization and factory jobs.", "Machines and factories changed how goods were made and where people lived."),
    "What are the ethical concerns surrounding facial recognition technology?": ("Privacy invasion, bias, and potential misuse in surveillance are key concerns.", "Some worry it violates privacy and unfairly targets certain groups."),
    "How do copyright laws protect digital content creators?": ("They grant exclusive rights, preventing unauthorized use or reproduction of creative work.", "Copyright lets creators control their work and stops others from copying it."),
    "What are the key differences between cloud computing and traditional computing?": ("Cloud computing offers remote access and scalability, while traditional computing uses local hardware.", "Cloud computing is flexible and online, while traditional computing relies on personal devices."),
    "Why is Python often recommended for beginners in programming?": ("It has simple syntax, is easy to learn, and is widely used in various fields.", "Python is beginner-friendly because it’s readable and used in many industries."),
    "In what types of projects is the waterfall model most effective?": ("It works best for projects with clear, fixed requirements and little need for changes.", "Waterfall is ideal when every step must be planned and followed strictly."),
    "How do vaccines work to protect against diseases?": ("They stimulate the immune system to recognize and fight viruses before infection occurs.", "Vaccines train the body to fight diseases by exposing it to harmless virus parts."),
    "Why is the James Webb Space Telescope important for space exploration?": ("It detects infrared light, allowing scientists to study distant galaxies and exoplanets.", "Webb helps explore deep space by capturing detailed images of stars and planets."),
    "What are the advantages and challenges of solar power?": ("It’s renewable and reduces emissions but depends on weather and requires storage.", "Solar energy is clean but costly and needs sunlight to work well."),
    "How does reinforcement learning differ from supervised learning?": ("Reinforcement learning learns from rewards, while supervised learning uses labeled data.", "One learns by trial and error; the other learns from pre-labeled examples."),
    "What are common types of cyber threats and how can they be prevented?": ("Phishing, malware, and data breaches can be prevented with strong passwords and updates.", "Hackers use malware and scams; security measures like updates help prevent attacks."),
    "How does blockchain technology ensure secure transactions?": ("It uses decentralization, cryptography, and consensus mechanisms to prevent fraud.", "Blockchain keeps records safe by using encryption and a distributed ledger."),
    "What are the potential benefits and risks of gene editing?": ("It can cure diseases but raises ethical concerns about unintended effects.", "Gene editing may fix genetic issues but could have unknown consequences."),
    "How is big data analytics being used in modern businesses?": ("It helps companies analyze trends, detect fraud, and improve decision-making.", "Big data allows businesses to predict customer behavior and enhance efficiency."),
    "What role does CRISPR play in genetic engineering?": ("CRISPR edits DNA precisely and is used in medicine and agriculture.", "CRISPR helps scientists change genes, possibly curing diseases."),
    "How does inflation impact the purchasing power of consumers?": ("It raises prices, making money worth less over time.", "Inflation makes goods more expensive, reducing what people can afford.")
}

# Transform dictionary into list of tuples with only first answers
transformed_data = [(question, answers[0]) for question, answers in data.items()]

# Prepare inputs, outputs, and metadata for bulk creation
inputs = [{"question": input_prompt} for input_prompt in data]
outputs = [{"answer": output_answer[0]} for _, output_answer in data.items()]
metadata = [{"source": "EvalMyAi"} for _ in data]


client = Client()
dataset_name = "EvalMyAi dataset"
dataset= None

try:
  dataset = client.create_dataset(
    dataset_name=dataset_name, description="Examplary EvalMyAi usage.",
  )
  client.create_examples(
    inputs=inputs,
    outputs=outputs,
    metadata=metadata,
    dataset_id=dataset.id,
  )
except LangSmithConflictError:
  print("Dataset already exists.")
  dataset = next(client.list_datasets(dataset_name=dataset_name))


Dataset already exists.


Simulating chatbot:

In [2]:
def dummy_chatbot(inputs: dict) -> dict:
    
    question = inputs.get("question")
    for key, (_, response) in data.items():
        if key == question:
            return {"answer": response}

    return {"answer": "I don't know the answer."}

Create EvalMyAi evaultor using Azure or OpenAI:

In [3]:
auth_azure = AzureAuth(
    api_key= azure_api_key,
    azure_endpoint= azure_endpoint,
    api_version= azure_api_version,
    azure_deployment= azure_deployment_name,
)
ev = Evaluator(auth_azure, evalMyAi_token)

In [4]:
# open_ai auth
# auth_open_ai = OpenAIAuth (
#     api_key= openAi_api_key,
#     model= openAi_model 
# )

# ev = Evaluator(auth_open_ai, evalMyAi_token)

Define langsmith function for each metric:

In [None]:
def f1(outputs: dict, reference_outputs: dict) -> int:
    data = {"expected": reference_outputs["answer"], "actual": outputs["answer"]}
    return ev.evaluate(data, symbols=["f1"])["f1"]["scores"]["f1"]


def contradictions(outputs: dict, reference_outputs: dict) -> int:
    data = {"expected": reference_outputs["answer"], "actual": outputs["answer"]}
    return ev.evaluate(data, symbols=["contradictions"])["contradictions"]["scores"][
        "score"
    ]

Run lagsmith evalutation:

In [None]:
experiment_results = client.evaluate(
    dummy_chatbot,
    data=dataset,
    evaluators=[
        f1,
        contradictions,
    ],
    experiment_prefix="evalMyAi",
    max_concurrency=1,
)