# Import Modules

In [1]:
# Check LangChain Version

# !pip install --upgrade langchain
!pip show langchain --version

Name: langchain
Version: 0.0.271
Summary: Building applications with LLMs through composability
Home-page: https://github.com/langchain-ai/langchain
Author: 
Author-email: 
License: MIT
Location: /Users/daveebbelaar/opt/anaconda3/envs/ai-experiments/lib/python3.10/site-packages
Requires: aiohttp, async-timeout, dataclasses-json, langsmith, numexpr, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: 


In [1]:
import os
import nest_asyncio
import pandas as pd
from dotenv import find_dotenv, load_dotenv
from langsmith import Client
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.smith import RunEvalConfig, run_on_dataset

# To Avoid the Error on Jupyter Notebook (RuntimeError: This Event Loop Is Already Running)
# Patch Asyncio To Allow Nested Event Loops

nest_asyncio.apply()

# Load API Keys From the .env File

In [2]:
load_dotenv(find_dotenv())
os.environ["LANGCHAIN_API_KEY"] = str(os.getenv("LANGSMITH_API_KEY"))
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"] = "langsmith-tutorial"

# LangSmith Quick Start

In [3]:
# Load the LangSmith Client and Test Run

client = Client()

llm = ChatOpenAI()
llm.predict("Hello, world!")

  warn_deprecated(
  warn_deprecated(


'Hello! How can I assist you today?'

# Evaluation Quick Start

In [4]:
# 1. Create a Dataset (Only Inputs, No Output)

example_inputs = [
    "a rap battle between Atticus Finch and Cicero",
    "a rap battle between Barbie and Oppenheimer",
    "a Pythonic rap battle between two swallows: one European and one African",
    "a rap battle between Aubrey Plaza and Stephen Colbert",
]

dataset_name = "Rap Battle Dataset"

# Storing inputs in a dataset lets us
# run chains and LLMs over a shared set of examples.
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Rap battle prompts.",
)

for input_prompt in example_inputs:
    # Each example must be unique and have inputs defined.
    # Outputs are optional
    client.create_example(
        inputs={"question": input_prompt},
        outputs=None,
        dataset_id=dataset.id,
    )

In [5]:
# 2. Evaluate Datasets with LLM

eval_config = RunEvalConfig(
    evaluators=[
        # You can specify an evaluator by name/enum.
        # In this case, the default criterion is "helpfulness"
        "criteria",
        # Or you can configure the evaluator
        RunEvalConfig.Criteria("harmfulness"),
        RunEvalConfig.Criteria("misogyny"),
        RunEvalConfig.Criteria(
            {
                "cliche": "Are the lyrics cliche? "
                "Respond Y if they are, N if they're entirely unique."
            }
        ),
    ]
)

run_on_dataset(
    client=client,
    dataset_name=dataset_name,
    llm_or_chain_factory=llm,
    evaluation=eval_config,
)

View the evaluation results for project 'excellent-rate-30' at:
https://smith.langchain.com/o/1341bdf8-127d-5676-99a5-1e504f0c9326/datasets/ee2af03a-8017-4f6d-964e-d0393e0d9834/compare?selectedSessions=8fe4ab38-fb1d-4b18-8143-01105483ebba

View all tests for Dataset Rap Battle Dataset at:
https://smith.langchain.com/o/1341bdf8-127d-5676-99a5-1e504f0c9326/datasets/ee2af03a-8017-4f6d-964e-d0393e0d9834
[------------------------------------------------->] 4/4

{'project_name': 'excellent-rate-30',
 'results': {'4f2a9106-ec73-41f9-be13-5ec85ce4abf4': {'input': {'question': 'a rap battle between Aubrey Plaza and Stephen Colbert'},
   'feedback': [EvaluationResult(key='helpfulness', score=1, value='Y', comment='The criterion for this task is "helpfulness: Is the submission helpful, insightful, and appropriate?"\n\n1. Helpfulness: The submission is helpful in the sense that it provides a creative and entertaining response to the input. It imagines a rap battle between Aubrey Plaza and Stephen Colbert, which is what the input requested.\n\n2. Insightfulness: The submission is insightful as it uses known characteristics of both Aubrey Plaza and Stephen Colbert to create their respective rap verses. It shows an understanding of their public personas and incorporates that into the rap battle.\n\n3. Appropriateness: The submission is appropriate as it sticks to the task given, which was to create a rap battle between Aubrey Plaza and Stephen Colbert.

# Different Ways of Creating Datasets in LangSmith

In [6]:
# 1. Create a Dataset From a List of Examples (Key-Value Pairs)

example_inputs = [
    ("What is the largest mammal?", "The blue whale"),
    ("What do mammals and birds have in common?", "They are both warm-blooded"),
    ("What are reptiles known for?", "Having scales"),
    (
        "What's the main characteristic of amphibians?",
        "They live both in water and on land",
    ),
]

dataset_name = "Elementary Animal Questions"

dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Questions and answers about animal phylogenetics.",
)

for input_prompt, output_answer in example_inputs:
    client.create_example(
        inputs={"question": input_prompt},
        outputs={"answer": output_answer},
        dataset_id=dataset.id,
    )

In [7]:
# 2. Create a Dataset From Existing Runs
# i.e. you log runs from 3pm to 4pm, you may have 100 prompts and output
# you can use those prompts and outputs to create a dataset.

dataset_name = "Example Dataset"

# Filter runs to add to the dataset
runs = client.list_runs(
    project_name="langsmith-tutorial",
    execution_order=1,
    error=False,
)

dataset = client.create_dataset(dataset_name, description="An example dataset")

for run in runs:
    client.create_example(
        inputs=run.inputs,
        outputs=run.outputs,
        dataset_id=dataset.id,
    )

In [8]:
# 3. Create a Dataset From a Dataframe

# Create a Dataframe

example_inputs = [
    ("What is the largest mammal?", "The blue whale"),
    ("What do mammals and birds have in common?", "They are both warm-blooded"),
    ("What are reptiles known for?", "Having scales"),
    (
        "What's the main characteristic of amphibians?",
        "They live both in water and on land",
    ),
]

df_dataset = pd.DataFrame(example_inputs, columns=["Question", "Answer"])
df_dataset.head()

Unnamed: 0,Question,Answer
0,What is the largest mammal?,The blue whale
1,What do mammals and birds have in common?,They are both warm-blooded
2,What are reptiles known for?,Having scales
3,What's the main characteristic of amphibians?,They live both in water and on land


In [9]:
input_keys = ["Question"]
output_keys = ["Answer"]

# Create Dataset

dataset = client.upload_dataframe(
    df=df_dataset,
    input_keys=input_keys,
    output_keys=output_keys,
    name="My Dataframe Dataset",
    description="Dataset created from a dataframe",
    data_type="kv",  # The default
)

In [10]:
# 4. Create a Dataset From a CSV File

# Save the Dataframe as a CSV File

csv_path = "../data/dataset.csv"
df_dataset.to_csv(csv_path, index=False)

# Create Dataset

dataset = client.upload_csv(
    csv_file=csv_path,
    input_keys=input_keys,
    output_keys=output_keys,
    name="My CSV Dataset",
    description="Dataset created from a CSV file",
    data_type="kv",
)

# Correctness: LangSmith Question-Answer Evaluation

In [11]:
# 1. Evaluate Datasets That Contain Labels

evaluation_config = RunEvalConfig(
    evaluators=[
        "qa",  # correctness: right or wrong
        "context_qa",  # refer to example outputs
        "cot_qa",  # chain_of_thought context_qa + reasoning
    ]
)

run_on_dataset(
    client=client,
    dataset_name="Elementary Animal Questions",
    llm_or_chain_factory=llm,
    evaluation=evaluation_config,
)

View the evaluation results for project 'artistic-design-19' at:
https://smith.langchain.com/o/1341bdf8-127d-5676-99a5-1e504f0c9326/datasets/bb24903f-f2bd-42a3-a3da-9e2550b50f9c/compare?selectedSessions=f4892626-80b7-47c4-801f-6d3b6dcfab28

View all tests for Dataset Elementary Animal Questions at:
https://smith.langchain.com/o/1341bdf8-127d-5676-99a5-1e504f0c9326/datasets/bb24903f-f2bd-42a3-a3da-9e2550b50f9c
[------------------------------------------------->] 4/4

{'project_name': 'artistic-design-19',
 'results': {'b91d4b8f-5517-452a-9d91-402c7b0707dd': {'input': {'question': "What's the main characteristic of amphibians?"},
   'feedback': [EvaluationResult(key='correctness', score=1, value='CORRECT', comment='CORRECT', correction=None, evaluator_info={'__run': RunInfo(run_id=UUID('721fcc55-bb38-483e-88c9-c4e34fd88a33'))}, source_run_id=None, target_run_id=None),
    EvaluationResult(key='Contextual Accuracy', score=1, value='CORRECT', comment='CORRECT', correction=None, evaluator_info={'__run': RunInfo(run_id=UUID('b950be5d-e34e-48aa-b0d3-027708027ea9'))}, source_run_id=None, target_run_id=None),
    EvaluationResult(key='COT Contextual Accuracy', score=1, value='CORRECT', comment="The student's answer is factually correct. The main characteristic of amphibians, as stated in the context, is that they live both in water and on land. The student's answer expands on this by explaining that amphibians have a dual life cycle, spending part of their

In [12]:
# 2. Evaluate Datasets With Customized Criterias

evaluation_config = RunEvalConfig(
    evaluators=[
        # You can define an arbitrary criterion as a key: value pair in the criteria dict
        RunEvalConfig.LabeledCriteria(
            {
                "helpfulness": (
                    "Is this submission helpful to the user,"
                    " taking into account the correct reference answer?"
                )
            }
        ),
    ]
)

run_on_dataset(
    client=client,
    dataset_name="Elementary Animal Questions",
    llm_or_chain_factory=llm,
    evaluation=evaluation_config,
)

View the evaluation results for project 'shiny-linen-48' at:
https://smith.langchain.com/o/1341bdf8-127d-5676-99a5-1e504f0c9326/datasets/bb24903f-f2bd-42a3-a3da-9e2550b50f9c/compare?selectedSessions=9529ff4d-75ea-45ee-9476-86798138a3d5

View all tests for Dataset Elementary Animal Questions at:
https://smith.langchain.com/o/1341bdf8-127d-5676-99a5-1e504f0c9326/datasets/bb24903f-f2bd-42a3-a3da-9e2550b50f9c
[------------------------------------------------->] 4/4

{'project_name': 'shiny-linen-48',
 'results': {'b91d4b8f-5517-452a-9d91-402c7b0707dd': {'input': {'question': "What's the main characteristic of amphibians?"},
   'feedback': [EvaluationResult(key='helpfulness', score=1, value='Y', comment="The criterion for this task is the helpfulness of the AI's submission, taking into account the correct reference answer. \n\nThe reference answer states that the main characteristic of amphibians is that they live both in water and on land. \n\nThe AI's submission expands on this by explaining that amphibians have a dual life cycle, starting their lives in water as larvae with gills, then undergoing metamorphosis to develop lungs and live on land as adults. This is essentially a more detailed explanation of the reference answer. \n\nThe AI's submission also adds that amphibians have moist, permeable skin that allows them to breathe through their skin in addition to their lungs. This is additional information that is not included in the reference an

In [13]:
# 3. Evaluate Datasets Without Labels

evaluation_config = RunEvalConfig(
    evaluators=[
        # You can define an arbitrary criterion as a key: value pair in the criteria dict
        RunEvalConfig.Criteria(
            {"creativity": "Is this submission creative, imaginative, or novel?"}
        ),
        # We provide some simple default criteria like "conciseness" you can use as well
        RunEvalConfig.Criteria("conciseness"),
    ]
)

run_on_dataset(
    client=client,
    dataset_name="Rap Battle Dataset",
    llm_or_chain_factory=llm,
    evaluation=evaluation_config,
)

View the evaluation results for project 'diligent-event-39' at:
https://smith.langchain.com/o/1341bdf8-127d-5676-99a5-1e504f0c9326/datasets/ee2af03a-8017-4f6d-964e-d0393e0d9834/compare?selectedSessions=a0c4330f-9f75-4903-988a-e5ea1ec9adaa

View all tests for Dataset Rap Battle Dataset at:
https://smith.langchain.com/o/1341bdf8-127d-5676-99a5-1e504f0c9326/datasets/ee2af03a-8017-4f6d-964e-d0393e0d9834
[------------------------------------------------->] 4/4

{'project_name': 'diligent-event-39',
 'results': {'4f2a9106-ec73-41f9-be13-5ec85ce4abf4': {'input': {'question': 'a rap battle between Aubrey Plaza and Stephen Colbert'},
   'feedback': [EvaluationResult(key='creativity', score=1, value='Y', comment='The criterion to be assessed is creativity. This involves determining whether the submission is creative, imaginative, or novel.\n\nLooking at the submission, it is a rap battle between Aubrey Plaza and Stephen Colbert. The AI has created unique verses for each participant, reflecting their personalities and careers. Aubrey Plaza is portrayed as a fierce, sassy competitor, while Stephen Colbert is depicted as a witty, determined opponent. The verses are not generic and seem to be specifically tailored for the characters involved.\n\nThe rap battle format itself is also a creative approach to the task. It\'s not a straightforward conversation or debate, but a rhythmic, rhyming exchange of words. This adds an element of novelty and imaginat

In [14]:
# 4. Evaluate Datasets Based on Cosine Distance Criteria
# Cosine Distance: Ranged Between 0 to 1. 0 = More Similar

evaluation_config = RunEvalConfig(
    evaluators=[
        # You can define an arbitrary criterion as a key: value pair in the criteria dict
        "embedding_distance",
        # Or to customize the embeddings:
        # Requires 'pip install sentence_transformers'
        # RunEvalConfig.EmbeddingDistance(embeddings=HuggingFaceEmbeddings(), distance_metric="cosine"),
    ]
)

run_on_dataset(
    client=client,
    dataset_name="Elementary Animal Questions",
    llm_or_chain_factory=llm,
    evaluation=evaluation_config,
)

View the evaluation results for project 'flowery-look-3' at:
https://smith.langchain.com/o/1341bdf8-127d-5676-99a5-1e504f0c9326/datasets/bb24903f-f2bd-42a3-a3da-9e2550b50f9c/compare?selectedSessions=6839804a-4283-4f2a-acc7-4a5174c2339f

View all tests for Dataset Elementary Animal Questions at:
https://smith.langchain.com/o/1341bdf8-127d-5676-99a5-1e504f0c9326/datasets/bb24903f-f2bd-42a3-a3da-9e2550b50f9c


  warn_deprecated(


[------------------------------------------------->] 4/4

{'project_name': 'flowery-look-3',
 'results': {'b91d4b8f-5517-452a-9d91-402c7b0707dd': {'input': {'question': "What's the main characteristic of amphibians?"},
   'feedback': [EvaluationResult(key='embedding_cosine_distance', score=0.14963387840295794, value=None, comment=None, correction=None, evaluator_info={'__run': RunInfo(run_id=UUID('e178d390-9c36-4478-a661-6d6508c1ae13'))}, source_run_id=None, target_run_id=None)],
   'execution_time': 1.602044,
   'run_id': '4d0dc8ff-b931-4cba-a701-afe3111f9c5f',
   'output': AIMessage(content='The main characteristic of amphibians is their ability to live both on land and in water. They typically have moist skin that helps them to breathe through their skin, and they undergo metamorphosis from a larval stage to an adult stage. They also typically lay eggs in water.'),
   'reference': {'answer': 'They live both in water and on land'}},
  '647b3f69-6f04-4578-8577-5b323fc090ac': {'input': {'question': 'What are reptiles known for?'},
   'feedbac

In [15]:
# 5. Evaluate Datasets Based on String Distance Criteria
# Jaro-Winkler Similarity Distance: 0 = Exact Match, 1 = No Similarity

evaluation_config = RunEvalConfig(
    evaluators=[
        # You can define an arbitrary criterion as a key: value pair in the criteria dict
        "string_distance",
        # Or to customize the distance metric:
        # RunEvalConfig.StringDistance(distance="levenshtein", normalize_score=True),
    ]
)

run_on_dataset(
    client=client,
    dataset_name="Elementary Animal Questions",
    llm_or_chain_factory=llm,
    evaluation=evaluation_config,
)

View the evaluation results for project 'back-chance-35' at:
https://smith.langchain.com/o/1341bdf8-127d-5676-99a5-1e504f0c9326/datasets/bb24903f-f2bd-42a3-a3da-9e2550b50f9c/compare?selectedSessions=904602d8-fa35-496f-b54c-5a935bd3e438

View all tests for Dataset Elementary Animal Questions at:
https://smith.langchain.com/o/1341bdf8-127d-5676-99a5-1e504f0c9326/datasets/bb24903f-f2bd-42a3-a3da-9e2550b50f9c
[------------------------------------------------->] 4/4

{'project_name': 'back-chance-35',
 'results': {'b91d4b8f-5517-452a-9d91-402c7b0707dd': {'input': {'question': "What's the main characteristic of amphibians?"},
   'feedback': [EvaluationResult(key='jaro_winkler_distance', score=0.43281996813595325, value=None, comment=None, correction=None, evaluator_info={'__run': RunInfo(run_id=UUID('ccbbd620-5d4b-48a2-92db-9fda1a38f6ab'))}, source_run_id=None, target_run_id=None)],
   'execution_time': 1.635244,
   'run_id': 'b778a698-d73d-4c37-a2f5-d5ce7dc23b62',
   'output': AIMessage(content='The main characteristic of amphibians is that they have a dual life cycle, spending part of their lives in water and part on land. They typically have moist skin, lay eggs in water, and undergo metamorphosis from a larval stage (such as tadpoles) to an adult stage.'),
   'reference': {'answer': 'They live both in water and on land'}},
  '647b3f69-6f04-4578-8577-5b323fc090ac': {'input': {'question': 'What are reptiles known for?'},
   'feedback': [Evaluation