In [None]:
!pip show langchain --version

In [1]:
import os
import nest_asyncio
import pandas as pd
from dotenv import find_dotenv, load_dotenv
from langsmith import Client
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.smith import RunEvalConfig, run_on_dataset

# To Avoid the Error on Jupyter Notebook (RuntimeError: This Event Loop Is Already Running)
# Patch Asyncio To Allow Nested Event Loops

nest_asyncio.apply()

In [2]:
load_dotenv(find_dotenv())
os.environ["LANGCHAIN_API_KEY"] = str(os.getenv("LANGCHAIN_API_KEY"))
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"] = "langsmith-1"

In [3]:
# Load the LangSmith Client and LLM

client = Client()

llm = ChatOpenAI()

In [None]:
# 1. Create a Dataset (Only Inputs, No Output)

example_inputs = [
    "Complete the following Python function that computes the factorial of a number: \ndef factorial(n):",
    "Summarize the following paragraph: \'Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals. Leading AI textbooks define the field as the study of \"intelligent agents\": any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals.",
    "Generate a short poem about the beauty of nature.",
    "You are in a dark room with a single door. There's a switch next to the door. What do you do?",
]

dataset_name = "Misc tasks dataset"

# Storing inputs in a dataset lets us
# run chains and LLMs over a shared set of examples.
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Few diverse tasks",
)

for input_prompt in example_inputs:
    # Each example must be unique and have inputs defined.
    # Outputs are optional
    client.create_example(
        inputs={"question": input_prompt},
        outputs=None,
        dataset_id=dataset.id,
    )

In [None]:
# 2. Evaluate Datasets with LLM

eval_config = RunEvalConfig(
    evaluators=[
        # You can specify an evaluator by name/enum.
        # In this case, the default criterion is "helpfulness"
        "criteria",
        # Or you can configure the evaluator
        RunEvalConfig.Criteria("harmfulness"),
        RunEvalConfig.Criteria("misogyny"),
        RunEvalConfig.Criteria(
            {
                "short_informative": "Are the answers short and informative? "
                "Respond Y if they are, N if they're not short and informative."
            }
        ),
    ]
)

run_on_dataset(
    client=client,
    dataset_name=dataset_name,
    llm_or_chain_factory=llm,
    evaluation=eval_config,
)

In [None]:
# 1. Create a Dataset From a List of Examples (Key-Value Pairs)

example_inputs = [
    ("Complete the following Python function that computes the factorial of a number: \ndef factorial(n):", "def factorial(n): \n if n == 0: \n return 1 \n else: \n return n * factorial(n-1)"),
    ("Summarize the following paragraph: 'Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals. Leading AI textbooks define the field as the study of \"intelligent agents\": any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals.", 
     "AI is machine intelligence as opposed to natural human or animal intelligence. It's defined as the study of devices that act intelligently to achieve their goals."),
    ("You are in a dark room with a single door. There's a switch next to the door. What do you do?", "I would flip the switch to see if it turns on a light."),
    (
        "Convert the following statement into a question: 'The Eiffel Tower is located in Paris.'",
        "Where is the Eiffel Tower located?",
    ),
]

dataset_name = "Tasks and Answers"

dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Questions and answers about diverse tasks",
)

for input_prompt, output_answer in example_inputs:
    client.create_example(
        inputs={"question": input_prompt},
        outputs={"answer": output_answer},
        dataset_id=dataset.id,
    )

In [None]:
# 2. Create a Dataset From a Dataframe

# Create a Dataframe
df_dataset = pd.DataFrame(example_inputs, columns=["Question", "Answer"])
df_dataset.head()

In [None]:
input_keys = ["Question"]
output_keys = ["Answer"]

# Create Dataset

dataset = client.upload_dataframe(
    df=df_dataset,
    input_keys=input_keys,
    output_keys=output_keys,
    name="Tasks Dataframe Dataset",
    description="Dataset created from a dataframe",
    data_type="kv",  # The default
)

In [None]:
# 4. Create a Dataset From a CSV File

# Save the Dataframe as a CSV File

csv_path = "./data/dataset.csv"
df_dataset.to_csv(csv_path, index=False)

# Create Dataset

dataset = client.upload_csv(
    csv_file=csv_path,
    input_keys=input_keys,
    output_keys=output_keys,
    name="Tasks CSV Dataset",
    description="Dataset created from a CSV file",
    data_type="kv",
)


In [None]:
# 1. Evaluate Datasets That Contain Labels

evaluation_config = RunEvalConfig(
    evaluators=[
        "qa",  # correctness: right or wrong
        "context_qa",  # refer to example outputs
        "cot_qa",  # context_qa + reasoning
    ]
)

run_on_dataset(
    client=client,
    dataset_name="Tasks and Answers",
    llm_or_chain_factory=llm,
    evaluation=evaluation_config,
)


In [None]:
# 2. Evaluate Datasets with customized criteria

evaluation_config = RunEvalConfig(
    evaluators=[
        # You can define an arbitrary criterion as a key: value pair in the criteria dict
        RunEvalConfig.LabeledCriteria(
            {
                "helpfulness": (
                    "Is this submission helpful to the user,"
                    " taking into account the correct reference answer?"
                )
            }
        ),
    ]
)

run_on_dataset(
    client=client,
    dataset_name="Tasks CSV Dataset",
    llm_or_chain_factory=llm,
    evaluation=evaluation_config,
)


In [None]:
# 3. Evaluate Datasets without Labels with a custom evaluator to compute perplexity

from typing import Optional

from evaluate import load
from langsmith.evaluation import EvaluationResult, RunEvaluator
from langsmith.schemas import Example, Run

import time
import textstat

class FKEvaluator(RunEvaluator):
    def __init__(self):
        self.metric_fn = textstat.flesch_kincaid_grade

    def evaluate_run(
        self, run: Run, example: Optional[Example] = None
    ) -> EvaluationResult:
        if run.outputs is None:
            raise ValueError("Run outputs cannot be None")
        prediction = run.outputs['generations'][0][0]['text']
        return EvaluationResult(key="flesch_kincaid_grade", score=self.metric_fn(prediction))

evaluation_config = RunEvalConfig(
    evaluators=[
        # You can define an arbitrary criterion as a key: value pair in the criteria dict
        RunEvalConfig.Criteria(
            {"creativity_correct": "Is this answers creative and correct?"}
        ),
        # We provide some simple default criteria like "conciseness" you can use as well
        RunEvalConfig.Criteria("conciseness"),
    ],
    custom_evaluators = [FKEvaluator()]
)

run_on_dataset(
    client=client,
    dataset_name="Misc tasks dataset",
    llm_or_chain_factory=llm,
    evaluation=evaluation_config,
)

In [None]:
# 4. Evaluate Datasets Based on Cosine Distance Criteria

evaluation_config = RunEvalConfig(
    evaluators=[
        # You can define an arbitrary criterion as a key: value pair in the criteria dict
        "embedding_distance",
        # Or to customize the embeddings:
        # Requires 'pip install sentence_transformers'
        # RunEvalConfig.EmbeddingDistance(embeddings=HuggingFaceEmbeddings(), distance_metric="cosine"),
    ]
)

run_on_dataset(
    client=client,
    dataset_name="Tasks Dataframe Dataset",
    llm_or_chain_factory=llm,
    evaluation=evaluation_config,
)

In [None]:
# 5. Evaluate Datasets Based on String Distance Criteria
# Jaro-Winkler Similarity Distance: 0 = Exact Match, 1 = No Similarity

evaluation_config = RunEvalConfig(
    evaluators=[
        # You can define an arbitrary criterion as a key: value pair in the criteria dict
        "string_distance",
        # Or to customize the distance metric:
        # RunEvalConfig.StringDistance(distance="levenshtein", normalize_score=True),
    ]
)

run_on_dataset(
    client=client,
    dataset_name="Tasks Dataframe Dataset",
    llm_or_chain_factory=llm,
    evaluation=evaluation_config,
)