In [None]:
!pip install openai

In [None]:
import os
import nest_asyncio
import pandas as pd
import torch

from langsmith import Client
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.smith import RunEvalConfig, run_on_dataset

from transformers import AutoModelForCausalLM
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)

# To Avoid the Error on Jupyter Notebook (RuntimeError: This Event Loop Is Already Running)
# Patch Asyncio To Allow Nested Event Loops

nest_asyncio.apply()

In [None]:
# -- Bitsandbytes parameters --

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Chat model
model_name = "NousResearch/Llama-2-7b-chat-hf"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    cache_dir = '/mnt/artifacts/llama2-7b-chat',
    device_map='auto'
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
pipe_llama7b_chat = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=750) # set device to run inference on GPU


ft_model_name ='subirmansukhani/llama-2-7b-miniguanaco'
ft_model = AutoModelForCausalLM.from_pretrained(
    ft_model_name,
    cache_dir = '/mnt/artifacts/llama2-7b-chat-ft',
    quantization_config=bnb_config,
    device_map='auto'
)




In [None]:
ft_model.config.use_cache = False
ft_model.config.pretraining_tp = 1

ft_tokenizer = AutoTokenizer.from_pretrained(ft_model_name, trust_remote_code=True)
pipe_llama7b_ft_chat = pipeline(task="text-generation", model=ft_model, tokenizer=ft_tokenizer, max_length=750) # set device to run inference on GPU

In [None]:
# Load the LangSmith Client and LLM

client = Client()

llm = ChatOpenAI()

In [None]:
# 1. Create a Dataset (Only Inputs, No Output)

example_inputs = [
    "Complete the following Python function that computes the factorial of a number: \ndef factorial(n):",
    "Summarize the following paragraph: 'Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals. Leading AI textbooks define the field as the study of \"intelligent agents\": any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals.'",
    "Generate a short poem about the beauty of nature.",
    "You are in a dark room with a single door. There's a switch next to the door. What do you do?",
    "Write a Python function to check if a given number is prime: \ndef is_prime(num):",
    "Explain the concept of neural networks in a simple way suitable for a high school student.",
    "Compose a haiku about the serenity of a quiet night.",
    "You are stuck on a deserted island and you find a radio. What's your next move?",
    "Translate the following English sentence to French: 'The sun sets over the horizon.'",
    "What are the primary differences between classical and quantum computing?",
    "Describe the taste of a freshly picked apple.",
    "You discover a mysterious old book in your attic with a lock on it. How would you approach this situation?",
    "Write a SQL query to fetch all rows from the 'employees' table where the salary is greater than 50000.",
    "What is the significance of the Schrödinger's cat thought experiment in quantum mechanics?",
    "Provide a Python code snippet to merge two dictionaries: \ndef merge_dicts(dict1, dict2):",
    "Briefly describe the process of photosynthesis.",
    "Compose a limerick about a mischievous cat.",
    "You're faced with a giant maze with a treasure in the center. How would you navigate it?",
    "Translate the following English sentence to Spanish: 'The night sky is filled with stars.'",
    "Explain the main components of a computer to someone from the 18th century.",
    "Describe the sensation of diving into cold water on a hot day.",
    "You hear a mysterious noise coming from the basement late at night. What's your reaction?",
    "Provide a JavaScript function to toggle an element's visibility: \nfunction toggleVisibility(elementId):",
    "Outline the main events leading to World War II.",
    "Write a sonnet about the passage of time.",
    "You come across a talking frog claiming to be a prince. How do you respond?",
    "Translate the following English phrase to German: 'Life is full of surprises.'",
    "Discuss the impact of social media on modern communication.",
    "Describe the aroma of fresh bread baking in an oven.",
    "You find a time machine with a note saying it can take you to any one moment in history. Where and when do you choose to go?",
    "Write a Java method to calculate the area of a rectangle: \npublic double rectangleArea(double length, double width):",
    "Detail the contributions of Nikola Tesla to the field of electricity.",
    "Compose a short story about a dragon who loves to read books.",
    "If you were to explain the internet to William Shakespeare, how would you describe it?",
    "Describe the key benefits of adopting cloud computing in modern businesses.",
    "Provide a summary of the principles of effective leadership in a corporate environment.",
    "Draft an email to employees announcing a new sustainability initiative.",
    "Outline the main challenges faced by global supply chains in the current economic climate.",
    "Explain the significance of data analytics in shaping business strategies today.",
    "Provide a brief overview of the concept of digital transformation and its impact on customer experience.",
    "Discuss the role of corporate social responsibility (CSR) in enhancing a company's brand image.",
    "Write a mission statement for a startup focused on renewable energy solutions.",
    "Summarize the advantages of remote work for both employees and employers.",
    "Describe the key factors that contribute to a positive organizational culture."
]


dataset_name = "Input only subjective task dataset"

# Storing inputs in a dataset lets us
# run chains and LLMs over a shared set of examples.
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Input only subjective tasks ",
)

for input_prompt in example_inputs:
    # Each example must be unique and have inputs defined.
    # Outputs are optional
    client.create_example(
        inputs={"question": input_prompt},
        outputs=None,
        dataset_id=dataset.id,
    )

In [None]:
import json
from typing import Any, Optional
from langchain.evaluation import StringEvaluator
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import openai_functions

eval_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are an impartial grader tasked with measuring the accuracy of responses."),
        ("human", "Please evaluate the following data:\n\n"
         "<INPUT>\n{input}</INPUT>\n"
         "<RESPONSE>\n{prediction}</RESPONSE>\n"
         "Please save your reasoning and grading by calling the commit_grade function."
         " First, enumerate all factual discrepancies in the response."
         " Finally, score the prediction on a scale out of 100, taking into account factuality, conciseness and"
         " correctness "),

    ]
)

commit_grade_schema = {
    "name": "commit_grade",
    "description": "Commits a grade with reasoning.",
    "parameters": {
        "title": "commit_grade_parameters",
        "description": "Parameters for the commit_grade function.",
        "type": "object",
        "properties": {
            "mistakes": {
                "title": "discrepancies",
                "type": "string",
                "description": "Any discrepencies between the predicted and ground truth."
            },
            "reasoning": {
                "title": "reasoning",
                "type": "string",
                "description": "The explanation or logic behind the final grade."
            },
            "grade": {
                "title": "grade",
                "type": "number",
                "description": "The numerical value representing the grade.",
                "minimum": 0,
                "maximum": 100
            }
        },
        "required": ["reasoning", "grade", "mistakes"],
    }
}

def normalize_grade(func_args: str) -> dict:
    args = json.loads(func_args)
    return {
        "reasoning": (args.get("reasoning", "") + "\n\n" + args.get("discrepancies", "")).strip(),
        "score": args.get("grade", 0) / 100,
    }

eval_chain = (
    eval_prompt
    | ChatOpenAI(temperature=0).bind(functions=[commit_grade_schema])
    | openai_functions.OutputFunctionsParser()
    | normalize_grade
)

class EvaluateTriplets(StringEvaluator):
    """Evaluate the triplets of a predicted string."""

    @property
    def requires_input(self) -> bool:
        return True

    @property
    def requires_reference(self) -> bool:
        return False

    def _evaluate_strings(
        self,
        *,
        prediction: str,
        reference: Optional[str] = None,
        input: Optional[str] = None,
        **kwargs: Any,
    ) -> dict:
        callbacks = kwargs.pop("callbacks", None)
        return eval_chain.invoke(
            {"prediction": prediction, "input": input},
            {"callbacks": callbacks},
        )

In [None]:
# 2. Evaluate Datasets with LLM, the criteria here is "informative-ness"

eval_config = RunEvalConfig(
    evaluators=[
        # Or you can configure the evaluator
        RunEvalConfig.Criteria("harmfulness"),
        RunEvalConfig.Criteria("misogyny"),
        RunEvalConfig.Criteria(
            {
                "short_informative": "Are the answers short and informative? "
                "Respond Y if they are, N if they're not short and informative."
            }
        ),
    ]
)

In [None]:
config = RunEvalConfig(
    custom_evaluators=[EvaluateTriplets()],
)

prompt = PromptTemplate.from_template("<s>[INST] {question} [/INST]")

In [16]:
# Chat LLM
llama_llm_chat = HuggingFacePipeline(pipeline=pipe_llama7b_chat)
llama_chain_chat = prompt | llama_llm_chat
results = await client.arun_on_dataset("Input only subjective task dataset", llama_chain_chat, evaluation=eval_config)

View the evaluation results for project 'test-crushing-point-36' at:
https://smith.langchain.com/o/dcc925e6-6130-54c2-852e-9cbbb51328d6/projects/p/f05ee440-5312-4c1b-bb19-ba7bab8c7183
[>                                                 ] 0/44



[------------------------------------------------->] 44/44

In [None]:
# Chat LLM w/ FT
llama_llm_chat_ft = HuggingFacePipeline(pipeline=pipe_llama7b_ft_chat)
llama_chain_chat_ft = prompt | llama_llm_chat_ft
results = await client.arun_on_dataset("Input only subjective task dataset", llama_chain_chat_ft, evaluation=eval_config)

View the evaluation results for project 'test-back-flower-89' at:
https://smith.langchain.com/o/dcc925e6-6130-54c2-852e-9cbbb51328d6/projects/p/d671ad3c-a73b-40bd-8149-ab219fefefb2
[------->                                          ] 7/44

In [None]:
# 1. Create a Dataset From a List of Examples (Key-Value Pairs)

example_inputs = [
    ("Complete the following Python function that computes the factorial of a number: \ndef factorial(n):", "def factorial(n): \n if n == 0: \n return 1 \n else: \n return n * factorial(n-1)"),
    ("Summarize the following paragraph: 'Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals. Leading AI textbooks define the field as the study of \"intelligent agents\": any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals.", 
     "AI is machine intelligence as opposed to natural human or animal intelligence. It's defined as the study of devices that act intelligently to achieve their goals."),
    ("You are in a dark room with a single door. There's a switch next to the door. What do you do?", "I would flip the switch to see if it turns on a light."),
    (
        "Convert the following statement into a question: 'The Eiffel Tower is located in Paris.'",
        "Where is the Eiffel Tower located?",
    ),
]

dataset_name = "Tasks and Answers"

dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Questions and answers about diverse tasks",
)

for input_prompt, output_answer in example_inputs:
    client.create_example(
        inputs={"question": input_prompt},
        outputs={"answer": output_answer},
        dataset_id=dataset.id,
    )

In [None]:
import textstat

from typing import Optional
from evaluate import load
from langsmith.evaluation import EvaluationResult, RunEvaluator
from langsmith.schemas import Example, Run


class FKEvaluator(RunEvaluator):
    def __init__(self):
        self.metric_fn = textstat.flesch_kincaid_grade

    def evaluate_run(
        self, run: Run, example: Optional[Example] = None
    ) -> EvaluationResult:
        if run.outputs is None:
            raise ValueError("Run outputs cannot be None")
        prediction = run.outputs['generations'][0][0]['text']
        return EvaluationResult(key="flesch_kincaid_grade", score=self.metric_fn(prediction))
    

eval_config = RunEvalConfig(
    evaluators=[
        # You can define an arbitrary criterion as a key: value pair in the criteria dict
        "string_distance",
        "embedding_distance",
        RunEvalConfig.Criteria("conciseness"),
        RunEvalConfig.LabeledCriteria(
            {
                "helpfulness": (
                    "Is this submission helpful to the user,"
                    " taking into account the correct reference answer?"
                )
            }
        ),
    ],
    custom_evaluators = [FKEvaluator()]
)

In [None]:
results = await client.arun_on_dataset("Tasks and Answers", llama_chain_chat_ft, evaluation=eval_config)