In [None]:
import nest_asyncio
import os
import pandas as pd
import torch
import textstat

from evaluate import load

from langsmith import Client
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.smith import RunEvalConfig, run_on_dataset
from langsmith.evaluation import EvaluationResult, RunEvaluator
from langsmith.schemas import Example, Run

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)

from typing import Optional
from tqdm import tqdm

# To Avoid the Error on Jupyter Notebook (RuntimeError: This Event Loop Is Already Running)
# Patch Asyncio To Allow Nested Event Loops

nest_asyncio.apply()

In [None]:
# -- Bitsandbytes parameters --

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Chat model
model_name = "NousResearch/Llama-2-7b-chat-hf"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    cache_dir = '/mnt/artifacts/llama2-7b-chat',
    device_map='auto'
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          cache_dir = '/mnt/artifacts/llama2-7b-chat',
                                          trust_remote_code=True)
pipe_llama7b_chat = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=750) # set device to run inference on GPU


ft_model_name ='subirmansukhani/llama-2-7b-miniguanaco'
ft_model = AutoModelForCausalLM.from_pretrained(
    ft_model_name,
    cache_dir = '/mnt/artifacts/llama2-7b-chat-ft',
    quantization_config=bnb_config,
    device_map='auto'
)

In [None]:
ft_model.config.use_cache = False
ft_model.config.pretraining_tp = 1

ft_tokenizer = AutoTokenizer.from_pretrained(ft_model_name,
                                             cache_dir = '/mnt/artifacts/llama2-7b-chat',
                                             trust_remote_code=True)
pipe_llama7b_ft_chat = pipeline(task="text-generation", model=ft_model, tokenizer=ft_tokenizer, max_length=750) # set device to run inference on GPU

In [None]:
def create_dataset_if_not_exists(client, dataset_name, example_inputs, description:str=None, has_answer:bool=False):
    """
    Create a dataset if it doesn't exist, and add examples to it.
    
    Parameters:
    - client: The client object used for operations.
    - dataset_name: Name of the dataset to check or create.
    - example_inputs: List of key-value pairs to add to the dataset.
    """
    
    # Check if the dataset with the given name already exists
    existing_dataset = client.list_datasets(dataset_name=dataset_name)
    if existing_dataset and len(list(existing_dataset)) >= 1:
        print(f"A dataset with the name '{dataset_name}' already exists.")
        return
    
    # If dataset does not exist, create it
    dataset = client.create_dataset(
        dataset_name=dataset_name,
        description=description,
    )
    
    if has_answer:
        for input_prompt, output_answer in example_inputs:
            client.create_example(
                inputs={"question": input_prompt},
                outputs={"answer": output_answer},
                dataset_id=dataset.id,
            )
    else:
        for input_prompt in example_inputs:
            client.create_example(
                inputs={"question": input_prompt},
                outputs=None,
                dataset_id=dataset.id,
            )
    
    # Print a completion message after adding all examples
    print(f"Dataset '{dataset_name}' has been successfully created and populated with examples.")

In [None]:
# Set the environment variables for Langsmith

os.environ["LANGCHAIN_API_KEY"] = str(os.getenv("LANGCHAIN_API_KEY"))
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"] = "llama2-langsmith-eval"

In [None]:
# Load the LangSmith Client and LLM

client = Client()
llm = ChatOpenAI()

In [None]:
# 1. Create a Dataset (Only Inputs, No Output)

example_inputs = [
    "Complete the following Python function that computes the factorial of a number: \ndef factorial(n):",
    "Summarize the following paragraph: 'Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals. Leading AI textbooks define the field as the study of \"intelligent agents\": any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals.'",
    "Generate a short poem about the beauty of nature.",
    "You are in a dark room with a single door. There's a switch next to the door. What do you do?",
    "Write a Python function to check if a given number is prime: \ndef is_prime(num):",
    "Explain the concept of neural networks in a simple way suitable for a high school student.",
    "Compose a haiku about the serenity of a quiet night.",
    "You are stuck on a deserted island and you find a radio. What's your next move?",
    "Translate the following English sentence to French: 'The sun sets over the horizon.'",
    "What are the primary differences between classical and quantum computing?",
    "Describe the taste of a freshly picked apple.",
    "You discover a mysterious old book in your attic with a lock on it. How would you approach this situation?",
    "Write a SQL query to fetch all rows from the 'employees' table where the salary is greater than 50000.",
    "What is the significance of the Schrödinger's cat thought experiment in quantum mechanics?",
    "Provide a Python code snippet to merge two dictionaries: \ndef merge_dicts(dict1, dict2):",
    "Briefly describe the process of photosynthesis.",
    "Compose a limerick about a mischievous cat.",
    "You're faced with a giant maze with a treasure in the center. How would you navigate it?",
    "Translate the following English sentence to Spanish: 'The night sky is filled with stars.'",
    "Explain the main components of a computer to someone from the 18th century.",
    "Describe the sensation of diving into cold water on a hot day.",
    "You hear a mysterious noise coming from the basement late at night. What's your reaction?",
    "Provide a JavaScript function to toggle an element's visibility: \nfunction toggleVisibility(elementId):",
    "Outline the main events leading to World War II.",
    "Write a sonnet about the passage of time.",
    "You come across a talking frog claiming to be a prince. How do you respond?",
    "Translate the following English phrase to German: 'Life is full of surprises.'",
    "Discuss the impact of social media on modern communication.",
    "Describe the aroma of fresh bread baking in an oven.",
    "You find a time machine with a note saying it can take you to any one moment in history. Where and when do you choose to go?",
    "Write a Java method to calculate the area of a rectangle: \npublic double rectangleArea(double length, double width):",
    "Detail the contributions of Nikola Tesla to the field of electricity.",
    "Compose a short story about a dragon who loves to read books.",
    "If you were to explain the internet to William Shakespeare, how would you describe it?",
    "Describe the key benefits of adopting cloud computing in modern businesses.",
    "Provide a summary of the principles of effective leadership in a corporate environment.",
    "Draft an email to employees announcing a new sustainability initiative.",
    "Outline the main challenges faced by global supply chains in the current economic climate.",
    "Explain the significance of data analytics in shaping business strategies today.",
    "Provide a brief overview of the concept of digital transformation and its impact on customer experience.",
    "Discuss the role of corporate social responsibility (CSR) in enhancing a company's brand image.",
    "Write a mission statement for a startup focused on renewable energy solutions.",
    "Summarize the advantages of remote work for both employees and employers.",
    "Describe the key factors that contribute to a positive organizational culture."
]


dataset_name = "Input only subjective task dataset"
create_dataset_if_not_exists(client, dataset_name, example_inputs, description='Diverse tasks without answers', has_answer=False)

In [None]:
# 2. Create a Dataset From a List of Examples (Key-Value Pairs)

example_inputs = [
    ("Complete the following Python function that computes the factorial of a number: \ndef factorial(n):", "def factorial(n): \n if n == 0: \n return 1 \n else: \n return n * factorial(n-1)"),
    ("Summarize the following paragraph: 'Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals. Leading AI textbooks define the field as the study of \"intelligent agents\": any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals.", 
     "AI is machine intelligence as opposed to natural human or animal intelligence. It's defined as the study of devices that act intelligently to achieve their goals."),
    ("You are in a dark room with a single door. There's a switch next to the door. What do you do?", "I would flip the switch to see if it turns on a light."),
    ("Convert the following statement into a question: 'The Eiffel Tower is located in Paris.'", "Where is the Eiffel Tower located?"),
    ("Rewrite the following sentence in passive voice: 'Cats chase mice.'", "Mice are chased by cats."),
    ("Define 'photosynthesis'.", "Photosynthesis is the process by which green plants and some other organisms use sunlight to synthesize foods with the help of chlorophyll pigments."),
    ("Translate the following to Spanish: 'Hello, how are you?'", "Hola, ¿cómo estás?"),
    ("Solve for \( x \) in the equation \( 2x = 10 \).", "x = 5"),
    ("What's the capital of Japan?", "Tokyo"),
    ("Briefly explain the theory of relativity.", "The theory of relativity, proposed by Einstein, describes the laws of physics in relation to objects moving relative to each other."),
    ("Describe the process of evaporation.", "Evaporation is the process by which water changes from a liquid to a gas or vapor."),
    ("Who wrote 'Romeo and Juliet'?", "William Shakespeare"),
    ("Convert the following statement into a question: 'She plays the piano.'", "Does she play the piano?"),
    ("What do you do if you have a flat tire on a highway?", "I would pull over to a safe location, turn on hazard lights, and change the tire if possible or call for assistance."),
    ("Explain the term 'metabolism'.", "Metabolism is the set of chemical reactions that occur within a living organism to maintain life."),
    ("What's the main ingredient in guacamole?", "Avocado"),
    ("Summarize: 'Gravity is a force by which a planet or other body draws objects toward its center. The force of gravity keeps planets in orbit around the sun.'", "Gravity is a force that attracts objects to a body's center and keeps planets orbiting the sun."),
    ("Convert the following statement into a question: 'She has visited the museum.'", "Has she visited the museum?"),
    ("Describe the function of the heart.", "The heart pumps blood throughout the body, supplying oxygen and nutrients to the tissues and removing carbon dioxide and other wastes."),
    ("What is the boiling point of water?", "100°C or 212°F at 1 atmospheric pressure."),
    ("How does photosynthesis benefit plants?", "Photosynthesis provides energy to plants and produces oxygen as a byproduct."),
    ("Who painted the Mona Lisa?", "Leonardo da Vinci"),
    ("Define 'osmosis'.", "Osmosis is the movement of solvent molecules through a selectively permeable membrane into a region of higher solute concentration."),
    ("Explain the greenhouse effect.", "The greenhouse effect is a natural process where certain gases in the Earth's atmosphere trap heat, preventing it from escaping into space, thus warming the planet.")
]


dataset_name = "Tasks and Answers"
create_dataset_if_not_exists(client, dataset_name, example_inputs, description='Diverse tasks with answers', has_answer=True)

In [None]:
prompt = PromptTemplate.from_template("<s>[INST] {question} [/INST]")

In [None]:
# Setup the LLM based evaluation for tasks that do not have answers specified in the dataset

eval_config = RunEvalConfig(
    evaluators=[
        # You can specify an evaluator by name/enum.
        # In this case, the default criterion is "helpfulness"
        "criteria",
        # Or you can configure the evaluator
        RunEvalConfig.Criteria("harmfulness"),
        RunEvalConfig.Criteria("misogyny"),
        RunEvalConfig.Criteria(
            {
                "short_informative": "Are the answers short and informative? "
                "Respond Y if they are, N if they're not short and informative."
            }
        ),
    ]
)

In [27]:
#  Evaluate the Chat LLM on the dataset that has only has tasks and no reference answers

llama_llm_chat = HuggingFacePipeline(pipeline=pipe_llama7b_chat)
llama_chain_chat = prompt | llama_llm_chat
results = await client.arun_on_dataset("Input only subjective task dataset", 
                                       llama_chain_chat,
                                       evaluation=eval_config,
                                       project_name=f'{os.environ["LANGCHAIN_PROJECT"]}_only_q',
                                      )

View the evaluation results for project 'llama2-langsmith-eval_only_q' at:
https://smith.langchain.com/o/dcc925e6-6130-54c2-852e-9cbbb51328d6/projects/p/88fa6a66-ce13-445c-a82f-4eca420bcc7a
[------------------------------------------------->] 44/44

In [28]:
# Evaluate the fine tuned Chat LLM 

llama_llm_chat_ft = HuggingFacePipeline(pipeline=pipe_llama7b_ft_chat)
llama_chain_chat_ft = prompt | llama_llm_chat_ft
results = await client.arun_on_dataset("Input only subjective task dataset",
                                       llama_chain_chat_ft,
                                       evaluation=eval_config,
                                       project_name=f'{os.environ["LANGCHAIN_PROJECT"]}_finetuned_only_q',
                                      )

View the evaluation results for project 'llama2-langsmith-eval_finetuned_only_q' at:
https://smith.langchain.com/o/dcc925e6-6130-54c2-852e-9cbbb51328d6/projects/p/8d8d33ef-7cd3-4517-8b44-a4149278d97b
[-------------------------------------------->     ] 40/44

  lib.cdequantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n))


[------------------------------------------------->] 44/44

In [29]:
# Let's define a custom Flesch Kincaid metric

class FKEvaluator(RunEvaluator):
    def __init__(self):
        self.metric_fn = textstat.flesch_kincaid_grade

    def evaluate_run(
        self, run: Run, example: Optional[Example] = None
    ) -> EvaluationResult:
        if run.outputs is None:
            raise ValueError("Run outputs cannot be None")
#         prediction = run.outputs['generations'][0][0]['text']
        prediction = run.outputs['output']
        return EvaluationResult(key="flesch_kincaid_grade", score=self.metric_fn(prediction))
    

eval_config = RunEvalConfig(
    evaluators=[
        # You can define an arbitrary criterion as a key: value pair in the criteria dict
        "string_distance",
        "embedding_distance",
        RunEvalConfig.LabeledCriteria(
            {
                "helpfulness": (
                    "Is this submission helpful to the user,"
                    " taking into account the correct reference answer?"
                )
            }
        ),
    ],
    custom_evaluators = [FKEvaluator()]
)

In [30]:
#  Evaluate the Chat LLM on the dataset that has tasks and reference answers

results = await client.arun_on_dataset("Tasks and Answers",
                                       llama_chain_chat,
                                       evaluation=eval_config,
                                       project_name=f'{os.environ["LANGCHAIN_PROJECT"]}_qa',
                                      )

View the evaluation results for project 'llama2-langsmith-eval_qa' at:
https://smith.langchain.com/o/dcc925e6-6130-54c2-852e-9cbbb51328d6/projects/p/4e68626d-25b7-4055-a462-e70d6a8ef399
[------------------------------------------------->] 24/24

In [32]:
#  Evaluate the fine tuned Chat LLM on the dataset that has tasks and reference answers

results = await client.arun_on_dataset("Tasks and Answers",
                                       llama_chain_chat_ft,
                                       evaluation=eval_config,
                                       project_name=f'{os.environ["LANGCHAIN_PROJECT"]}_finetuned_qa',
                                      )

View the evaluation results for project 'llama2-langsmith-eval_finetuned_qa' at:
https://smith.langchain.com/o/dcc925e6-6130-54c2-852e-9cbbb51328d6/projects/p/2ead0bb7-bbc6-4b90-ac39-4043bd62a973
[------------------------------------------------->] 24/24