In [1]:
! pip install --quiet -U langchain
! pip install --quiet -U langsmith

You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m


In [None]:
import torch
from transformers import AutoModelForCausalLM
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)

# -- Bitsandbytes parameters --

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Chat model
model_name = "NousResearch/Llama-2-7b-chat-hf"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map='auto'
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
pipe_llama7b_chat = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=300) # set device to run inference on GPU

In [None]:
import torch

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)


# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Fine-tuned model
model_path ='subirmansukhani/llama-2-7b-miniguanaco'

model_loaded = AutoModelForCausalLM.from_pretrained(model_path,
                                                    quantization_config=bnb_config,
                                                    cache_dir="/mnt/artifacts/llama2-model-cache/",
                                                    device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [8]:
from transformers import pipeline
pipe_llama7b_chat_ft = pipeline(task="text-generation", model=model_loaded, tokenizer=tokenizer, max_length=200) # Set device to run inference on GPU

prompt = "What should i do in Paris?"
text = f"<s>[INST] {prompt} [/INST]"

result = pipe_llama7b_chat_ft(text)
result

[{'generated_text': "<s>[INST] What should i do in Paris? [/INST] Paris is a beautiful city with a rich history and culture. There are many things you can do in Paris, depending on your interests and preferences. Here are a few suggestions:\n\n1. Visit famous landmarks: Paris is home to many iconic landmarks, including the Eiffel Tower, Notre-Dame Cathedral, and the Arc de Triomphe. You can take a guided tour of these landmarks or visit them on your own.\n2. Explore museums: Paris has a number of world-class museums, including the Louvre, the Musée d'Orsay, and the Centre Pompidou. These museums house some of the most famous works of art in the world, including the Mona Lisa and Van Gogh's Starry Night.\n3. Take a river cruise: A river cruise along the Seine is"}]

In [None]:
import json
from typing import Any, Optional
from langchain.evaluation import StringEvaluator
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import openai_functions

eval_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are an impartial grader tasked with measuring the accuracy of extracted entity relations."),
        ("human", "Please evaluate the following data:\n\n"
         "<INPUT>\n{input}</INPUT>\n"
         "<PREDICTED>\n{prediction}</PREDICTED>\n"
         "<GROUND_TRUTH>\n{reference}</GROUND_TRUTH>\n\n"
         "Please save your reasoning and grading by calling the commit_grade function."
         " First, enumerate all factual discrepancies in the predicted triplets relative to the ground truth."
         " Finally, score the prediction on a scale out of 100, taking into account factuality and"
         " correctness according to the ground truth."),

    ]
)

commit_grade_schema = {
    "name": "commit_grade",
    "description": "Commits a grade with reasoning.",
    "parameters": {
        "title": "commit_grade_parameters",
        "description": "Parameters for the commit_grade function.",
        "type": "object",
        "properties": {
            "mistakes": {
                "title": "discrepancies",
                "type": "string",
                "description": "Any discrepencies between the predicted and ground truth."
            },
            "reasoning": {
                "title": "reasoning",
                "type": "string",
                "description": "The explanation or logic behind the final grade."
            },
            "grade": {
                "title": "grade",
                "type": "number",
                "description": "The numerical value representing the grade.",
                "minimum": 0,
                "maximum": 100
            }
        },
        "required": ["reasoning", "grade", "mistakes"],
    }
}

def normalize_grade(func_args: str) -> dict:
    args = json.loads(func_args)
    return {
        "reasoning": (args.get("reasoning", "") + "\n\n" + args.get("discrepancies", "")).strip(),
        "score": args.get("grade", 0) / 100,
    }

eval_chain = (
    eval_prompt
    | ChatOpenAI(model="gpt-4", temperature=0).bind(functions=[commit_grade_schema])
    | openai_functions.OutputFunctionsParser()
    | normalize_grade
)

class EvaluateTriplets(StringEvaluator):
    """Evaluate the triplets of a predicted string."""

    @property
    def requires_input(self) -> bool:
        return True

    @property
    def requires_reference(self) -> bool:
        return True

    def _evaluate_strings(
        self,
        *,
        prediction: str,
        reference: Optional[str] = None,
        input: Optional[str] = None,
        **kwargs: Any,
    ) -> dict:
        callbacks = kwargs.pop("callbacks", None)
        return eval_chain.invoke(
            {"prediction": prediction, "reference": reference, "input": input},
            {"callbacks": callbacks},
        )

In [None]:
from langsmith import Client
from langchain.smith import RunEvalConfig
from langchain.prompts import PromptTemplate
from langchain.llms import HuggingFacePipeline
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

client = Client()
# Note that "sentence" is the key in the test dataset
prompt = PromptTemplate.from_template(
    "[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n### Input:{sentence}\n\n[/INST]\n"
).partial(system_message=system_prompt)

from langchain.smith import RunEvalConfig
config = RunEvalConfig(
    custom_evaluators=[EvaluateTriplets()],
)

In [None]:
# Chat LLM
llama_llm_chat = HuggingFacePipeline(pipeline=pipe_llama7b_chat)
llama_chain_chat = prompt | llama_llm_chat
results = await client.arun_on_dataset(validation_dataset_name, llama_chain_chat, evaluation=config)

In [None]:
# Chat LLM w/ FT
llama_llm_chat_ft = HuggingFacePipeline(pipeline=pipe_llama7b_chat_ft)
llama_chain_chat_ft = prompt | llama_llm_chat_ft
results = await client.arun_on_dataset(validation_dataset_name, llama_chain_chat_ft, evaluation=config)