In [1]:
import dotenv
dotenv.load_dotenv()
import os
import weave
import wandb

In [2]:
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM, AutoTokenizer, TrainingArguments, pipeline
)
from peft import LoraConfig, get_peft_model, TaskType, AutoPeftModelForCausalLM
from trl import SFTTrainer
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
test_cases = [
    ("we have nothing to fear but fear itself", "Franklin D. Roosevelt"),
    ("early to bed and early to rise makes a man healthy, wealthy, and wise", "Benjamin Franklin"),
    ("the only thing we have to fear is fear itself", "Franklin D. Roosevelt"),
    ("i have not failed. i've just found 10,000 ways that won't work", "Thomas Edison"),
    ("the best way to predict the future is to invent it", "Alan Kay"),
    ("in the middle of every difficulty lies opportunity", "Albert Einstein"),
    ("i am doing a poc. leave me alone.", "Some Guy"),
]

In [4]:
wandb.login(key = os.getenv("WANDB_API_KEY"))
run = wandb.init(entity = "june-pov", project = "tuned-evaluate")
target_adapter_name = "test-gemma-lora-adapter"
path = run.use_artifact("june-pov/model-registry/test-gemma-lora-adapter:latest").download()

ft_model = AutoPeftModelForCausalLM.from_pretrained(
    target_adapter_name,
    torch_dtype = torch.bfloat16,
    device_map = "auto",
)

tokenizer  = AutoTokenizer.from_pretrained(target_adapter_name)

def analyze_quote(quote):
    prompt = f"Quote: {quote}\nAuthor:"
    return tokenizer.decode(ft_model.generate(
        **tokenizer(prompt, return_tensors="pt").to(ft_model.device),
        max_new_tokens=16)[0], skip_special_tokens=True)

[34m[1mwandb[0m: Appending key for zscalersre.wandb.io to your netrc file: /home/kilnaar/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mcumbel[0m ([33mjune-pov[0m) to [32mhttps://zscalersre.wandb.io[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Downloading large artifact test-gemma-lora-adapter:latest, 160.11MB. 33 files... 
[34m[1mwandb[0m:   33 of 33 files downloaded.  
Done. 0:0:0.6 (290.1MB/s)
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  3.98it/s]
We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


In [5]:
results = [(x[0], f"{x[1]} wrote {x[0]}", analyze_quote(x[0])) for x in test_cases]
results

[('we have nothing to fear but fear itself',
  'Franklin D. Roosevelt wrote we have nothing to fear but fear itself',
  'Quote: we have nothing to fear but fear itself\nAuthor: Franklin D. Roosevelt\nSource: The New York Times, 193'),
 ('early to bed and early to rise makes a man healthy, wealthy, and wise',
  'Benjamin Franklin wrote early to bed and early to rise makes a man healthy, wealthy, and wise',
  "Quote: early to bed and early to rise makes a man healthy, wealthy, and wise\nAuthor: Benjamin Franklin\nSource: Franklin's Autobiography\nDate: 177"),
 ('the only thing we have to fear is fear itself',
  'Franklin D. Roosevelt wrote the only thing we have to fear is fear itself',
  'Quote: the only thing we have to fear is fear itself\nAuthor: Franklin D. Roosevelt\nSource: The New York Times, 193'),
 ("i have not failed. i've just found 10,000 ways that won't work",
  "Thomas Edison wrote i have not failed. i've just found 10,000 ways that won't work",
  "Quote: i have not failed

In [6]:
os.environ["WANDB_BASE_URL"] = "https://api.wandb.ai"
from weave.scorers import WeaveHallucinationScorerV1

hallucination_scorer = WeaveHallucinationScorerV1()

  check_cuda(self.device)
[34m[1mwandb[0m: Downloading large artifact hallucination_hhem_scorer:v0, 421.31MB. 8 files... 
[34m[1mwandb[0m:   8 of 8 files downloaded.  
Done. 0:0:1.0 (431.4MB/s)
Device set to use cpu


In [7]:
def hallucination_score(query, context, output):
    result = hallucination_scorer.score(
        query = query,
        context = context,
        output = output
    )

    return result.metadata['score']

list(map(lambda x: hallucination_score(x[0], x[1], x[2]), results))


 (subsequent messages of this type will be suppressed)


[0.8703258037567139,
 0.8031547218561172,
 0.8890906944870949,
 0.812981441617012,
 0.9266120493412018,
 0.8484246283769608,
 0.6511365175247192]