In [1]:
import dotenv
dotenv.load_dotenv()
import os
import weave
import wandb
import asyncio
import nest_asyncio
nest_asyncio.apply()

In [2]:
from transformers import (
    AutoTokenizer
)
from peft import AutoPeftModelForCausalLM
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
test_cases = [
    ("we have nothing to fear but fear itself", "Franklin D. Roosevelt"),
    ("early to bed and early to rise makes a man healthy, wealthy, and wise", "Benjamin Franklin"),
    ("the only thing we have to fear is fear itself", "Franklin D. Roosevelt"),
    ("i have not failed. i've just found 10,000 ways that won't work", "Thomas Edison"),
    ("the best way to predict the future is to invent it", "Alan Kay"),
    ("in the middle of every difficulty lies opportunity", "Albert Einstein"),
    ("i am doing a poc. leave me alone.", "Some Guy"),
]

In [7]:
wandb.login(key = os.getenv("WANDB_API_KEY"))
run = wandb.init(entity = "june-pov", project = "tuned-evaluate")
weave.init("tuned-evaluate")
target_adapter_name = "test-gemma-lora-adapter"
path = run.use_artifact("june-pov/model-registry/test-gemma-lora-adapter:latest").download()

ft_model = AutoPeftModelForCausalLM.from_pretrained(
    target_adapter_name,
    torch_dtype = torch.bfloat16,
    device_map = "auto",
)

tokenizer  = AutoTokenizer.from_pretrained(target_adapter_name)

[34m[1mwandb[0m: Appending key for zscalersre.wandb.io to your netrc file: /home/kilnaar/.netrc


[34m[1mwandb[0m: Downloading large artifact test-gemma-lora-adapter:latest, 160.11MB. 33 files... 
[34m[1mwandb[0m:   33 of 33 files downloaded.  
Done. 0:0:0.6 (275.0MB/s)
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  3.22it/s]
We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


In [8]:
results = [(x[0], f"{x[1]} wrote {x[0]}") for x in test_cases]
results_ds = list(map(lambda x: ({'query': x[0], 'context': x[1]}), results))
results_ds

[{'query': 'we have nothing to fear but fear itself',
  'context': 'Franklin D. Roosevelt wrote we have nothing to fear but fear itself'},
 {'query': 'early to bed and early to rise makes a man healthy, wealthy, and wise',
  'context': 'Benjamin Franklin wrote early to bed and early to rise makes a man healthy, wealthy, and wise'},
 {'query': 'the only thing we have to fear is fear itself',
  'context': 'Franklin D. Roosevelt wrote the only thing we have to fear is fear itself'},
 {'query': "i have not failed. i've just found 10,000 ways that won't work",
  'context': "Thomas Edison wrote i have not failed. i've just found 10,000 ways that won't work"},
 {'query': 'the best way to predict the future is to invent it',
  'context': 'Alan Kay wrote the best way to predict the future is to invent it'},
 {'query': 'in the middle of every difficulty lies opportunity',
  'context': 'Albert Einstein wrote in the middle of every difficulty lies opportunity'},
 {'query': 'i am doing a poc. leave

In [9]:
os.environ["WANDB_BASE_URL"] = "https://api.wandb.ai"
from weave.scorers import WeaveHallucinationScorerV1

hallucination_scorer = WeaveHallucinationScorerV1()

  check_cuda(self.device)
[34m[1mwandb[0m: Downloading large artifact hallucination_hhem_scorer:v0, 421.31MB. 8 files... 
[34m[1mwandb[0m:   8 of 8 files downloaded.  
Done. 0:0:1.0 (434.2MB/s)
Device set to use cpu


In [10]:
evaluation = weave.Evaluation(
    dataset = results_ds,
    scorers = [hallucination_scorer],
    evaluation_name = "tuned-evaluation"
)

In [11]:
@weave.op()
def op_ft_model(query):
    prompt = f"Quote: {query}\nAuthor:"
    inputs = tokenizer(prompt, return_tensors="pt").to(ft_model.device)

    output_encoded = ft_model.generate(
        **inputs)

    return tokenizer.decode(output_encoded[0], skip_special_tokens=True)


In [12]:
asyncio.run(evaluation.evaluate(op_ft_model))

[36m[1mweave[0m: 🍩 https://wandb.ai/june-pov/tuned-evaluate/r/call/01980920-c1b5-7e35-a3cb-cbe4bf7c46f5
[36m[1mweave[0m: Evaluated 1 of 7 examples
[36m[1mweave[0m: Evaluated 2 of 7 examples
[36m[1mweave[0m: Evaluated 3 of 7 examples
[36m[1mweave[0m: Evaluated 4 of 7 examples
[36m[1mweave[0m: Evaluated 5 of 7 examples
[36m[1mweave[0m: Evaluated 6 of 7 examples
[36m[1mweave[0m: Evaluated 7 of 7 examples
[36m[1mweave[0m: Evaluation summary {
[36m[1mweave[0m:   "WeaveHallucinationScorerV1": {
[36m[1mweave[0m:     "passed": {
[36m[1mweave[0m:       "true_count": 0,
[36m[1mweave[0m:       "true_fraction": 0.0
[36m[1mweave[0m:     },
[36m[1mweave[0m:     "metadata": {
[36m[1mweave[0m:       "score": {
[36m[1mweave[0m:         "mean": 0.8236865842980998
[36m[1mweave[0m:       }
[36m[1mweave[0m:     }
[36m[1mweave[0m:   },
[36m[1mweave[0m:   "model_latency": {
[36m[1mweave[0m:     "mean": 3.760831185749599
[36m[1mweave[0m:   }

{'WeaveHallucinationScorerV1': {'passed': {'true_count': 0,
   'true_fraction': 0.0},
  'metadata': {'score': {'mean': 0.8236865842980998}}},
 'model_latency': {'mean': 3.760831185749599}}

In [13]:
weave.finish()