# Eval Gemma's Outputs via Gemini 1.0 Pro

## Install dependencies

In [None]:
!pip install git+https://github.com/huggingface/alignment-handbook.git@main
!pip install -U transformers
!pip install -U peft
!pip install -U accelerate
!pip install -U bitsandbytes
!pip install -U datasets
!pip install -U google-generativeai

## Hugging Face Login 

Gemma model is hosted on gated repoitory at Hugging Face Hub. So, you need to authenticate your Hugging Face account to use Gemma models.

In [None]:
!huggingface-cli login

## Imports

In [None]:
from IPython.display import Markdown

from string import Template

import torch
import datasets
from transformers import (
    AutoModelForCausalLM,
    BitsAndBytesConfig
)

from alignment import (
    ModelArguments,
    DataArguments,
    H4ArgumentParser,
    SFTConfig
)
from alignment.model_utils import get_tokenizer, get_quantization_config

## Importing Model, Data, SFT Arguments from YAML

The folloinwg `config.yaml` shows the actual configurations generated when fine-tuning Gemma 7B model. There are some useful information during the inference such as which tokenizer is used and how it should be initialized. 

In [None]:
%%writefile config.yaml
# Model arguments
model_name_or_path: google/gemma-7b
model_revision: main
tokenizer_name_or_path: philschmid/gemma-tokenizer-chatml # Custom tokenizer with <|im_start|> and <|im_end|> tokens
torch_dtype: bfloat16
use_flash_attention_2: true

# LoRA arguments
load_in_4bit: true
use_peft: true
lora_r: 16
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules:
- q_proj
- k_proj
- v_proj
- o_proj
- gate_proj
- up_proj
- down_proj

# Data training arguments
dataset_mixer:
  sayakpaul/no_robots_only_coding: 1.0
dataset_splits:
- train_sft
- test_sft
preprocessing_num_workers: 12

# SFT trainer config
bf16: true
dataset_kwargs:
  add_special_tokens: false  # We already wrap <bos> and <eos> in the chat template
  append_concat_token: false # No need to add <eos> across samples
do_eval: true
evaluation_strategy: epoch
gradient_accumulation_steps: 2
gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
hub_model_id: gemma-2b-sft-qlora-no-robots
hub_strategy: every_save
learning_rate: 2.0e-04
log_level: info
logging_steps: 5
logging_strategy: steps
lr_scheduler_type: cosine
max_seq_length: 2048
max_steps: -1
num_train_epochs: 1
output_dir: data/gemma-2b-sft-qlora-no-robots
overwrite_output_dir: true
per_device_eval_batch_size: 8
per_device_train_batch_size: 4
push_to_hub: true
report_to:
- tensorboard
save_strategy: "steps"
save_steps: 100
save_total_limit: 1
seed: 42
warmup_ratio: 0.1

with `H4ArgumentParser`, we can bring the YAML recorded configurations into `ModelArguments`, `DataArguments`, and `SFTConfig`. `SFTConfig` is not used, but it is here for FYI.

In [None]:
configs = H4ArgumentParser((ModelArguments, DataArguments, SFTConfig)).parse_yaml_file('./config.yaml', allow_extra_keys=True)
model_args, data_args, sft_args = configs

## Constants

- `gemini_api_key`: Gemini API Key to call Gemini API from Google AI Studio
- `model_id`: model repository where fine-tuned Gemma 7B model is stored at Hugging Face Hub
- `dataset_id`: dataset repository to grasp test dataset 
- `eval_prompt_tmpl`: prompt template to be injected to Gemini. Some placeholders will be filled with real values later

In [None]:
gemini_api_key = "..."

model_id = "sayakpaul/gemma-2b-sft-qlora-no-robots"
dataset_id = "sayakpaul/no_robots_only_coding"

eval_prompt_tmpl = """Given an instruction and two responses—one generated by a human and the other by a language model—I'm seeking to evaluate how closely the language model's response mirrors the human-generated one. Additionally, I want to assess the accuracy and relevance of the language model's response to the original instruction.

Instruction:
```
$instruction
```

Human Response:
```
$human_response
```

Language Model Response:
```
$lm_response
```

You are quality assessor who analyzes the similarity between the Human Response and the Language Model Response on a scale of 1 to 100, where 1 indicates no similarity and 100 indicates identical responses.
Also you analyze the Language Model Response how it accurately answers the given Instruction on a scale of 1 to 100. Analysis MUST be rigorous and thorough.
Provide the assessment in the following JSON format:

{
  "similarity_assessment": {"score": [Insert similarity score here],"reason": [Insert how the similarity score is determined]},
  "precision_assessment": {"score": [Insert precision score here],"reason": [Insert how the precision score is determined]}
}"""

### Utility functions

### Fine-tuned model(Gemma) Specific

In [None]:
def get_model(model_args, data_args):
    """
    get_model instantiates and return fine-tuned language model and tokenzier.

    arguments:
    model_args -- ModelArguments obtained from H4ArgumentParser
    data_args -- DataArguments obtained from H4ArgumentParser
    """
    quantization_config = BitsAndBytesConfig(load_in_8bit=True)

    tokenizer = get_tokenizer(model_args, data_args)
    model = AutoModelForCausalLM.from_pretrained(
        model_id, torch_dtype=torch.bfloat16,
        quantization_config=quantization_config, device_map="auto"
    )

    return tokenizer, model

In [None]:
def gen_model_output(model, tokenizer, ds, temperature=0.4, max_new_tokens=1024, delimiter="assistant\n"):
    """
    gen_model_output generates and return response(output) from a given model.

    arguments:
    model -- fine-tuned lanaguage model instance
    tokenizer -- tokenizer instance
    ds -- a single data record which has "prompt" column
    """
    messages = [
        {"role": "user", "content": ds['prompt']},
    ]

    gen_input = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True)
    output_tensor = model.generate(
        gen_input,
        do_sample=True,
        temperature=temperature,
        max_new_tokens=max_new_tokens,
    )

    return tokenizer.decode(output_tensor[0], skip_special_tokens=True).split(delimiter)[1]

In [None]:
def construct_eval_prompt(ds, lm_response, eval_prompt_tmpl):
    """
    construct_eval_prompt returns a prompt to be injected into the language model (evaluator)

    arguments:
    ds -- a single data record which has "prompt", "messages" columns
    lm_response -- string value which fine-tuned model generated
    eval_prompt_tmpl -- string with placeholders of instruction, human_response, and lm_response.
    """
    instruction = ds['prompt']
    ground_truth = ds['messages'][1]['content']

    return Template(eval_prompt_tmpl).substitute(
        instruction=instruction,
        human_response=ground_truth,
        lm_response=lm_response
    )

### Larger evaluator model(Gemini) Specific

In [None]:
import json
import google.generativeai as genai

def find_json_snippet(raw_snippet):
	json_parsed_string = None

	json_start_index = raw_snippet.find('{')
	json_end_index = raw_snippet.rfind('}')

	if json_start_index >= 0 and json_end_index >= 0:
		json_snippet = raw_snippet[json_start_index:json_end_index+1]
		try:
			json_parsed_string = json.loads(json_snippet, strict=False)
		except:
			raise ValueError('......failed to parse string into JSON format')
	else:
		raise ValueError('......No JSON code snippet found in string.')

	return json_parsed_string

def parse_first_json_snippet(snippet):
	json_parsed_string = None

	if isinstance(snippet, list):
		for snippet_piece in snippet:
			try:
				json_parsed_string = find_json_snippet(snippet_piece)
				return json_parsed_string
			except:
				pass
	else:
		try:
			json_parsed_string = find_json_snippet(snippet)
		except Exception as e:
			print(e)
			raise ValueError()

	return json_parsed_string

def call_gemini(prompt="", generation_config=None, safety_settings=None):
    if generation_config is None:
        generation_config = {
          "temperature": 0.9,
          "top_p": 1,
          "top_k": 32,
          "max_output_tokens": 8192,
        }

    if safety_settings is None:
        safety_settings = [
            {
                "category": "HARM_CATEGORY_HARASSMENT",
                "threshold": "BLOCK_NONE"
            },
            {
                "category": "HARM_CATEGORY_HATE_SPEECH",
                "threshold": "BLOCK_NONE"
            },
            {
                "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
                "threshold": "BLOCK_NONE"
            },
            {
                "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
                "threshold": "BLOCK_NONE"
            },
        ]

    model = genai.GenerativeModel(model_name="gemini-1.0-pro",
                                generation_config=generation_config,
                                safety_settings=safety_settings)
    prompt_parts = [prompt]
    response = model.generate_content(prompt_parts)
    return response.text

def try_out(prompt, gemini_api_key, retry_num=3):
    genai.configure(api_key=gemini_api_key)
    
    assessment_json = None
    cur_retry = 0

    while assessment_json is None and cur_retry < retry_num:
        try:
            assessment = call_gemini(
                prompt=prompt,
            )

            assessment_json = parse_first_json_snippet(assessment)
        except Exception as e:
            cur_retry = cur_retry + 1
            print(f"......retry [{e}]")

    return assessment_json

## Evaluate a single generated output

In [None]:
test_ds = datasets.load_dataset(dataset_id)["test_sft"]

### Instantiate fine-tuned language model and tokenizer

In [None]:
tokenizer, model = get_model(model_args, data_args)

### Generate fine-tuned language model's response to the one of the test instructions

In [None]:
lm_response = gen_model_output(model, tokenizer, test_ds[0])

In [None]:
lm_response

### Form a prompt based on the generated output

In [None]:
eval_prompt = construct_eval_prompt(test_ds[0], lm_response, eval_prompt_tmpl)

In [None]:
Markdown(eval_prompt)

### Assess/evaluate the generated output 

In [None]:
assessment_json = try_out(eval_prompt, gemini_api_key, retry_num=50)

In [None]:
assessment_json