In [2]:
!pip install -q datasets transformers evaluate peft trl bitsandbytes accelerate
!pip install --upgrade -q accelerate
!pip install -q python-Levenshtein

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
import os
import torch
from datasets import load_from_disk, load_metric
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

def get_model_id(model_type, run_name, checkpoint_id):
    return os.path.join(model_type, "model_output", run_name, checkpoint_id)

# model_type = "gemma"
# run_name = "gemma-7b-qlora-survey-json"
checkpoint_id = "checkpoint-500"

# gemma_7b_model_id = get_model_id("gemma", "gemma-7b-qlora-inst", checkpoint_id)
gemma_2b_model_id = get_model_id("gemma", "gemma-2b-qlora-inst", checkpoint_id)
codellama2_7b_model_id = get_model_id("llama2", "codellama2-7b-qlora-inst", checkpoint_id)
llama3_7b_model_id = get_model_id("llama3", "llama3-7b-qlora-inst", checkpoint_id)
phi_2_model_id = get_model_id("phi2", "phi-2-qlora-inst", checkpoint_id)
model_id = gemma_2b_model_id

test_dataset = load_from_disk("./datasets/survey_json_datasets_instruction_test")

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    quantization_config=quantization_config, 
    torch_dtype=torch.float32,
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
import Levenshtein
import json

def levenshtein_accuracy(predict, ground_truth):
    epsilon = 1e-10
    # Calculate the Levenshtein distance between the two strings
    distance = Levenshtein.distance(predict, ground_truth)
    
    # Calculate the maximum possible distance (i.e., the length of the longer string)
    max_distance = max(len(predict), len(ground_truth))
    
    # Calculate the accuracy as the complement of the normalized distance
    accuracy = (max_distance - distance) / (max_distance + epsilon)
    
    return accuracy

def survey_json_accuracy(pred_string, true_string):
    true_json = json.loads(true_string)
    pred_json = json.loads(pred_string)
    
    """
    print(data.keys())
    Usually have three fields:
        header
        questions
        subHeader (optional)
    """
    header_accuracy = 0
    subheader_accuracy = 0
    question_accuracy = []
    if "questions" in pred_json:
        for true_q, pred_q in zip(true_json["questions"], pred_json["questions"]):
            # compare the questions 
            key_accuracy = dict()
            for k in true_q.keys():
                if k in pred_q:
                    acc = levenshtein_accuracy(str(true_q[k]), str(pred_q[k]))
                else:
                    acc = 0
                key_accuracy[k] = acc
            question_accuracy.append(key_accuracy)
    else:
        question_accuracy.append(dict())

    if "header" in pred_json:
        header_accuracy = levenshtein_accuracy(pred_json["header"], true_json["header"])
    else:
        header_accuracy = 0
            
    if "subHeader" in pred_json:
        subheader_accuracy = levenshtein_accuracy(pred_json["subHeader"], true_json["subHeader"])
    else:
        subheader_accuracy = 0

    question_key_accuracy = {}
    for question in question_accuracy:
        for key, acc in question.items():
            if key not in question_key_accuracy:
                question_key_accuracy[key] = [acc]
            else:
                question_key_accuracy[key].append(acc)
                
    for key, acc_list in question_key_accuracy.items():
        question_key_accuracy[key] = sum(acc_list) / len(acc_list)
    # print(question_accuracy)

    results = {
        "header_accuracy": header_accuracy,
        "subheader_accuracy": subheader_accuracy,
        "question_key_accuracy": question_key_accuracy
    }
    return results

In [None]:
from tqdm import tqdm
accuracy_metric = load_metric("accuracy")

failed_count = 0
preds = []
ground_truths = []
for example in tqdm(test_dataset):    
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
    prompt, ground_truth = example['text'].split("[/INST]")
    prompt += "[/INST]"
    result = pipe(prompt, max_new_tokens=2088, do_sample=True)
    output = result[0]['generated_text']
    pred = output.split("[/INST]")[1].strip()
    ground_truth = ground_truth.strip()

    # overall_leven_accuracy = levenshtein_accuracy(pred, ground_truth)
    # sj_accuracy = survey_json_accuracy(pred, ground_truth)
    # accuracy = accuracy_metric.compute(predictions=[pred], references=[ground_truth])

    preds.append(pred)
    ground_truths.append(ground_truth)