In [1]:
!pip install transformers evaluate tqdm rouge_score textdistance langchain accelerate openai tiktoken bitsandbytes accelerate

Collecting transformers
  Downloading transformers-4.40.0-py3-none-any.whl.metadata (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.6/137.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting textdistance
  Downloading textdistance-4.6.1-py3-none-any.whl.metadata (18 kB)
Collecting langchain
  Downloading langchain-0.1.16-py3-none-any.whl.metadata (13 kB)
Collecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl.metadata (18 kB)
Collecting openai
  Downloading openai-1.23.2-py3-none-any.whl.metadata (21 kB)
Collecting tiktoken
  Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl.metadata 

In [2]:
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import evaluate
from tqdm import tqdm
from rouge_score import rouge_scorer
from datasets import load_dataset
from textdistance import jaro_winkler
import numpy as np
import json
import pandas as pd
import torch
from langchain.evaluation import load_evaluator
from langchain.evaluation import EmbeddingDistance
import os

os.environ['OPENAI_API_KEY'] = '<OPENAI API KEY>'
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'

In [3]:
# specify device type to use to run LLMs
device_type = "cuda:0"

# load fine-tuned model and tokenizer
base_model_name = "vilsonrodrigues/falcon-7b-sharded"
# model_name = "rlhf-falcon_v3"
model_name = "rlhf-falcon_test"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

trained_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device_type,
    trust_remote_code=True,
    quantization_config=bnb_config,
)

tokenizer = AutoTokenizer.from_pretrained(base_model_name ,trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right"  # set padding to the right to avoid issues with fp16 (when using 4-bit quantization)

Some weights of FalconForCausalLM were not initialized from the model checkpoint at rlhf-falcon_test and are newly initialized: ['h.0.input_layernorm.bias', 'h.0.input_layernorm.weight', 'h.0.mlp.dense_4h_to_h.weight', 'h.0.mlp.dense_h_to_4h.weight', 'h.0.self_attention.dense.weight', 'h.0.self_attention.query_key_value.weight', 'h.1.input_layernorm.bias', 'h.1.input_layernorm.weight', 'h.1.mlp.dense_4h_to_h.weight', 'h.1.mlp.dense_h_to_4h.weight', 'h.1.self_attention.dense.weight', 'h.1.self_attention.query_key_value.weight', 'h.10.input_layernorm.bias', 'h.10.input_layernorm.weight', 'h.10.mlp.dense_4h_to_h.weight', 'h.10.mlp.dense_h_to_4h.weight', 'h.10.self_attention.dense.weight', 'h.10.self_attention.query_key_value.weight', 'h.11.input_layernorm.bias', 'h.11.input_layernorm.weight', 'h.11.mlp.dense_4h_to_h.weight', 'h.11.mlp.dense_h_to_4h.weight', 'h.11.self_attention.dense.weight', 'h.11.self_attention.query_key_value.weight', 'h.12.input_layernorm.bias', 'h.12.input_layernorm.

tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

In [4]:
# data_files = {'test':'test_augment_data.json'}
data_files = {'test':'val_augment_data.json'}

data = load_dataset("json", data_files=data_files)
data

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['answer', 'question'],
        num_rows: 24546
    })
})

In [5]:
def calculate_rouge4(pred, ref):
    '''
    Purpose: to calculate ROUGE-4 score for a single data point
    @params pred: a list of predictions made by model (str)
    @params ref: a list of references/actual answers (str)
    returns: a dict of evaluation results (dict)
    '''
    # get rouge-4 score; since not included in evaluate load method
    scorer = rouge_scorer.RougeScorer(['rouge4'], use_stemmer=True)
    scores = scorer.score(pred, ref)

    res = scores['rouge4'][2]  # get fmeasure (precision+recall) results

    return res

In [6]:
def calculate_rouge_bleu(preds, refs):
    '''
    Purpose: to calculate ROUGE & BLEU metrics for a set of data samples
    @params preds: a list of predictions made by model (list of str)
    @params refs: a list of references/actual answers (list of str)
    returns: a dict of evaluation results (dict)
    '''

    rouge_metric = evaluate.load("rouge")
    rouge_res = rouge_metric.compute(predictions=preds, references=refs, use_stemmer=True)

    # calculate average rouge-4 score
    rouge4_res = []
    for i in range(len(preds)):
        pred = preds[i]
        ref = refs[i]
        r4 = calculate_rouge4(pred, ref)
        rouge4_res.append(r4)

    r4_avg = np.mean(rouge4_res)

    # put all results together
    res = rouge_res
    rouge_res['rouge4'] = r4_avg

    # calculate bleu scores
    bleu_metric = evaluate.load("bleu")
    bleu_res = bleu_metric.compute(predictions=preds, references=refs)

    for i in range(len(bleu_res['precisions'])):
        name_ix = i+1

        k = f'bleu{name_ix}'
        res[k] = bleu_res['precisions'][i]

    return res

In [7]:
# define function to format data to prompt instruction format
def get_model_response(model, question):
    '''
    Purpose: to generate an answer from model for a given question
    @params model: the loaded LLM model
    @params question: a question for model to answer (str)
    returns: an answer (str)
    '''
    tokens = tokenizer(question, return_tensors="pt")
    input_ids = tokens.input_ids.to(device_type)
    attention_mask = tokens.attention_mask.to(device_type)
    outputs = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=512, no_repeat_ngram_size=2, max_time=7, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return response[0]

In [8]:
questions = data['test']['question'][:5000]
refs = data['test']['answer']  # actual answer

# get model predictions
preds = []
for i in tqdm(range(len(questions))):
    q = questions[i]
    ans = get_model_response(trained_model, q)
    preds.append(ans)

100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [9:45:23<00:00,  7.02s/it]


In [9]:
with open(f'rlhf_falcon_val_set_preds_test.json', 'w', encoding='utf-8') as f:
    json.dump(preds, f, ensure_ascii=False, indent=4)
f.close()

In [10]:
# calculate rouge & bleu metrics
refs = data['test']['answer']
n = 5000  # data size to calculate metrics
eval_results = calculate_rouge_bleu(preds[:n], refs[:n])

print(f'----------\n Results: \n {eval_results}')

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

----------
 Results: 
 {'rouge1': 0.07300136394829512, 'rouge2': 0.030422385925886426, 'rougeL': 0.055352078611822855, 'rougeLsum': 0.055357851885465985, 'rouge4': 0.00962228533999513, 'bleu1': 0.042021178650657486, 'bleu2': 0.016342447966505417, 'bleu3': 0.008827271999216936, 'bleu4': 0.005017377038402807}


In [11]:
# calculate jaro-winkler distance
refs = data['test']['answer']
n = 5000 # data size to calculate metrics

jaros = []
for i in range(n):
    pred = preds[i]
    ref = refs[i]
    distance = jaro_winkler(pred, ref)
    jaros.append(distance)

print(f'The average jaro-winkler distance is: {np.mean(jaros)}')

The average jaro-winkler distance is: 0.5733590816562064


In [13]:
evaluator = load_evaluator("embedding_distance", distance_metric=EmbeddingDistance.COSINE)

  warn_deprecated(


In [14]:
# calculate cosine embedding distance
refs = data['test']['answer']
n = 5000 # data size to calculate metrics

cos_dist = []
for i in tqdm(range(n)):
    pred = preds[i]
    ref = refs[i]
    distance = evaluator.evaluate_strings(prediction=pred, reference=ref)
    cos_dist.append(distance['score'])

print(f'The average cosine embedding distance is: {np.mean(cos_dist)}')

100%|███████████████████████████████████████████████████████████████████████████████| 5000/5000 [18:48<00:00,  4.43it/s]

The average cosine embedding distance is: 0.1672721869285779



