<a href="https://colab.research.google.com/github/chen-star/llm_model_trainings/blob/main/7_2_evaluation_quantitative_HellaSwag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1> ⭐ Evaluation ⭐

---

* **Goal:** To test a LLM's ability to perform *commonsense reasoning*.

* **How it works?**

The core task is sentence completion. Models are given a context, typically a short description of an everyday situation or activity, and then presented with four possible endings. The model must select the most plausible and commonsense ending from the options.

---

# ✈ Imports

In [1]:
#  !pip install bitsandbytes

In [2]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn.functional as F

from datasets import load_dataset
from tqdm import tqdm
from transformers import AutoTokenizer,AutoModelForCausalLM, BitsAndBytesConfig

import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# 🔢 Hyperparameters

In [3]:
# use GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [4]:
context_window_size = 1024

# 🐶 HellaSwag

In [9]:
# func to test one HellaSwag data sample
def test_one_sample(model, sample, tokenizer):
  ctx = sample['ctx']
  log_prob_sum_per_option = np.zeros(len(sample['endings']))

  for option in range(len(sample['endings'])):
    prompt = f"{ctx} {sample['endings'][option]}"
    prompt_tokens = tokenizer.encode(prompt, return_tensors = 'pt').to(device)

    with torch.no_grad():
      logits = model(prompt_tokens).logits

    log_probs = F.log_softmax(logits, dim = -1)

    # sum of log_probs of predicated tokens
    log_prob_sum = np.array([
        log_probs[0, i, prompt_tokens[0][i+1]].item()
        for i in range(0, len(prompt_tokens[0])-1)
    ])
    log_prob_sum_per_option[option] = log_prob_sum.sum()

  # return the log_prob for each option, and the true answer
  return log_prob_sum_per_option, int(sample['label'])

#### Compare the HellaSwag results for different models

Import Zephyr Model

In [10]:
# import zephyr model
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # preserve 4-bit precision
    bnb_4bit_compute_dtype = 'float16',
    bnb_4bit_use_double_quant = True,
)

zephyr_model = AutoModelForCausalLM.from_pretrained('HuggingFaceH4/zephyr-7b-alpha',
    quantization_config = quantization_config)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [11]:
zephyr_model.eval()
zephyr_model.to(device)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=2)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
     

In [12]:
# tokenizer
zephyr_tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-alpha')

Import GPT-2 Model

In [14]:
# import GPT2 and disable normalizations
gpt2_model = AutoModelForCausalLM.from_pretrained('gpt2').to(device)

In [15]:
gpt2_model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [17]:
gpt2_tokenizer = AutoTokenizer.from_pretrained('gpt2')

Run test for both models

In [20]:
dataset = load_dataset('hellaswag',split='validation')

In [23]:
sample_size = 666
results = np.zeros((2, sample_size))

def is_correct(log_prob_sum_per_option, true_answer):
  return 1 if np.argmax(log_prob_sum_per_option) == true_answer else 0

for i in tqdm(range(sample_size), desc='HellaSwag Evaluation'):
  one_sample = dataset[i]

  # zephyr model
  log_prob_sum_per_option, true_answer = test_one_sample(zephyr_model, one_sample, zephyr_tokenizer)
  results[0, i] = is_correct(log_prob_sum_per_option, true_answer)

  # gpt-2 model
  log_prob_sum_per_option, true_answer = test_one_sample(gpt2_model, one_sample, gpt2_tokenizer)
  results[1, i] = is_correct(log_prob_sum_per_option, true_answer)

print(f"Zephyr Accuracy: {np.mean(results[0])}")
print(f"GPT-2 Accuracy: {np.mean(results[1])}")

HellaSwag Evaluation: 100%|██████████| 666/666 [09:35<00:00,  1.16it/s]

Zephyr Accuracy: 0.5285285285285285
GPT-2 Accuracy: 0.35585585585585583



