<a href="https://colab.research.google.com/github/chen-star/llm_model_trainings/blob/main/7_3_evaluation_quantitative_KL_divergence.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1> ⭐ Evaluation ⭐

---

* KL distance (KL divergence) measures the "distance" between two probability distributions.

* `KL distance = sum( p(x) * log (p(x) / q(x)) )`

---

# ✈ Imports

In [3]:
!pip install mauve-text



In [12]:
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset

import mauve
from tqdm import tqdm

import torch
from transformers import AutoTokenizer,AutoModelForCausalLM

import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

# 🔢 Hyperparameters

In [5]:
# use GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [6]:
context_window_size = 1024

# 🏠 Mauve Score

#### Import 2 GPT-2 models

In [7]:
gpt_small = AutoModelForCausalLM.from_pretrained('gpt2').to(device)
gpt_large = AutoModelForCausalLM.from_pretrained('gpt2-large').to(device)
gpt_small.eval()
gpt_large.eval()

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1280)
    (wpe): Embedding(1024, 1280)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-35): 36 x GPT2Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=3840, nx=1280)
          (c_proj): Conv1D(nf=1280, nx=1280)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=5120, nx=1280)
          (c_proj): Conv1D(nf=1280, nx=5120)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1280, out_features=50257, bias=False)
)

In [8]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.set_pad_token_id = tokenizer.eos_token_id

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

#### Generate model output tokens

In [9]:
gpt_small_tokens = []
gpt_large_tokens = []

batch_size = 50
token_per_batch = 200

In [10]:
for _ in tqdm(range(batch_size),desc='Generating tokens for GPT2 small'):

  ### in GPT2-small
  tokens = gpt_small.generate(
      torch.tensor([[tokenizer.bos_token_id]]).to(device),
      pad_token_id = tokenizer.eos_token_id,
      min_length = token_per_batch,
      max_length = token_per_batch,
      do_sample  = True,
      top_k      = 30,
      top_p      = .90,
  )
  gpt_small_tokens.append(tokens[0][1:])

Generating tokens for GPT2 small:   0%|          | 0/50 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Generating tokens for GPT2 small: 100%|██████████| 50/50 [01:55<00:00,  2.32s/it]


In [11]:
for _ in tqdm(range(batch_size),desc='Generating tokens for GPT2 large'):

  ### in GPT2-large
  tokens = gpt_large.generate(
      torch.tensor([[tokenizer.bos_token_id]]).to(device),
      pad_token_id = tokenizer.eos_token_id,
      min_length = token_per_batch,
      max_length = token_per_batch,
      do_sample  = True,
      top_k      = 30,
      top_p      = .90,
  )
  gpt_large_tokens.append(tokens[0][1:])

Generating tokens for GPT2 large: 100%|██████████| 50/50 [04:42<00:00,  5.65s/it]


#### Download human tokens from wiki

In [15]:
dataset = load_dataset('wikitext','wikitext-2-raw-v1',split='train')

In [38]:
human_tokens = []

In [39]:
while len(human_tokens) < batch_size:
  i = np.random.randint(0,len(dataset))
  tokens = tokenizer.encode(dataset[i]['text'], return_tensors='pt')[0]
  if len(tokens) > token_per_batch:
    human_tokens.append(tokens[:token_per_batch-1])

#### Calc Mauve Scores

In [43]:
mauve_score_small = mauve.compute_mauve(
      p_tokens  = gpt_small_tokens,
      q_tokens  = human_tokens,
      verbose   = False,
      device_id = 0
  )

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Featurizing p:   0%|          | 0/50 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/50 [00:00<?, ?it/s]

In [44]:
mauve_score_large = mauve.compute_mauve(
      p_tokens  = gpt_large_tokens,
      q_tokens  = human_tokens,
      verbose   = False,
      device_id = 0
  )

Featurizing p:   0%|          | 0/50 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/50 [00:00<?, ?it/s]

In [46]:
print(f"GPT2-small Mauve Score: {mauve_score_small.mauve}")
print(f"GPT2-large Mauve Score: {mauve_score_large.mauve}")

GPT2-small Mauve Score: 0.3932836980527517
GPT2-large Mauve Score: 0.9569621103217867
