In [13]:
"""
https://github.com/davendw49/k2
https://arxiv.org/abs/2306.05064
We also construct GeoBench, an evaluation dataset comprising more than 1500 objective questions and 939 subjective questions collected from:
National Postgraduate Entrance Examination (NPEE) on Geoscience and AP Test Geology, Geography, and Environmental Science.
"""

!git clone https://github.com/davendw49/k2.git

Cloning into 'k2'...
remote: Enumerating objects: 195, done.[K
remote: Counting objects: 100% (60/60), done.[K
remote: Compressing objects: 100% (50/50), done.[K
remote: Total 195 (delta 37), reused 15 (delta 10), pack-reused 135 (from 1)[K
Receiving objects: 100% (195/195), 30.02 MiB | 17.97 MiB/s, done.
Resolving deltas: 100% (96/96), done.


In [14]:
import pandas as pd

aps = pd.read_json("k2/data/geobench/geobench_apstudy.json")
# npee = pd.read_json("k2/data/geobench/geobench_npee.json")

print(aps.keys())
# print(npee.keys())

Index(['id', 'question', 'answerKey'], dtype='object')


In [15]:
def append_text_to_stem_aps(dataframe):

  strings = []
  for index, row in dataframe.iterrows():
    question = row["question"]["stem"]
    answer = [choice["text"] for choice in row["question"]["choices"] if choice["label"] == row["answerKey"]][0]

    strings.append(f"Question: {question}. Answer: {answer}")

  return strings

In [16]:
aps_strings = append_text_to_stem_aps(aps)
print(len(aps_strings))
print(aps_strings[0])

1395
Question: The umbrella theory explaining the Earth's movement, contact, and flattening of large land plates is known as. Answer: plate tectonics


In [18]:
# Write the sentences to the file, each separated by a newline
with open("geobench_ap_perplexity_sample.txt", 'w') as f:
    for sentence in aps_strings:
        f.write(sentence + '\n')

In [None]:
df = pd.DataFrame({'sentence': aps_strings})
df['word_count'] = df['sentence'].apply(lambda x: len(x.split()))

df["word_count"].mean()

21.62078853046595

In [2]:
!pip install tqdm
!pip install -U bitsandbytes
!pip install transformers accelerate bitsandbytes
!pip install datasets
!pip install --upgrade huggingface_hub

Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1
Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m15.9 MB/s[0m 

In [3]:
import torch
import os
from google.colab import userdata
# Note: `userdata.get` is a Colab API. If you're not using Colab, set the env
# vars as appropriate for your system.
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import math

from datasets import load_dataset
import nltk

from tqdm import tqdm

In [4]:
# Check so there is a gpu available, a T4(free tier) is enough to run this notebook
assert (torch.cuda.is_available()==True)

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B")
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B",
                                            quantization_config=quantization_config,
                                            device_map='auto')

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [5]:
def calculate_perplexity(model, tokenizer, text):
    """Calculates the perplexity of a given text using a language model.

    Args:
        model: The language model.
        tokenizer: The tokenizer for the language model.
        text: The text to calculate perplexity for.

    Returns:
        The perplexity of the text.

    Example: calculate_perplexity(model, tokenizer, "The quick brown fox jumps over the lazy dog")
    """
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss
        perplexity = math.exp(loss.item())
    return perplexity


In [6]:
from google.colab import files
import pandas as pd
import io

uploaded = files.upload()
with open("wiki_perplexity_sample.txt", "r") as f:
  wiki_strings = f.readlines()

Saving wiki_perplexity_sample.txt to wiki_perplexity_sample.txt


In [None]:
# get micro perplexity

micro_aps_ppl = [calculate_perplexity(model, tokenizer, sent) for sent in tqdm(aps_strings)]
ppl_sents = list(zip(micro_aps_ppl, aps_strings))
print(sum(micro_aps_ppl) / len(aps_strings))

  0%|          | 0/1395 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
100%|██████████| 1395/1395 [07:03<00:00,  3.29it/s]

39.861749458454426





In [None]:
# save analysis file
df = pd.DataFrame(ppl_sents, columns=["perplexity", "text"])
df.to_csv("perplexity_text_ap_geobench.csv")

In [7]:
# get micro perplexity on general domain
micro_wiki_ppl = [calculate_perplexity(model, tokenizer, sent) for sent in tqdm(wiki_strings)]
wiki_ppl_sents = list(zip(micro_wiki_ppl, wiki_strings))
print(sum(micro_wiki_ppl) / len(wiki_strings))

  0%|          | 0/6000 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
100%|██████████| 6000/6000 [33:36<00:00,  2.98it/s]

29.052969335750973





In [8]:
print(sum(micro_wiki_ppl) / len(wiki_strings))

29.052969335750973


In [9]:
# save analysis file on general domain
df = pd.DataFrame(wiki_ppl_sents, columns=["perplexity", "text"])
df.to_csv("perplexity_text_wiki.csv")

In [12]:
# Meta-Llama-3.1-8B (vanilla)
# 8bit
# micro-ppl ap test QA - 39.861749458454426
# micro-ppl wiki - 29.052969335750973

In [11]:
# Meta-Llama-3.1-8B (CPT'd)
# 8bit
# micro-ppl ap test QA - xx
# micro-ppl wiki - xx