In [None]:
import os
from google.colab import userdata
# Note: `userdata.get` is a Colab API. If you're not using Colab, set the env
# vars as appropriate for your system.
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")

In [None]:
!pip install torch -q
!pip install tqdm -q
!pip install -U bitsandbytes -q
!pip install transformers accelerate bitsandbytes -q
!pip install evaluate -q
!pip install datasets==2.21.0 -q # weird problem with load_metric
!pip install --upgrade huggingface_hub -q

In [None]:
from google.colab import files
import io
import pandas as pd
import json
from datasets import load_dataset
from evaluate import load
import torch
from transformers import AutoTokenizer, pipeline, AutoModelForQuestionAnswering, AutoModelForCausalLM, AutoModelForSeq2SeqLM, BitsAndBytesConfig
from tqdm import tqdm

# Custom dataset

In [None]:
uploaded = files.upload()
df = pd.read_csv('50_Q&A_Pairs.csv')
print(df.keys())
df.head(3)

df_filtered = df[['Question', 'Answer']]
df_filtered.rename(columns={"Question": "question", "Answer": "answer"}, inplace=True)
data_to_save = df_filtered.to_dict(orient='records')
# add id
data_to_save = [{**el, "id": str(idx)} for idx, el in enumerate(data_to_save)]
with open("50_Q&A_Pairs.json", 'w') as f:
  json.dump(data_to_save, f, indent=4)

custom_dataset = load_dataset("json", data_files="50_Q&A_Pairs.json", split="train")

Saving 50_Q&A_Pairs.csv to 50_Q&A_Pairs (5).csv
Index(['User Type', 'Question', 'Answer', 'Source [1]', '[2]', '[3]', '[4]'], dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.rename(columns={"Question": "question", "Answer": "answer"}, inplace=True)


Generating train split: 0 examples [00:00, ? examples/s]

# Gen datasets

In [None]:
# dataset = load_dataset("natural_questions", split="validation") # 140 GB disk space
dataset = load_dataset("nq_open", split="validation")
# dataset = load_dataset("squad_v2", split="validation") # https://paperswithcode.com/sota/question-answering-on-squad20
# dataset = load_dataset("ScienceQA", split="validation")

In [None]:
dataset = dataset.map(lambda example, idx: {**example, "id": str(idx)}, with_indices=True)
dataset = dataset.map(lambda sample: {"answer": sample["answer"][0]})

Map:   0%|          | 0/3610 [00:00<?, ? examples/s]

Map:   0%|          | 0/3610 [00:00<?, ? examples/s]

# Models

In [None]:
model_name = "meta-llama/Llama-3.1-8B"
# model_name = "meta-llama/Llama-3.1-8B-Instruct"
# model_name = "universeTBD/astrollama"
# model_name = "Tijmen2/cosmosage-v3.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
model = AutoModelForCausalLM.from_pretrained(model_name,
                                              quantization_config=quantization_config,
                                              device_map='auto')

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
# For reproducibility
torch.manual_seed(42)

<torch._C.Generator at 0x78a0b0e2bcf0>

# Generate and Evaluate

In [None]:
generator = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
    pad_token_id=tokenizer.eos_token_id
)

In [None]:
# https://huggingface.co/spaces/evaluate-metric/squad_v2
metric = load("squad_v2")

In [None]:
def eval_qa(examples):
  questions = []
  preds = []
  refs = []

  for sample in tqdm(examples):
      question = sample["question"]
      generated_answer = generator(question, max_length=150, do_sample=True, truncation=True)[0]["generated_text"].replace(question, "").strip()

      questions.append(question)
      preds.append({"id": sample["id"], "prediction_text": generated_answer, "no_answer_probability": 0.})
      refs.append({"id": sample["id"], "answers": {"answer_start": [0], "text": sample["answer"]}})

  # Compute exact match and F1 scores
  final_score = metric.compute(predictions=preds, references=refs)

  return final_score

In [None]:
custom_score = eval_qa(custom_dataset.select(list(range(5))))
print(custom_score)

100%|██████████| 5/5 [02:03<00:00, 24.60s/it]


[{'id': '0', 'answers': {'answer_start': [0], 'text': 'Active sensors generate their own energy to illuminate the objects they observe. They emit radiation towards the target and then measure the radiation that is reflected or backscattered back to the sensor. This allows active sensors to operate independently of external light sources. In contrast, passive sensors detect natural energy, such as sunlight, that is emitted or reflected by the objects or scenes they observe. The most common source of radiation for passive sensors is reflected sunlight, which they measure to gather information about the target.'}}, {'id': '1', 'answers': {'answer_start': [0], 'text': 'The study investigates ecosystem service (ES) supply in peri-urban watersheds (PUWs) across Greece using earth observation (EO) data and empirical models. Utilising various geospatial datasets within a GIS environment, the research focuses on assessing soil conservation and water retention services at the watershed scale. Fi

In [None]:
score = eval_qa(dataset.select(list(range(5))))
print(score)

100%|██████████| 5/5 [02:07<00:00, 25.45s/it]


[{'id': '0', 'answers': {'answer_start': [0], 'text': '14 December 1972 UTC'}}, {'id': '1', 'answers': {'answer_start': [0], 'text': 'Bobby Scott'}}, {'id': '2', 'answers': {'answer_start': [0], 'text': 'one'}}, {'id': '3', 'answers': {'answer_start': [0], 'text': '2017'}}, {'id': '4', 'answers': {'answer_start': [0], 'text': 'South Carolina'}}]
{'exact': 0.0, 'f1': 3.5134408602150535, 'total': 5, 'HasAns_exact': 0.0, 'HasAns_f1': 3.5134408602150535, 'HasAns_total': 5, 'best_exact': 0.0, 'best_exact_thresh': 0.0, 'best_f1': 3.5134408602150535, 'best_f1_thresh': 0.0}
