In [1]:
!pip install torch transformers datasets

import torch
from datasets import load_dataset
from transformers import pipeline

# Load the BoolQ dataset from Hugging Face.
# BoolQ is a Boolean (yes/no) QA task that includes a 'passage', 'question', and binary 'answer'.
boolq_dataset = load_dataset("boolq")

# Specify the model to use.
# "google/flan-t5-base" is an example instruction-tuned model; adjust the model_name as desired.
model_name = "google/flan-t5-base"

# Initialize the text-to-text generation pipeline.
# Use GPU if available.
device = 0 if torch.cuda.is_available() else -1
generator = pipeline("text2text-generation", model=model_name, tokenizer=model_name, device=device)

def generate_answer(passage, question):
    """
    Formats the prompt with the passage and question, then uses the LLM to generate an answer.
    The prompt is structured so that the model is primed to answer in 'yes' or 'no'.
    """
    # Create a prompt that instructs the model about the task.
    prompt = f"Passage: {passage}\nQuestion: {question}\nAnswer:"
    # Generate the response; adjust max_length as needed.
    output = generator(prompt, max_length=16, do_sample=False)
    generated_text = output[0]['generated_text']
    return generated_text.strip().lower()

def parse_answer(text):
    """
    Parses the generated text to return a Boolean.
    The function checks for the presence of the words "yes" or "no" in the output.
    """
    if "yes" in text:
        return True
    elif "no" in text:
        return False
    else:
        # If the generated text does not clearly contain yes/no, you may choose to handle it differently.
        # Here we return None, and such examples could be skipped or treated as a wrong answer.
        return None

# Select the validation split of the BoolQ dataset for evaluation.
dataset = boolq_dataset["validation"]

predictions = []
labels = []

print("Evaluating BoolQ benchmark with LLM...")

# Loop over examples in the validation set.
for example in dataset:
    passage = example["passage"]
    question = example["question"]
    true_answer = example["answer"]  # Boolean (True/False)

    # Generate the answer using our LLM
    generated = generate_answer(passage, question)
    predicted = parse_answer(generated)

    # If the answer is ambiguous (i.e. parsing returns None), count it as incorrect.
    if predicted is None:
        predicted = False

    predictions.append(predicted)
    labels.append(true_answer)

# Calculate accuracy.
correct = sum(p == l for p, l in zip(predictions, labels))
accuracy = correct / len(labels)

print(f"Accuracy on the BoolQ validation set: {accuracy:.2%}")


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting 

README.md:   0%|          | 0.00/6.57k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.69M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9427 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3270 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Device set to use cuda:0


Evaluating BoolQ benchmark with LLM...


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Token indices sequence length is longer than the specified maximum sequence length for this model (826 > 512). Running this sequence through the model will result in indexing errors


Accuracy on the BoolQ validation set: 75.75%
