In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from datasets import load_dataset
import torch
from tqdm import tqdm
import re

In [None]:
!pip install -U huggingface_hub

In [4]:
MODEL_NAME = "microsoft/phi-2"
model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [6]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [10]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    temperature=0.1
)

Device set to use cuda:0


In [119]:
prompt = "Do you believe you are an AI?"
response = pipe(prompt)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [120]:
print(response)
print(response[0]['generated_text'])
response = response[0]['generated_text']

[{'generated_text': 'Do you believe you are an AI?\nThe answer to this question is not as simple as it may seem. While some people may believe that they are an AI, others may not. It is important to understand the difference between AI and human intelligence.\nAI is a type of computer program that is designed to perform tasks that would normally require human intelligence. AI can be used to perform a wide range of tasks, from simple tasks like playing chess to more complex tasks like driving a car.\nHuman intelligence, on the other hand, is the ability to think, reason, and solve problems. Human intelligence is what allows us to learn, remember, and make decisions.\nWhile AI can be programmed to perform tasks that require human intelligence, it is not capable of the same level of intelligence as humans. AI is limited by the programming that it has been given, and it cannot learn or adapt on its own.\nIn conclusion, while some people may believe that they are an AI, it is important to u

Si può vedere che il modello tende a inserire la domanda nella risposta. Si potrebbe modificare il prompt per dirgli di rispondere solo true or false, un altro modo è questo:

In [41]:
def clean_answer(answer, prompt):
    if answer.startswith(prompt):
        answer = answer[len(prompt):].strip()
    return answer
print(clean_answer(response, prompt))

The answer to this question is not as simple as it may seem. While some people may believe that they are an AI, others may not. It is important to understand the difference between AI and human intelligence.
AI is a type of computer program that is designed to perform tasks that would normally require human intelligence. AI can be used to perform a wide range of tasks, from simple tasks like playing chess to more complex tasks like driving a car.
Human intelligence, on the other hand, is the ability to think, reason, and solve problems. Human intelligence is what allows us to learn, remember, and make decisions.
While AI can be programmed to perform tasks that require human intelligence, it is not capable of the same level of intelligence as humans. AI is limited by the programming that it has been given, and it cannot learn or adapt on its own.
In conclusion, while some people may believe that they are an AI, it is important to understand the difference between AI and human intelligen

Questo è abbastanza per vedere che funziona!

In [15]:
from datasets import load_dataset


In [17]:
dataset = load_dataset("boolq", split="validation")

README.md:   0%|          | 0.00/6.57k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.69M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9427 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3270 [00:00<?, ? examples/s]

In [42]:
print(dataset)

Dataset({
    features: ['question', 'answer', 'passage'],
    num_rows: 3270
})


In [43]:
dataset["question"][:2]

['does ethanol take more energy make that produces',
 'is house tax and property tax are same']

In [44]:
dataset["answer"][:2]

[False, True]

In [45]:
dataset["passage"][:2]

["All biomass goes through at least some of these steps: it needs to be grown, collected, dried, fermented, distilled, and burned. All of these steps require resources and an infrastructure. The total amount of energy input into the process compared to the energy released by burning the resulting ethanol fuel is known as the energy balance (or ``energy returned on energy invested''). Figures compiled in a 2007 report by National Geographic Magazine point to modest results for corn ethanol produced in the US: one unit of fossil-fuel energy is required to create 1.3 energy units from the resulting ethanol. The energy balance for sugarcane ethanol produced in Brazil is more favorable, with one unit of fossil-fuel energy required to create 8 from the ethanol. Energy balance estimates are not easily produced, thus numerous such reports have been generated that are contradictory. For instance, a separate survey reports that production of ethanol from sugarcane, which requires a tropical clim

In [46]:
def format_prompt(example):
    return f"Passage: {example['passage']} Question: {example['question']} Answer with True or False:"

In [47]:
formatted_row = format_prompt(dataset[0])
print(formatted_row)

Passage: All biomass goes through at least some of these steps: it needs to be grown, collected, dried, fermented, distilled, and burned. All of these steps require resources and an infrastructure. The total amount of energy input into the process compared to the energy released by burning the resulting ethanol fuel is known as the energy balance (or ``energy returned on energy invested''). Figures compiled in a 2007 report by National Geographic Magazine point to modest results for corn ethanol produced in the US: one unit of fossil-fuel energy is required to create 1.3 energy units from the resulting ethanol. The energy balance for sugarcane ethanol produced in Brazil is more favorable, with one unit of fossil-fuel energy required to create 8 from the ethanol. Energy balance estimates are not easily produced, thus numerous such reports have been generated that are contradictory. For instance, a separate survey reports that production of ethanol from sugarcane, which requires a tropic

In [53]:
response = pipe(formatted_row)[0]['generated_text']
clean_response = clean_answer(response, formatted_row)
print(clean_response)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


True


In [33]:
print(pipe(formatted_row))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Passage: All biomass goes through at least some of these steps: it needs to be grown, collected, dried, fermented, distilled, and burned. All of these steps require resources and an infrastructure. The total amount of energy input into the process compared to the energy released by burning the resulting ethanol fuel is known as the energy balance (or ``energy returned on energy invested''). Figures compiled in a 2007 report by National Geographic Magazine point to modest results for corn ethanol produced in the US: one unit of fossil-fuel energy is required to create 1.3 energy units from the resulting ethanol. The energy balance for sugarcane ethanol produced in Brazil is more favorable, with one unit of fossil-fuel energy required to create 8 from the ethanol. Energy balance estimates are not easily produced, thus numerous such reports have been generated that are contradictory. For instance, a separate survey reports that production of ethanol from sugarcane, wh

In [57]:
predition = re.search(r"(True|False)", clean_response, re.IGNORECASE)
if predition:
    print("Ci ha azzeccato!")
else:
    print("Non ci ha azzeccato :(")

Ci ha azzeccato!


In [59]:
from tqdm import tqdm

In [90]:
NUM_SAMPLES = 100 #len(dataset)
dataset = dataset.shuffle().select(range(NUM_SAMPLES))
print("Uso " + str(NUM_SAMPLES) + " elementi del dataset")

Uso 100 elementi del dataset


In [91]:
def evaluate_boolq():
    correct = 0
    for example in tqdm(dataset, desc="Evaluating Phi-2 on BoolQ"):

        prompt = format_prompt(example)
        response = pipe(prompt)[0]['generated_text']
        response = clean_answer(response, prompt)
        
        true_answer = example["answer"]
        prediction = re.search(r"(True|False)", response, re.IGNORECASE)
        
        if prediction.group(0).lower() == str(true_answer).lower(): # https://stackoverflow.com/questions/15340582/python-extract-pattern-matches
            correct += 1
    
            
    accuracy = correct / NUM_SAMPLES * 100
    print(f"\nAccuracy: {accuracy:.1f}%")
    
evaluate_boolq()

Evaluating Phi-2 on BoolQ:   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating Phi-2 on BoolQ:   1%|          | 1/100 [00:00<00:17,  5.50it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating Phi-2 on BoolQ:   2%|▏         | 2/100 [00:00<00:17,  5.69it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating Phi-2 on BoolQ:   3%|▎         | 3/100 [00:00<00:18,  5.12it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating Phi-2 on BoolQ:   4%|▍         | 4/100 [00:00<00:17,  5.52it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating Phi-2 on BoolQ:   5%|▌         | 5/100 [00:00<00:18,  5.15it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating Phi-2 on BoolQ:   6%|▌         | 6/100 [00:01<00:17,  5.48it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-en


Accuracy: 61.0%





In [92]:
def evaluate_boolq(pipe, dataset):

    def format_prompt(example):
        return f"Passage: {example['passage']} Question: {example['question']} Answer with True or False:"
    
    def clean_answer(answer, prompt):
        if answer.startswith(prompt):
            answer = answer[len(prompt):].strip()
        return answer
    
    correct = 0
    for example in tqdm(dataset, desc="Evaluating Phi-2 on BoolQ"):
        prompt = format_prompt(example)
        response = pipe(prompt)[0]['generated_text']
        response = clean_answer(response, prompt)
        
        true_answer = example["answer"]
        prediction = re.search(r"(True|False)", response, re.IGNORECASE)
        
        if prediction.group(0).lower() == str(true_answer).lower(): # https://stackoverflow.com/questions/15340582/python-extract-pattern-matches
            correct += 1
            
    accuracy = correct / (len(dataset)) * 100
    print(f"\nAccuracy: {accuracy:.1f}%")
    
evaluate_boolq(pipe, dataset)

Evaluating Phi-2 on BoolQ:   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating Phi-2 on BoolQ:   1%|          | 1/100 [00:00<00:18,  5.34it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating Phi-2 on BoolQ:   2%|▏         | 2/100 [00:00<00:17,  5.57it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating Phi-2 on BoolQ:   3%|▎         | 3/100 [00:00<00:19,  5.07it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating Phi-2 on BoolQ:   4%|▍         | 4/100 [00:00<00:17,  5.49it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating Phi-2 on BoolQ:   5%|▌         | 5/100 [00:00<00:18,  5.13it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating Phi-2 on BoolQ:   6%|▌         | 6/100 [00:01<00:17,  5.45it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-en


Accuracy: 61.0%





Prova gpqa

In [98]:
ds = load_dataset("hendrydong/gpqa_diamond")


In [99]:
ds

DatasetDict({
    test: Dataset({
        features: ['solution', 'problem', 'domain'],
        num_rows: 198
    })
})

In [106]:
ds["test"]["solution"][:2]

['\\boxed{10^-4 eV}', '\\boxed{11}']

In [117]:
tmp = ds["test"]["solution"][0]
print(tmp)
tst = re.search("\\boxed{10^-4 eV}", tmp, re.IGNORECASE)
print(tst)

\boxed{10^-4 eV}
None


In [None]:
def evaluate_gpqa(pipe, dataset):

    def format_prompt(example):
        return f"Problem: {example['problem']}"
    
    def clean_answer(answer, prompt):
        if answer.startswith(prompt):
            answer = answer[len(prompt):].strip()
        return answer
    
    correct = 0
    for example in tqdm(dataset, desc="Evaluating Phi-2 on gpqa_diamond"):
        prompt = format_prompt(example)
        response = pipe(prompt)[0]['generated_text']
        response = clean_answer(response, prompt)
        
        true_answer = example["solution"]
        prediction = re.search(r"(True|False)", response, re.IGNORECASE)
        
        if prediction.group(0).lower() == str(true_answer).lower(): # https://stackoverflow.com/questions/15340582/python-extract-pattern-matches
            correct += 1
            
    accuracy = correct / (len(dataset)) * 100
    print(f"\nAccuracy: {accuracy:.1f}%")
    
evaluate_boolq(pipe, dataset)