Prova a vedere se riesci a replicare questo benchmark: https://klu.ai/glossary/GSM8K-eval

Risorse per il RAG: https://www.mathlearningcenter.org/educators/free-resources/lessons-publications/practice-books

In [23]:
from datasets import load_dataset
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [25]:
torch.cuda.empty_cache()

# upload the model and construct the generation pipeline

nel paper originale mostrano che al crescere del numero dei parametri del modello la capacità di reasoning migliora -> vedere come va per gli altri phi?
o magari spostarsi su gemma-2b e gemma-7b... capire se ci stanno su ORFEO però

In [26]:
torch.random.manual_seed(0)
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    device_map = "cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [27]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "do_sample": False,
}

# upload the dataset: `openai/gsm8k`

In [28]:
dataset = load_dataset('openai/gsm8k', 'main')
train_dataset = dataset['train']
print(train_dataset)

Dataset({
    features: ['question', 'answer'],
    num_rows: 7473
})


In [64]:
N_rows = 500

# extract the correct answer from the dataset

In [30]:
import re

def estrai_numero(input_string):
    # Usa una regex per trovare il numero dopo ###
    match = re.search(r'###\s*(\d+)', input_string)
    if match:
        return match.group(1)
    else:
        return None

In [65]:
correct_answers = []

for i in range(N_rows):
  correct_answers.append(estrai_numero(train_dataset['answer'][i]))

In [32]:
correct_answers[:5]

['72', '10', '5', '42', '624']

# baseline: zero-shot question

In [33]:
def create_message3(question):
    content = f"""
    Question: "{question}".
    """

    messages = [
        {"role": "system",
        "content": """
        You are a helpful AI assistant asked to solve mathematical problems. Output your numerical answer only after these symbols ###.
        """},
        {"role": "user",
        "content": content},
    ]

    return messages

In [None]:
answers_baseline = []
for i in range(N_rows):
  messages = create_message3(train_dataset['question'][i])
  output = pipe(messages, **generation_args)
  answers_baseline.append(output[0]['generated_text'])



In [None]:
answers_baseline_def = []
for i in range(N_rows):
    answers_baseline_def.append(estrai_numero(answers_baseline[i]))

In [43]:
answers_baseline_def[:5]

['72', '10', None, None, '624']

# baseline #2: one-shot question (just the answer, not CoT)

In [44]:
def create_message2(question):
    content = f"""
    Question: "{question}".
    """

    messages = [
        {"role": "system",
        "content": """
        You are a helpful AI assistant asked to solve mathematical problems. Output your numerical answer only after these symbols ###.
        Question: James writes a 3-page letter to 2 different friends twice a week. How many pages does he write a year?
        """},
        {"role": "assistant",
        "content": """
        Assistant: ### 624
        """},
        {"role": "user",
        "content": content},
    ]

    return messages

In [None]:
answers_baseline2 = []
for i in range(N_rows):
  messages = create_message2(train_dataset['question'][i])
  output = pipe(messages, **generation_args)
  answers_baseline2.append(output[0]['generated_text'])

In [None]:
answers_baseline2_def = []
for i in range(N_rows):
    answers_baseline2_def.append(estrai_numero(answers_baseline2[i]))

In [47]:
answers_baseline2_def[:5]

['72', '10', '25', '36', '624']

# one-shot CoT

says who? gli autori: https://arxiv.org/pdf/2201.11903

In [48]:
def create_message(question):
    content = f"""
    Question: "{question}".
    """

    messages = [
        {"role": "system",
        "content": """
        You are a helpful AI assistant asked to solve mathematical problems.
        Question: James writes a 3-page letter to 2 different friends twice a week. How many pages does he write a year?
        """},
        {"role": "assistant",
        "content": """
        Assistant: He writes each friend 3*2=6 pages a week So he writes 6*2=12 pages every week That means he writes 12*52=624 pages a year. ### 624
        """},
        {"role": "user",
        "content": content},
    ]

    return messages

In [None]:
answers_cot = []
for i in range(N_rows):
  messages = create_message(train_dataset['question'][i])
  output = pipe(messages, **generation_args)
  answers_cot.append(output[0]['generated_text'])

In [None]:
answers_cot_def = []
for i in range(N_rows):
    answers_cot_def.append(estrai_numero(answers_cot[i]))

# creazione di un dataset con tutte le nuove risposte

In [None]:
dati = {
    'domanda': train_dataset['question'][:N_rows],
    'corretta': correct_answers,
    'baseline 1': answers_baseline_def,
    'baseline 2': answers_baseline2_def,
    'one-shot CoT': answers_cot_def
}

In [None]:
import pandas as pd

df = pd.DataFrame(dati)
df.head()

In [56]:
# df.to_csv('first-500.csv')

# valutazione delle performance (da sistemare)

In [62]:
my_sum = 0

for i in range(N_rows):
    if correct_answers[i] == answers_cot_def[i]:
        my_sum += 1

In [59]:
my_sum

23

In [61]:
my_sum

19

In [63]:
my_sum

92