In [7]:
from datasets import load_dataset

dataset = load_dataset('openai/gsm8k', 'main', split="train", trust_remote_code=True)
dataset

ModuleNotFoundError: No module named 'datasets'

In [3]:
docs = []
for i in range(len(dataset)):
    concatenation = dataset['question'][i] + dataset['answer'][i]
    docs.append(concatenation)

In [12]:
import pandas as pd

dati = {
    'docs': docs,
}

df = pd.DataFrame(dati)
df.head()

Unnamed: 0,docs
0,Natalia sold clips to 48 of her friends in Apr...
1,Weng earns $12 an hour for babysitting. Yester...
2,Betty is saving money for a new wallet which c...
3,"Julie is reading a 120-page book. Yesterday, s..."
4,James writes a 3-page letter to 2 different fr...


In [13]:
# df.to_csv('references-gsm8k.csv')

In [1]:
from langchain.document_loaders import HuggingFaceDatasetLoader

loader = HuggingFaceDatasetLoader('saracandu/references-gsm8k', 'docs')
documents = loader.load()
documents[0] 

ModuleNotFoundError: Module langchain_community.document_loaders not found. Please install langchain-community to access this module. You can install it using `pip install -U langchain-community`

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# create an instance of the RecursiveCharacterTextSplitter class with specific parameters
# (it splits text into chunks of 50 characters each with a 20-character overlap)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=20)

# 'documents' holds the text you want to split, split the text into documents using the text splitter
docs = text_splitter.split_documents(documents)

In [41]:
# choose an embedding method
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",  
)

In [42]:
# embed the documents 'docs' into vectors using the embedding method specified by 'embedding'
# the result is stored in a FAISS index:
db = FAISS.from_documents(docs, embeddings)

# to avoid computing it each time (since the docs won't change), save the result in the storage
db.save_local(folder_path="faiss_db", index_name="GSM8K_FaissIndex_MiniLM")

In [43]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",  
)

db = FAISS.load_local(
    folder_path="faiss_db", # where to find it
    embeddings=embeddings, # in which "embedding language" it is expressed
    index_name="GSM8K_FaissIndex_MiniLM", # since the folder contains multiple vector databases, specify its name
    allow_dangerous_deserialization=True
)

In [44]:
retriever = db.as_retriever(
    search_kwargs={'k': 5,}
) 

In [45]:
queries = dataset['question']
queries[0]

'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?'

In [46]:
documents_retrieved = retriever.invoke(queries[0])

In [47]:
documents_retrieved

[Document(page_content='"Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?Natalia sold 48/2 = <<48/2=24>>24 clips in May.\\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\\n#### 72"', metadata={'Unnamed: 0': 0}),
 Document(page_content='"Nina makes one-of-a-kind jewelry and sells it at a local boutique.  She charges $25.00 for her necklaces, $15.00 for bracelets and $10.00 for a pair of earrings.  Over the weekend, she sold 5 necklaces, 10 bracelets, 20 earrings, and received 2 orders for a complete jewelry ensemble that Nina charged $45.00 for.  How much money did she make over the weekend?She sold 5 necklaces at $25.00 apiece for a total of 5*25 = $<<5*25=125.00>>125.00\\nShe sold 10 bracelets for $15.00 each for a total of 10*15 = $<<10*15=150.00>>150.00\\nShe sold 20 earrings for $10.00 a pair for a total of 20*10 = $<<20*10=200.00>>200.00\\nShe sold 2 e

In [48]:
def format_page_content(documents):
    """
    Formats the list of retrieved documents such that 'page_content', 'Documents', 'metadata' 
    words are removed and just the true content is kept.
    """
    formatted_output = ""
    for i, doc in enumerate(documents, start=1):
        content = doc.page_content.strip(" ")
        formatted_output += f"[{i}]: {content}\n"
    return formatted_output

In [49]:
formatted_context = format_page_content(documents_retrieved)

In [50]:
print(formatted_context)

[1]: "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72"
[2]: "Nina makes one-of-a-kind jewelry and sells it at a local boutique.  She charges $25.00 for her necklaces, $15.00 for bracelets and $10.00 for a pair of earrings.  Over the weekend, she sold 5 necklaces, 10 bracelets, 20 earrings, and received 2 orders for a complete jewelry ensemble that Nina charged $45.00 for.  How much money did she make over the weekend?She sold 5 necklaces at $25.00 apiece for a total of 5*25 = $<<5*25=125.00>>125.00\nShe sold 10 bracelets for $15.00 each for a total of 10*15 = $<<10*15=150.00>>150.00\nShe sold 20 earrings for $10.00 a pair for a total of 20*10 = $<<20*10=200.00>>200.00\nShe sold 2 ensembles for $45.00 each for a total of 2*45 = $<<2*45=90.00>>90.00\nAll t

In [52]:
import torch
import re
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [53]:
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    device_map = "cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "do_sample": False,
}

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [54]:
def create_message_baseline_RAG(question, context):
    content = f"""
    Question: "{question}";
    Context: "{context}".
    """

    messages = [
        {"role": "system",
        "content": """
        You are a helpful AI assistant asked to solve mathematical problems using similar problems already solved as context. 
        Output your numerical answer only after these symbols ###.
        """},
        {"role": "user",
        "content": content},
    ]

    return messages

In [55]:
create_message_baseline_RAG(queries[0], formatted_context)

[{'role': 'system',
  'content': '\n        You are a helpful AI assistant asked to solve mathematical problems using similar problems already solved as context. \n        Output your numerical answer only after these symbols ###.\n        '},
 {'role': 'user',
  'content': '\n    Question: "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?";\n    Context: "[1]: "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?Natalia sold 48/2 = <<48/2=24>>24 clips in May.\\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\\n#### 72"\n[2]: "Nina makes one-of-a-kind jewelry and sells it at a local boutique.  She charges $25.00 for her necklaces, $15.00 for bracelets and $10.00 for a pair of earrings.  Over the weekend, she sold 5 necklaces, 10 bracelets, 20 earrings, and 

In [56]:
messages = create_message_baseline_RAG(queries[0], formatted_context)
output = pipe(messages, **generation_args)
print(output[0]['generated_text'])

You are not running the flash-attention implementation, expect numerical differences.


 Natalia sold 48 clips in April and 24 clips in May, so she sold 48 + 24 = 72 clips altogether in April and May.
#### 72


# baseline zero-shot

In [1]:
from datasets import load_dataset
import torch
import re
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

torch.random.manual_seed(0)

##############################################################################

# upload the model and the tokenizer

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    device_map = "cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

##############################################################################

# construct the pipeline and fix the generation arguments
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "do_sample": False,
}

##############################################################################

# load the dataset and select the test set 
dataset = load_dataset('openai/gsm8k', 'main')
test_dataset = dataset['test']

# function to extract the answer from the golden answers
def estrai_numero(input_string):
    # Usa una regex per trovare il numero dopo ###
    match = re.search(r'###\s*(\d+)', input_string)
    if match:
        return match.group(1)
    else:
        return None

# extract the correct answers
correct_answers = []
for i in range(len(test_dataset)):
  correct_answers.append(estrai_numero(test_dataset['answer'][i]))

##############################################################################

# load a FAISS index and import the correct embedding and use it as a retriever

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",  
)

db = FAISS.load_local(
    folder_path="../faiss_db", # where to find it
    embeddings=embeddings, # in which "embedding language" it is expressed
    index_name="GSM8K_FaissIndex_MiniLM", # since the folder contains multiple vector databases, specify its name
    allow_dangerous_deserialization=True
)

retriever = db.as_retriever(
    search_kwargs={'k': 5,}
)

##############################################################################

# function to format the retrieved documents in a proper way 

def format_page_content(documents):
    """
    Formats the list of retrieved documents such that 'page_content', 'Documents', 'metadata' 
    words are removed and just the true content is kept.
    """
    formatted_output = ""
    for i, doc in enumerate(documents, start=1):
        content = doc.page_content.strip(" ")
        formatted_output += f"[{i}]: {content}\n"
    return formatted_output

##############################################################################

# function to create the prompt for each question

def create_message_baseline_RAG(question, context):
    content = f"""
    Question: "{question}";
    Context: "{context}".
    """

    messages = [
        {"role": "system",
        "content": """
        You are a helpful AI assistant asked to solve mathematical problems using similar problems already solved as context. 
        Output your numerical answer only after these symbols ###.
        """},
        {"role": "user",
        "content": content},
    ]

    return messages

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
##############################################################################

answers_baseline_RAG = []
answers_baseline_RAG_def = []

for i in range(len(test_dataset)):
    relevant_passages = format_page_content(retriever.invoke(test_dataset['question'][i]))
    messages = create_message_baseline_RAG(test_dataset['question'][i], relevant_passages)
    output = pipe(messages, **generation_args)
    answers_baseline_RAG.append(output[0]['generated_text'])
    answers_baseline_RAG_def.append(estrai_numero(answers_baseline_RAG[i]))



KeyboardInterrupt: 

In [None]:
##############################################################################

df = {
    'query': test_dataset['question'],
    'correct': correct_answers,
    'long answer': answers_baseline_RAG,
    'answer': answers_baseline_RAG_def
}

df = pd.DataFrame.from_dict(df)
df.to_csv('testset-baseline-RAG-zeroshot.csv')