In [18]:
import pandas as pd
import ast
import re
import datasets
from datasets import load_dataset
import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
)
from guidance import models, select
from langchain_core.prompts import PromptTemplate
from operator import itemgetter


import warnings
warnings.filterwarnings("ignore")

In [19]:
# Definisci una funzione di pulizia per rimuovere caratteri non validi
def clean_text(text):
    return re.sub(r"[^\w\s.,!?\-:;()]+", '', text)

# Definisci una funzione di pulizia per rimuovere caratteri non validi
def clean_text_final(text):
    text = re.sub(r'[^\w\s.,!?\'"\-:;()]+', '', text)  # Rimuove i caratteri speciali
    text = re.sub(r"['\"-]", '', text)  # Rimuove apostrofi, virgolette e trattini
    text = text.lower()  # Converte in minuscolo
    return text

In [3]:
# prompt augmentation for the (format of the) synthesis:
prompt_template = PromptTemplate.from_template(
"""You are a multiple-choice question answering assistant.
Choose the most proper option between {options} that best matches with the suggestion. 

Question: {question}
Context: {critique}
Sources: {context}

Assistant:
"""
)
augmentation = {"question": itemgetter("question"),
                "options": itemgetter("options"), 
                "critique": itemgetter("critique"),
                "context": itemgetter("context"), }
synthesis_chain = augmentation | prompt_template 

In [4]:
def create_message_thesis(question, options, context):
    options_str = '", "'.join(options)
    content = f"""

    Now do the same for this question: "{question}", where options: ["{options_str}"]. Assistant:
    """

    user_content = "Answer to the following question: " + question + " providing one of these options as answer: " + str(options) + "Assistant:"

    messages = [
        {"role": "system", "content": """
        You are an helpful AI assistant. You have to provide helpful answers to the user’s questions based on the context: 
        """ + context},
        {"role": "user", "content": user_content}
    ]

    return messages

def extract_answer_thesis(text):
    # Trova l'indice in cui inizia il testo "Why or why not the answer is correct:"
    start_index = text.find("}]")

    
    # Se l'indice è stato trovato, estrai la risposta corretta
    if start_index != -1:
        start_index += len("}]")
        # Estrai il testo dopo "Why or why not the answer is correct:"
        correct_answer_text = text[start_index:].strip()
        return correct_answer_text
    else:
        return "The correct answer could not be found."

def thesisGeneration(query, merged, sources):
    merged = ast.literal_eval(merged)
    augmented_prompt = create_message_thesis(query, merged, sources)
    ans = new_model + str(augmented_prompt) + select(merged)
    return str(ans)

In [5]:
def create_message_antithesis(question, candidate, options, context):
    options_str = '", "'.join(options)
    content = f"""

    Now do the same for this question: "{question}", where options: ["{options_str}"]. Assistant:
    """

    user_content = "Question: " + question + "\n Options: " + str(options) + "\n Candidate answer: " + candidate + "\n Context: " + context + "\n Assistant: \n"

    messages = [
        {"role": "system", "content": """
        You are an helpful AI assistant. You are asked to determine the most correct answer for a given question, provided a set of possible options.
        You also have at disposal a first tentative answer that you are required to check with respect to the question and the relevant context.
        Your goal is to decree which is the most correct answer to the question between the available options.

        Here's an example of how to do it:
        """},
        {"role": "user", "content": """
        Question: What event did Juan Rossell participate in?
        Options: ['1996 summer olympics', 'olympic games', 'sport']
        Candidate answer: 1996 summer olympics
        Context: The 2004 Summer Olympic Games, held in Athens, Greece, marked the return of the games to their birthplace. With a motto of "Welcome Home," the event featured 10,625 athletes from 201 countries competing in 301 medal events across 28 sports. This edition of the Games was significant as it was the first time all countries with a National Olympic Committee participated.\n\nIn the realm of beach volleyball, a sport introduced to the Olympics in 1996, Juan Miguel Rossell Milanes from Cuba emerged as a notable figure. He won the gold medal in the men\'s beach team competition at the 2003 Pan American Games in Santo Domingo, Dominican Republic, partnering Francisco Alvarez. Rossell Milanes also represented Cuba at the 1996 and 2004 Summer Olympics.\n\nThe Pan American Games, a major sporting event in the Americas, also featured beach volleyball as a competitive discipline. The 14th Pan American Games took place in Santo Domingo, Dominican Republic, in 2003. The successful bid for the games was made in the mid-1990s, when the Dominican Republic experienced one of the highest growth rates in Latin America.\n\nThe International Olympic Committee (IOC) voted in 1986 to separate the Summer and Winter Games, which had been held in the same year since 1924, and place them in alternating even-numbered years, beginning in 1994. The 1996 Summer Games were the first to be staged in a different year from the Winter Games. Atlanta, Georgia, United States, hosted the 1996 Summer Games, becoming the fifth American city to host the Olympic Games and the third to hold a Summer Olympic Games.

        Assistant: Let's consider each option and check whether or not is the proper one with respect to the context.'sport' is an extremely generic answer, inappropriate for the question asked. 'olympic games' is not the most proper answer, since it is true that Juan Rossell introduced beach volleyball as an olympic sport, but he participated only at Summer Olympics, both in 1996 and in 2003. Therefore, the correct option is '1996 summer olympics'.

        """
        },
        {"role": "system", "content": "Now do the same for the following question:"},
        {"role": "user", "content": user_content}
    ]

    return messages

def antithesisGeneration(query, merged, candidate, sources):
    merged = ast.literal_eval(merged)
    prompt = create_message_antithesis(query, candidate, merged, sources)
    output = pipe(prompt, **generation_args)
    return output[0]['generated_text']

In [6]:
def create_message_presynthesis(question, candidate, suggestion, options, context):

    user_content = "Question: " + question + "\n Options: " + str(options) + "\n Candidate answer: " + candidate + "\n Suggestion: " + suggestion + "\n Context: " + context + "\n Assistant: \n"

    messages = [
        {"role": "system", "content": """
        You are an helpful AI assistant. You are asked to determine the most correct answer for a given question, provided a set of possible options.
        You also have at disposal a first tentative answer and a suggestion on which is the correct answer.
        Your goal is to decree which is the most correct answer to the question between the available options according to the context.

        Here's a few examples on how to do it:
        """},
        {"role": "user", "content": """
        Question: What event did Juan Rossell participate in?
        Options: ['1996 summer olympics', 'olympic games', 'sport']
        Candidate answer: 1996 summer olympics
        Suggestion: Let's consider each option and check whether or not is the proper one with respect to the context.'sport' is an extremely generic answer, inappropriate for the question asked. 'olympic games' is not the most proper answer, since it is true that Juan Rossell introduced beach volleyball as an olympic sport, but he participated only at Summer Olympics, both in 1996 and in 2003. Therefore, the correct option is '1996 summer olympics'.
        Context: The 2004 Summer Olympic Games, held in Athens, Greece, marked the return of the games to their birthplace. With a motto of "Welcome Home," the event featured 10,625 athletes from 201 countries competing in 301 medal events across 28 sports. This edition of the Games was significant as it was the first time all countries with a National Olympic Committee participated.\n\nIn the realm of beach volleyball, a sport introduced to the Olympics in 1996, Juan Miguel Rossell Milanes from Cuba emerged as a notable figure. He won the gold medal in the men\'s beach team competition at the 2003 Pan American Games in Santo Domingo, Dominican Republic, partnering Francisco Alvarez. Rossell Milanes also represented Cuba at the 1996 and 2004 Summer Olympics.\n\nThe Pan American Games, a major sporting event in the Americas, also featured beach volleyball as a competitive discipline. The 14th Pan American Games took place in Santo Domingo, Dominican Republic, in 2003. The successful bid for the games was made in the mid-1990s, when the Dominican Republic experienced one of the highest growth rates in Latin America.\n\nThe International Olympic Committee (IOC) voted in 1986 to separate the Summer and Winter Games, which had been held in the same year since 1924, and place them in alternating even-numbered years, beginning in 1994. The 1996 Summer Games were the first to be staged in a different year from the Winter Games. Atlanta, Georgia, United States, hosted the 1996 Summer Games, becoming the fifth American city to host the Olympic Games and the third to hold a Summer Olympic Games.
        """
        },
        {"role": "assistant", "content": """
        Assistant: The correct option is '1996 summer olympics', both the candidate answer and the suggestion agree.
        """
        },
        

        
        {"role": "system", "content": "Now do the same for the following question:"},
        {"role": "user", "content": user_content}
    ]

    return messages

def preSynthGeneration(query, candidate_answer, critique, merged, sources):
    prompt = create_message_presynthesis(query, merged, candidate_answer, critique, sources)
    output = pipe(prompt, **generation_args)
    return output[0]['generated_text']

In [7]:

def synthesisGeneration(query, merged, pre_answer, sources):
    merged = ast.literal_eval(merged)
    augmented_prompt = synthesis_chain.invoke({'question': query, 
                                            'options': merged,
                                            'critique': pre_answer,
                                            'context': sources})

    normal_string = clean_text(augmented_prompt.text)
    ans = new_model + normal_string + select(merged)
    return str(ans)

def extract_answer_synthesis(text):
    # Trova l'indice in cui inizia il testo "Why or why not the answer is correct:"
    start_index = text.find("Assistant:\n")

    
    # Se l'indice è stato trovato, estrai la risposta corretta
    if start_index != -1:
        start_index += len("Assistant:\n")
        # Estrai il testo dopo "Why or why not the answer is correct:"
        correct_answer_text = text[start_index:].strip()
        return correct_answer_text
    else:
        return "The correct answer could not be found."

In [8]:
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct", use_fast=False)
new_model = models.Transformers(model, tokenizer, temperature=0.0)

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "do_sample": False,
}

In [20]:
df = pd.read_csv('../../../wikihop_dataset/wikihop-merged-summarized.csv')

# select a subset of the queries, just for test:
first_queries = df['query']

# same for correct answers and distractors:
correct_answers = df['answer']
possibilities = df['options']

# and for the sources:
sources = df['sum_supports']

N_rows = 5

In [25]:
possibilities[1]

"['english', 'greek', 'koine greek', 'nahuatl', 'spanish']"

In [28]:
sources[5]

' King Kobra, a hard rock band founded by drummer Carmine Appice, was active from 1983 to 1986. The band\'s first two albums, "Ready to Strike" and "Thrill of a Lifetime," were released in 1985 and 1986, respectively. The band\'s third album, "King Kobra III," was released in 1988. After the band\'s dissolution, Appice joined guitarist John Sykes on his Blue Murder project in 1989.\n\nBernardo Chavez Rico, known as B.C., was an American luthier specializing in guitars. He was born in East Los Angeles, California, and began his career building Flamenco, Classical guitars, banjos, and ukuleles in the 1950s. Rico\'s original instruments were acoustic guitars made under the name B.C. Rico. These acoustic guitars are very rare, with approximately 300 surviving.\n\nDokken is an American metal band formed in Los Angeles in 1979. The band split up in 1989 and reformed in 1993. They achieved moderate chart success in the late 1980s and early 1990s.\n\nVan Halen is an American hard rock band for

In [29]:
first_queries[5]

'What was the occupation of Johnny Rod?'

In [14]:
# THESIS
answers = []
for i in range(N_rows):
    answers.append(extract_answer_thesis(thesisGeneration(first_queries[i], possibilities[i], sources[i])))


# ANTITHESIS
ant_answers = []
for i in range(N_rows):
    ant_answers.append(antithesisGeneration(first_queries[i], possibilities[i], answers[i], sources[i]))

# format antithesis
format_answers = []
for i in range(N_rows):
    format_answers.append(extract_answer_synthesis(
        synthesisGeneration(
            first_queries[i], possibilities[i], 
            ant_answers[i], sources[i])))

# SYNTHESIS
pre_answers = []
for i in range(N_rows):
    pre_answers.append(preSynthGeneration(first_queries[i], possibilities[i], answers[i], ant_answers[i], sources[i]))


# format synthesis
syn_answers = []
for i in range(N_rows):
    syn_answers.append(extract_answer_synthesis(
        synthesisGeneration(
            first_queries[i], possibilities[i], 
            pre_answers[i], sources[i])))

In [15]:
syn_answers

['1996 summer olympics', 'english', 'reptile', 'crocodilia', 'chancellor']