In [27]:
import pandas as pd

mini = pd.read_csv('def/phi-mini/cot/phimini-cot-wikihop.csv')
med = pd.read_csv('def/phi-medium/cot/phimedium-cot-wikihop.csv')
# two = pd.read_csv('def/gemma-2-2b-it/cot/gemma-2-2b-it-cot.csv')
# nine = pd.read_csv('def/gemma-2-9b-it/cot/gemma-2-9b-it-cot.csv')

In [28]:
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
from guidance import models, select

In [38]:
med.head()

Unnamed: 0.1,Unnamed: 0,query,correct,thesis,antithesis,pre-synthesis,synthesis,context
0,0,What event did Juan Rossell participate in?,1996 summer olympics,1996 summer olympics,Let's analyze the options and the context pro...,The correct answer is '1996 summer olympics'....,1996 summer olympics,"The 2004 Summer Olympic Games, held in Athens..."
1,1,What languages did John Osteen speak or write?,english,koine greek,The context provided does not give any inform...,The correct option is 'english'. The context ...,english,Christianity is a monotheistic religion based...
2,2,What is the parent taxon of Proaigialosaurus?,lepidosauria,diapsid,To determine the parent taxon of Proaigialosa...,"Based on the context provided, the parent tax...",reptile,Herpetology is the branch of zoology that stu...
3,3,What is the parent taxon of Australosuchus?,crocodilia,plant,Let's analyze the options given and compare t...,"The correct answer is'mekosuchinae', as it is...",mekosuchinae,Mekosuchinae was a subfamily of crocodiles th...
4,4,What was the occupation of Cao Chong?,physicist,science,Let's analyze the options and the context pro...,"Based on the context provided, Cao Chong was ...",general,The Three Kingdoms period (220280) in China w...


In [39]:
med = med.drop(['Unnamed: 0'], axis = 1)

In [5]:
from langchain_core.prompts import PromptTemplate
from operator import itemgetter

# prompt augmentation for the (format of the) synthesis:
prompt_template = PromptTemplate.from_template(
"""You are a multiple-choice question answering assistant.
Choose the most proper option between {options} that best matches with the suggestion. 

Question: {question}
Context: {critique}
Sources: {context}

Assistant:
"""
)
augmentation = {"question": itemgetter("question"),
                "options": itemgetter("options"), 
                "critique": itemgetter("critique"),
                "context": itemgetter("context"), }
synthesis_chain = augmentation | prompt_template 

In [6]:
import ast
import re

# Definisci una funzione di pulizia per rimuovere caratteri non validi
def clean_text(text):
    return re.sub(r"[^\w\s.,!?\-:;()]+", '', text)

def synthesisGeneration(query, merged, pre_answer, sources):
    merged = ast.literal_eval(merged)
    merged.append('not clear given the context')
    augmented_prompt = synthesis_chain.invoke({'question': query, 
                                            'options': merged,
                                            'critique': pre_answer,
                                            'context': sources})

    normal_string = clean_text(augmented_prompt.text)
    ans = new_model + normal_string + select(merged)
    return str(ans)

def extract_answer_synthesis(text):
    # Trova l'indice in cui inizia il testo "Why or why not the answer is correct:"
    start_index = text.find("\n\nAssistant:\n")

    
    # Se l'indice è stato trovato, estrai la risposta corretta
    if start_index != -1:
        start_index += len("\n\nAssistant:\n")
        # Estrai il testo dopo "Why or why not the answer is correct:"
        correct_answer_text = text[start_index:].strip()
        return correct_answer_text
    else:
        return "The correct answer could not be found."

In [7]:
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct", use_fast=False)
new_model = models.Transformers(model, tokenizer, temperature=0.0)

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

' + message['content'] + '<|end|>
'}}{% elif message['role'] == 'user' %}{{'<|user|>
' + message['content'] + '<|end|>
'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>
' + message['content'] + '<|end|>
'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>
' }}{% else %}{{ eos_token }}{% endif %} was unable to be loaded directly into guidance.
                        Defaulting to the ChatML format which may not be optimal for the selected model. 
                        For best results, create and pass in a `guidance.ChatTemplate` subclass for your model.


In [8]:
df = pd.read_csv('wikihop_dataset/wikihop-merged-summarized.csv')

# select a subset of the queries, just for test:
first_queries = df['query']

# same for correct answers and distractors:
correct_answers = df['answer']
possibilities = df['options']

# and for the sources:
sources = df['sum_supports']

In [40]:
syn_answers = []
for i in range(len(df)):
    syn_answers.append(extract_answer_synthesis(
        synthesisGeneration(
            first_queries[i], possibilities[i], 
            med['pre-synthesis'][i], sources[i])))

In [41]:
ant_answers = []
for i in range(len(df)):
    ant_answers.append(extract_answer_synthesis(
        synthesisGeneration(
            first_queries[i], possibilities[i], 
            med['antithesis'][i], sources[i])))

In [42]:
df = {
    'query': first_queries,
    'correct': correct_answers,
    'thesis': med['thesis'],
    'pre-antithesis': med['antithesis'],
    'antithesis': ant_answers,
    'pre-synthesis': med['pre-synthesis'],
    'synthesis': syn_answers,
    'context': sources
} 

df = pd.DataFrame(df)

In [43]:
def clean_text_final(text):
    text = re.sub(r'[^\w\s.,!?\'"\-:;()]+', '', text)  # Rimuove i caratteri speciali
    text = re.sub(r"['\"-]", '', text)  # Rimuove apostrofi, virgolette e trattini
    text = text.lower()  # Converte in minuscolo
    return text

In [44]:
# Applica la funzione alla colonna 'correct answer'
df['correct'] = df['correct'].apply(clean_text_final)
df['thesis'] = df['thesis'].apply(clean_text_final)
df['antithesis'] = df['antithesis'].apply(clean_text_final)
df['synthesis'] = df['synthesis'].apply(clean_text_final)

In [45]:
df.head()

Unnamed: 0,query,correct,thesis,pre-antithesis,antithesis,pre-synthesis,synthesis,context
0,What event did Juan Rossell participate in?,1996 summer olympics,1996 summer olympics,Let's analyze the options and the context pro...,1996 summer olympics,The correct answer is '1996 summer olympics'....,1996 summer olympics,"The 2004 Summer Olympic Games, held in Athens..."
1,What languages did John Osteen speak or write?,english,koine greek,The context provided does not give any inform...,english,The correct option is 'english'. The context ...,english,Christianity is a monotheistic religion based...
2,What is the parent taxon of Proaigialosaurus?,lepidosauria,diapsid,To determine the parent taxon of Proaigialosa...,reptile,"Based on the context provided, the parent tax...",gymnophiona,Herpetology is the branch of zoology that stu...
3,What is the parent taxon of Australosuchus?,crocodilia,plant,Let's analyze the options given and compare t...,mekosuchinae,"The correct answer is'mekosuchinae', as it is...",mekosuchinae,Mekosuchinae was a subfamily of crocodiles th...
4,What was the occupation of Cao Chong?,physicist,science,Let's analyze the options and the context pro...,mathematician,"Based on the context provided, Cao Chong was ...",mathematician,The Three Kingdoms period (220280) in China w...


In [46]:
df.to_csv('cot-phimedium-wikihop.csv')