# Dataset, documents, FAISS; retriever

## 🔹 Load the dataset 

In [1]:
from datasets import load_dataset

dataset = load_dataset('saracandu/filtered_hotpotQA', split="train", trust_remote_code=True)
dataset

Dataset({
    features: ['question', 'options', 'answer', 'type', 'level', 'selected_passages'],
    num_rows: 352
})

## 🔹 Select a subset of the true dataset as a test

In [2]:
# shuffles the order of the vector containing the correct answer and the two distractors
# returns another vector, shuffled
import re
import random

# Definisci una funzione di pulizia per rimuovere caratteri non validi
def clean_text(text):
    return re.sub(r"[^\w\s.,!?\-:;()]+", '', text)

In [3]:
N_examples = len(dataset['question'])

# select a subset of the queries, just for test:
first_queries = dataset['question']

# same for correct answers and distractors:
correct_answers = dataset['answer']
possibilities = dataset['options']
# and for the sources:
sources = dataset['selected_passages']

# Model loading and dataset selection (for testing purposes)

## ▪️ Upload the model: 

In [None]:
# do not run this unless necessary!

from huggingface_hub import login
login()

In [4]:
import pandas as pd
import torch
import datasets
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)

import warnings
warnings.filterwarnings("ignore")

In [7]:
model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-1.1-7b-it",
    device_map="cuda",
    torch_dtype=torch.bfloat16,
    max_length = 1000,
    do_sample = False
)

tokenizer = AutoTokenizer.from_pretrained("google/gemma-1.1-7b-it", use_fast=False)

config.json:   0%|          | 0.00/620 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

In [8]:
from guidance import models, select

new_model = models.Transformers(model, tokenizer, temperature=0.0)

# new_model + f'Do you want a joke or a poem? A ' + select(['joke', 'poem'])

## ▪️ Test with `guidance`: 

In [9]:
system_message = """You are a multiple-choice question answering assistant.
Choose which of the following options: a star, a planet, a galaxy is the object below.

Object: the sun
"""

new_model + system_message + select(["a planet", "a galaxy", "a star"])

In [62]:
from langchain import PromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter

prompt_template = PromptTemplate.from_template(
"""You are a multiple-choice question answering assistant.
You have a suggestion on which answer is the most appropriate, that is treated as context. Use the suggestion to choose the most proper option.
You also have an attempt of answer that you are suggested to neglect. 

Question: {question}
Attempt: {candidate_answer}
Context: {critique}

The most proper option between {options} is:
"""
)

augmentation = {"question": itemgetter("question"),
                "options": itemgetter("options"), 
                "candidate_answer": itemgetter("candidate_answer"),
                "critique": itemgetter("critique"),
                "context": itemgetter("context"), }

synthesis_chain = augmentation | prompt_template 

In [63]:
# shuffles the order of the vector containing the correct answer and the two distractors
# returns another vector, shuffled
import re
import random

# Definisci una funzione di pulizia per rimuovere caratteri non validi
def clean_text(text):
    return re.sub(r"[^\w\s.,!?\-:;()]+", '', text)

In [64]:
def synthesisGeneration(query, prompt_template, merged, candidate_answer, critique, sources):
    # merged = ast.literal_eval(merged)
    augmented_prompt = synthesis_chain.invoke({'question': query, 
                                            'options': merged,
                                            'candidate_answer': candidate_answer,
                                            'critique': critique,
                                            'context': sources})

    normal_string = clean_text(augmented_prompt.text)
    new_model + normal_string + select(merged)

In [65]:
synthesisGeneration('what is the sun', prompt_template, ['star', 'planet'], 'planet', 
                    'the correct answer is: a star since bot an asteroid and a planet are inadequate and not supported by the context', 
                    'The Sun is the star at the center of the Solar System. It is a massive, nearly perfect sphere of hot plasma, heated to incandescence by nuclear fusion reactions in its core, radiating the energy from its surface mainly as visible light and infrared radiation with 10% at ultraviolet energies.')

In [66]:
source = """ Arthur\'s Magazine (1844–1846) was an American literary periodical published in Philadelphia in the 19th century. 
Edited by T.S. Arthur, it featured work by Edgar A. Poe, J.H. Ingraham, Sarah Josepha Hale, Thomas G. Spear, and others. 
In May 1846 it was merged into "Godey\'s Lady\'s Book. First for Women is a woman\'s magazine published by Bauer Media Group in the USA.  
The magazine was started in 1989. It is based in Englewood Cliffs, New Jersey. In 2011 the circulation of the magazine was 1,310,696 copies.
    """

synthesisGeneration('Which magazine was started first Arthur\'s Magazine or First for Women?', 
                    prompt_template, ['First for Women', 'Arthur\'s Magazine'], 'First for Women', 
                    'the correct answer is Arthur\'s Magazine and the context agrees', 
                    source)

In [67]:
source = """ "The Oberoi family is an Indian family that is famous for its involvement in hotels, namely through The Oberoi Group. 
The Oberoi Group is a hotel company with its head office in Delhi. 
Founded in 1934, the company owns and/or operates 30+ luxury hotels and two river cruise ships in six countries, primarily under its Oberoi Hotels & Resorts and Trident Hotels brands.".
    """

synthesisGeneration('The Oberoi family is part of a hotel company that has a head office in what city?', 
                    prompt_template, ['Delhi', 'Sammardenchia'], 'Sammardenchia', 
                    'Delhi is correct', 
                    source)

In [68]:
source = """ Allison Beth Allie Goertz (born March 2, 1991) is an American musician. 
Goertz is known for her satirical songs based on various pop culture topics. Her videos are posted on YouTube under the name of Cossbysweater. 
Subjects of her songs have included the film The Room, the character Milhouse from the television show The Simpsons, and the game Dungeons & Dragons. 
Her style has been compared to that of Bo Burnham. In December 2015, Goertz released a concept album based on the Adult Swim series Rick and Morty, Sad Dance Songs, 
with the album\'s cover emulating the animation and logo of the series.  The album was made possible through Kickstarter. 
She is co-host of Everything's Coming Up Podcast, a Simpsons-focused podcast along with Julia Prescott. 
Milhouse Mussolini van Houten is a fictional character featured in the animated television series The Simpsons, voiced by Pamela Hayden, and created by Matt Groening 
who named the character after President Richard Nixon\'s middle name. Later in the series, it is revealed that Milhouse\'s middle name is Mussolini. "
    """

synthesisGeneration('Musician and satirist Allie Goertz wrote a song about the "The Simpsons" character Milhouse, who Matt Groening named after who?', 
                    prompt_template, ['Nixon', 'Obama'], 'Obama', 
                    'the correct answer is Nixon', 
                    source)

In [69]:
source = """ 
Nicholas Ray (born Raymond Nicholas Kienzle Jr., August 7, 1911 – June 16, 1979) was an American film director best known for the movie Rebel Without a Cause. 
Elia Kazan (born Elias Kazantzoglou September 7, 1909 – September 28, 2003) was a Greek-American director, producer, writer and actor, described by The New York Times as one of the most honored and influential directors in Broadway and Hollywood history.
    """

synthesisGeneration('What profession does Nicholas Ray and Elia Kazan have in common?', 
                    prompt_template, ['director', 'writer'], 'director', 
                    'the correct answer is writer', 
                    source)

# Thesis

## 🔹 PromptTemplate definition and a LLMChain for the **thesis** 

In [10]:
def create_message(question, options, context):
    options_str = '", "'.join(options)
    content = f"""

    Now do the same for this question: "{question}", where options: ["{options_str}"]. Assistant:
    """

    user_content = "Answer to the following question: " + question + " providing one of these options as answer: " + str(options) + "Assistant:"

    messages = [
        {"role": "system", "content": """
        You are an helpful AI assistant. You have to provide helpful answers to the user’s questions based on the context: 
        """ + context},
        {"role": "user", "content": user_content}
    ]

    return messages

In [11]:
create_message(first_queries[0], possibilities[0], sources[0])

[{'role': 'system',
  'content': '\n        You are an helpful AI assistant. You have to provide helpful answers to the user’s questions based on the context: \n        Arthur\'s Magazine (1844–1846) was an American literary periodical published in Philadelphia in the 19th century. Edited by T.S. Arthur, it featured work by Edgar A. Poe, J.H. Ingraham, Sarah Josepha Hale, Thomas G. Spear, and others. In May 1846 it was merged into "Godey\'s Lady\'s Book". First for Women is a woman\'s magazine published by Bauer Media Group in the USA. The magazine was started in 1989. It is based in Englewood Cliffs, New Jersey. In 2011 the circulation of the magazine was 1,310,696 copies.'},
 {'role': 'user',
  'content': 'Answer to the following question: Which magazine was started first, Arthur\'s Magazine or First for Women? providing one of these options as answer: ["Arthur\'s Magazine", \'First for Women\']Assistant:'}]

## 🔹 Function that generates the output given the prompt, the question and the set of options

In [12]:
import ast

def thesisGeneration(query, merged, sources):
    merged = ast.literal_eval(merged)
    augmented_prompt = create_message(query, merged, sources)
    ans = new_model + str(augmented_prompt) + select([clean_text(merged[0]), clean_text(merged[1])])
    return str(ans)

In [13]:
def extract_answer(text):
    # Trova l'indice in cui inizia il testo "Why or why not the answer is correct:"
    start_index = text.find("}]")

    
    # Se l'indice è stato trovato, estrai la risposta corretta
    if start_index != -1:
        start_index += len("}]")
        # Estrai il testo dopo "Why or why not the answer is correct:"
        correct_answer_text = text[start_index:].strip()
        return correct_answer_text
    else:
        return "The correct answer could not be found."

## 🔹 Test: how well the thesis alone is able to perform?

In [14]:
answers = []
for i in range(N_examples):
    answers.append(extract_answer(thesisGeneration(first_queries[i], possibilities[i], sources[i])))

In [15]:
answers[:10]

['Arthurs Magazine',
 'Henri Leconte',
 'The Wolfhounds',
 'no',
 'no',
 'New York City',
 'Aleksander Ford',
 'yes',
 'actor',
 'The Saimaa Gesture']

# Antithesis

In [16]:
# Import module for generating prompt templates.
from langchain.prompts import PromptTemplate

# Import module for generating few-shot prompt templates.
from langchain import FewShotPromptTemplate

In [27]:
# one-shot
examples =[
  {
    "prompt": """
        Question: What is the sun, a star or a planet?
        Options: ['a star', 'a planet']
        Candidate answer: a planet
        Context: The Sun is the star at the center of the Solar System. It is a massive, nearly perfect sphere of hot plasma, heated to incandescence by nuclear fusion reactions in its core, radiating the energy from its surface mainly as visible light and infrared radiation with 10% at ultraviolet energies.
    """,
    "target": "The correct answer should be 'a star' due to the fact that the context explicitly say so. On the opposite, the context never mentions the fact that the Sun could be a planet."
  }
]

# template -> to extract the answer later 
example_template = """
User: {prompt}
AI: {target}
"""

example_prompt = PromptTemplate(
    input_variables=['prompt', 'target'],
    template=example_template
)

prefix = """
You are an helpful AI assistant. You are asked to determine the most correct answer for a given question, provided a set of possible options.
You also have at disposal a first tentative answer that you are required to check with respect to the question and the relevant context.
Your goal is to decree which is the most correct answer to the question between the available options.

Here's an example of how to do it:
"""

suffix = """
Now do the same for this question.
User: {prompt}
AI: """

few_shot_prompt_template = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    prefix=prefix,
    suffix=suffix,
    input_variables=["prompt"],
    example_separator="\n\n"
)

In [28]:
def output_parser(text):
    index = text.find("\n\nAI:")
    if index != -1:
        return text[:index]
    else:
        return text

In [29]:
def extract_from_second_ai(text):
    start_marker = "AI:"
    end_marker = "<eos>"
    
    # Trova tutte le occorrenze del marker di inizio
    start_indices = [i for i in range(len(text)) if text.startswith(start_marker, i)]
    
    # Se ci sono almeno due occorrenze
    if len(start_indices) >= 2:
        second_start_index = start_indices[1]
        end_index = text.find(end_marker, second_start_index)
        
        # Estrai la parte del testo compresa tra il secondo marker di inizio e il marker di fine
        if end_index != -1:
            # Estrarre la parte del testo dal secondo "AI:" al marker "<eos>"
            extracted_text = text[second_start_index:end_index + len(end_marker)]
            
            # Rimuovere il primo "AI:" nella risposta estratta
            cleaned_text = extracted_text.replace(start_marker, "", 1).strip()
            
            return cleaned_text
    
    return None

In [30]:
def create_message(question, candidate, options, context):
    options_str = '", "'.join(options)
    content = f"""

    Now do the same for this question: "{question}", where options: ["{options_str}"]. Assistant:
    """

    user_content = "Question: " + question + "\n Options: " + str(options) + "\n Candidate answer: " + candidate + "\n Context: " + context + "\n Assistant: \n"

    messages = [
        {"role": "user", "content": user_content}
    ]

    return messages

## 🔸 Function to generate the antithesis given the question, the thesis, the context and the options

In [31]:
def antithesisGeneration(query, candidate, merged, context):
    merged = ast.literal_eval(merged)
    prompt = create_message(query, candidate, merged, context)
    response = few_shot_prompt_template.format(prompt=prompt)
    input_ids = tokenizer(response, return_tensors="pt").to("cuda")
    outputs = model.generate(**input_ids)
    return extract_from_second_ai(tokenizer.decode(outputs[0]))

In [34]:
ant_answers = []
for i in range(N_examples):
    ant_answers.append(antithesisGeneration(first_queries[i], answers[i], possibilities[i], sources[i]))

In [36]:
ant_answers[:5]

["_______________________________________________________________\n\n**Please provide the most correct answer to the question, along with a brief explanation.**\n\n**Answer:** The most correct answer is 'Arthur's Magazine'.\n**Explanation:** The context clearly states that Arthur's Magazine was started first, in 1844.<eos>",
 '_______________________________________________________________\n\n**Please provide the most correct answer to the question and the reasoning behind it.**\n\n**Note:** The provided text contains some irrelevant information that should be ignored when making the determination.<eos>',
 '_______________________________________________________________________________\n\n**Note:** The provided context contains some irrelevant information about Courtney Love. Please focus on the founding of the two bands and choose the most accurate answer based on the given options.\n\n\n**Answer options:**\n- The Wolfhounds\n- Hole\n\n**Candidate answer:** The Wolfhounds\n\n**Context

In [41]:
def clean_vector(vector):
    cleaned_vector = []
    for item in vector:
        if isinstance(item, str):
            cleaned_item = item.replace("_", "").replace("<eos>", "").replace("**", "").strip()
            cleaned_vector.append(cleaned_item)
        else:
            cleaned_vector.append(item)  # Mantiene gli elementi che non sono stringhe
    return cleaned_vector

cleaned_vector = clean_vector(ant_answers)
cleaned_vector[:5]

["**Please provide the most correct answer to the question, along with a brief explanation.**\n\n**Answer:** The most correct answer is 'Arthur's Magazine'.\n**Explanation:** The context clearly states that Arthur's Magazine was started first, in 1844.",
 '**Please provide the most correct answer to the question and the reasoning behind it.**\n\n**Note:** The provided text contains some irrelevant information that should be ignored when making the determination.',
 '**Note:** The provided context contains some irrelevant information about Courtney Love. Please focus on the founding of the two bands and choose the most accurate answer based on the given options.\n\n\n**Answer options:**\n- The Wolfhounds\n- Hole\n\n**Candidate answer:** The Wolfhounds\n\n**Context:** ...The Wolfhounds are an indie pop/noise pop band formed in Romford, UK in 1985...\n...Courtney Michelle Love (born Courtney Michelle Harrison; July 9, 1964) is an American singer, songwriter, actress, and visual artist...\

# Pre Synthesis

In [37]:
# one-shot
examples =[
  {
    "prompt": """
        Question: What is the sun, a star or a planet?
        Options: ['a star', 'a planet']
        Candidate answer: a planet
        Suggestion: 'a star' is the correct option since the context clearly specifies that the Sun is the star at the center of the Solar System
        Context: The Sun is the star at the center of the Solar System. It is a massive, nearly perfect sphere of hot plasma, heated to incandescence by nuclear fusion reactions in its core, radiating the energy from its surface mainly as visible light and infrared radiation with 10% at ultraviolet energies.
    """,
    "target": "The correct option is 'a star', since the suggestion is grounded in the context ('The Sun is the star at the center of the Solar System'), even if the candidate answer does not agree by saying 'a planet'."
  }
]

# template -> to extract the answer later 
example_template = """
User: {prompt}
AI: {target}
"""

example_prompt = PromptTemplate(
    input_variables=['prompt', 'target'],
    template=example_template
)

prefix = """
You are an helpful AI assistant. You are asked to determine the most correct answer for a given question, provided a set of possible options.
You also have at disposal a first tentative answer and a suggestion on which is the correct answer.
Your goal is to decree which is the most correct answer to the question between the available options according to the context.

Here's an example of how to do it:
"""

suffix = """
User: {prompt}
AI: """

few_shot_prompt_template = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    prefix=prefix,
    suffix=suffix,
    input_variables=["prompt"],
    example_separator="\n\n"
)

In [38]:
def create_message(question, candidate, suggestion, options, context):

    user_content = "Question: " + question + "\n Options: " + str(options) + "\n Candidate answer: " + candidate + "\n Suggestion: " + suggestion + "\n Context: " + context + "\n Assistant: \n"

    messages = [
        {"role": "user", "content": user_content
        },
    ]

    return messages

In [27]:
def preSynthGeneration(query, candidate_answer, critique, merged, sources):
    prompt = create_message(query, candidate_answer, critique, merged, sources)
    response = few_shot_prompt_template.format(prompt=prompt)
    input_ids = tokenizer(response, return_tensors="pt").to("cuda")
    outputs = model.generate(**input_ids)
    return extract_from_second_ai(tokenizer.decode(outputs[0]))

In [29]:
pre_answers = []
for i in range(N_examples):
    pre_answers.append(preSynthGeneration(first_queries[i], answers[i], ant_answers[i], possibilities[i], sources[i]))

In [30]:
preSynthGeneration(first_queries[0], answers[0], ant_answers[0], possibilities[0], sources[0])

"The correct answer is 'First for Women', since the context explicitly mentions that the magazine was started first.<eos>"

In [31]:
pre_answers[0]

"The correct answer is 'First for Women', since the context explicitly mentions that the magazine was started first.<eos>"

# Synthesis

## 🔺 PromptTemplate definition and a LLMChain for the **synthesis** 

In [32]:
from langchain import PromptTemplate
from operator import itemgetter

prompt_template = PromptTemplate.from_template(
"""You are a multiple-choice question answering assistant.
Choose the most proper answer between {options} that best matches with the suggestion. 

Question: {question}
Suggestion: {critique}

Assistant:
"""
)

augmentation = {"question": itemgetter("question"),
                "options": itemgetter("options"), 
                "critique": itemgetter("critique"),
                "context": itemgetter("context"), }

synthesis_chain = augmentation | prompt_template 

## 🔺 Function to generate the synthesis given literally everything

In [33]:
def synthesisGeneration(query, prompt_template, merged, pre_answer, sources):
    merged = ast.literal_eval(merged)
    augmented_prompt = synthesis_chain.invoke({'question': query, 
                                            'options': merged,
                                            'critique': pre_answer,
                                            'context': sources})

    normal_string = clean_text(augmented_prompt.text)
    ans = new_model + normal_string + select([clean_text(merged[0]), clean_text(merged[1])])
    return str(ans)

In [45]:
def extract_answer(text):
    # Trova l'indice in cui inizia il testo "Why or why not the answer is correct:"
    start_index = text.find("assistant:\n")

    
    # Se l'indice è stato trovato, estrai la risposta corretta
    if start_index != -1:
        start_index += len("assistant:\n")
        # Estrai il testo dopo "Why or why not the answer is correct:"
        correct_answer_text = text[start_index:].strip()
        return correct_answer_text
    else:
        return "The correct answer could not be found."

In [46]:
syn_answers = []
for i in range(N_examples):
    syn_answers.append(extract_answer(
        synthesisGeneration(
            first_queries[i], prompt_template, possibilities[i], 
            pre_answers[i], sources[i])))

In [47]:
syn_answers[:10]

['first for women',
 'henri leconte',
 'the wolfhounds',
 'yes',
 'yes',
 'new york city',
 'aleksander ford',
 'yes',
 'actor',
 'the saimaa gesture']

In [164]:
def_answers = ["the correct option is " + clean_text(correct_answer)
               + " since the other options is not mentioned in the context" for correct_answer in correct_answers]

In [48]:
goat_answers = []
for i in range(N_examples):
    goat_answers.append(extract_answer(
        synthesisGeneration(
            first_queries[i], prompt_template, possibilities[i], 
            syn_answers[i], sources[i])))

# Dataset conversion and performances

In [49]:
df = {
    'query': first_queries,
    'correct': correct_answers,
    'thesis': answers,
    'antithesis': ant_answers,
    'pre-synthesis': pre_answers,
    'synthesis': syn_answers,
    'goat': goat_answers,
    'context': sources
} 

In [50]:
import pandas as pd

df = pd.DataFrame(df)
df.head()

Unnamed: 0,query,correct,thesis,antithesis,pre-synthesis,synthesis,goat,context
0,"Which magazine was started first, Arthur's Mag...",Arthur's Magazine,Arthurs Magazine,The correct answer is 'First for Women' due to...,"The correct answer is 'First for Women', since...",first for women,arthurs magazine,Arthur's Magazine (1844–1846) was an American ...
1,Which tennis player won more Grand Slam titles...,Jonathan Stark,Henri Leconte,The correct answer is Henri Leconte. The conte...,The correct answer is Henri Leconte. The conte...,henri leconte,henri leconte,Henri Leconte (born 4 July 1963) is a former F...
2,"Which band was founded first, Hole (the rock b...",The Wolfhounds,The Wolfhounds,The correct answer is 'The Wolfhounds' due to ...,The correct answer is 'The Wolfhounds' since t...,the wolfhounds,the wolfhounds,The Wolfhounds are an indie pop/noise pop band...
3,Were Pavel Urysohn and Leonid Levin known for ...,no,yes,The correct answer is 'no' since the context d...,The correct answer is 'no'. The context does n...,yes,yes,Leonid Anatolievich Levin ( ; Russian: Леони́д...
4,Are both The New Pornographers and Kings of Le...,yes,yes,The correct answer is 'no' because the context...,The correct answer is 'no'. The context does n...,yes,yes,Kings of Leon is an American rock band that fo...


In [51]:
# df.to_csv('baseline-true.csv')

In [52]:
# Funzione per rimuovere le quadre e ottenere solo il contenuto
def remove_brackets(s):
    return s.strip("[] ")

# Definisci una funzione di pulizia per rimuovere caratteri non validi
def clean_text(text):
    text = re.sub(r'[^\w\s.,!?\'"\-:;()]+', '', text)  # Rimuove i caratteri speciali
    text = re.sub(r"['\"-]", '', text)  # Rimuove apostrofi, virgolette e trattini
    text = text.lower()  # Converte in minuscolo
    return text

# Applica la funzione alla colonna 'correct answer'
df['correct'] = df['correct'].apply(clean_text)
df['thesis'] = df['thesis'].apply(clean_text)
df['synthesis'] = df['synthesis'].apply(clean_text)
df['goat'] = df['goat'].apply(clean_text)


df.head()

Unnamed: 0,query,correct,thesis,antithesis,pre-synthesis,synthesis,goat,context
0,"Which magazine was started first, Arthur's Mag...",arthurs magazine,arthurs magazine,The correct answer is 'First for Women' due to...,"The correct answer is 'First for Women', since...",first for women,arthurs magazine,Arthur's Magazine (1844–1846) was an American ...
1,Which tennis player won more Grand Slam titles...,jonathan stark,henri leconte,The correct answer is Henri Leconte. The conte...,The correct answer is Henri Leconte. The conte...,henri leconte,henri leconte,Henri Leconte (born 4 July 1963) is a former F...
2,"Which band was founded first, Hole (the rock b...",the wolfhounds,the wolfhounds,The correct answer is 'The Wolfhounds' due to ...,The correct answer is 'The Wolfhounds' since t...,the wolfhounds,the wolfhounds,The Wolfhounds are an indie pop/noise pop band...
3,Were Pavel Urysohn and Leonid Levin known for ...,no,yes,The correct answer is 'no' since the context d...,The correct answer is 'no'. The context does n...,yes,yes,Leonid Anatolievich Levin ( ; Russian: Леони́д...
4,Are both The New Pornographers and Kings of Le...,yes,yes,The correct answer is 'no' because the context...,The correct answer is 'no'. The context does n...,yes,yes,Kings of Leon is an American rock band that fo...


In [53]:
df['antithesis'][0]

"The correct answer is 'First for Women' due to the fact that the context explicitly mentions that the magazine was started first.<eos>"

In [54]:
# Conta quante righe combaciano e quante no
matches = 0
non_matches = 0

for index, row in df.iterrows():
    correct_answer = str(row['correct']).strip()
    thesis = str(row['thesis']).strip()
    
    if correct_answer == thesis:
        matches += 1
    else:
        non_matches += 1

print(f"Number of matches: {matches}")
print(f"Number of non-matches: {non_matches}")

Number of matches: 194
Number of non-matches: 158


In [55]:
# Conta quante righe combaciano e quante no
matches = 0
non_matches = 0

for index, row in df.iterrows():
    correct_answer = str(row['correct']).strip()
    thesis = str(row['synthesis']).strip()
    
    if correct_answer == thesis:
        matches += 1
    else:
        # print("Synthesis: {}, Correct: {}".format(thesis, correct_answer))
        non_matches += 1

print(f"Number of matches: {matches}")
print(f"Number of non-matches: {non_matches}")

Number of matches: 188
Number of non-matches: 164


In [56]:
# Conta quante righe combaciano e quante no
matches = 0
non_matches = 0
which_ones_syn = []

for index, row in df.iterrows():
    correct_answer = str(row['correct']).strip()
    thesis = str(row['goat']).strip()
    
    if correct_answer == thesis:
        matches += 1
    else:
        # print("Synthesis: {}, Correct: {}".format(thesis, correct_answer))
        non_matches += 1
        which_ones_syn.append("Synthesis: {}, Correct: {}".format(thesis, correct_answer))

print(f"Number of matches: {matches}")
print(f"Number of non-matches: {non_matches}")

Number of matches: 170
Number of non-matches: 182


In [175]:
df.to_csv('gg-hf-gemma-2b-it-baseline-2.csv')