In [1]:
import pandas as pd
import torch
import re
import ast
import datasets
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)

from guidance import models, select
from langchain_core.prompts import PromptTemplate
from operator import itemgetter
import warnings
warnings.filterwarnings("ignore")

  backends.update(_get_backends("networkx.backends"))


In [2]:
df = pd.read_csv('hotpot_bridge.csv')
df.head()

Unnamed: 0,question,answer,type,level,selected_passages,Unnamed: 5,Unnamed: 6
0,The Oberoi family is part of a hotel company t...,Delhi,bridge,medium,The Oberoi family is an Indian family that is ...,,
1,Musician and satirist Allie Goertz wrote a son...,President Richard Nixon,bridge,hard,"Allison Beth ""Allie"" Goertz (born March 2, 199...",,
2,What nationality was James Henry Miller's wife?,American,bridge,medium,James Henry Miller (25 January 1915 – 22 Octob...,,
3,Cadmium Chloride is slightly soluble in this c...,alcohol,bridge,medium,Cadmium chloride is a white crystalline compou...,,
4,Which genus of moth in the world's seventh-lar...,Crambidae,bridge,hard,"India, officially the Republic of India (""Bhār...",,


In [7]:
dataset = load_dataset('saracandu/my_wikihop', split='train')
dataset

Dataset({
    features: ['Unnamed: 0', 'query', 'answer', 'options', 'supports', 'cleaned_query'],
    num_rows: 1000
})

In [3]:
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Meta-Llama-3.1-8B-Instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_fast=False)
new_model = models.Transformers(model, tokenizer, temperature=0.0)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 10,
    "return_full_text": False,
    "do_sample": False,
}

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
def produce_prompt(question, correct, context):

    user_content = "Question: "+question+"\n Correct answer: "+correct+"\n Context: "+context+"\n\n Incorrect answer:"
    messages = [ 
        {"role": "system", "content": """
        You are a helpful AI assistant. You are given a question and the correct answer to it and the context used to answer it.
        You have to provide me an alternative, wrong answer to it. 
        """}, 
        {"role": "user", "content": user_content},
    ] 
    return messages

In [5]:
ans = []
for i in range(10):
    output = pipe(produce_prompt(df['question'][i], df['answer'][i], df['selected_passages'][i]), **generation_args) 
    ans.append(output[0]['generated_text']) 

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [6]:
ans[:5]

['Mumbai.',
 'The incorrect answer is:  Albert Einstein',
 "The nationality of James Henry Miller's wife, Margaret",
 'The incorrect answer is:  Mercury.\n\nReason:',
 "The genus of moth in the world's seventh-largest"]

In [8]:
df = {
    'question': df['question'][:1000],
    'correct': df['answer'][:1000],
    'alternative': ans,
    'support': df['selected_passages'][:1000],
    'level': df['level'][:1000]
}

In [9]:
df = pd.DataFrame.from_dict(df)
df.head()

Unnamed: 0,question,correct,alternative,support,level
0,The Oberoi family is part of a hotel company t...,Delhi,Mumbai <|end|>,The Oberoi family is an Indian family that is ...,medium
1,Musician and satirist Allie Goertz wrote a son...,President Richard Nixon,President Abraham Lincoln <|end|>,"Allison Beth ""Allie"" Goertz (born March 2, 199...",hard
2,What nationality was James Henry Miller's wife?,American,British <|end|>,James Henry Miller (25 January 1915 – 22 Octob...,medium
3,Cadmium Chloride is slightly soluble in this c...,alcohol,Sodium Chloride <|end|>,Cadmium chloride is a white crystalline compou...,medium
4,Which genus of moth in the world's seventh-lar...,Crambidae,Lepidoptera <|end|>,"India, officially the Republic of India (""Bhār...",hard


In [11]:
def remove_spaces(df, column_name):
    # Rimuove tutti gli spazi bianchi dal testo nella colonna specificata
    df[column_name] = df[column_name].str.replace('<|end|>', '', regex=False)
    return df

In [12]:
df = remove_spaces(df, 'alternative')
df.head()

Unnamed: 0,question,correct,alternative,support,level
0,The Oberoi family is part of a hotel company t...,Delhi,Mumbai,The Oberoi family is an Indian family that is ...,medium
1,Musician and satirist Allie Goertz wrote a son...,President Richard Nixon,President Abraham Lincoln,"Allison Beth ""Allie"" Goertz (born March 2, 199...",hard
2,What nationality was James Henry Miller's wife?,American,British,James Henry Miller (25 January 1915 – 22 Octob...,medium
3,Cadmium Chloride is slightly soluble in this c...,alcohol,Sodium Chloride,Cadmium chloride is a white crystalline compou...,medium
4,Which genus of moth in the world's seventh-lar...,Crambidae,Lepidoptera,"India, officially the Republic of India (""Bhār...",hard


In [13]:
df.to_csv('hotpot-bridge-updated.csv')