In [1]:
import pandas as pd
from datasets import load_dataset

df = pd.read_csv('init-narrativeqa.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,questions,correct_answer,sources
0,0,WHO NORMALLY DELIVERS THE OPENING PROLOGUE IN ...,THE ACTOR WEARING THE BLACK CLOAK,The play begins with three pages disputing ov...
1,1,WHAT NAME WAS CYNTHIA MORE FAMOUSLY KNOWN BY?,THE GODDESS DIANA,The play begins with three pages disputing ov...
2,2,WHO DOES ECHO WEEP FOR?,NARCISSUS,The play begins with three pages disputing ov...
3,3,WHAT DOES A DRINK FROM NARCISSUS'S SPRING CAUS...,FALL IN LOVE WITH THEMSELVES,The play begins with three pages disputing ov...
4,4,IN WHAT VALLEY DID THE SOLEMN REVELS OF CYNTHI...,GARGAPHIE IN GREECE,The play begins with three pages disputing ov...


In [2]:
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct", use_fast=False)

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
from transformers import pipeline 

def produce_prompt(question, correct, source):
    
    user_content = "Question: " + question + "\n Correct answer: " +  correct + "\n Context: " + source + "\n\n Assistant:"

    messages = [ 
        {"role": "system", "content": """
        You are a helpful AI assistant. You are given a question and the correct answer to it. 
        Given the context, you have to provide a wrong, yet realistic, alternative answer to the same question given the context. 
        For example you can refer to another item cited in the text which is not the correct answer to the question. 
        """}, 
        {"role": "user", "content": "Now give me a wrong alternative answer: \n" + user_content},
    ] 
    return messages

pipe = pipeline( 
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
) 

generation_args = { 
    "max_new_tokens": 70, 
    "return_full_text": False, 
    "temperature": 0.0, 
    "do_sample": False, 
} 

In [9]:
alternatives = []

for i in range(len(df)):
    output = pipe(produce_prompt(df['questions'][i], df['correct_answer'][i], df['sources'][i]), **generation_args) 
    alternatives.append(output[0]['generated_text']) 

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [10]:
alternatives[:5]

[' The actor wearing the white robe. <|end|>',
 ' The wrong alternative answer could be: The Goddess Diana was more famously known as Athena. <|end|>',
 ' WHO DOES ECHO WEEP FOR?\nIncorrect answer: Apollo <|end|>',
 " A wrong alternative answer could be: A drink from Narcissus's spring causes the drinker to become invisible. <|end|>",
 ' The wrong alternative answer: The solemn revels of Cynthia took place in the valley of Olympus in Greece. <|end|>']

In [13]:
df['alternative'] = alternatives

In [19]:
df['alternative'] = df['alternative'].str.replace('<|end|>', '', regex=False)

In [21]:
df['alternative'] = df['alternative'].apply(lambda x: x.split(':', 1)[-1].strip() if ':' in x else x)

In [26]:
df['questions'] = df['questions'].str.lower()
df['correct_answer'] = df['correct_answer'].str.lower()
df['sources'] = df['sources'].str.lower()
df['alternative'] = df['alternative'].str.lower()

In [27]:
df.head()

Unnamed: 0.1,Unnamed: 0,questions,correct_answer,sources,alternative
0,0,who normally delivers the opening prologue in ...,the actor wearing the black cloak,the play begins with three pages disputing ov...,the actor wearing the white robe.
1,1,what name was cynthia more famously known by?,the goddess diana,the play begins with three pages disputing ov...,the goddess diana was more famously known as a...
2,2,who does echo weep for?,narcissus,the play begins with three pages disputing ov...,apollo
3,3,what does a drink from narcissus's spring caus...,fall in love with themselves,the play begins with three pages disputing ov...,a drink from narcissus's spring causes the dri...
4,4,in what valley did the solemn revels of cynthi...,gargaphie in greece,the play begins with three pages disputing ov...,the solemn revels of cynthia took place in the...


In [28]:
df.to_csv('test-narrative.csv')