In [1]:
import datasets
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import json
import numpy as np
import pandas as pd
import torch
from transformers import BitsAndBytesConfig

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Meta-Llama-3.1-8B-Instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_fast=False)

In [None]:
from transformers import pipeline

def produce_prompt():
    messages = [ 
        {"role": "system", "content": """
        You are a helpful AI assistant. 
        You are given a question and the relevant context to answer it. Answer briefly to the question with just one of the given options. 
        """}, 
        {"role": "user", "content": """
        Question: Which year is Halley's Comet expected to return to the solar system?

        Options: [2110, 2045, 2086, 2061]

        Context: Astronomers have now linked the comet's appearances to observations dating back more than 2,000 years. Halley was last seen in Earth's skies in 1986 and was met in space by an international fleet of spacecraft. It performs a regular 76-year journey around the Sun.
        """},
    ] 
    return messages

pipe = pipeline( 
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
) 

generation_args = { 
    "max_new_tokens": 50, 
    "return_full_text": False, 
    "temperature": 0.2, 
    "do_sample": False, 
} 

In [None]:
output = pipe(produce_prompt(), **generation_args) 
print(output[0]['generated_text']) 

In [2]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-9b-it",
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
input_text = """
You are a helpful AI assistant. 
You are given a question and the relevant context to answer it. Answer briefly to the question with just one of the given options. 

Question: Which year is Halley's Comet expected to return to the solar system?

Options: [2110, 2045, 2086, 2061]

Context: Astronomers have now linked the comet's appearances to observations dating back more than 2,000 years. Halley was last seen in Earth's skies in 1986 and was met in space by an international fleet of spacecraft. It performs a regular 76-year journey around the Sun.

Assistant:
"""
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

outputs = model.generate(**input_ids, max_new_tokens=50, temperature=0.2)
print(tokenizer.decode(outputs[0]))



<bos>
You are a helpful AI assistant. 
You are given a question and the relevant context to answer it. Answer briefly to the question with just one of the given options. 

Question: Which year is Halley's Comet expected to return to the solar system?

Options: [2110, 2045, 2086, 2061]

Context: Astronomers have now linked the comet's appearances to observations dating back more than 2,000 years. Halley was last seen in Earth's skies in 1986 and was met in space by an international fleet of spacecraft. It performs a regular 76-year journey around the Sun.

Assistant:
2061 
<end_of_turn><eos>


In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('ultramega-test-wikihop-2308.csv')
df['new'][0]

'[\' "Christian" derives from the Koine Greek word "Christós" (), a translation of the Biblical Hebrew term "mashiach"\', \' It is the largest congregation in the United States, averaging about 52,000 attendees per week\', \' The 16,800-seat Lakewood Church Central Campus, home to four English-language services and two Spanish-language services per week, is located at the former Compaq Center\', \' Other metropolises include Guadalajara, Monterrey, Puebla, Toluca, Tijuana and León\']'

In [11]:
import ast 

test = ast.literal_eval(df['new'][0])

clean_and_merge(test)

'Christian" derives from the Koine Greek word "Christós" (), a translation of the Biblical Hebrew term "mashiach. It is the largest congregation in the United States, averaging about 52,000 attendees per week. The 16,800-seat Lakewood Church Central Campus, home to four English-language services and two Spanish-language services per week, is located at the former Compaq Center. Other metropolises include Guadalajara, Monterrey, Puebla, Toluca, Tijuana and León'

In [13]:
def clean_and_merge(dataset):

    dataset = ast.literal_eval(dataset)
    # Remove leading and trailing backslashes and quotes from each element
    cleaned_elements = [element.strip("[]' \"") for element in dataset]

    # Join the elements with '. \n ' as a separator
    merged_string = '. '.join(cleaned_elements)
    
    return merged_string

In [14]:
df['pecore'] = df['new'].apply(clean_and_merge)

In [15]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,supports,query,answer_y,options_y,sum_supports,new,pecore
0,0,0,A Christian (or ) is a person who follows or a...,What languages did John Osteen speak or write?,english,"['english', 'greek', 'koine greek', 'nahuatl',...",Christianity is a monotheistic religion based...,"[' ""Christian"" derives from the Koine Greek wo...","Christian"" derives from the Koine Greek word ""..."
1,1,1,Anne Perry (born 28 October 1938 as Juliet Mar...,What was Thomas Pitt's role in the context of ...,fictional character,"['character', 'crime', 'fiction', 'fictional c...",Anne Perry is an English author best known fo...,['Anne Perry (born 28 October 1938 as Juliet M...,Anne Perry (born 28 October 1938 as Juliet Mar...
2,2,2,Key Center North Tower is a high-rise located ...,In which administrative territorial entity is ...,new york,"['buffalo', 'center', 'erie', 'erie county', '...",The Key Center North Tower and Key Center Sou...,['Key Center North Tower is a high-rise locate...,Key Center North Tower is a high-rise located ...
3,3,3,The National Football League (NFL) is a profes...,How did Joe Don Looney meet his demise?,accident,"['accident', 'knockout']",The National Football League (NFL) is a profe...,['The National Football League (NFL) is a prof...,The National Football League (NFL) is a profes...
4,4,4,Professional ice hockey has existed since the ...,What sport did the Louisiana IceGators play?,ice hockey,"['hockey', 'ice hockey']",Professional ice hockey originated in the Uni...,['Professional ice hockey has existed since th...,Professional ice hockey has existed since the ...


In [16]:
df['pecore'][0]

'Christian" derives from the Koine Greek word "Christós" (), a translation of the Biblical Hebrew term "mashiach. It is the largest congregation in the United States, averaging about 52,000 attendees per week. The 16,800-seat Lakewood Church Central Campus, home to four English-language services and two Spanish-language services per week, is located at the former Compaq Center. Other metropolises include Guadalajara, Monterrey, Puebla, Toluca, Tijuana and León'

In [17]:
df.to_csv('ultramega-test-wikihop-0109.csv')