In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
import torch
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "3"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_path = "/home/uj-user/Yo/hybrid-ltm/data_eval/context-fms/MS_DG/test.jsonl"
model_name = "facebook/blenderbot-3B"
# model_name = "gpt2"
target_len = 128
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [4]:
dataset = pd.read_json(data_path, lines=True)
dataset

Unnamed: 0,input,output,eval_indicator
0,"```\nDialogue Session #1:\nPatient: Hi, doctor...","Doctor:Ah yes, jet lag can be tough. It's impo...",Engagingness
1,"```\nDialogue Session #1:\nNeighbors A:Hi, Nei...","Neighbors A:Well, I have some experience with ...",Specificity
2,```\nDialogue Session #1:\nCo-workers A:I have...,Co-workers B:That's a good perspective to have...,Memorability
3,"```\nDialogue Session #1:\nMentee:Mentor, I we...","Mentee:Exactly, Mentor. I remember a time in m...",Specificity
4,"```\nDialogue Session #1:\nClassmates A:Hey, g...","Classmates B:Yes, I'll never forget that. It w...",Humanness
...,...,...,...
75,```\nDialogue Session #1:\nHusband:You always ...,"Wife:Sure, we can do that. You know, speaking ...",Engagingness
76,"```\nDialogue Session #1:\nClassmates A:Hey, B...","Classmates A:Thanks, I just did what I had to ...",Memorability
77,"```\nDialogue Session #1:\nClassmates A:Hey, B...",Classmates A:I like the idea of serving my cou...,Specificity
78,```\nDialogue Session #1:\nHusband:I really do...,Husband:It might be worth getting a second opi...,Specificity


In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name, truncation_side='left', use_fast=False)
# tokenizer.add_special_tokens({'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>'})
tokenizer

BlenderbotTokenizer(name_or_path='facebook/blenderbot-3B', vocab_size=8008, model_max_length=128, is_fast=False, padding_side='right', truncation_side='left', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	8008: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}

In [17]:
ans_jsons = []
for i, line in enumerate(dataset['input']):
    input = tokenizer([dataset['input'][i][:-8]], max_length=64, truncation=True, return_tensors='pt').to(device)#.input_ids
    output_ids = model.generate(
        **input,
        do_sample=True,
        max_length=target_len,
        encoder_no_repeat_ngram_size=None
    )
    output_ids = output_ids[0][len(input.input_ids[0]):]
    outputs = tokenizer.decode(output_ids, skip_special_tokens=True).strip()
    # print('input:\n', tokenizer.decode(input.input_ids[0]))
    # print('output:\n', outputs)
    # if i%3==1:
    #     break
    
    ans_jsons.append(
        {
            "question_id": i,
            "text": outputs,
            "model_id": model_name,
            "metadata": {},
        }
    )

In [18]:
filename = f"{model_name}_{data_path.split('/')[-2]}.json"
save_path = os.path.dirname(filename)
os.makedirs(save_path, exist_ok=True)
df_ans_json = pd.DataFrame(ans_jsons)
df_ans_json.to_json(filename, orient='records', lines=True)