In [1]:
import os
import torch
import pandas as pd

from tqdm import tqdm
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, Dataset, DatasetDict
# os.environ["CUDA_VISIBLE_DEVICES"] = "3"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_path = "/home/uj-user/Yo/hybrid-ltm/data_eval/context-fms/MS_DG/test.jsonl"
data_path = "/home/uj-user/Yo/hybrid-ltm/data_eval/context-fms/MS_DG/test_last_session.jsonl"
model_name = "microsoft/GODEL-v1_1-large-seq2seq"

# model_name = "gpt2"
MODEL_MAX_LEN=1024
TARGET_LEN = 128
device = "cuda:1" if torch.cuda.is_available() else "cpu"

In [3]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

In [4]:
dataset = pd.read_json(data_path, lines=True)
dataset

Unnamed: 0,input
0,```\nDialogue Session #3:\nDoctor:Hello Patien...
1,```\nDialogue Session #3:\nNeighbors A:Hey the...
2,"```\nDialogue Session #4:\nCo-workers A:Hey, B..."
3,"```\nDialogue Session #2:\nMentee:Mentor, I ha..."
4,```\nDialogue Session #5:\nClassmates A:Sorry ...
...,...
75,```\nDialogue Session #4:\nHusband:You know wh...
76,"```\nDialogue Session #4:\nClassmates B:Hey, C..."
77,```\nDialogue Session #3:\nClassmates A:I've b...
78,```\nDialogue Session #2:\nHusband:I found a s...


In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.add_special_tokens({'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>'})
tokenizer

T5TokenizerFast(name_or_path='microsoft/GODEL-v1_1-large-seq2seq', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<PAD>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', 

In [6]:
import re
dataset['prep_input'] = dataset['input'].map(lambda x: '```'+ '```'.join(x.split('```')[1:-1]) +'```')
text_list = dataset['prep_input'][1].split('Dialogue Session #')[1:] # dialogue session remove
result_list = [re.sub(r'^\d+:\s*', '', line) for line in text_list] # #1 숫자 제거
result_list = ['\n'.join(line.split(':', 1)[1].strip() for line in lines.split('\n') if ':' in line) for lines in result_list]
result_list = [line.replace('\n', ' EOS ').replace('```', '').strip() for line in result_list] # 불필요 개행문자, 공백제거
result_list = ' '.join(result_list)
result_list = result_list[:-7].strip()
result_list

'Hey there, have you ever considered pursuing journalism as a career? EOS Journalism? No, not really. Why do you ask?'

In [7]:
ans_jsons = []
for i, line in enumerate(tqdm(dataset['input'])):
    instruction = f'Instruction: given a dialog context, you need to response empathically.'
    knowledge = ''

    dataset['prep_input'] = dataset['input'].map(lambda x: '```'+ '```'.join(x.split('```')[1:-1]) +'```')
    text_list = dataset['prep_input'][i].split('Dialogue Session #')[1:] # dialogue session remove
    result_list = [re.sub(r'^\d+:\s*', '', line) for line in text_list] # #1 숫자 제거
    result_list = ['\n'.join(line.split(':', 1)[1].strip() for line in lines.split('\n') if ':' in line) for lines in result_list]
    result_list = [line.replace('\n', ' EOS ').replace('```', '').strip() for line in result_list] # 불필요 개행문자, 공백제거
    result_list = ' '.join(result_list)
    result_list = result_list[:-7].strip()
    dialog = result_list

    query = f"{instruction} [CONTEXT] {dialog} {knowledge}"
    # print(query)
    input_ids = tokenizer(f"{query}", return_tensors="pt").input_ids.to(device)
    outputs = model.generate(input_ids, max_length=TARGET_LEN, min_length=8, top_p=0.9, do_sample=True)
    outputs = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # print(outputs)

    # if i%3==1:
    #     break

    ans_jsons.append(
        {
            "question_id": i,
            "text": outputs,
            "model_id": model_name,
            "metadata": {},
        }
    )

100%|██████████| 80/80 [00:31<00:00,  2.52it/s]


In [8]:
filename = f"predictions/{model_name}_{data_path.split('/')[-2]}.json"
save_path = os.path.dirname(filename)
os.makedirs(save_path, exist_ok=True)
df_ans_json = pd.DataFrame(ans_jsons)
df_ans_json.to_json(filename, orient='records', lines=True)

## Sample

In [9]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# tokenizer = AutoTokenizer.from_pretrained("microsoft/GODEL-v1_1-large-seq2seq")
# model = AutoModelForSeq2SeqLM.from_pretrained("microsoft/GODEL-v1_1-large-seq2seq").to(device)


# Instruction for a chitchat task
instruction = f'Instruction: given a dialog context, you need to response empathically.'
# Leave the knowldge empty
knowledge = ''
dialog = [
    'Does money buy happiness?',
    'It is a question. Money buys you a lot of things, but not enough to buy happiness.',
    'What is the best way to buy happiness ?'
]
dialog = ' EOS '.join(dialog)
query = f"{instruction} [CONTEXT] {dialog} {knowledge}"

print(query)
input_ids = tokenizer(f"{query}", return_tensors="pt").input_ids.to(device)
outputs = model.generate(input_ids, max_length=128, min_length=8, top_p=0.9, do_sample=True)
output = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(output)

Instruction: given a dialog context, you need to response empathically. [CONTEXT] Does money buy happiness? EOS It is a question. Money buys you a lot of things, but not enough to buy happiness. EOS What is the best way to buy happiness ? 
You should find a job.
