### Load Model

In [22]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")

In [23]:
# Let's chat for 5 lines
for step in range(5):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)

    # pretty print last ouput tokens from bot
    print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


DialoGPT: I'm good, how are you?
DialoGPT: That's good.
DialoGPT: I'm not sure.
DialoGPT: I'm not sure.
DialoGPT: I'm going to go to sleep now.


### Load data

In [None]:
from datasets import load_dataset
empathetic_dialogues = load_dataset("Estwld/empathetic_dialogues_llm")
print(empathetic_dialogues)

def extract_conversation_pairs(example):
    pairs = []
    for i in range(len(example['conversations']) - 1):
        if example['conversations'][i]['role'] == 'user' and example['conversations'][i + 1]['role'] == 'assistant':
            user_input = example['conversations'][i]['content']
            assistant_reponse = example['conversations'][i + 1]['content']
            pairs.append((user_input, assistant_reponse))
    return pairs

def preprocess_dataset(dataset):
    all_pairs = []
    for example in dataset:
        all_pairs.extend(extract_conversation_pairs(example))
    return all_pairs

train_pairs = preprocess_dataset(empathetic_dialogues['train'])

DatasetDict({
    train: Dataset({
        features: ['conv_id', 'situation', 'emotion', 'conversations'],
        num_rows: 19533
    })
    valid: Dataset({
        features: ['conv_id', 'situation', 'emotion', 'conversations'],
        num_rows: 2770
    })
    test: Dataset({
        features: ['conv_id', 'situation', 'emotion', 'conversations'],
        num_rows: 2547
    })
})


### Tokenize the data

In [30]:
tokenizer.pad_token = tokenizer.eos_token

def toeknize_pairs(pairs):
    inputs = []
    labels = []
    for user_input, assistant_response in pairs:
        encoded = tokenizer(
            user_input,
            assistant_response,
            truncation=True,
            padding='max_length',
            max_length=512,
            return_tensors='pt'
        )
        inputs.append(encoded['input_ids'])
        labels.append(encoded['attention_mask'])
    return {'input_ids': torch.cat(inputs), 'attention_mask': torch.cat(labels)}

train_data = toeknize_pairs(train_pairs)

### fine-tuning with tokenized data

In [None]:
from torch.utils.data import Dataset
from transformers import Trainer, TrainingArguments

class DialogDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

        def __len__(self):
            return len(self.encodings['input_ids'])
        
        def __getitem__(self, idx):
            return {
                key: torch.tensor(val[idx]) for key, val in self.encodings.items()
            }
        
train_dataset = DialogDataset(train_data)

trainings_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
    evaluation_strategy='steps',
    eval_steps=500,
    logging_steps=100,
    learning_rate=5e-5,
    warmup_steps=500,
    weight_decay=0.01,
    fp16=True   # if using GPU
)

trainer = Trainer(
    model=model,
    args=trainings_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)

trainer.train()