In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

device = 'cpu' #'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


In [2]:
sep_token = '<SEP>'
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large", bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>', sep_token ='<SEP>',
                                         padding_side='left')
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-large").to(device)
model.resize_token_embeddings(len(tokenizer))
model.device

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


device(type='cpu')

In [3]:
# chat on cpu
for step in range(3):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    input_text = input(">> User:")
    text_to_bot = f'2,17{sep_token}{input_text}{sep_token}'
    print(text_to_bot)
    new_user_input_ids = tokenizer.encode(text_to_bot + tokenizer.eos_token, return_tensors='pt')

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens,
    chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id,
                                     top_p=0.95, temperature=1.5)
    print(chat_history_ids.shape)
    

    # pretty print last ouput tokens from bot
    print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))


>> User:יקט


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


2,17<SEP>יקט<SEP>
torch.Size([1, 15])
DialoGPT: 2,17.
>> User:hey


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


2,17<SEP>hey<SEP>
torch.Size([1, 27])
DialoGPT: 2,17.
>> User:hey


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


2,17<SEP>hey<SEP>
torch.Size([1, 41])
DialoGPT: 2,17, hey,


In [4]:
data = pd.read_csv('data.csv')
single_sentences = []

for text, cell, resp in zip(data.input_text, data.input_cell, data.output):
    single_sentences.append(f'{cell}{sep_token}{text}{sep_token}{resp}')
# single_sentences *= 5
single_sentences

['2,17<SEP>where should I go now?<SEP>pass the branch below the tiger, to the left',
 '2,17<SEP>where to?<SEP>pass the branch below the tiger, to the left',
 '2,17<SEP>now what?<SEP>pass the branch below the tiger, to the left',
 "2,17<SEP>I'm near the tiger<SEP>pass the branch below the tiger, to the left",
 '8,1<SEP>where should I go now?<SEP>head south east from the giraph, towards the snake',
 '8,1<SEP>where to?<SEP>head south east from the giraph, towards the snake',
 '8,1<SEP>now what?<SEP>head south east from the giraph, towards the snake',
 "8,1<SEP>I'm near the giraph<SEP>head south east from the giraph, towards the snake"]

In [None]:
max_length = max([len(tokenizer.encode(sent)) for sent in single_sentences])
max_length

In [None]:
class CustomDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
#         print(self.input_ids[idx].shape)
#         print(self.attn_masks[idx])
        return self.input_ids[idx], self.attn_masks[idx]

In [None]:
dataset = CustomDataset(single_sentences, tokenizer, max_length=max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])
print(f'train {len(train_dataset)} val: {len(val_dataset)}')

In [None]:
training_args = TrainingArguments("few_uters",
                                  evaluation_strategy="steps",
                                  logging_steps=6,
                                  eval_steps=12,
                                  save_steps=12,
                                  num_train_epochs=1,
                                  per_device_train_batch_size=1,
                                  per_device_eval_batch_size=1,
                                  no_cuda=True,
                                  overwrite_output_dir=True)

In [None]:
# torch.cuda.empty_cache()
t = Trainer(model=model,  args=training_args, train_dataset=train_dataset,
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])})
t.train()

In [None]:
generated = tokenizer("<|startoftext|> 2,17<SEP>where should I go now?<SEP>", return_tensors="pt").input_ids
sample_outputs = model.generate(generated, do_sample=True, top_k=50, pad_token_id=tokenizer.eos_token_id,
                                max_length=300, top_p=0.95, temperature=1.5, num_return_sequences=5)


In [None]:
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))