In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel
import gc

In [3]:
torch.manual_seed(42)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-small', bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained('gpt2-small').cuda()
model.resize_token_embeddings(len(tokenizer))
class CustomDataset(Dataset):
    def __init__(self, prompt,instruction,desired_output, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for p,i,d in zip(prompt,instruction,desired_output):
            p = str(p)
            i = str(i)
            d = str(d)
            prompt = '# '+p.lstrip('\n') + '\n# '+i + '\n\"\"\"' + d
            encodings_dict = tokenizer('<|startoftext|>' + prompt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
df = pd.read_csv('Path to CSV')
prompt = df['inputs']
instruction = ['generate the future' for i in range(len(prompt))]
target = df['targets']
max_length = max([len(tokenizer.encode(str(t))) for t in target]) + \
             max([len(tokenizer.encode(str(i))) for i in instruction]) + \
             max([len(tokenizer.encode(str(p))) for p in prompt])
dataset = CustomDataset(prompt,instruction,target,tokenizer,max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])
gc.collect()
torch.cuda.empty_cache()
training_args = TrainingArguments(output_dir='./results', 
                                  num_train_epochs=4, 
                                  logging_steps=100,
                                  save_steps=1000,
                                  per_device_train_batch_size=1, 
                                  per_device_eval_batch_size=1,
                                  gradient_accumulation_steps=1,
                                  gradient_checkpointing=True,
                                  fp16=True          #if loss return 0, change to False
                                  optim="adafactor", #change to adamw_torch if you have have enough memory['adamw_hf', 'adamw_torch', 'adamw_torch_xla', 'adamw_apex_fused', 'adafactor', 'adamw_bnb_8bit', 'sgd', 'adagrad']
                                  warmup_steps=1, 
                                  weight_decay=0.05, 
                                  logging_dir='/home/delta/Downloads/logs', 
                                  report_to = 'tensorboard')

In [None]:
Trainer(model=model,  args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()

In [None]:
model.save_pretrained('model')
tokenizer.save_pretrained('tokenizer')

In [6]:
prompt = '''

This is a decoder only model, it can generate the next word based on previous word but no way to control
if it produce the correct or not correct answer. To control the output, you need to use an encoder and decoder model
YOu also need to think about what type of masking you are going to use. GPT3 or T5 are just the name. The true question
is what type of transformer architecture and what masking strategy you are going to adopt.

'''
instruction = 'generate the future'             #Same instruction as you trained
prompt = '# '+prompt.lstrip('\n') + '\n# '+instruction + '\n\"\"\"'
prompt_start = prompt.rfind(instruction)+len(instruction)+4
print(prompt)

# Kirsten Johnson
# generate a description for the above director
"""


In [7]:
generated = tokenizer(f"<|startoftext|>{prompt}", return_tensors="pt").input_ids.cuda()
sample_outputs = model.generate(generated, do_sample=True, top_k=50, 
                                max_length=300, top_p=0.95, temperature=1.9, num_return_sequences=20)
for i, sample_output in enumerate(sample_outputs):
    output = tokenizer.decode(sample_output, skip_special_tokens=True,
                              clean_up_tokenization_spaces=True)
    output = output[prompt_start:]
    print(output)
    print('')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The tragic death at an apartment complex impacts not just residents but their landlords as she navigates three husbands, one ex on the run from police harassment.

In a fictional case involving a criminal and a gang at a local community college, four teen and one classmate battle a vicious and ruthless group that refuses a truce.

After his daughter abruptly drops off the farm because she's taking birth-injection drugs or is dead, a mother-to-be is sent on a mad cap that gives her two extra lives.

As the young lives their parents – along with all living creatures on all 4 corners of Earth, is threatened. She seeks solace with magical adventures in the small, mysterious island home.

A talented but ambitious teen from Boston struggles to juggle her dreams to play soccer or fall in with a criminal cartel – but soon ends the team she falls under to.

An aspiring model agrees to dance, where her talent doesn't follow when the photographer whose job she wants to get a photo sets her up for