In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel
import gc

In [None]:
torch.manual_seed(42)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large', bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained('gpt2-large').cuda()
model.resize_token_embeddings(len(tokenizer))
class CustomDataset(Dataset):
    def __init__(self, prompt,instruction,desired_output, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for p,i,d in zip(prompt,instruction,desired_output):
            p = str(p)
            i = str(i)
            d = str(d)
            prompt = '# '+p.lstrip('\n') + '\n# '+i + '\n\"\"\"' + d
            encodings_dict = tokenizer('<|startoftext|>' + prompt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

Downloading vocab.json:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading pytorch_model.bin:   0%|          | 0.00/3.02G [00:00<?, ?B/s]

In [None]:
df = pd.read_csv('/home/delta/Downloads/netflix_titles.csv')
prompt = df['director']
instruction = ['generate a description for the above director' for i in range(len(prompt))]
target = df['description']
max_length = max([len(tokenizer.encode(str(t))) for t in target]) + \
             max([len(tokenizer.encode(str(i))) for i in instruction]) + \
             max([len(tokenizer.encode(str(p))) for p in prompt])
dataset = CustomDataset(prompt,instruction,target,tokenizer,max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])
gc.collect()
torch.cuda.empty_cache()
training_args = TrainingArguments(output_dir='./results', num_train_epochs=1, logging_steps=100, save_steps=5000,
                                  per_device_train_batch_size=1, per_device_eval_batch_size=1,
                                  warmup_steps=10, weight_decay=0.05, logging_dir='./logs', report_to = 'none')


In [None]:
df

In [None]:
Trainer(model=model,  args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()

In [None]:
prompt = 'Kirsten Johnson'
instruction = 'generate a description for the above director'
prompt = '# '+prompt.lstrip('\n') + '\n# '+instruction + '\n\"\"\"'
prompt_start = prompt.rfind(instruction)+len(instruction)+4
print(prompt)

In [None]:
generated = tokenizer(f"<|startoftext|>{prompt}", return_tensors="pt").input_ids.cuda()
sample_outputs = model.generate(generated, do_sample=True, top_k=50, 
                                max_length=300, top_p=0.95, temperature=1.9, num_return_sequences=20)
for i, sample_output in enumerate(sample_outputs):
    output = tokenizer.decode(sample_output, skip_special_tokens=True,
                              clean_up_tokenization_spaces=True)
    output = output[prompt_start:]
    print(output)
    print('')