In [60]:
from transformers import GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, GPT2LMHeadModel, pipeline, \
                          Trainer, TrainingArguments
import torch

In [18]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [19]:
text_data = TextDataset(
    tokenizer=tokenizer,
    file_path='data/sample.txt',
    block_size=32
)



In [44]:
len(text_data.examples)

6154

In [20]:
text_data[0], text_data[0].shape

(tensor([  171,   119,   123,   464,  4935, 20336, 46566,    11,   317, 11217,
           786,   319, 29015, 18493,    11,   416, 22578,   198, 22362,   372,
          1355,   721,   372,   628,   198,  1212, 46566,   318,   329,   262,
           779,   286]),
 torch.Size([32]))

In [21]:
print(tokenizer.decode(text_data[0]))

The Project Gutenberg eBook, A Treatise on Domestic Economy, by Catherine
Esther Beecher


This eBook is for the use of


In [22]:
print(tokenizer.pad_token)
print(tokenizer.eos_token)
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
print(tokenizer.pad_token)

Using pad_token, but it is not set yet.


None
<|endoftext|>
<|endoftext|>


In [23]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) #mlm: Masked Language Modelling

In [24]:
collator_example = data_collator([tokenizer('I am an input'), tokenizer('So am I')])
collator_example

{'input_ids': tensor([[   40,   716,   281,  5128],
        [ 2396,   716,   314, 50256]]), 'attention_mask': tensor([[1, 1, 1, 1],
        [1, 1, 1, 0]]), 'labels': tensor([[  40,  716,  281, 5128],
        [2396,  716,  314, -100]])}

In [25]:
collator_example.input_ids

tensor([[   40,   716,   281,  5128],
        [ 2396,   716,   314, 50256]])

In [26]:
tokenizer.pad_token_id

50256

In [27]:
collator_example.attention_mask

tensor([[1, 1, 1, 1],
        [1, 1, 1, 0]])

In [28]:
collator_example.labels # -100 to ignore loss calculation for the padded token
                        # labels are shifted inside the GPT model

tensor([[  40,  716,  281, 5128],
        [2396,  716,  314, -100]])

In [32]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
pretrained_generator = pipeline(
    'text-generation', 
    model=model,
    tokenizer='gpt2',
    config={'max_new_tokens': 200, 'do_sample': True, 'top_p': 0.9, 'temperature': 0.7, 'top_k': 10}
)

In [33]:
print('-'*10)
for generated_sequence in pretrained_generator("Women's role in the market should", num_return_sequences=3):
    print(generated_sequence['generated_text'])
    print('-'*10)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


----------
Women's role in the market should be to make women buy better brands, but more so than "better food – but more people like better food."

We've seen many examples on this page.

We have to change how that works
----------
Women's role in the market should come at least as important one to people who live in states with anti-discrimination law but still think they can still vote.

"I agree on many things and a lot of people have really bad views about
----------
Women's role in the market should not matter much if they do not receive adequate incentives to support their own children — the state should not have to subsidize their own children. The state should not have to subsidize their own children because there is a
----------


In [38]:
training_args = TrainingArguments(
    output_dir='gpt2_text',
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=len(text_data.examples) // 5,
    logging_steps=50,
    load_best_model_at_end=True,
    evaluation_strategy='epoch',
    save_strategy='epoch',
)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=text_data.examples[:int(len(text_data.examples)*.8)],
    eval_dataset=text_data.examples[int(len(text_data.examples)*.8):],
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [39]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 1231
  Batch size = 32


{'eval_loss': 4.69537353515625,
 'eval_runtime': 1.7537,
 'eval_samples_per_second': 701.936,
 'eval_steps_per_second': 22.238}

In [49]:
trainer.train()

***** Running training *****
  Num examples = 4923
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 770
  Number of trainable parameters = 124439808


Epoch,Training Loss,Validation Loss
1,3.491,4.003829


***** Running Evaluation *****
  Num examples = 1231
  Batch size = 32
  Num examples = 1231
  Batch size = 32
Saving model checkpoint to gpt2_text/checkpoint-154
Saving model checkpoint to gpt2_text/checkpoint-154
Configuration saved in gpt2_text/checkpoint-154/config.json
Configuration saved in gpt2_text/checkpoint-154/generation_config.json
Model weights saved in gpt2_text/checkpoint-154/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1231
  Batch size = 32
***** Running Evaluation *****
  Num examples = 1231
  Batch size = 32
Saving model checkpoint to gpt2_text/checkpoint-308
Saving model checkpoint to gpt2_text/checkpoint-308
Configuration saved in gpt2_text/checkpoint-308/config.json
Configuration saved in gpt2_text/checkpoint-308/generation_config.json
Model weights saved in gpt2_text/checkpoint-308/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1231
  Batch size = 32
Saving model checkpoint to gpt2_text/checkpoint-462
Configuration saved in

TrainOutput(global_step=770, training_loss=3.2265681675502234, metrics={'train_runtime': 234.0681, 'train_samples_per_second': 105.162, 'train_steps_per_second': 3.29, 'total_flos': 401981460480000.0, 'train_loss': 3.2265681675502234, 'epoch': 5.0})

In [50]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 1231
  Batch size = 32


{'eval_loss': 4.003828525543213,
 'eval_runtime': 1.7459,
 'eval_samples_per_second': 705.086,
 'eval_steps_per_second': 22.338,
 'epoch': 5.0}

In [51]:
trainer.save_model()

Saving model checkpoint to gpt2_text
Configuration saved in gpt2_text/config.json
Configuration saved in gpt2_text/generation_config.json
Model weights saved in gpt2_text/pytorch_model.bin


In [62]:
device = torch.device(torch.cuda.current_device() if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [63]:
loaded_model = GPT2LMHeadModel.from_pretrained('gpt2_text')
device = torch.device(torch.cuda.current_device() if torch.cuda.is_available() else 'cpu')
finetuned_generator = pipeline(
    'text-generation',
    model=loaded_model,
    tokenizer=tokenizer,
    config={'max_new_tokens': 200, 'do_sample': True, 'top_p': 0.9, 'temperature': 0.7, 'top_k': 10},
    device=device
)

loading configuration file gpt2_text/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "do_sample": true,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "max_length": 50,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "use_cache": true,
  "

In [65]:
print('-'*10)
for generated_sequence in finetuned_generator("Women's role in the market should", num_return_sequences=3):
    print(generated_sequence['generated_text'])
    print('-'*10)

Generate config GenerationConfig {
  "bos_token_id": 50256,
  "do_sample": true,
  "eos_token_id": 50256,
  "max_length": 50,
  "transformers_version": "4.26.1"
}

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


----------
Women's role in the market should be
deduced, by the benevolence, of the community, when she learns that its
own customs, institutions, feelings, and habits are the most perfect
complementary to her interests."

----------
Women's role in the market should be
dedicated solely to the health and happiness of children. The young
should always have their education for the benefit of the family;
this will always be the result of our benevolent nature, and the bene
----------
Women's role in the market should never be
discriminated against to those ladies who are at the front and bottom, who
are very good, but who are deficient in the other branches of their profession.


Another interesting thing to
----------
