# Train a GPT model from scratch

In [1]:
# in this notebook we'll only get one of the files (the Oscar one) for the sake of simplicity and performance
#!wget -c https://cdn-datasets.huggingface.co/EsperBERTo/data/oscar.eo.txt

In [1]:
paths = ['oscar.eo.txt']

## Train a tokenizer

In [2]:
%%time 

from tokenizers import ByteLevelBPETokenizer

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])




CPU times: user 15min 47s, sys: 2.65 s, total: 15min 49s
Wall time: 15min 56s


In [3]:
#!mkdir GPT2
tokenizer.save_model("./GPT2")

['./GPT2/vocab.json', './GPT2/merges.txt']

## Train a model

### Load Tokenizer

In [2]:
from transformers import GPT2Tokenizer

fs_tokenizer = GPT2Tokenizer.from_pretrained("./GPT2", max_len=512)
fs_tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})

2021-09-12 20:20:15.585303: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


1

### Create dataset

In [3]:
%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=fs_tokenizer,
    file_path=paths[0],
    block_size=128,
)



CPU times: user 4min 51s, sys: 4.29 s, total: 4min 56s
Wall time: 4min 59s


In [4]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=fs_tokenizer, mlm=False
)

### Initiate model

In [5]:
from transformers import GPT2Config, GPT2LMHeadModel

# creating the configurations from which the model can be made
config = GPT2Config(
  vocab_size=fs_tokenizer.vocab_size,
  bos_token_id=fs_tokenizer.bos_token_id,
  eos_token_id=fs_tokenizer.eos_token_id
)
# creating the model
model = GPT2LMHeadModel(config)

### Define Trainer

In [6]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./GPT2",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

### Display logs in wandb

In [7]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mcharlesxu90[0m (use `wandb login --relogin` to force relogin)


True

In [8]:
%env WANDB_PROJECT=GPT2_train_demo

env: WANDB_PROJECT=GPT2_train_demo


### Train model

In [9]:
%%time
trainer.train()

***** Running training *****
  Num examples = 974545
  Num Epochs = 1
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 7614
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
2021-09-12 20:25:42.545815: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0




Step,Training Loss
500,7.3571
1000,6.5045
1500,6.1492
2000,5.9234
2500,5.7511
3000,5.6435
3500,5.5302
4000,5.4487
4500,5.3824
5000,5.3272




Training completed. Do not forget to share your model on huggingface.co/models =)




CPU times: user 1h 4min 38s, sys: 23min 30s, total: 1h 28min 8s
Wall time: 2h 20min 10s


TrainOutput(global_step=7614, training_loss=5.675492876121495, metrics={'train_runtime': 8410.6622, 'train_samples_per_second': 115.87, 'train_steps_per_second': 0.905, 'total_flos': 6.366021083136e+16, 'train_loss': 5.675492876121495, 'epoch': 1.0})

In [10]:
trainer.save_model("./GPT2")

Saving model checkpoint to ./GPT2
Configuration saved in ./GPT2/config.json
Model weights saved in ./GPT2/pytorch_model.bin


## Test generation

In [11]:
from transformers import pipeline

text_gen = pipeline(
    "text-generation",
    model="./GPT2",
    tokenizer="./GPT2"
)

loading configuration file ./GPT2/config.json
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 0.1,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "torch_dtype": "float32",
  "transformers_version": "4.9.1",
  "use_cache": true,
  "vocab_size": 52000
}

loading configuration file ./GPT2/config.json
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 

In [12]:
text_gen("Temas")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[{'generated_text': 'Temas pri la plej multaj aliaj lingvoj, kiuj estas la plej granda kaj la plej granda. La plej'}]