# Train a GPT model from scratch

In [1]:
# in this notebook we'll only get one of the files (the Oscar one) for the sake of simplicity and performance
#!wget -c https://cdn-datasets.huggingface.co/EsperBERTo/data/oscar.eo.txt

## Create dataset

In [38]:
%%time
from transformers import LineByLineTextDataset

paths = ['oscar.eo.txt']

dataset = LineByLineTextDataset(
    tokenizer=fs_tokenizer,
    file_path=paths[0],
    block_size=128,
)

Creating features from dataset file at oscar.eo.txt


CPU times: user 3min 35s, sys: 13 s, total: 3min 48s
Wall time: 3min 59s


## Train a tokenizer

In [None]:
%%time 

from tokenizers import ByteLevelBPETokenizer

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
])

In [40]:
#!mkdir GPT2
tokenizer.save_model("./GPT2")

['./GPT2/vocab.json', './GPT2/merges.txt']

## Train a model

### Load Tokenizer

In [41]:
from transformers import GPT2TokenizerFast

fs_tokenizer = GPT2TokenizerFast.from_pretrained("./GPT2", max_len=512)

fs_tokenizer.pad_token = fs_tokenizer.eos_token

Didn't find file ./GPT2/tokenizer.json. We won't load it.
Didn't find file ./GPT2/added_tokens.json. We won't load it.
Didn't find file ./GPT2/special_tokens_map.json. We won't load it.
Didn't find file ./GPT2/tokenizer_config.json. We won't load it.
loading file ./GPT2/vocab.json
loading file ./GPT2/merges.txt
loading file None
loading file None
loading file None
loading file None
loading configuration file ./GPT2/config.json
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 50256,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,

### Initiate model

In [42]:
from transformers import GPT2LMHeadModel, GPT2Config

config = GPT2Config()
config.pad_token_id = config.eos_token_id
model = GPT2LMHeadModel(config)

### Define Trainer

In [43]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=fs_tokenizer, mlm=False
)

In [44]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./GPT2",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

### Display logs in wandb

In [None]:
import wandb
wandb.login()

In [None]:
%env WANDB_PROJECT=GPT2_train_demo

### Train model

In [None]:
%%time
trainer.train()

In [None]:
trainer.save_model("./GPT2")

In [None]:
from transformers import pipeline

text_gen = pipeline(
    "text-generation",
    model="./GPT2",
    tokenizer="./GPT2"
)

In [None]:
text_gen("ABC")