# Train a GPT model from scratch

In [1]:
# in this notebook we'll only get one of the files (the Oscar one) for the sake of simplicity and performance
#!wget -c https://cdn-datasets.huggingface.co/EsperBERTo/data/oscar.eo.txt

## Train a tokenizer

In [2]:
%%time 

from tokenizers import ByteLevelBPETokenizer

paths = ['oscar.eo.txt']

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
])




CPU times: user 15min 47s, sys: 2.65 s, total: 15min 49s
Wall time: 15min 56s


In [3]:
#!mkdir GPT2
tokenizer.save_model("./GPT2")

['./GPT2/vocab.json', './GPT2/merges.txt']

## Train a model

### Load Tokenizer

In [4]:
from transformers import GPT2Tokenizer

fs_tokenizer = GPT2Tokenizer.from_pretrained("./GPT2", max_len=512)
fs_tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})

2021-09-12 15:39:24.692819: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


1

### Create dataset

In [5]:
%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=fs_tokenizer,
    file_path=paths[0],
    block_size=128,
)



CPU times: user 4min 58s, sys: 4.69 s, total: 5min 3s
Wall time: 5min 6s


In [6]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=fs_tokenizer, mlm=False
)

### Initiate model

In [7]:
from transformers import GPT2Config, GPT2LMHeadModel

# creating the configurations from which the model can be made
config = GPT2Config(
  vocab_size=fs_tokenizer.vocab_size,
  bos_token_id=fs_tokenizer.bos_token_id,
  eos_token_id=fs_tokenizer.eos_token_id
)
# creating the model
model = GPT2LMHeadModel(config)

### Define Trainer

In [8]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./GPT2",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


### Display logs in wandb

In [9]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mcharlesxu90[0m (use `wandb login --relogin` to force relogin)


True

In [10]:
%env WANDB_PROJECT=GPT2_train_demo

env: WANDB_PROJECT=GPT2_train_demo


### Train model

In [11]:
%%time
trainer.train()

***** Running training *****
  Num examples = 974545
  Num Epochs = 1
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 7614
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2021-09-12 15:48:52.773701: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0




Step,Training Loss
500,7.354
1000,6.5015
1500,6.1459
2000,5.9219
2500,5.7509
3000,5.6429
3500,5.5311
4000,5.4511
4500,5.3862
5000,5.3304




Training completed. Do not forget to share your model on huggingface.co/models =)




CPU times: user 1h 2min 11s, sys: 33min 58s, total: 1h 36min 10s
Wall time: 1h 41min 4s


TrainOutput(global_step=7614, training_loss=5.677009143633952, metrics={'train_runtime': 6063.8374, 'train_samples_per_second': 160.714, 'train_steps_per_second': 1.256, 'total_flos': 6.366021083136e+16, 'train_loss': 5.677009143633952, 'epoch': 1.0})

In [None]:
trainer.save_model("./GPT2")

In [None]:
from transformers import pipeline

text_gen = pipeline(
    "text-generation",
    model="./GPT2",
    tokenizer="./GPT2"
)

In [None]:
text_gen("Temas")