In [1]:
paths = ['guacamol_v1_train.smiles']

## Train a tokenizer

In [2]:
%%time 

from tokenizers import ByteLevelBPETokenizer

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=1_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])



CPU times: user 48.3 s, sys: 25.7 ms, total: 48.3 s
Wall time: 49.1 s



In [3]:
#!mkdir GPT2
tokenizer.save_model("./SGPT2")

['./SGPT2/vocab.json', './SGPT2/merges.txt']

## Train a model

### Load Tokenizer

In [4]:
from transformers import GPT2Tokenizer

fs_tokenizer = GPT2Tokenizer.from_pretrained("./SGPT2", max_len=512)
fs_tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})

2021-09-13 10:44:18.774512: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


0

### Create dataset

In [5]:
%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=fs_tokenizer,
    file_path=paths[0],
    block_size=128,
)



CPU times: user 2min 23s, sys: 1.97 s, total: 2min 25s
Wall time: 2min 27s


In [6]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=fs_tokenizer, mlm=False
)

### Initiate model

In [7]:
from transformers import GPT2Config, GPT2LMHeadModel

# creating the configurations from which the model can be made
config = GPT2Config(
  vocab_size=fs_tokenizer.vocab_size,
  bos_token_id=fs_tokenizer.bos_token_id,
  eos_token_id=fs_tokenizer.eos_token_id
)
# creating the model
model = GPT2LMHeadModel(config)

### Define Trainer

In [15]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./SGPT2",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


### Display logs in wandb

In [16]:
import wandb
wandb.login()



True

In [17]:
%env WANDB_PROJECT=SGPT_train_demo

env: WANDB_PROJECT=SGPT_train_demo


### Train model

In [18]:
%%time
trainer.train()

***** Running training *****
  Num examples = 1273104
  Num Epochs = 5
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 49735
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss
500,0.904
1000,0.8919
1500,0.8793
2000,0.8674
2500,0.8572
3000,0.8484
3500,0.8431
4000,0.8347
4500,0.8297
5000,0.8223


Saving model checkpoint to ./SGPT2/checkpoint-10000
Configuration saved in ./SGPT2/checkpoint-10000/config.json
Model weights saved in ./SGPT2/checkpoint-10000/pytorch_model.bin
Deleting older checkpoint [SGPT2/checkpoint-20000] due to args.save_total_limit
Saving model checkpoint to ./SGPT2/checkpoint-20000
Configuration saved in ./SGPT2/checkpoint-20000/config.json
Model weights saved in ./SGPT2/checkpoint-20000/pytorch_model.bin
Deleting older checkpoint [SGPT2/checkpoint-30000] due to args.save_total_limit
Saving model checkpoint to ./SGPT2/checkpoint-30000
Configuration saved in ./SGPT2/checkpoint-30000/config.json
Model weights saved in ./SGPT2/checkpoint-30000/pytorch_model.bin
Deleting older checkpoint [SGPT2/checkpoint-10000] due to args.save_total_limit
Saving model checkpoint to ./SGPT2/checkpoint-40000
Configuration saved in ./SGPT2/checkpoint-40000/config.json
Model weights saved in ./SGPT2/checkpoint-40000/pytorch_model.bin
Deleting older checkpoint [SGPT2/checkpoint-2000

CPU times: user 2h 54min 54s, sys: 1h 11min 10s, total: 4h 6min 4s
Wall time: 4h 14min 26s


TrainOutput(global_step=49735, training_loss=0.7094691694471912, metrics={'train_runtime': 15266.6612, 'train_samples_per_second': 416.956, 'train_steps_per_second': 3.258, 'total_flos': 2.22376437891072e+17, 'train_loss': 0.7094691694471912, 'epoch': 5.0})

In [12]:
trainer.save_model("./SGPT2")

Saving model checkpoint to ./SGPT2
Configuration saved in ./SGPT2/config.json
Model weights saved in ./SGPT2/pytorch_model.bin


## Test generation

In [13]:
from transformers import pipeline

text_gen = pipeline(
    "text-generation",
    model="./SGPT2",
    tokenizer="./SGPT2"
)

loading configuration file ./SGPT2/config.json
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 0.1,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "torch_dtype": "float32",
  "transformers_version": "4.9.1",
  "use_cache": true,
  "vocab_size": 1000
}

loading configuration file ./SGPT2/config.json
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop":

In [14]:
text_gen("C")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[{'generated_text': 'C=C(C)C1CCC2(C(=O)O)CCC3('}]