In [1]:
# from https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb#scrollTo=oK7PPVm2XBgr
# from https://huggingface.co/blog/how-to-train

In [2]:
# in this notebook we'll only get one of the files (the Oscar one) for the sake of simplicity and performance
# !wget -c https://cdn-datasets.huggingface.co/EsperBERTo/data/oscar.eo.txt

In [3]:
# We won't need TensorFlow here
!pip uninstall -y tensorflow
# Install `transformers` from master
!pip install git+https://github.com/huggingface/transformers
!pip install git+https://github.com/huggingface/datasets
!pip list | grep -E 'transformers|tokenizers'
# transformers version at notebook update --- 2.11.0
# tokenizers version at notebook update --- 0.8.0rc1

Uninstalling tensorflow-2.4.1:
  Successfully uninstalled tensorflow-2.4.1
Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-q97vfaac
  Running command git clone -q https://github.com/huggingface/transformers /tmp/pip-req-build-q97vfaac
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 4.0MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 27

In [4]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [5]:
# import pandas as pd
# abc_heads = pd.read_csv("/content/drive/My Drive/abc/abcheads.csv")
# print(abc_heads)

In [6]:
%%time 
from tokenizers import ByteLevelBPETokenizer

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(
    files="/content/drive/My Drive/abc/abcheads.txt",
    vocab_size=52_000,
    min_frequency=2,
    special_tokens=["<s>",
                    "<pad>",
                    "</s>",
                    "<unk>",
                    "<mask>",
                    ])

CPU times: user 29.5 s, sys: 4.33 s, total: 33.8 s
Wall time: 10.4 s


In [7]:
!mkdir "/content/drive/My Drive/abc/aBERTc"
tokenizer.save_model("/content/drive/My Drive/abc/aBERTc")

mkdir: cannot create directory ‘/content/drive/My Drive/abc/aBERTc’: File exists


['/content/drive/My Drive/abc/aBERTc/vocab.json',
 '/content/drive/My Drive/abc/aBERTc/merges.txt']

In [8]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = ByteLevelBPETokenizer(
    "/content/drive/My Drive/abc/aBERTc/vocab.json",
    "/content/drive/My Drive/abc/aBERTc/merges.txt",
)

In [9]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [10]:
tokenizer.encode("Donald Trump against bird shooting.")

Encoding(num_tokens=12, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [11]:
tokenizer.encode("Donald Trump against bird shooting.").tokens

['<s>',
 'D',
 'onald',
 'Ġ',
 'T',
 'r',
 'ump',
 'Ġagainst',
 'Ġbird',
 'Ġshooting',
 '.',
 '</s>']

In [12]:
# Check that we have a GPU
!nvidia-smi

Fri Feb 26 18:42:02 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.39       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    25W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [13]:
# Check that PyTorch sees it
import torch
torch.cuda.is_available()

True

In [14]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [15]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("/content/drive/My Drive/abc/aBERTc", max_len=512)

In [16]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [17]:
model.num_parameters()
# => 84 million parameters

83504416

In [19]:
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="/content/drive/My Drive/abc/abcheads.txt",
    block_size=128,
)



In [20]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [21]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/drive/My Drive/abc/aBERTc",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    # prediction_loss_only=True,
)

In [22]:
%%time
trainer.train()

Step,Training Loss
500,9.1792
1000,8.5458
1500,8.2393
2000,8.0738
2500,7.9371
3000,7.7753
3500,7.7362
4000,7.6521
4500,7.5283
5000,7.4608


CPU times: user 26min 28s, sys: 42min 45s, total: 1h 9min 14s
Wall time: 1h 9min 4s


TrainOutput(global_step=19161, training_loss=7.161705022266094, metrics={'train_runtime': 4142.9292, 'train_samples_per_second': 4.625, 'total_flos': 8603870488189056.0, 'epoch': 1.0, 'init_mem_cpu_alloc_delta': 7820422, 'init_mem_gpu_alloc_delta': 334180352, 'init_mem_cpu_peaked_delta': 18306, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 762130, 'train_mem_gpu_alloc_delta': 1020621824, 'train_mem_cpu_peaked_delta': 364562230, 'train_mem_gpu_peaked_delta': 1589757440})

In [23]:
trainer.save_model("/content/drive/My Drive/abc/aBERTc")

In [24]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="/content/drive/My Drive/abc/aBERTc",
    tokenizer="/content/drive/My Drive/abc/aBERTc"
)

Some weights of RobertaModel were not initialized from the model checkpoint at /content/drive/My Drive/abc/aBERTc and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
# The sun <mask>.
# =>

fill_mask("Donald Trump <mask>.")

[{'score': 0.008189248852431774,
  'sequence': 'Donald Trump1.',
  'token': 21,
  'token_str': '1'},
 {'score': 0.008159184828400612,
  'sequence': 'Donald Trumps.',
  'token': 87,
  'token_str': 's'},
 {'score': 0.0066345250234007835,
  'sequence': 'Donald Trumpth.',
  'token': 322,
  'token_str': 'th'},
 {'score': 0.004682299215346575,
  'sequence': 'Donald Trumpg.',
  'token': 75,
  'token_str': 'g'},
 {'score': 0.004626961424946785,
  'sequence': 'Donald Trumpy.',
  'token': 93,
  'token_str': 'y'}]

In [26]:
fill_mask("taxpayers will have to <mask>.")

# This is the beginning of a beautiful <mask>.
# =>

[{'score': 0.016638141125440598,
  'sequence': 'taxpayers will have to the.',
  'token': 362,
  'token_str': ' the'},
 {'score': 0.013024426065385342,
  'sequence': 'taxpayers will have to 20.',
  'token': 656,
  'token_str': ' 20'},
 {'score': 0.012642229907214642,
  'sequence': 'taxpayers will have to 2.',
  'token': 462,
  'token_str': ' 2'},
 {'score': 0.011848079971969128,
  'sequence': 'taxpayers will have to 10.',
  'token': 1681,
  'token_str': ' 10'},
 {'score': 0.010218915529549122,
  'sequence': 'taxpayers will have to 25.',
  'token': 2824,
  'token_str': ' 25'}]