<a href="https://colab.research.google.com/github/ftnext/practice-dl-nlp/blob/master/bert_exercise/20220510KantaiBERT_datasets_tokenizers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Rewrite https://github.com/ftnext/practice-dl-nlp/blob/master/bert_exercise/20220506KantaiBERT_with_datasets.ipynb with 🤗 tokenizers.

# Step 1: Fetch dataset

In [1]:
!curl --output kant.txt \
  https://raw.githubusercontent.com/PacktPublishing/Transformers-for-Natural-Language-Processing/main/Chapter03/kant.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 10.7M  100 10.7M    0     0  8470k      0  0:00:01  0:00:01 --:--:-- 8470k


In [2]:
!wc -l kant.txt

188287 kant.txt


# Step 2: Install dependencies

In [3]:
!pip uninstall -y tensorflow

Found existing installation: tensorflow 2.8.0
Uninstalling tensorflow-2.8.0:
  Successfully uninstalled tensorflow-2.8.0


In [4]:
!pip install transformers datasets

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 12.1 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 46.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 38.4 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 39.0 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 5.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 48

In [5]:
!pip list | grep -E 'transformers|tokenizers'

tokenizers                    0.12.1
transformers                  4.18.0


In [6]:
!pip list | grep -E '^datasets'

datasets                      2.1.0


In [7]:
!pip list | grep torch

torch                         1.11.0+cu113
torchaudio                    0.11.0+cu113
torchsummary                  1.5.1
torchtext                     0.12.0
torchvision                   0.12.0+cu113


# Check GPU

In [8]:
!nvidia-smi

Tue May 10 12:56:54 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   72C    P8    33W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [9]:
import torch

In [10]:
torch.cuda.is_available()

True

# Imports

In [11]:
from pathlib import Path

In [12]:
from datasets import load_dataset

In [13]:
from tokenizers import Tokenizer, decoders, pre_tokenizers, processors
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer

In [14]:
from transformers import (
    pipeline,
    DataCollatorForLanguageModeling,
    RobertaConfig,
    RobertaTokenizerFast,
    RobertaForMaskedLM,
    Trainer,
    TrainingArguments,
)

# Tokenizer

## Train tokenizer then save

In [15]:
paths = ["kant.txt"]
special_tokens = ["<s>", "<pad>", "</s>", "<unk>","<mask>"]

In [16]:
# tokenizer = ByteLevelBPETokenizer() 相当
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)

In [17]:
# tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=special_tokens) 相当
trainer = BpeTrainer(
    vocab_size=52_000,
    min_frequency=2,
    special_tokens=special_tokens,
    initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
)

In [18]:
%%time
tokenizer.train(paths, trainer)

CPU times: user 6.55 s, sys: 202 ms, total: 6.75 s
Wall time: 3.54 s


In [19]:
# RobertaTokenizerFastとして読み込めるように後処理を追加
tokenizer.post_processor = processors.TemplateProcessing(
    single="<s> $A </s>",
    pair="<s> $A </s> $B:1 </s>:1",
    special_tokens=[
        ("<s>", tokenizer.token_to_id("<s>")),
        ("</s>", tokenizer.token_to_id("</s>")),
    ],
)

In [20]:
token_dir = Path("KantaiBERT")
token_dir.mkdir(exist_ok=True)

tokenizer.save(str(token_dir / "tokenizer.json"))  # pipelineでロードするために命名規則に従う

In [21]:
!ls -lh KantaiBERT

total 816K
-rw-r--r-- 1 root root 815K May 10 12:57 tokenizer.json


In [22]:
tokenizer.get_vocab_size()

19296

# RoBERTa

## Config

In [23]:
config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

## Tokenizer

In [24]:
tokenizer = RobertaTokenizerFast(tokenizer_file=str(token_dir / "tokenizer.json"))

In [25]:
len(tokenizer.get_vocab())

19296

In [26]:
tokenizer.encode("The Critique of Pure Reason.")

[0, 803, 2245, 270, 1410, 1270, 18, 2]

## Model

In [27]:
model = RobertaForMaskedLM(config)

In [28]:
model.num_parameters()

83504416

## Dataset (for pre-training)

In [29]:
text_column_name = "text"

In [30]:
def tokenize_function(examples):
    examples[text_column_name] = [
        line
        for line in examples[text_column_name]
        if len(line) > 0 and not line.isspace()
    ]
    return tokenizer(
        examples[text_column_name],
        padding=False,
        truncation=True,
        max_length=512,
        return_special_tokens_mask=True,
    )

In [31]:
raw_datasets = load_dataset("text", data_files="kant.txt")

Using custom data configuration default-0c7569542c5a94be


Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-0c7569542c5a94be/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-0c7569542c5a94be/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [32]:
%%time
tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    num_proc=None,
    remove_columns=[text_column_name],
    load_from_cache_file=True,
    desc="Running tokenizer on dataset line_by_line",
)

Running tokenizer on dataset line_by_line:   0%|          | 0/189 [00:00<?, ?ba/s]

CPU times: user 13.3 s, sys: 172 ms, total: 13.5 s
Wall time: 9.38 s


In [33]:
dataset = tokenized_datasets["train"]

In [34]:
len(dataset)

170964

## Data collator

In [35]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

## Train

In [36]:
training_args = TrainingArguments(
    output_dir=str(token_dir),
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
)

In [37]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [38]:
%%time
trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 170964
  Num Epochs = 1
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 2672


Step,Training Loss
500,6.5983
1000,5.7215
1500,5.2343
2000,4.9835
2500,4.8389




Training completed. Do not forget to share your model on huggingface.co/models =)




CPU times: user 19min 8s, sys: 7.57 s, total: 19min 15s
Wall time: 19min 10s


TrainOutput(global_step=2672, training_loss=5.431695355626637, metrics={'train_runtime': 1150.7648, 'train_samples_per_second': 148.566, 'train_steps_per_second': 2.322, 'total_flos': 873620128952064.0, 'train_loss': 5.431695355626637, 'epoch': 1.0})

In [39]:
trainer.save_model(str(token_dir))

Saving model checkpoint to KantaiBERT
Configuration saved in KantaiBERT/config.json
Model weights saved in KantaiBERT/pytorch_model.bin


## fill-mask task

In [40]:
fill_mask = pipeline(
    "fill-mask",
    model=str(token_dir),
    tokenizer=str(token_dir)
)

loading configuration file KantaiBERT/config.json
Model config RobertaConfig {
  "_name_or_path": "KantaiBERT",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 52000
}

loading configuration file KantaiBERT/config.json
Model config RobertaConfig {
  "_name_or_path": "KantaiBERT",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,


In [41]:
fill_mask("Human thinking involves<mask>.")

[{'score': 0.03786461800336838,
  'sequence': 'Human thinking involves it.',
  'token': 306,
  'token_str': ' it'},
 {'score': 0.019051196053624153,
  'sequence': 'Human thinking involves reason.',
  'token': 393,
  'token_str': ' reason'},
 {'score': 0.010969003662467003,
  'sequence': 'Human thinking involves I.',
  'token': 364,
  'token_str': ' I'},
 {'score': 0.010744065046310425,
  'sequence': 'Human thinking involves experience.',
  'token': 531,
  'token_str': ' experience'},
 {'score': 0.009861685335636139,
  'sequence': 'Human thinking involves,.',
  'token': 16,
  'token_str': ','}]