<a href="https://colab.research.google.com/github/ftnext/practice-dl-nlp/blob/master/bert_exercise/20220506KantaiBERT_with_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Rewrite https://github.com/ftnext/practice-dl-nlp/blob/master/bert_exercise/20220423KantaiBERT.ipynb with 🤗 datasets.

# Step 1: Fetch dataset

In [1]:
!curl --output kant.txt \
  https://raw.githubusercontent.com/PacktPublishing/Transformers-for-Natural-Language-Processing/main/Chapter03/kant.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 10.7M  100 10.7M    0     0  19.0M      0 --:--:-- --:--:-- --:--:-- 19.0M


In [2]:
!wc -l kant.txt

188287 kant.txt


# Step 2: Install dependencies

In [3]:
!pip uninstall -y tensorflow

Found existing installation: tensorflow 2.8.0
Uninstalling tensorflow-2.8.0:
  Successfully uninstalled tensorflow-2.8.0


In [4]:
!pip install transformers datasets

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 117 kB/s 
[?25hCollecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 61 kB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 93 kB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 126 kB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 132 kB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 120 kB/s

In [5]:
!pip list | grep -E 'transformers|tokenizers'

tokenizers                    0.12.1
transformers                  4.18.0


In [6]:
!pip list | grep -E '^datasets'

datasets                      2.1.0


In [7]:
!pip list | grep torch

torch                         1.11.0+cu113
torchaudio                    0.11.0+cu113
torchsummary                  1.5.1
torchtext                     0.12.0
torchvision                   0.12.0+cu113


# Check GPU

In [8]:
!nvidia-smi

Fri May  6 02:56:57 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P8    25W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [9]:
import torch

In [10]:
torch.cuda.is_available()

True

# Imports

In [11]:
from pathlib import Path

In [12]:
from datasets import load_dataset

from tokenizers import ByteLevelBPETokenizer

In [13]:
from transformers import (
    pipeline,
    DataCollatorForLanguageModeling,
    RobertaConfig,
    RobertaTokenizer,
    RobertaForMaskedLM,
    Trainer,
    TrainingArguments,
)

# Tokenizer

## Train tokenizer then save

In [14]:
paths = ["kant.txt"]

tokenizer = ByteLevelBPETokenizer()

special_tokens = ["<s>", "<pad>", "</s>", "<unk>","<mask>"]

In [15]:
%%time
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=special_tokens)

CPU times: user 7.17 s, sys: 233 ms, total: 7.4 s
Wall time: 3.91 s


In [16]:
token_dir = Path("KantaiBERT")
token_dir.mkdir(exist_ok=True)

tokenizer.save_model(str(token_dir))

['KantaiBERT/vocab.json', 'KantaiBERT/merges.txt']

In [17]:
!ls -lh KantaiBERT

total 496K
-rw-r--r-- 1 root root 186K May  6 02:57 merges.txt
-rw-r--r-- 1 root root 308K May  6 02:57 vocab.json


In [18]:
tokenizer.get_vocab_size()

19296

# RoBERTa

## Config

In [19]:
config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

## Tokenizer

In [20]:
tokenizer = RobertaTokenizer.from_pretrained(str(token_dir), max_length=512)

In [21]:
len(tokenizer.get_vocab())

19296

In [22]:
tokenizer.encode("The Critique of Pure Reason.")

[0, 803, 2245, 270, 1410, 1270, 18, 2]

## Model

In [23]:
model = RobertaForMaskedLM(config)

In [24]:
model.num_parameters()

83504416

## Dataset (for pre-training)

In [25]:
text_column_name = "text"

In [26]:
def tokenize_function(examples):
    examples[text_column_name] = [
        line
        for line in examples[text_column_name]
        if len(line) > 0 and not line.isspace()
    ]
    return tokenizer(
        examples[text_column_name],
        padding=False,
        truncation=True,
        max_length=512,
        return_special_tokens_mask=True,
    )

In [27]:
raw_datasets = load_dataset("text", data_files="kant.txt")

Using custom data configuration default-13e0b5fd85b56838


Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-13e0b5fd85b56838/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-13e0b5fd85b56838/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [28]:
%%time
tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    num_proc=None,
    remove_columns=[text_column_name],
    load_from_cache_file=True,
    desc="Running tokenizer on dataset line_by_line",
)

Running tokenizer on dataset line_by_line:   0%|          | 0/189 [00:00<?, ?ba/s]

CPU times: user 40.5 s, sys: 1.16 s, total: 41.7 s
Wall time: 41.5 s


In [29]:
dataset = tokenized_datasets["train"]

In [30]:
len(dataset)

170964

## Data collator

In [31]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

## Train

In [32]:
training_args = TrainingArguments(
    output_dir=str(token_dir),
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
)

In [33]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [34]:
%%time
trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 170964
  Num Epochs = 1
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 2672


Step,Training Loss
500,6.6022
1000,5.7325
1500,5.2521
2000,4.9988
2500,4.8453




Training completed. Do not forget to share your model on huggingface.co/models =)




CPU times: user 18min 21s, sys: 8.96 s, total: 18min 30s
Wall time: 18min 24s


TrainOutput(global_step=2672, training_loss=5.4421061213145, metrics={'train_runtime': 1104.4049, 'train_samples_per_second': 154.802, 'train_steps_per_second': 2.419, 'total_flos': 873620128952064.0, 'train_loss': 5.4421061213145, 'epoch': 1.0})

In [35]:
trainer.save_model(str(token_dir))

Saving model checkpoint to KantaiBERT
Configuration saved in KantaiBERT/config.json
Model weights saved in KantaiBERT/pytorch_model.bin


## fill-mask task

In [36]:
fill_mask = pipeline(
    "fill-mask",
    model=str(token_dir),
    tokenizer=str(token_dir)
)

loading configuration file KantaiBERT/config.json
Model config RobertaConfig {
  "_name_or_path": "KantaiBERT",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 52000
}

loading configuration file KantaiBERT/config.json
Model config RobertaConfig {
  "_name_or_path": "KantaiBERT",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,


In [37]:
fill_mask("Human thinking involves<mask>.")

[{'score': 0.020495852455496788,
  'sequence': 'Human thinking involves experience.',
  'token': 531,
  'token_str': ' experience'},
 {'score': 0.01469163317233324,
  'sequence': 'Human thinking involves reason.',
  'token': 393,
  'token_str': ' reason'},
 {'score': 0.008073052391409874,
  'sequence': 'Human thinking involves itself.',
  'token': 500,
  'token_str': ' itself'},
 {'score': 0.006722672842442989,
  'sequence': 'Human thinking involves nature.',
  'token': 586,
  'token_str': ' nature'},
 {'score': 0.006194472312927246,
  'sequence': 'Human thinking involves it.',
  'token': 306,
  'token_str': ' it'}]