In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |▏                               | 10kB 25.9MB/s eta 0:00:01[K     |▎                               | 20kB 28.9MB/s eta 0:00:01[K     |▌                               | 30kB 19.6MB/s eta 0:00:01[K     |▋                               | 40kB 22.8MB/s eta 0:00:01[K     |▉                               | 51kB 24.3MB/s eta 0:00:01[K     |█                               | 61kB 26.8MB/s eta 0:00:01[K     |█▏                              | 71kB 21.5MB/s eta 0:00:01[K     |█▎                              | 81kB 22.2MB/s eta 0:00:01[K     |█▍                              | 92kB 20.9MB/s eta 0:00:01[K     |█▋                              | 102kB 20.3MB/s eta 0:00:01[K     |█▊                              | 112kB 20.3MB/s eta 0:00:01[K     |██                              | 

In [None]:
import transformers

In [None]:
print (transformers.__version__)

4.5.1


In [None]:
import logging
import os
import math
import torch
import tensorflow as tf
from dataclasses import dataclass, field
from transformers import AutoModel, AutoTokenizer, BertTokenizerFast, BertForMaskedLM, BertModel
from transformers import TrainingArguments, HfArgumentParser
from transformers import LongformerModel 
from transformers import LongformerSelfAttention
from transformers import LongformerConfig 

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [None]:
class BertLongSelfAttention(LongformerSelfAttention):
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_value=None,
        output_attentions=False,
    ):
        return super().forward(hidden_states, attention_mask=attention_mask, output_attentions=output_attentions)


class BertLong(BertModel):
    def __init__(self, config):
        super().__init__(config)
        for i, layer in enumerate(self.encoder.layer):
            # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`
            layer.attention.self = BertLongSelfAttention(config, layer_id=i)

In [None]:
def create_long_model(save_model_to, attention_window, max_pos):
    model = BertModel.from_pretrained("GroNLP/bert-base-dutch-cased")
    tokenizer = BertTokenizerFast.from_pretrained("GroNLP/bert-base-dutch-cased", model_max_length=max_pos)
    config = model.config

    print(max_pos)
    # extend position embeddings
    tokenizer.model_max_length = max_pos
    tokenizer.init_kwargs['model_max_length'] = max_pos
    current_max_pos, embed_size = model.embeddings.position_embeddings.weight.shape
    config.max_position_embeddings = max_pos
    assert max_pos > current_max_pos
    # allocate a larger position embedding matrix
    new_pos_embed = model.embeddings.position_embeddings.weight.new_empty(max_pos, embed_size)
    print(new_pos_embed.shape)
    print(model.embeddings.position_embeddings)
    # copy position embeddings over and over to initialize the new position embeddings
    k = 0
    step = current_max_pos
    while k < max_pos - 1:
        new_pos_embed[k:(k + step)] = model.embeddings.position_embeddings.weight
        k += step
    print(new_pos_embed.shape)
    model.embeddings.position_ids = torch.from_numpy(tf.range(new_pos_embed.shape[0], dtype=tf.int32).numpy()[tf.newaxis, :])
    model.embeddings.position_embeddings = torch.nn.Embedding.from_pretrained(new_pos_embed)
    
    # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`
    config.attention_window = [attention_window] * config.num_hidden_layers                                                                                                                                                                                                                                                                                                                                                                                                                                                                                
    for i, layer in enumerate(model.encoder.layer):
        longformer_self_attn = LongformerSelfAttention(config, layer_id=i)
        longformer_self_attn.query = layer.attention.self.query
        longformer_self_attn.key = layer.attention.self.key
        longformer_self_attn.value = layer.attention.self.value

        longformer_self_attn.query_global = layer.attention.self.query
        longformer_self_attn.key_global = layer.attention.self.key
        longformer_self_attn.value_global = layer.attention.self.value

        layer.attention.self = longformer_self_attn
    print(model.embeddings.position_ids.shape)
    logger.info(f'saving model to {save_model_to}')
    model.save_pretrained(save_model_to)
    tokenizer.save_pretrained(save_model_to)
    return model, tokenizer, new_pos_embed

In [None]:
@dataclass
class ModelArgs:
    attention_window: int = field(default=512, metadata={"help": "Size of attention window"})
    max_pos: int = field(default=4096, metadata={"help": "Maximum position"})

parser = HfArgumentParser((TrainingArguments, ModelArgs,))


training_args, model_args = parser.parse_args_into_dataclasses(look_for_args_file=False, args=[
    '--output_dir', 'tmp',
    '--warmup_steps', '500',
    '--learning_rate', '0.00003',
    '--weight_decay', '0.01',
    '--adam_epsilon', '1e-6',
    '--max_steps', '3000',
    '--logging_steps', '500',
    '--save_steps', '500',
    '--max_grad_norm', '5.0',
    '--per_gpu_eval_batch_size', '8',
    '--per_gpu_train_batch_size', '2',  # 32GB gpu with fp32
    '--gradient_accumulation_steps', '32',
    #'--evaluate_during_training',
    '--do_train',
    '--do_eval',
])

# Choose GPU
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
model_path = f'{training_args.output_dir}/GroNLP/bert-base-dutch-cased-{model_args.max_pos}'
if not os.path.exists(model_path):
    os.makedirs(model_path)

logger.info(f'GroNLP/bert-base-dutch-cased into GroNLP/bert-base-dutch-cased-{model_args.max_pos}')
model, tokenizer, new_pos_embed = create_long_model(
    save_model_to=model_path, attention_window=model_args.attention_window, max_pos=model_args.max_pos)
#create_long_model(save_model_to, attention_window, max_pos)

INFO:__main__:GroNLP/bert-base-dutch-cased into GroNLP/bert-base-dutch-cased-4096
INFO:filelock:Lock 140210913015760 acquired on /root/.cache/huggingface/transformers/443c1d513d458927e5883e0b1298cdb70ba4d14a55faa236d93e0598efc78fc7.9c2c2a3ca9723b5324fb9d01fbe78b3550de8641c6cdf59498152ab427df95a2.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=521.0, style=ProgressStyle(description_…

INFO:filelock:Lock 140210913015760 released on /root/.cache/huggingface/transformers/443c1d513d458927e5883e0b1298cdb70ba4d14a55faa236d93e0598efc78fc7.9c2c2a3ca9723b5324fb9d01fbe78b3550de8641c6cdf59498152ab427df95a2.lock
INFO:filelock:Lock 140211926223760 acquired on /root/.cache/huggingface/transformers/0afca22dbb0c0cf270ab24011224d449d604dca5dac95f18ef719be45029ee80.99bc8efd927a7d6b36398f23af86aff6b91a6b607e37773ed52ab62d53cfe5a9.lock





HBox(children=(FloatProgress(value=0.0, description='Downloading', max=436538834.0, style=ProgressStyle(descri…

INFO:filelock:Lock 140211926223760 released on /root/.cache/huggingface/transformers/0afca22dbb0c0cf270ab24011224d449d604dca5dac95f18ef719be45029ee80.99bc8efd927a7d6b36398f23af86aff6b91a6b607e37773ed52ab62d53cfe5a9.lock





Some weights of BertModel were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:filelock:Lock 140210912984976 acquired on /root/.cache/huggingface/transformers/5ab1fe3a4445d1380cdcfca1496fda3de97bc83e5005fd1a0942cb758eacc67a.4c3119f5f60fe4102b00f9d33781542a23e127fb2ee1ece1254b8f908cc357bd.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=241441.0, style=ProgressStyle(descripti…

INFO:filelock:Lock 140210912984976 released on /root/.cache/huggingface/transformers/5ab1fe3a4445d1380cdcfca1496fda3de97bc83e5005fd1a0942cb758eacc67a.4c3119f5f60fe4102b00f9d33781542a23e127fb2ee1ece1254b8f908cc357bd.lock
INFO:filelock:Lock 140210778477840 acquired on /root/.cache/huggingface/transformers/adb82a117c09b0f8768357de8e836a9e0610730782f82edc49dd0020c48f1d03.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock





HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…

INFO:filelock:Lock 140210778477840 released on /root/.cache/huggingface/transformers/adb82a117c09b0f8768357de8e836a9e0610730782f82edc49dd0020c48f1d03.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d.lock
INFO:filelock:Lock 140210778848848 acquired on /root/.cache/huggingface/transformers/32589e144ae92d44c56eef5c89a01b3cf4bb1c744c60073f47834ddfdd84854e.fd86b436ab272b10c085aec068f5dbace040ca1602155bea390706b96cb608e3.lock





HBox(children=(FloatProgress(value=0.0, description='Downloading', max=254.0, style=ProgressStyle(description_…

INFO:filelock:Lock 140210778848848 released on /root/.cache/huggingface/transformers/32589e144ae92d44c56eef5c89a01b3cf4bb1c744c60073f47834ddfdd84854e.fd86b436ab272b10c085aec068f5dbace040ca1602155bea390706b96cb608e3.lock



4096
torch.Size([4096, 768])
Embedding(512, 768)
torch.Size([4096, 768])


INFO:__main__:saving model to tmp/GroNLP/bert-base-dutch-cased-4096


torch.Size([1, 4096])


In [None]:
logger.info(f'Loading the model from {model_path}')
tokenizer = BertTokenizerFast.from_pretrained(model_path)
model = BertLong.from_pretrained(model_path)


INFO:__main__:Loading the model from tmp/GroNLP/bert-base-dutch-cased-4096
