In [None]:
!pip show transformers

Name: transformers
Version: 4.23.1
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache
Location: /usr/local/lib/python3.7/dist-packages
Requires: importlib-metadata, numpy, tokenizers, filelock, huggingface-hub, tqdm, requests, pyyaml, packaging, regex
Required-by: 


In [1]:
!pip install -U transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 37.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 77.0 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 92.9 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1


In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
#!pip install transformers==3.0.2

In [3]:
import logging
import os
import math
import copy
import torch
from dataclasses import dataclass, field
from transformers import RobertaForMaskedLM, RobertaTokenizerFast, TextDataset, DataCollatorForLanguageModeling, Trainer
from transformers import TrainingArguments, HfArgumentParser
from transformers.models.longformer.modeling_longformer import LongformerSelfAttention
from transformers import AutoTokenizer

logging.getLogger().setLevel(logging.INFO)
logger = logging.getLogger(__name__)
#logging.basicConfig(level=logging.INFO)

In [4]:
#roberta_base = RobertaForMaskedLM.from_pretrained('roberta-base')
roberta_base = RobertaForMaskedLM.from_pretrained("vinai/phobert-base")
# roberta_base_tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
roberta_base_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

Downloading:   0%|          | 0.00/557 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/543M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/895k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
class RobertaLongSelfAttention(LongformerSelfAttention):
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_value = None,
        output_attentions=False,
    ):
        attention_mask = attention_mask.squeeze(dim=2).squeeze(dim=1)
        is_index_masked = attention_mask < 0
        is_index_global_attn = attention_mask > 0
        is_global_attn = any(is_index_global_attn.flatten())
        return super().forward(hidden_states, 
                               is_index_masked=is_index_masked, 
                               is_index_global_attn=is_index_global_attn, 
                               is_global_attn=is_global_attn,
                               attention_mask=attention_mask, 
                               output_attentions=output_attentions)

class RobertaLongForMaskedLM(RobertaForMaskedLM):
    def __init__(self, config):
        super().__init__(config)
        for i, layer in enumerate(self.roberta.encoder.layer):
            layer.attention.self = RobertaLongSelfAttention(config, layer_id=i)

In [6]:
def create_long_model(save_model_to, attention_window, max_pos):
    model = roberta_base
    tokenizer = roberta_base_tokenizer
    config = model.config

    # extend position embeddings
    tokenizer.model_max_length = max_pos
    tokenizer.init_kwargs['model_max_length'] = max_pos
    current_max_pos, embed_size = model.roberta.embeddings.position_embeddings.weight.shape
    max_pos += 2  # NOTE: RoBERTa has positions 0,1 reserved, so embedding size is max position + 2
    config.max_position_embeddings = max_pos
    assert max_pos > current_max_pos
    # allocate a larger position embedding matrix
    new_pos_embed = model.roberta.embeddings.position_embeddings.weight.new_empty(max_pos, embed_size)
    # copy position embeddings over and over to initialize the new position embeddings
    k = 2
    step = current_max_pos - 2
    while k < max_pos - 1:
        new_pos_embed[k:(k + step)] = model.roberta.embeddings.position_embeddings.weight[2:]
        k += step
    model.roberta.embeddings.position_embeddings.weight.data = new_pos_embed
    model.roberta.embeddings.position_ids.data = torch.tensor([i for i in range(max_pos)]).reshape(1, max_pos)

    # replace the `modeling_bert.BertSelfAttention` object with `LongformerSelfAttention`
    config.attention_window = [attention_window] * config.num_hidden_layers
    for i, layer in enumerate(model.roberta.encoder.layer):
        longformer_self_attn = LongformerSelfAttention(config, layer_id=i)
        longformer_self_attn.query = layer.attention.self.query
        longformer_self_attn.key = layer.attention.self.key
        longformer_self_attn.value = layer.attention.self.value

        longformer_self_attn.query_global = copy.deepcopy(layer.attention.self.query)
        longformer_self_attn.key_global = copy.deepcopy(layer.attention.self.key)
        longformer_self_attn.value_global = copy.deepcopy(layer.attention.self.value)

        layer.attention.self = longformer_self_attn

    logger.info(f'saving model to {save_model_to}')
    model.save_pretrained(save_model_to)
    tokenizer.save_pretrained(save_model_to)
    return model, tokenizer

In [7]:
def copy_proj_layers(model):
    for i, layer in enumerate(model.roberta.encoder.layer):
        layer.attention.self.query_global = copy.deepcopy(layer.attention.self.query)
        layer.attention.self.key_global = copy.deepcopy(layer.attention.self.key)
        layer.attention.self.value_global = copy.deepcopy(layer.attention.self.value)
    return model

In [10]:
def pretrain_and_evaluate(args, model, tokenizer, eval_only, model_path):
    val_dataset = TextDataset(tokenizer=tokenizer,
                              file_path=args.val_datapath,
                              block_size=tokenizer.model_max_length)
    if eval_only:
        train_dataset = val_dataset
    else:
        logger.info(f'Loading and tokenizing training data is usually slow: {args.train_datapath}')
        train_dataset = TextDataset(tokenizer=tokenizer,
                                    file_path=args.train_datapath,
                                    block_size=tokenizer.model_max_length)

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
    trainer = Trainer(model=model, args=args, data_collator=data_collator,
                      train_dataset=train_dataset, eval_dataset=val_dataset)

    eval_loss = trainer.evaluate()
    eval_loss = eval_loss['eval_loss']
    logger.info(f'Initial eval bpc: {eval_loss/math.log(2)}')
    #print(eval_loss/math.log(2))
    
    if not eval_only:
        trainer.train(model_path=model_path)
        trainer.save_model()

        eval_loss = trainer.evaluate()
        eval_loss = eval_loss['eval_loss']
        #print(eval_loss/math.log(2))
        logger.info(f'Eval bpc after pretraining: {eval_loss/math.log(2)}')



In [9]:
@dataclass
class ModelArgs:
    attention_window: int = field(default=256, metadata={"help": "Size of attention window"})
    max_pos: int = field(default=1024, metadata={"help": "Maximum position"})

parser = HfArgumentParser((TrainingArguments, ModelArgs,))


training_args, model_args = parser.parse_args_into_dataclasses(look_for_args_file=False, args=[
    '--output_dir', 'tmp',
    '--warmup_steps', '500',
    '--learning_rate', '0.00003',
    '--weight_decay', '0.01',
    '--adam_epsilon', '1e-6',
    '--max_steps', '3000',
    '--logging_steps', '500',
    '--save_steps', '500',
    '--max_grad_norm', '5.0',
    '--per_device_eval_batch_size', '8',
    '--per_device_train_batch_size', '2',  # 32GB gpu with fp32
    '--gradient_accumulation_steps', '32',
#    '--evaluate_during_training',
    '--do_train',
    '--do_eval',
    '--prediction_loss_only', 'True'
])
training_args.val_datapath = '/content/drive/MyDrive/z_inf_data/word_seg_corpus_valid.txt'
training_args.train_datapath = '/content/drive/MyDrive/z_inf_data/word_seg_corpus_test.txt'

# Choose GPU
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"


In [None]:
logger.info('Evaluating Phobert (seqlen: 256) for refernece ...')
pretrain_and_evaluate(training_args, roberta_base, roberta_base_tokenizer, eval_only=True, model_path=None)

INFO:__main__:Evaluating Phobert (seqlen: 256) for refernece ...
max_steps is given, it will override any value given in num_train_epochs


In [None]:
model_path = f'{training_args.output_dir}/phobert-{model_args.max_pos}'
if not os.path.exists(model_path):
    os.makedirs(model_path)

logger.info(f'Converting phobert into phobert-{model_args.max_pos}')
model, tokenizer = create_long_model(
    save_model_to=model_path, attention_window=model_args.attention_window, max_pos=model_args.max_pos)

INFO:__main__:Converting phobert into phobert-1024
INFO:__main__:saving model to tmp/phobert-1024


In [None]:
logger.info(f'Loading the model from {model_path}')
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = RobertaLongForMaskedLM.from_pretrained(model_path)

INFO:__main__:Loading the model from tmp/phobert-1024


In [None]:
pretrain_and_evaluate(training_args, model, tokenizer, eval_only=True, model_path=None)

Creating features from dataset file at /content/drive/MyDrive/z_inf_data
Saving features into cached file /content/drive/MyDrive/z_inf_data/cached_lm_PhobertTokenizer_1022_word_seg_corpus_valid.txt [took 0.176 s]
max_steps is given, it will override any value given in num_train_epochs
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
***** Running Evaluation *****
  Num examples = 5572
  Batch size = 8


Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


2.2595022196254506


In [None]:
logger.info(f'Pretraining phobert-{model_args.max_pos} ... ')

training_args.max_steps = 2000   ## <<<<<<<<<<<<<<<<<<<<<<<< REMOVE THIS <<<<<<<<<<<<<<<<<<<<<<<<

pretrain_and_evaluate(training_args, model, tokenizer, eval_only=False, model_path='/content/tmp/phobert-1024')

INFO:__main__:Pretraining phobert-1024 ... 
INFO:__main__:Loading and tokenizing training data is usually slow: /content/drive/MyDrive/z_inf_data/word_seg_corpus_test.txt
max_steps is given, it will override any value given in num_train_epochs
Loading model from /content/tmp/phobert-1024.
***** Running training *****
  Num examples = 263268
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 32
  Total optimization steps = 2000
Didn't find an RNG file, if you are resuming a training that was launched in a distributed fashion, reproducibility is not guaranteed.


Step,Training Loss
500,1.4489
1000,1.3663
1500,1.3527
2000,1.3442


Saving model checkpoint to tmp/checkpoint-500
Configuration saved in tmp/checkpoint-500/config.json
Model weights saved in tmp/checkpoint-500/pytorch_model.bin
Saving model checkpoint to tmp/checkpoint-1000
Configuration saved in tmp/checkpoint-1000/config.json
Model weights saved in tmp/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to tmp/checkpoint-1500
Configuration saved in tmp/checkpoint-1500/config.json
Model weights saved in tmp/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to tmp/checkpoint-2000
Configuration saved in tmp/checkpoint-2000/config.json
Model weights saved in tmp/checkpoint-2000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to tmp
Configuration saved in tmp/config.json
Model weights saved in tmp/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 5572
  Batch size = 8


INFO:__main__:Eval bpc after pretraining: 1.8116716075469081


In [None]:
logger.info(f'Copying local projection layers into global projection layers ... ')
model = copy_proj_layers(model)
logger.info(f'Saving model to {model_path}')
model.save_pretrained(model_path)

INFO:__main__:Copying local projection layers into global projection layers ... 
INFO:__main__:Saving model to tmp/phobert-1024
Configuration saved in tmp/phobert-1024/config.json
Model weights saved in tmp/phobert-1024/pytorch_model.bin


In [None]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-20d1c5f5-9552-c9a7-b438-dddf73519b13)


In [None]:
import shutil
shutil.make_archive('phobert-1024-2000steps', 'zip', '/content/tmp/phobert-1024')

'/content/phobert-1024-2000steps.zip'

In [None]:
!cp /content/phobert-1024-2000steps.zip /content/drive/MyDrive/z_inf_data

In [11]:
import zipfile
with zipfile.ZipFile('/content/drive/MyDrive/z_inf_data/phobert-1024-2000steps.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/phobert')

In [12]:
tokenizer = AutoTokenizer.from_pretrained('/content/phobert')
model = RobertaLongForMaskedLM.from_pretrained('/content/phobert')

In [13]:
pretrain_and_evaluate(training_args, model, tokenizer, eval_only=True, model_path=None)

max_steps is given, it will override any value given in num_train_epochs
***** Running Evaluation *****
  Num examples = 5572
  Batch size = 8


INFO:__main__:Initial eval bpc: 1.8153857448749413
