---
### MLM Pretraining - 나만의 tokenizer로

In [1]:
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from pathlib import Path

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer(lowercase=True)

# Customize training
tokenizer.train(files='./nsmc/ratings_train.txt', vocab_size=1000, min_frequency=2,
                show_progress=True,
                special_tokens=["<s>",
                                "<pad>",
                                "</s>",
                                "<unk>",
                                "<mask>"])

tokenizer.save_model('scratch')

import pandas as pd
# 판다스로 훈련셋과 테스트셋 데이터 로드
train = pd.read_csv("nsmc/ratings_train.txt", sep='\t')
train = train[~train.document.isnull()].document.values.tolist()

In [2]:
from transformers import RobertaTokenizer

### roberta 형식에 맞게 바꿔준다. 
scratch_tokenizer = RobertaTokenizer.from_pretrained('./scratch')

file ./scratch\config.json not found


In [3]:
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM

# Set a configuration for our RoBERTa model
config = RobertaConfig(
    vocab_size=1000,
    max_position_embeddings=1024, ### 이게 중요하다, position embedding층의 입력크기를 결정하기 때문에 크게 해야한다. 
    num_attention_heads=4,
    num_hidden_layers=1,
    type_vocab_size=1,
)
# Initialize the model from a configuration without pretrained weights
model = RobertaForMaskedLM(config=config)
print('Num parameters: ',model.num_parameters())

Num parameters:  9237736


In [4]:
import torch
class CustomDataset(torch.utils.data.Dataset):
          
    def __len__(self):
        return len(train)

    def __getitem__(self, i):
        # We’ll pad at the batch level.
        return scratch_tokenizer(train[i],  padding='max_length', max_length=20, truncation=True, return_tensors='pt')

      
# Create the train and evaluation dataset
train_dataset = CustomDataset()

In [5]:
class CustomDataset2(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer):
        # or use the RobertaTokenizer from `transformers` directly.
        self.examples = []
        # For every value in the dataframe 
        for example in df.values:
            # 
            x=tokenizer.encode_plus(example) ### 그냥 encode만 해도 된다. 
            self.examples += [x.input_ids]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        # We’ll pad at the batch level.
        return torch.tensor(self.examples[i])

train = pd.read_csv("nsmc/ratings_train.txt", sep='\t')
train_dataset2 = CustomDataset2(train[~train.document.isnull()]['document'], scratch_tokenizer)

In [6]:
### 매우 중요!!! datacollatorforlanguagemodelling의 경우는 attention mask가 있는것이 input으로 들어오면 안된다!!! 
### 읽어보니 padding이 되어있어도 안된다고 한다...!!!!!!!!! 

from transformers import DataCollatorForLanguageModeling

# Define the Data Collator
data_collator = DataCollatorForLanguageModeling(tokenizer=scratch_tokenizer, mlm=True, mlm_probability=0.15)

In [8]:
### tensorboard log가 남는 이유는 trainingargumnets에서 기본적으로 output_dir 지정해주면 logging_dir가 안에 runs로 남도록 default 설정이 되어있다. 
### 모델 학습중간에 꺼서 model checkpoint가 안남은거임. 


from transformers import Trainer, TrainingArguments
# Define the training arguments

### 저장은 CHECKPOINT 폴더가 생기고 그 안에 OPTIMIZER.PT CONFIG,JSON PYTORCH_MODEL.BIN TRAINER_STATE.JSON이 모두 저장된다. 

training_args = TrainingArguments(
    output_dir='./final',
    overwrite_output_dir=True,
    evaluation_strategy = 'epoch',
    num_train_epochs=1,
    learning_rate=1e-3,
    weight_decay=0.1,
    per_device_train_batch_size=32,
    save_steps=2,
    #eval_steps=4096,
    save_total_limit=10,  ### 저장할 최대개수의 폴더를 지정. 1로 해놓으면 안돼... 
)
# Create the trainer for our model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset2   #prediction_loss_only=True,  ### input_ids만 받기 때문에 train_dataset2를 넣은것이다. 
)
# Train the model
trainer.train()


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 149995
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 4688
Saving model checkpoint to ./final\checkpoint-2
Configuration saved in ./final\checkpoint-2\config.json
Model weights saved in ./final\checkpoint-2\pytorch_model.bin
Saving model checkpoint to ./final\checkpoint-4
Configuration saved in ./final\checkpoint-4\config.json
Model weights saved in ./final\checkpoint-4\pytorch_model.bin
Saving model checkpoint to ./final\checkpoint-6
Configuration saved in ./final\checkpoint-6\config.json
Model weights s

KeyboardInterrupt: 

In [25]:
trainer.save_model('final/') ## trainer의 모델의 경우 이렇게 저장하면, config.json과 pytorch_model.bin이 저장된다.
trainer.save_state() ## 상태가 저장된다. 근데 인자가 필요없음. 

Saving model checkpoint to final/
Configuration saved in final/config.json
Model weights saved in final/pytorch_model.bin


In [9]:
load_model = RobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path= 'final/checkpoint-46/pytorch_model.bin', config='final/checkpoint-46/config.json')

loading configuration file final/checkpoint-46/config.json
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 1024,
  "model_type": "roberta",
  "num_attention_heads": 4,
  "num_hidden_layers": 1,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.12.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 1000
}

loading weights file final/checkpoint-46/pytorch_model.bin
All model checkpoint weights were used when initializing RobertaForMaskedLM.

All the weights of RobertaForMaskedLM were initialized from the model checkpoint at final/checkpoint-46/pytorch_model.bin.
If your task is si

In [10]:
for_check = RobertaForMaskedLM.from_pretrained('roberta-base')

loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at C:\Users\chkim/.cache\huggingface\transformers\733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

https://huggingface.co/roberta-base/resolve/main/pytorch_model.bin not found in cach

KeyboardInterrupt: 

In [11]:
### roberta의 경우는 NSP가 없기때문에 TOKEN_TYPE_IDS에 1을 넣어주면 에러가 뜬다. 
### 하지만 모델 구조보면 TOKEN_TYPE_IDS가 있기 때문에, 넣긴넣어줄 수 있다. 
load_model(**{'input_ids': torch.tensor([[  0, 285, 307, 232, 248, 291, 964,  35, 225,   2]]), 'token_type_ids': torch.tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]), 'attention_mask': torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])})

IndexError: index out of range in self


[A

In [13]:
from transformers import pipeline
# Create a Fill mask pipeline
fill_mask = pipeline(
    "fill-mask",
    model='modelling/',
    tokenizer=scratch_tokenizer
)
# Test some examples
# knit midi dress with vneckline
# =>
fill_mask("I <mask> am a dog")
# The test text: Round neck sweater with long sleeves

loading configuration file modelling/config.json
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 1024,
  "model_type": "roberta",
  "num_attention_heads": 4,
  "num_hidden_layers": 1,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.12.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 1000
}

loading configuration file modelling/config.json
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob":

[{'sequence': 'I이 am a dog',
  'score': 0.024812832474708557,
  'token': 269,
  'token_str': '이'},
 {'sequence': 'I다 am a dog',
  'score': 0.022476496174931526,
  'token': 270,
  'token_str': '다'},
 {'sequence': 'I  am a dog',
  'score': 0.016187183558940887,
  'token': 225,
  'token_str': ' '},
 {'sequence': 'I� am a dog',
  'score': 0.015984654426574707,
  'token': 231,
  'token_str': '�'},
 {'sequence': 'I. am a dog',
  'score': 0.01597427949309349,
  'token': 18,
  'token_str': '.'}]