<a href="https://colab.research.google.com/github/chw8207/Transformer/blob/main/RoBERTa_%EC%82%AC%EC%A0%84%ED%9B%88%EB%A0%A8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install tokenizers

In [None]:
pip install transformers[torch]

In [None]:
pip install deepspeed

In [63]:
import tensorflow as tf
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
import os
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
import torch
from transformers import RobertaConfig
from transformers import RobertaTokenizer
from transformers import RobertaForMaskedLM
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import pipeline

#### HugginfFace 트랜스포머 설치

In [None]:
!pip install git+https://github.com/huggingface/transformers
!pip list | grep -E 'transformers|tokenizers'

### GPU 설정

In [32]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0' :
    raise SystemError('GPU device not found')
print(f'Found GPU at: {device_name}')

Found GPU at: /device:GPU:0


In [33]:
from tensorflow.python.client import device_lib

print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 17017443865037718643
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 14328594432
locality {
  bus_id: 1
  links {
  }
}
incarnation: 2330672869101771179
physical_device_desc: "device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5"
xla_global_id: 416903419
]


### KantaiBERT 모델
- 레이어 : 6개
- 헤드 : 12개
- 파라미터 : 84,095,008개

#### 토크나이저 훈련

In [34]:
%%time
paths = [str(x) for x in Path('.').glob("**/*.txt")]

# 토크나이저 초기화
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2,
                special_tokens=['<s>','<pad>','</s>','<unl>','<mask>'])

CPU times: user 9.41 s, sys: 415 ms, total: 9.83 s
Wall time: 2.79 s


#### 파일 저장하기

In [35]:
token_dir = '/content/KantaiBERT'
if not os.path.exists(token_dir) :
  os.makedirs(token_dir)
tokenizer.save_model('KantaiBERT')

['KantaiBERT/vocab.json', 'KantaiBERT/merges.txt']

#### 훈련된 토크나이저 파일 불러오기

In [36]:
tokenizer = ByteLevelBPETokenizer(
    '/content/KantaiBERT/vocab.json',
    '/content/KantaiBERT/merges.txt'
)

In [37]:
# 토크나이저의 시퀀스 인코딩
tokenizer.encode('The Critique of Pure Reason.').tokens

['The', 'ĠCritique', 'Ġof', 'ĠPure', 'ĠReason', '.']

In [38]:
# 시퀀스에서 토큰 수 확인
tokenizer.encode('The Critique of Pure Reason.')

Encoding(num_tokens=6, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [39]:
# 시작 및 종료 토큰 추가
tokenizer._tokenizer.post_processor = BertProcessing(
    ('</s>', tokenizer.token_to_id('</s>')),
    ('<s>', tokenizer.token_to_id('<s>'))
)
# 512개 이상의 토큰은 자르도록 설정함
tokenizer.enable_truncation(max_length=512)

In [40]:
tokenizer.encode('The Critique of Pure Reason.')

Encoding(num_tokens=8, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [41]:
tokenizer.encode('The Critique of Pure Reason.').tokens

['<s>', 'The', 'ĠCritique', 'Ġof', 'ĠPure', 'ĠReason', '.', '</s>']

#### 리소스 제약 확인 : GPU와 CUDA

In [42]:
!nvidia-smi

Wed Jul 12 07:25:51 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   55C    P0    28W /  70W |    881MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [43]:
# 파이토치 cuda인식 여부 확인
torch.cuda.is_available()

True

#### 모델 설정 정의

In [44]:
config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)

In [45]:
print(config)

RobertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.31.0.dev0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 52000
}



#### 사전 훈련된 토크나이저 다시 불러오기

In [46]:
tokenizer = RobertaTokenizer.from_pretrained('./KantaiBERT', max_length=512)

#### 백지 상태에서 모델 초기화

In [47]:
# 마스킹된 RoBERTa 모델 불러오기
model = RobertaForMaskedLM(config=config)

In [48]:
print(model)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

In [49]:
# 파라미터 크기 확인하기
print(model.num_parameters())

83504416


In [50]:
# 파라미터 길이
LP = list(model.parameters())
lp = len(LP)
print(lp)

106


In [51]:
# 파라미터 탐색
for p in range(0, lp) :
  print(LP[p])

Parameter containing:
tensor([[ 0.0117,  0.0076, -0.0146,  ...,  0.0159,  0.0276,  0.0109],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0038,  0.0183, -0.0244,  ...,  0.0083, -0.0395,  0.0164],
        ...,
        [ 0.0135,  0.0005, -0.0129,  ..., -0.0036, -0.0040, -0.0148],
        [-0.0185,  0.0163,  0.0085,  ...,  0.0070, -0.0110,  0.0111],
        [ 0.0122, -0.0067, -0.0118,  ..., -0.0227, -0.0178, -0.0027]],
       requires_grad=True)
Parameter containing:
tensor([[-0.0026, -0.0018,  0.0232,  ..., -0.0010,  0.0146,  0.0002],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0090,  0.0083,  0.0043,  ..., -0.0055,  0.0146,  0.0353],
        ...,
        [ 0.0205,  0.0188, -0.0068,  ..., -0.0318,  0.0355, -0.0084],
        [-0.0160,  0.0290, -0.0138,  ..., -0.0325,  0.0103,  0.0258],
        [ 0.0208,  0.0254, -0.0181,  ..., -0.0267,  0.0235,  0.0002]],
       requires_grad=True)
Parameter containing:
tensor([[ 1.

In [52]:
# 각 텐서의 파라미터 수 계산
np = 0
for p in range(0,lp) : # 텐서의 수
  PL2 = True
  try :
    L2 = len(LP[p][0])  # 2차원인지 확인
  except :
    L2 = 1              # 2차원이 아니고 1차원임
    PL2 = False
  L1 = len(LP[p])       # L1 : 첫 번째 차원의 크기
  L3 = L1*L2
  np += L3

  # 트랜스포머 모델의 파라미터 수가 정확히 어떻게 계산되는지 살펴보기
  if PL2 == True :
    print(p, L1, L2, L3)
  if PL2 == False :
    print(p, L1, L3)

print(np)


0 52000 768 39936000
1 514 768 394752
2 1 768 768
3 768 768
4 768 768
5 768 768 589824
6 768 768
7 768 768 589824
8 768 768
9 768 768 589824
10 768 768
11 768 768 589824
12 768 768
13 768 768
14 768 768
15 3072 768 2359296
16 3072 3072
17 768 3072 2359296
18 768 768
19 768 768
20 768 768
21 768 768 589824
22 768 768
23 768 768 589824
24 768 768
25 768 768 589824
26 768 768
27 768 768 589824
28 768 768
29 768 768
30 768 768
31 3072 768 2359296
32 3072 3072
33 768 3072 2359296
34 768 768
35 768 768
36 768 768
37 768 768 589824
38 768 768
39 768 768 589824
40 768 768
41 768 768 589824
42 768 768
43 768 768 589824
44 768 768
45 768 768
46 768 768
47 3072 768 2359296
48 3072 3072
49 768 3072 2359296
50 768 768
51 768 768
52 768 768
53 768 768 589824
54 768 768
55 768 768 589824
56 768 768
57 768 768 589824
58 768 768
59 768 768 589824
60 768 768
61 768 768
62 768 768
63 3072 768 2359296
64 3072 3072
65 768 3072 2359296
66 768 768
67 768 768
68 768 768
69 768 768 589824
70 768 768
71 768 768

#### 데이터셋 구축

In [53]:
%%time
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path='/content/drive/MyDrive/Colab Notebooks/트랜스포머/Trasnformer/Transformer/data/kant.txt',
    block_size=128
)



CPU times: user 26.1 s, sys: 358 ms, total: 26.4 s
Wall time: 26.6 s


#### 데이터 collator 정의하기

In [54]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0/15
)

#### 트레이너 초기화

In [60]:
training_args = TrainingArguments(
    output_dir="./KantaiBERT",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

#### 모델 사전훈련

In [61]:
%%time
trainer.train()



Step,Training Loss
500,0.0
1000,0.0
1500,0.0
2000,0.0
2500,0.0


CPU times: user 9min 16s, sys: 1.89 s, total: 9min 18s
Wall time: 9min 18s


TrainOutput(global_step=2672, training_loss=0.0, metrics={'train_runtime': 557.6034, 'train_samples_per_second': 306.605, 'train_steps_per_second': 4.792, 'total_flos': 873939262999296.0, 'train_loss': 0.0, 'epoch': 1.0})

#### 최종 모델(+토크나이저+config)저장하기

In [62]:
trainer.save_model('./KantaiBERT')

#### FillMaskPipeline에 의한 모델링

In [64]:
fill_mask = pipeline(
    'fill-mask',
    model='./KantaiBERT',
    tokenizer='./KantaiBERT'
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [65]:
fill_mask('Human thinking involves human <mask>.')

[{'score': 0.00016281074204016477,
  'token': 37910,
  'token_str': '',
  'sequence': 'Human thinking involves human.'},
 {'score': 0.00013703538570553064,
  'token': 40755,
  'token_str': '',
  'sequence': 'Human thinking involves human.'},
 {'score': 0.00013351008237805218,
  'token': 19132,
  'token_str': ' QUANTITY',
  'sequence': 'Human thinking involves human QUANTITY.'},
 {'score': 0.00012756860814988613,
  'token': 8107,
  'token_str': ' hands',
  'sequence': 'Human thinking involves human hands.'},
 {'score': 0.00012513381079770625,
  'token': 14348,
  'token_str': 'opposites',
  'sequence': 'Human thinking involves humanopposites.'}]