# Better Transformer
-   torchtext로 프로덕션 추론을 위해 Better Transformer 사용하는 방법
-   PyTorch 코어 nn.module에 직접 기반을 두거나 torchtext를 사용하는 모델에 대해 작동
-   Better Transformer의 장점
        - CPU 및 GPU에 대한 네이티브 멀티헤드 어텐션 구현
        - 가변 길이 입력으로 인해 입력 토큰에는 처리를 건너뛸 수 있는 패딩 토큰이 많이 포함될 수 있으므로 속도가 크게 향상
        ex) https://colab.research.google.com/drive/1KZnMJYhYkOMYtNIX5S3AGIYnjyG0AojN?usp=sharing

# 설정
## 1.1 사전 훈련된 모델 불러오기

In [1]:
# torchtext 모델에서 XLM-R 모델을 다운로드
import torch
import torch.nn as nn

print(f"torch version: {torch.__version__}")

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

print(f"torch cuda available: {torch.cuda.is_available()}")

import torch, torchtext
from torchtext.models import RobertaClassificationHead
from torchtext.functional import to_tensor
xlmr_large = torchtext.models.XLMR_LARGE_ENCODER
classifier_head = torchtext.models.RobertaClassificationHead(num_classes=2, input_dim = 1024)
model = xlmr_large.get_model(head=classifier_head)
transform = xlmr_large.transform()

  from .autonotebook import tqdm as notebook_tqdm


torch version: 1.13.1+cu116
torch cuda available: True


Downloading: "https://download.pytorch.org/models/text/xlmr.large.encoder.pt" to C:\Users\kimju/.cache\torch\hub\checkpoints\xlmr.large.encoder.pt
100%|██████████| 2.08G/2.08G [03:14<00:00, 11.5MB/s] 
100%|██████████| 5.07M/5.07M [00:01<00:00, 3.58MB/s]
Downloading: "https://download.pytorch.org/models/text/xlmr.vocab.pt" to C:\Users\kimju/.cache\torch\hub\checkpoints\xlmr.vocab.pt
100%|██████████| 4.85M/4.85M [00:00<00:00, 11.3MB/s]


## 1.2 데이터세트 설정
-   작은 입력 배치와 희소성 있는 큰 입력 배치의 두 가지 유형의 입력

In [2]:
small_input_batch = [
               "Hello world",
               "How are you!"
]
big_input_batch = [
               "Hello world",
               "How are you!",
               """`Well, Prince, so Genoa and Lucca are now just family estates of the
Buonapartes. But I warn you, if you don't tell me that this means war,
if you still try to defend the infamies and horrors perpetrated by
that Antichrist- I really believe he is Antichrist- I will have
nothing more to do with you and you are no longer my friend, no longer
my 'faithful slave,' as you call yourself! But how do you do? I see
I have frightened you- sit down and tell me all the news.`

It was in July, 1805, and the speaker was the well-known Anna
Pavlovna Scherer, maid of honor and favorite of the Empress Marya
Fedorovna. With these words she greeted Prince Vasili Kuragin, a man
of high rank and importance, who was the first to arrive at her
reception. Anna Pavlovna had had a cough for some days. She was, as
she said, suffering from la grippe; grippe being then a new word in
St. Petersburg, used only by the elite."""
]

In [3]:
# 전처리 및 모델 테스트
input_batch=big_input_batch

model_input = to_tensor(transform(input_batch), padding_value=1)
output = model(model_input)
output.shape

torch.Size([3, 2])

In [4]:
ITERATIONS=10

# 2. 실행
## 2.1 벤치마크 추론

In [5]:
print("slow path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=False) as prof:
  for i in range(ITERATIONS):
    output = model(model_input)
print(prof)

model.eval()

print("fast path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=False) as prof:
  with torch.no_grad():
    for i in range(ITERATIONS):
      output = model(model_input)
print(prof)

slow path:
---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                       Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                   aten::eq         0.00%      29.000us         0.00%      29.000us      29.000us             1  
            aten::embedding         0.00%      23.000us         0.00%     289.000us     289.000us             1  
              aten::reshape         0.00%      10.000us         0.00%      11.000us      11.000us             1  
       aten::_reshape_alias         0.00%       1.000us         0.00%       1.000us       1.000us             1  
         aten::index_select         0.00%     242.000us         0.00%     253.000us     253.000us             1  
                aten::empty         0.00%       2.000us         0.00%       2

  output = torch._nested_tensor_from_mask(output, src_key_padding_mask.logical_not(), mask_check=False)


-------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                       Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
-------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                   aten::eq         0.00%      25.000us         0.00%      25.000us      25.000us             1  
                            aten::embedding         0.00%       9.000us         0.00%     206.000us     206.000us             1  
                              aten::reshape         0.00%       5.000us         0.00%       7.000us       7.000us             1  
                       aten::_reshape_alias         0.00%       2.000us         0.00%       2.000us       2.000us             1  
                         aten::index_select         0.00%     184.000us         0.00%     

## 2.2 BT 빠른 경로가 있거나 없는(네이티브 MHA만 해당) DEVICE(구성 가능)에서 실행 및 벤치마크 추론

In [6]:
# Better Transformer 모델을 희소성 설정 확인
model.encoder.transformer.layers.enable_nested_tensor

True

In [7]:
# Better Transformer 모델을 희소성 설정 비활성화
model.encoder.transformer.layers.enable_nested_tensor=False

In [8]:
model.to(DEVICE)
model_input = model_input.to(DEVICE)

print("slow path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=True) as prof:
  for i in range(ITERATIONS):
    output = model(model_input)
print(prof)

model.eval()

print("fast path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=True) as prof:
  with torch.no_grad():
    for i in range(ITERATIONS):
      output = model(model_input)
print(prof)

slow path:
---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                       Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                   aten::eq         0.01%     100.000us         0.01%     100.000us     100.000us       7.000us         0.00%       7.000us       7.000us             1  
            aten::embedding         0.00%      28.000us         0.10%     849.000us     849.000us       9.000us         0.00%       1.225ms       1.225ms             1  
              aten::reshape         0.00%       6.000us         0.00%      10.000us      10.000us       4.000us         0.00%       7.000u

## 2.3 BT 빠른 경로(네이티브 MHA + 희소성)가 있거나 없는 (구성 가능한) DEVICE에서 실행 및 벤치마크 추론

In [9]:
# Better Transformer 모델을 희소성 설정 활성화
model.encoder.transformer.layers.enable_nested_tensor=True

In [10]:
model.to(DEVICE)
model_input = model_input.to(DEVICE)

print("slow path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=True) as prof:
  for i in range(ITERATIONS):
    output = model(model_input)
print(prof)

model.eval()

print("fast path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=True) as prof:
  with torch.no_grad():
    for i in range(ITERATIONS):
      output = model(model_input)
print(prof)

slow path:
---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                       Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                   aten::eq         0.03%      62.000us         0.03%      62.000us      62.000us       6.000us         0.00%       6.000us       6.000us             1  
            aten::embedding         0.01%      24.000us         0.03%      62.000us      62.000us       8.000us         0.00%      39.000us      39.000us             1  
              aten::reshape         0.00%       7.000us         0.00%       9.000us       9.000us       6.000us         0.00%       8.000u