In [None]:
%pip install transformers asian-bart

In [None]:
import os
if "drive" not in os.listdir("/content") :
    from google.colab import drive
    drive.mount('/content/drive')
os.chdir("/content/drive/MyDrive/NLP_Project_3")

In [None]:
import torch
import time
import gc

from transformers import AutoTokenizer, AutoConfig
from asian_bart import AsianBartForConditionalGeneration

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if "cuda" in DEVICE.type :
    torch.cuda.set_device(DEVICE)

tokenizer = AutoTokenizer.from_pretrained("hyunwoongko/asian-bart-ecjk", src_text = "ko_KR", tgt_text = "en_XX")

model = AsianBartForConditionalGeneration.from_pretrained("./Model/large_batch_kor2eng")

quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)

In [None]:
def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

print_size_of_model(model)
print_size_of_model(quantized_model)

In [None]:
quantized_model.config.save_pretrained("./Model/quantized_kor2eng_config")
torch.save(quantized_model.state_dict(), "./Model/quantized_kor2eng")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("hyunwoongko/asian-bart-ecjk", src_text = "ko_KR", tgt_text = "en_XX")
config = AutoConfig.from_pretrained("./Model/quantized_kor2eng_config")
dummy_model = AsianBartForConditionalGeneration(config)

In [None]:
reconstructed_quantized_model = torch.quantization.quantize_dynamic(dummy_model, {torch.nn.Linear}, dtype = torch.qint8)
del dummy_model, config
gc.collect()
reconstructed_quantized_model.load_state_dict(torch.load("./Model/quantized_kor2eng"))

<All keys matched successfully>

In [None]:
def time_model_evaluation(model, tokenizer, device):
    eval_start_time = time.time()
    inputs = tokenizer(["안녕하세요.", "만나서 반갑습니다.", "속도 시험 중 입니다.", "자료가 많아질 수록 속도 차이가 많이 나겠죠?"], return_tensors = "pt", padding = True, max_length = 100).input_ids.to(device)
    result = model.generate(inputs, max_length = 100)
    eval_end_time = time.time()
    eval_duration_time = eval_end_time - eval_start_time
    print(tokenizer.batch_decode(result, skip_special_tokens = True, clean_up_tokenization_spaces = True))
    print("Evaluate total time (seconds): {0:.1f}".format(eval_duration_time))



In [None]:
# Inference on GPU
time_model_evaluation(model.to(DEVICE), tokenizer, DEVICE)

  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "


['▁Hello.', "▁It's ▁nice ▁to ▁meet ▁you.", "▁I'm ▁taking ▁a ▁speed ▁test.", '▁The ▁more ▁material, ▁the ▁higher ▁the ▁speed ▁difference, ▁right?']
Evaluate total time (seconds): 0.3


In [None]:
# 동적 양자화를 거친 INT8 BERT 모델 평가
time_model_evaluation(reconstructed_quantized_model, tokenizer, "cpu")

  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "


['▁Hello.', "▁It's ▁nice ▁to ▁meet ▁you.", "▁I'm ▁taking ▁a ▁speed ▁test.", '▁The ▁more, ▁and ▁the ▁s hor ter ▁the ▁speed ▁difference ▁will ▁be, ▁right?']
Evaluate total time (seconds): 1.4
