In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  # Arrange GPU devices starting from 0
os.environ["CUDA_VISIBLE_DEVICES"]= "0"  # Set the GPUs to use

In [4]:
import os
import re
import random
import warnings
from tqdm import tqdm

import numpy as np
import pandas as pd
import torch

from transformers import (
    AutoConfig, AutoTokenizer, 
    AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, 
    AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
)

from peft import PeftModel, PeftConfig

from datasets import load_from_disk

from src.metrics import rouge_for_batch, f1_score_at_k_for_batch, jaccard_similarity_for_batch

os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings('ignore')


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /workspace/news-topic-keyphrase-generation-model-dev/.venv/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda116.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 116
CUDA SETUP: Loading binary /workspace/news-topic-keyphrase-generation-model-dev/.venv/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda116.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


In [5]:
NGPU = torch.cuda.device_count()
NCPU = os.cpu_count()
NGPU, NCPU

(1, 64)

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Paths and Names

In [7]:
### paths and names

TRAIN_DATA_PATH = 'data/model_dev/model_dev_v4_polyglot_1.3b_train.hf'
EVAL_DATA_PATH = 'data/model_dev/model_dev_v4_polyglot_1.3b_eval.hf'

PEFT_MODEL_ID = '.log/eleutherai_polyglot_ko_1.3b_v4_run_13'

In [8]:
torch_dtype = torch.float16

# Load PEFT Model & Tokenizer

In [9]:
peft_config = PeftConfig.from_pretrained(PEFT_MODEL_ID)

In [10]:
model = AutoModelForCausalLM.from_pretrained(PEFT_MODEL_ID, torch_dtype=torch_dtype, load_in_8bit=True, device_map='auto')
model = PeftModel.from_pretrained(model, PEFT_MODEL_ID, torch_dtype=torch_dtype)
model.eval()

Some weights of the model checkpoint at .log/eleutherai_polyglot_ko_1.3b_v4_run_13 were not used when initializing GPTNeoXForCausalLM: ['gpt_neox.layers.13.attention.query_key_value.lora_B.weight', 'gpt_neox.layers.9.attention.query_key_value.lora_B.weight', 'gpt_neox.layers.7.attention.query_key_value.lora_B.weight', 'gpt_neox.layers.10.attention.query_key_value.lora_A.weight', 'gpt_neox.layers.3.attention.query_key_value.lora_B.weight', 'gpt_neox.layers.8.attention.query_key_value.lora_B.weight', 'gpt_neox.layers.16.attention.query_key_value.lora_A.weight', 'gpt_neox.layers.4.attention.query_key_value.lora_B.weight', 'gpt_neox.layers.1.attention.query_key_value.lora_B.weight', 'gpt_neox.layers.19.attention.query_key_value.lora_B.weight', 'gpt_neox.layers.23.attention.query_key_value.lora_A.weight', 'gpt_neox.layers.14.attention.query_key_value.lora_B.weight', 'gpt_neox.layers.0.attention.query_key_value.lora_A.weight', 'gpt_neox.layers.3.attention.query_key_value.lora_A.weight', 'gpt

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPTNeoXForCausalLM(
      (gpt_neox): GPTNeoXModel(
        (embed_in): Embedding(30005, 2048)
        (layers): ModuleList(
          (0): GPTNeoXLayer(
            (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
            (post_attention_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
            (attention): GPTNeoXAttention(
              (rotary_emb): RotaryEmbedding()
              (query_key_value): MergedLinear8bitLt(
                in_features=2048, out_features=6144, bias=True
                (lora_dropout): Dropout(p=0.1, inplace=False)
                (lora_A): Linear(in_features=2048, out_features=16, bias=False)
                (lora_B): Conv1d(16, 4096, kernel_size=(1,), stride=(1,), groups=2, bias=False)
              )
              (dense): Linear8bitLt(in_features=2048, out_features=2048, bias=True)
            )
            (mlp): GPTNeoXMLP(
              (

In [11]:
tokenizer = AutoTokenizer.from_pretrained(PEFT_MODEL_ID)

In [12]:
model.resize_token_embeddings(len(tokenizer))

Embedding(30005, 2048)

In [13]:
model.dtype, model.peft_config.inference_mode

(torch.float16, True)

# Load Data

In [14]:
eval_dataset = load_from_disk(EVAL_DATA_PATH)
print(eval_dataset)

Dataset({
    features: ['id', 'create_date', 'title', 'input_text', 'len_tokenized', 'target_text', 'len_tokenized_target_text', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3000
})


In [15]:
# rand_idx = random.randint(0, len(eval_dataset)-1)
# tokenizer.decode(eval_dataset['input_ids'][rand_idx])
# # tokenizer.decode(eval_dataset['labels'][rand_idx])

# Generate

## CLM

In [28]:
inputs = eval_dataset['input_text']
inputs = ['[generate keyphrases]' + input_ + '[keyphrases generated]' for input_ in inputs]
labels = eval_dataset['target_text']

In [30]:
tokenizer.padding_side = 'left'

In [31]:
generation_args = {
    'do_sample': True, 
    'top_k': 50, 
    'top_p': 0.7, 
    'num_beams': 5, 
    'no_repeat_ngram_size': 3, 
    'num_return_sequences': 1, 
    # 'num_beam_groups': 3,
    'early_stopping': True,
}
generation_args = {}

In [32]:
batch_size = 12
predictions = None

with torch.no_grad(), torch.autocast("cuda"):
    start = 0
    range_end = len(inputs) if len(inputs) % batch_size == 0 else len(inputs) + batch_size
    for idx in tqdm(range(batch_size, range_end+1, batch_size)):

        tokenized = tokenizer(inputs[start:idx], truncation=False, padding=True, return_tensors='pt')
        tokenized = {k:v.to(device) for k, v in tokenized.items() if k != 'token_type_ids'}

        prediction = model.generate(
            **tokenized, 
            max_new_tokens=64, 
            eos_token_id=tokenizer.eos_token_id, 
            pad_token_id=tokenizer.pad_token_id,
            **generation_args, 
        )
        if predictions == None:
            predictions = prediction.detach().cpu().tolist()
        else:
            predictions.extend(prediction.detach().cpu().tolist())
        start = idx

100%|██████████| 250/250 [32:18<00:00,  7.75s/it]


In [33]:
tokenizer.encode('[keyphrases generated]')

[30004]

In [34]:
predictions_only = []
for prediction in predictions:
    idx = prediction.index(30004)
    predictions_only.append(prediction[idx+1:])

In [35]:
predictions_decoded = tokenizer.batch_decode(predictions_only, skip_special_tokens=True)

In [36]:

for prediction_decoded, label in zip(predictions_decoded, labels):
    print(label)
    print(prediction_decoded)
    print()

보드게임콘; 2023 보드게임콘; 국내 최대 보드게임 축제; SETEC; 자유롭고 다채로운 행사; 최대 할인율; 국내외 유명 보드게임 600여 종; 무료 체험; 작가존; 대학교 동아리존
보드게임콘; 2023 보드게임콘; SETEC; 자유롭고 다채로운 행사; 최대 할인율; 보드게임; 작가존; 대학교 동아리존; 무료 체험; 사전등록; 포토존; 현장 이벤트; 가족; 친구; 연인; 남녀노소 다양한 사람들; 보드게임을 즐기는 사람

벤제마; 레알 마드리드; 발롱도르 수상자; 라리가 득점왕; UCL 득점왕; 더블 우승; 선수 계약 연장; 재계약 합의; 이번 시즌 출전; 스타 엘링 홀란드
벤제마; 레알 마드리드; 1년 연장; 구두 합의; 2022 발롱도르 수상자; 유럽축구연맹(UFEA) 챔피언스리그(UCL); 득점왕; 리그에서 활약; 엘링 홀란드; 1년 더 계약 연장; 주장인 벤제마와 레알은 1년 더

50억 클럽; 특검법안; 국회; 법제사법위원회; 상정; 정의당; 원내대변인; 김도읍 법사위원장; 국민의힘; 주호영 원내대표
국회 법사위; 50억 클럽; 특검법안; 상정; 정의당; 원내대변인; 국민의힘; 주호영; 원내대표; 논의; 검찰 수사; 패스트트랙; 진실규명; 사법 정의; 법사위 안건 지정; 신속처리 안건; 검찰 수사; 진실 은폐; 윤석열 정치 검찰; 증인 보호; 진실 은폐

박현주 회장; 미래에셋그룹; 배당금 전액 기부; 미래에셋자산운용; 누적기부액; 이 땅의 젊은이들을 위해; 미래에셋박현주재단; 인재 육성 프로그램; 사회복지 사업; 나만의 책꿈터 지원
박현주 미래에셋 회장; 배당금 전액 기부; 미래에셋자산운용; 박현주 미래에셋그룹 회장; 2010년부터; 미래에셋에서 받은 배당금; 2010년부터; 미래에셋박현주재단; 미래에셋희망재단; 인재 육성 프로그램; 미래에셋해외 교환 장학생;

집권여당; 원내대표; 윤재옥; TK 정치권; 구심점; 꼼꼼함; 치밀함; 구원투수; 주호영 전 원내대표
윤재옥; 원내대표; TK 정치권; 협상 분위기; 경찰 출신; 의정 활동; 드루킹 특검법

# Calculate Metrics

In [37]:
def compute_metrics(eval_pred):
    predictions, decoded_labels = eval_pred
    
    prediction_lens = [np.count_nonzero(np.array(pred) != tokenizer.pad_token_id) for pred in predictions]
    
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # ROUGE / Mean Generated Length
    print('Calculating ROUGE...')
    result = rouge_for_batch(decoded_labels, decoded_preds)

    # F1@K
    print('Calculating F1@10...')
    result["F1@10"] = f1_score_at_k_for_batch(decoded_labels, decoded_preds, 10)
    
    # Jaccard@K
    print('Jaccard F1@10...')
    result["jaccard_similarity"] = jaccard_similarity_for_batch(decoded_labels, decoded_preds, 10)
        
    result = {key: round(value * 100, 4) for key, value in result.items()}
    result["gen_len"] = round(np.mean(prediction_lens), 4)
    
    return result

In [38]:
metrics = compute_metrics((predictions_only, labels))

Calculating ROUGE...


100%|██████████| 3000/3000 [07:58<00:00,  6.27it/s]


Calculating F1@10...


100%|██████████| 3000/3000 [00:00<00:00, 46326.45it/s]


Jaccard F1@10...


100%|██████████| 3000/3000 [00:00<00:00, 76022.79it/s]


In [39]:
metrics

{'rouge1': 50.1822,
 'rouge2': 35.8062,
 'rougeL': 42.1194,
 'rougeLsum': 42.1194,
 'F1@10': 61.0273,
 'jaccard_similarity': 25.3944,
 'gen_len': 64.0}

#### Beam Search
```
generation_args = {
    'num_beams': 5, 
    'no_repeat_ngram_size': 3, 
    'num_return_sequences': 1, 
    'early_stopping': False,
}

{'rouge1': 65.9559,
 'rouge2': 45.8405,
 'rougeL': 53.5547,
 'rougeLsum': 53.5547,
 'F1@10': 58.2764,
 'jaccard_similarity': 25.382}
```

#### Diverse Beam Search
```
generation_args = {
    'num_beams': 15, 
    'no_repeat_ngram_size': 3, 
    'num_return_sequences': 1, 
    'num_beam_groups': 3,
    'early_stopping': False,
}

{'rouge1': 65.9841,
 'rouge2': 45.8601,
 'rougeL': 53.5776,
 'rougeLsum': 53.5776,
 'F1@10': 58.2764,
 'jaccard_similarity': 25.382}
```

#### Sampling
```
generation_args = {
    'do_sample': True, 
    'top_k': 3, 
    'top_p': 0.95, 
}

{'rouge1': 64.5375,
 'rouge2': 43.5099,
 'rougeL': 52.8016,
 'rougeLsum': 52.8016,
 'F1@10': 58.6438,
 'jaccard_similarity': 24.1092}
```


# Inference on Eval Dataset

In [None]:
for data, pred in zip(eval_dataset, predictions):
    pred = np.where(pred != -100, pred, tokenizer.pad_token_id)
    context = tokenizer.decode(data['input_ids'], skip_special_tokens=True)
    summary = tokenizer.decode(data['labels'], skip_special_tokens=True)
    pred = tokenizer.decode(pred, skip_special_tokens=True)
    print(f'입력: {context}')
    print(f'정답: {summary}')
    print(f'예측: {pred}', end='\n\n')