In [1]:
import os
import re
import warnings

import pandas as pd
import numpy as np
import torch

from transformers import (
    AutoConfig, AutoTokenizer, 
    T5TokenizerFast, T5ForConditionalGeneration, 
    AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, 
    AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
)

from datasets import load_metric, Dataset

import wandb
import nltk

os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings('ignore')
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
NGPU = torch.cuda.device_count()
NCPU = os.cpu_count()
NGPU, NCPU

(6, 64)

# Paths and Names

In [3]:
### paths and names

DATA_PATH = 'data/model_dev/model_dev_v3.pickle'
MODEL_CHECKPOINT = '.log/paust_pko_t5_base_v3_run_5/checkpoint-11310'

# Model & Tokenizer

In [4]:
config = AutoConfig.from_pretrained(MODEL_CHECKPOINT)

In [5]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT, config=config)
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
metric = load_metric('rouge')

# Inputs and Labels

In [11]:
prefix = "generate keyphrases: "

max_input_length = 1024
max_target_length = 64

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["input_text"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")

    labels = tokenizer(examples["target_text"], max_length=max_target_length, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [12]:
data_df = pd.read_pickle(DATA_PATH)

In [13]:
dataset = Dataset.from_pandas(data_df).shuffle(seed=100).train_test_split(0.2, seed=100)
train_dataset = dataset['train']
eval_dataset = dataset['test']

In [14]:
train_dataset = train_dataset.map(preprocess_function, 
                                  batched=True, 
                                  num_proc=NCPU, 
                                  remove_columns=train_dataset.column_names)

eval_dataset = eval_dataset.map(preprocess_function, 
                                batched=True, 
                                num_proc=NCPU, 
                                remove_columns=eval_dataset.column_names)
print(train_dataset)
print(eval_dataset)

Map (num_proc=64):   0%|          | 0/9346 [00:00<?, ? examples/s]

Map (num_proc=64):   0%|          | 0/2337 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 9346
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 2337
})


In [70]:
inputs = eval_dataset[:10]
input_ids = torch.tensor(inputs['input_ids'])
attention_mask = torch.tensor(inputs['attention_mask'])
labels = tokenizer.batch_decode(inputs['labels'], skip_special_tokens=True)

In [71]:
inputs_decoded = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
# print(f'inputs_decoded: {inputs_decoded}')

In [72]:
predictions = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=64)
predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)

In [73]:
def f1_score_at_k_for_sample(label, prediction, k):
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    
    # convert label and prediction strings to sets of key-phrases
    label_lst = [key_phrase.strip() for key_phrase in label.split(';') if key_phrase != '']
    label_lst = [key_phrase for key_phrase in label_lst if key_phrase != '']
    label_set = set(label_lst)
    # print(f'label_set: {label_set}')
    
    # split the predicted key-phrases and their scores
    prediction_lst = [key_phrase.strip() for key_phrase in prediction.split(';') if key_phrase != '']
    prediction_lst = [key_phrase for key_phrase in prediction_lst if key_phrase != ''][:k]
    prediction_set = set(prediction_lst)
    # prediction_set = set(p[0] for p in predictions[:k])
    # print(f'prediction_set: {prediction_set}')
    
    # calculate true positives, false positives, and false negatives
    for keyphrase in prediction_set:
        if keyphrase in label_set:
            true_positives += 1
        else:
            false_positives += 1
    
    for keyphrase in label_set:
        if keyphrase not in prediction_set:
            false_negatives += 1
    
    # print(f'true_positives: {true_positives}')    
    # print(f'false_positives: {false_positives}')
    # print(f'false_negatives: {false_negatives}')

    # calculate precision, recall, and F1 score
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    
    if precision == 0 or recall == 0:
        return 0
    
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    return f1_score

In [74]:
labels, predictions

(['토트넘; 우승; 아스널; 맨시티; 승점 차; 빅4; 브라이튼; 강등 전쟁; 슈퍼컴퓨터; 노팅엄 포레스트',
  '경부고속도; 지하화; 서울 리니어파크; 강남 도심; 상부 공간; 녹지 공간; 연결; 서울시장; 지하도로; 착공',
  '중기부; 중소기업부; 수출활성화; TF(태스크포스); 글로벌 강소기업; 육성; 1000+ 프로젝트; 글로벌 비즈니스센터(GBC); 개편전략; 수출인큐베이터(BI)',
  '하태경; 윤석열 대통령; 부산세계박람회; BIE; 일광수산 횟집; 친일몰이; 더탐사; 선라이즈; 조선시대 지명; 괴담 언론',
  'SSG 추신수; 외야 수비; 1번 우익수; 대전 한화전; 첫 수비 출장; 지명타자; 개막 7경기; 왼쪽 팔꿈치 수술; 스프링캠프; 선발 라인업',
  '전처 남친 살해; 재판; 혐의; 징역 19년; B씨; 이혼; 주거지; 1심; C씨; 대법원',
  '은행권 발행; 코코본드; 조건부자본증권; 발행 잔액; 상각; 회계상 자본; 상각 사유; 자기자본비율; 투자심리 위축; 금융시장',
  '수지; 그림 그리기; 취미 생활; 작업실; 인형 같은 비주얼; 상큼한 매력; 자유로움; 꾸준히 하기; 이두나!; 팬들',
  '승아 양; 어머니; 인형; 발인식; 음주 운전자; 치사; 유가족; 오빠; 친구들; 처벌 강화',
  '이태곤 감독; 보라! 데모라; 이보라; 연애서; 유인나; 이수혁; 한상진; 현실 공감 로맨스; 성장; 보라! 데보라'],
 ['토트넘; 빅4; 슈퍼컴퓨터; 우승; 브라이턴; 뉴캐슬; 맨체스터; 승점; 강등 전쟁; 레스터시티',
  '경부고속도로; 양재~한남; 지하화; 상부공간; 이용방안; 서울 리니어파크; 강남도심; 동서 지역; 리오 공원; 복합문화 공간',
  '중소벤처기업부; 수출 활성화; 민관 협·단체장; 연구기관; 합동 대응체계; 수출바우처; 글로벌 비즈니스센터; 수출바우처; 수출바우처 지원; 수출 선도기업; 스마트공장 우대지원',
  '하태경; 윤석열 대통령; 부산세계박람회; 부산 횟집; 친일몰이; 더탐사; 일광;

In [75]:
def f1_score_at_k_for_batch(labels, predictions, k):
    f1_scores =[]

    for label, prediction in zip(labels, predictions):
        f1_scores.append(f1_score_at_k_for_sample(label, prediction, k))

    print(f1_scores)
    return sum(f1_scores) / len(f1_scores)

In [77]:
f1_score_at_k_for_batch(labels, prediction, 10)

[0.5, 0.20000000000000004, 0, 0.6, 0.4210526315789474, 0.3, 0.20000000000000004, 0.33333333333333326, 0.20000000000000004, 0.3]


0.30543859649122806

In [78]:
# f1_score_at_k_for_sample(labels[9], prediction[9], 10)

In [79]:
def jaccard_similarity_for_sample(label, prediction, k):

    # convert label and prediction strings to sets of key-phrases
    label_lst = [key_phrase.strip() for key_phrase in label.split(';') if key_phrase != '']
    label_lst = [key_phrase for key_phrase in label_lst if key_phrase != '']
    # print(label_lst)
    
    # split the predicted key-phrases and their scores
    prediction_lst = [key_phrase.strip() for key_phrase in prediction.split(';') if key_phrase != '']
    prediction_lst = [key_phrase for key_phrase in prediction_lst if key_phrase != ''][:k]
    # print(prediction_lst)

    """Define Jaccard Similarity function for two sets"""
    intersection = len(list(set(label_lst).intersection(prediction_lst)))
    union = (len(label_lst) + len(prediction_lst)) - intersection

    # print(union)
    # print(intersection)

    return float(intersection) / union

In [80]:
def jaccard_similarity_for_batch(labels, predictions, k):
    jaccard_similarities =[]

    for label, prediction in zip(labels, predictions):
        jaccard_similarities.append(jaccard_similarity_for_sample(label, prediction, k))

    print(jaccard_similarities)
    return sum(jaccard_similarities) / len(jaccard_similarities)

In [81]:
jaccard_similarity_for_batch(labels, predictions, 10)

[0.3333333333333333, 0.1111111111111111, 0.0, 0.42857142857142855, 0.25, 0.17647058823529413, 0.1111111111111111, 0.17647058823529413, 0.1111111111111111, 0.17647058823529413]


0.1874649859943978