In [83]:
import os
import re
import warnings

import pandas as pd
import numpy as np
import torch

from transformers import (
    AutoConfig, AutoTokenizer, 
    T5TokenizerFast, T5ForConditionalGeneration, 
    AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, 
    AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
)

from datasets import load_metric, Dataset

import evaluate

import wandb
import nltk

from konlpy.tag import Komoran

os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings('ignore')
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [84]:
NGPU = torch.cuda.device_count()
NCPU = os.cpu_count()
NGPU, NCPU

(6, 64)

# Paths and Names

In [85]:
### paths and names

DATA_PATH = 'data/model_dev/model_dev_v3.pickle'
MODEL_CHECKPOINT = '.log/paust_pko_t5_base_v3_run_5/checkpoint-11310'

# Model & Tokenizer

In [86]:
config = AutoConfig.from_pretrained(MODEL_CHECKPOINT)

In [87]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT, config=config)
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
metric = load_metric('rouge')

# Inputs and Labels

In [88]:
prefix = "generate keyphrases: "

max_input_length = 1024
max_target_length = 64

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["input_text"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")

    labels = tokenizer(examples["target_text"], max_length=max_target_length, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [89]:
data_df = pd.read_pickle(DATA_PATH)

In [90]:
dataset = Dataset.from_pandas(data_df).shuffle(seed=100).train_test_split(0.2, seed=100)
train_dataset = dataset['train']
eval_dataset = dataset['test']

In [91]:
train_dataset = train_dataset.map(preprocess_function, 
                                  batched=True, 
                                  num_proc=NCPU, 
                                  remove_columns=train_dataset.column_names)

eval_dataset = eval_dataset.map(preprocess_function, 
                                batched=True, 
                                num_proc=NCPU, 
                                remove_columns=eval_dataset.column_names)
print(train_dataset)
print(eval_dataset)

Map (num_proc=64):   0%|          | 0/9346 [00:00<?, ? examples/s]

Map (num_proc=64):   0%|          | 0/2337 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 9346
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 2337
})


In [109]:
inputs = eval_dataset[:100]
input_ids = torch.tensor(inputs['input_ids'])
attention_mask = torch.tensor(inputs['attention_mask'])
labels = tokenizer.batch_decode(inputs['labels'], skip_special_tokens=True)

In [110]:
inputs_decoded = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
# print(f'inputs_decoded: {inputs_decoded}')

In [111]:
predictions = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=64)
predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)

In [112]:
# labels = [[label] for label in labels]
# predictions = [[prediction] for prediction in predictions]

### ROUGE

In [113]:
komoran = Komoran()

In [114]:
rouge = evaluate.load('rouge')

In [115]:
def rouge_for_sampale(label, prediction):
    return rouge.compute(references=[label], predictions=[prediction], tokenizer=komoran.morphs)

In [116]:
def rouge_for_batch(labels, predictions):
    rouge_scores = None
    
    for label, prediction in zip(labels, predictions):
        if rouge_scores == None:
            rouge_scores = rouge_for_sampale(label, prediction)
        else:
            rouge_score = rouge_for_sampale(label, prediction)
            for key in rouge_scores.keys():
                rouge_scores[key] = rouge_scores[key] + rouge_score[key]
    
    for key in rouge_scores.keys():
        rouge_scores[key] = rouge_scores[key] / len(labels)
    
    return rouge_scores

In [117]:
rouge_for_batch(labels, predictions)

{'rouge1': 0.6478344446493621,
 'rouge2': 0.4362975104732406,
 'rougeL': 0.5335291248203091,
 'rougeLsum': 0.5335291248203091}

### F1

In [None]:
def f1_score_at_k_for_sample(label, prediction, k):
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    
    # convert label and prediction strings to sets of key-phrases
    label_lst = [key_phrase.strip() for key_phrase in label.split(';') if key_phrase != '']
    label_lst = [key_phrase for key_phrase in label_lst if key_phrase != '']
    label_set = set(label_lst)
    # print(f'label_set: {label_set}')
    
    # split the predicted key-phrases and their scores
    prediction_lst = [key_phrase.strip() for key_phrase in prediction.split(';') if key_phrase != '']
    prediction_lst = [key_phrase for key_phrase in prediction_lst if key_phrase != ''][:k]
    prediction_set = set(prediction_lst)
    # prediction_set = set(p[0] for p in predictions[:k])
    # print(f'prediction_set: {prediction_set}')
    
    # calculate true positives, false positives, and false negatives
    for keyphrase in prediction_set:
        if keyphrase in label_set:
            true_positives += 1
        else:
            false_positives += 1
    
    for keyphrase in label_set:
        if keyphrase not in prediction_set:
            false_negatives += 1
    
    # print(f'true_positives: {true_positives}')    
    # print(f'false_positives: {false_positives}')
    # print(f'false_negatives: {false_negatives}')

    # calculate precision, recall, and F1 score
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    
    if precision == 0 or recall == 0:
        return 0
    
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    return f1_score

In [118]:
def f1_score_at_k_for_sample(label, prediction, k):
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    
    # convert label and prediction strings to sets of key-phrases
    label_lst = [key_phrase.strip() for key_phrase in label.split(';') if key_phrase != '']
    label_lst = [key_phrase for key_phrase in label_lst if key_phrase != '']
    label_set = set(label_lst)
    # print(f'label_set: {label_set}')
    
    # split the predicted key-phrases and their scores
    prediction_lst = [key_phrase.strip() for key_phrase in prediction.split(';') if key_phrase != '']
    prediction_lst = [key_phrase for key_phrase in prediction_lst if key_phrase != ''][:k]
    prediction_set = set(prediction_lst)
    # prediction_set = set(p[0] for p in predictions[:k])
    # print(f'prediction_set: {prediction_set}')
    
    # calculate true positives, false positives, and false negatives
    for keyphrase in prediction_set:
        if keyphrase in label_set:
            true_positives += 1
        else:
            false_positives += 1
    
    for keyphrase in label_set:
        if keyphrase not in prediction_set:
            false_negatives += 1
    
    # print(f'true_positives: {true_positives}')    
    # print(f'false_positives: {false_positives}')
    # print(f'false_negatives: {false_negatives}')

    # calculate precision, recall, and F1 score
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    
    if precision == 0 or recall == 0:
        return 0
    
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    return f1_score

In [None]:
labels, predictions

In [120]:
def f1_score_at_k_for_batch(labels, predictions, k):
    f1_scores =[]

    for label, prediction in zip(labels, predictions):
        f1_scores.append(f1_score_at_k_for_sample(label, prediction, k))

    # print(f1_scores)
    return sum(f1_scores) / len(f1_scores)

In [121]:
f1_score_at_k_for_batch(labels, predictions, 10)

[0.5, 0.20000000000000004, 0, 0.6, 0.4210526315789474, 0.3, 0.20000000000000004, 0.33333333333333326, 0.20000000000000004, 0.3, 0.5, 0.4000000000000001, 0.4000000000000001, 0.20000000000000004, 0.20000000000000004, 0.2105263157894737, 0.4000000000000001, 0.20000000000000004, 0.5555555555555556, 0.28571428571428564, 0.3157894736842105, 0.5, 0.6, 0.20000000000000004, 0.1111111111111111, 0.3157894736842105, 0.4444444444444445, 0.6, 0.4000000000000001, 0.3, 0.20000000000000004, 0.4000000000000001, 0.6, 0.5, 0.4000000000000001, 0.8000000000000002, 0.23529411764705882, 0.2105263157894737, 0.3, 0.4210526315789474, 0.4210526315789474, 0.6, 0.5263157894736842, 0.3, 0.5, 0.4000000000000001, 0.6, 0.4000000000000001, 0.2105263157894737, 0.10000000000000002, 0.3, 0.3, 0.3, 0.4000000000000001, 0.4000000000000001, 0.3, 0.5263157894736842, 0.7, 0.631578947368421, 0.5263157894736842, 0.6, 0.4000000000000001, 0.4000000000000001, 0.7368421052631577, 0.3, 0.3157894736842105, 0.4000000000000001, 0.63157894

0.38000686765934427

In [122]:
# f1_score_at_k_for_sample(labels[9], prediction[9], 10)

### Jaccard

In [123]:
def jaccard_similarity_for_sample(label, prediction, k):

    # convert label and prediction strings to sets of key-phrases
    label_lst = [key_phrase.strip() for key_phrase in label.split(';') if key_phrase != '']
    label_lst = [key_phrase for key_phrase in label_lst if key_phrase != '']
    # print(label_lst)
    
    # split the predicted key-phrases and their scores
    prediction_lst = [key_phrase.strip() for key_phrase in prediction.split(';') if key_phrase != '']
    prediction_lst = [key_phrase for key_phrase in prediction_lst if key_phrase != ''][:k]
    # print(prediction_lst)

    """Define Jaccard Similarity function for two sets"""
    intersection = len(list(set(label_lst).intersection(prediction_lst)))
    union = (len(label_lst) + len(prediction_lst)) - intersection

    # print(union)
    # print(intersection)

    return float(intersection) / union

In [124]:
def jaccard_similarity_for_batch(labels, predictions, k):
    jaccard_similarities =[]

    for label, prediction in zip(labels, predictions):
        jaccard_similarities.append(jaccard_similarity_for_sample(label, prediction, k))

    print(jaccard_similarities)
    return sum(jaccard_similarities) / len(jaccard_similarities)

In [125]:
jaccard_similarity_for_batch(labels, predictions, 10)

[0.3333333333333333, 0.1111111111111111, 0.0, 0.42857142857142855, 0.25, 0.17647058823529413, 0.1111111111111111, 0.17647058823529413, 0.1111111111111111, 0.17647058823529413, 0.3333333333333333, 0.25, 0.25, 0.1111111111111111, 0.1111111111111111, 0.1111111111111111, 0.25, 0.1111111111111111, 0.3333333333333333, 0.16666666666666666, 0.17647058823529413, 0.3333333333333333, 0.42857142857142855, 0.1111111111111111, 0.05555555555555555, 0.17647058823529413, 0.25, 0.42857142857142855, 0.25, 0.17647058823529413, 0.1111111111111111, 0.25, 0.42857142857142855, 0.3333333333333333, 0.25, 0.6666666666666666, 0.125, 0.1111111111111111, 0.17647058823529413, 0.25, 0.25, 0.42857142857142855, 0.3333333333333333, 0.17647058823529413, 0.3333333333333333, 0.25, 0.42857142857142855, 0.25, 0.1111111111111111, 0.05263157894736842, 0.17647058823529413, 0.17647058823529413, 0.17647058823529413, 0.25, 0.25, 0.17647058823529413, 0.3333333333333333, 0.5384615384615384, 0.42857142857142855, 0.3333333333333333, 0

0.2421258226637483