In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
import torch
import numpy as np

from datasets import load_dataset
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, EvalPrediction

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# CONFIG

SEED = 42
CHECKPOINT = 'bert-large-cased'
NUM_OF_OPTIONS = 2605
PROBLEM_TYPE = 'multi_label_classification'

DATA_FILES = {
    'train': '../preprocess/option/train.json',
    'valid': '../preprocess/option/valid.json',
    'test': '../preprocess/option/test.json'
}

In [4]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
model = AutoModelForSequenceClassification.from_pretrained(CHECKPOINT, num_labels=NUM_OF_OPTIONS, problem_type=PROBLEM_TYPE)

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-cased and are newly ini

In [5]:
# Load the dataset and apply the tokenizer
dataset = load_dataset("json", data_files=DATA_FILES)

Found cached dataset json (/home/nlplab11/.cache/huggingface/datasets/json/default-e189f551ad0c0c81/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)
100%|██████████| 3/3 [00:00<00:00, 341.10it/s]


In [6]:
# Define a function to tokenize the data
def tokenize(batch):
    return tokenizer(batch["prompt"], padding=True, truncation=True, max_length=256)

In [7]:
option_list = []

for i in range(len(dataset['train'])):
    for option in dataset['train'][i]['option'][0]:
        if option not in option_list:
            option_list.append(option)

print(len(option_list))

2605


In [8]:
mlb = MultiLabelBinarizer()
mlb.fit_transform([option_list])

array([[1, 1, 1, ..., 1, 1, 1]])

In [9]:
mlb.classes_

array(['#wow', '2d', '2d animation', ..., 'zine', 'zoom', 'zoom lens'],
      dtype=object)

In [10]:
dataset = dataset.map(tokenize, batched=True)

Loading cached processed dataset at /home/nlplab11/.cache/huggingface/datasets/json/default-e189f551ad0c0c81/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-ba6fbb995fc44184.arrow
Loading cached processed dataset at /home/nlplab11/.cache/huggingface/datasets/json/default-e189f551ad0c0c81/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-5053ec8199b51ed4.arrow
Loading cached processed dataset at /home/nlplab11/.cache/huggingface/datasets/json/default-e189f551ad0c0c81/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-c2948d6752bc2bf7.arrow


In [11]:
mlb.transform(dataset['train'][0]['option'])[0]

array([0, 0, 0, ..., 0, 0, 0])

In [12]:
def list_to_numpy(batch):
    batch['labels'] = np.array(mlb.transform(batch['option'])[0], dtype=np.float32)
    return batch

In [13]:
dataset = dataset.map(list_to_numpy, batched=False)

Loading cached processed dataset at /home/nlplab11/.cache/huggingface/datasets/json/default-e189f551ad0c0c81/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-3081a84761c3eb3a.arrow
Loading cached processed dataset at /home/nlplab11/.cache/huggingface/datasets/json/default-e189f551ad0c0c81/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-85980b03cff74578.arrow
Loading cached processed dataset at /home/nlplab11/.cache/huggingface/datasets/json/default-e189f551ad0c0c81/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-033520a5797b1e92.arrow


In [14]:
dataset = dataset.remove_columns(["prompt", "option"])
dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [15]:
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))

    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1

    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)

    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    
    return metrics

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    result = multi_label_metrics(
        predictions=logits, 
        labels=labels
        )
    
    return result

In [16]:
# Set up the training arguments and trainer
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=48,
    per_device_eval_batch_size=48,
    num_train_epochs=10,
    weight_decay=0.01,
    fp16=True,
    report_to=None,
    load_best_model_at_end=True,
    do_eval=True,
    warmup_steps=100,
    logging_dir='./logs',      
    logging_steps=100,
    save_strategy='epoch',
    label_names=['labels']
)

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['valid'],
    compute_metrics=compute_metrics,   
)

In [18]:
# Fine-tune the model
trainer.train()



OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 MiB (GPU 0; 23.70 GiB total capacity; 22.40 GiB already allocated; 176.81 MiB free; 22.47 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF