In [None]:
!pip install -q transformers datasets

In [None]:
from datasets import load_dataset

dataset = load_dataset("sem_eval_2018_task_1", "subtask5.english")

In [None]:
dataset

In [None]:
example = dataset['train'][0]
example

위 Dataset에는 여러가지 Feature가 존재합니다.

이것을 이용해 ID와 Tweet 외의 다른 Column을 이용해 Multi-label Classification을 위한 Dataset을 구성해보겠습니다.

In [None]:
labels = [label for label in dataset['train'].features.keys() if label not in ['ID', 'Tweet']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

In [None]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

def preprocess_data(examples):
    text = examples['Tweet']
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128) # tokenizer로 dict형태의 Padding, Masking, token_type_ids 등등 다 만듦.
    labels_batch = {k : examples[k] for k in examples.keys() if k in labels}
    labels_matrix = np.zeros((len(text), len(labels)))

    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]
    
    encoding['labels'] = labels_matrix.tolist()

    return encoding

In [None]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)

In [None]:
example = encoded_dataset['train'][0]
print(example.keys())

In [None]:
tokenizer.decode(example['input_ids'])

In [None]:
example['labels']

해당 Data에는 anticipation, optimizim, trust 가 포함되어 있다.

In [None]:
example.keys()

In [None]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

마지막으로 우리는 우리의 Data의 Format을 PyTorch tensors로 설정해야한다. 이것은 PyTorch dataset의 training, validation,test로 변환될 예정이다.

In [None]:
encoded_dataset.set_format('torch') # tokenizer 형태로 변환된 Data를 torch.utils.data.Dataset형태로 변환함.

In [36]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

loading configuration file https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/6c4a5d81a58c9791cdf76a09bce1b5abfb9cf958aebada51200f4515403e5d08.0fe59f3f4f1335dadeb4bce8b8146199d9083512b50d07323c1c319f96df450c
Model config BertConfig {
  "_name_or_path": "bert-base-multilingual-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "anger",
    "1": "anticipation",
    "2": "disgust",
    "3": "fear",
    "4": "joy",
    "5": "love",
    "6": "optimism",
    "7": "pessimism",
    "8": "sadness",
    "9": "surprise",
    "10": "trust"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "anger": 0,
    "anticipation": 1,
    "disgust": 2,
    "fear": 3,
    "joy": 4,
    "love": 5,
    "

In [37]:
batch_size = 8
metric_name = "f1"

In [38]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english", # output_dir
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #push_to_hub=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


우리는 Metric을 계산하기 위해 `compute_metric`함수를 정의할 겁니다. 이 함수는 metric 계산을 위한 Dicionary 값을 이용해 계산될 예정입니다.

In [39]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

def multi_label_metrics(predictions, labels, threshold=0.5):
    # apply sigmoid on preditions which are of shape(batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1 # 6 ~ 9번 위치가 만족 -> (array([6, 7, 8, 9]))를 이용해 0, 1로 prediction을 나눔.
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average='micro')
    accuracy = accuracy_score(y_true, y_pred)

    metrics = { # 이런 Metric이 정해져 있는 듯 함.
        'f1' : f1_micro_average,
        'roc_auc' : roc_auc,
        'accuracy': accuracy
    }
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics (
        predictions=preds,
        labels=p.label_ids
    )
    return result

In [40]:
encoded_dataset['train'][0]['labels'].type()

'torch.FloatTensor'

In [41]:
encoded_dataset['train']['input_ids'][0]

tensor([   101,    100, 102204,  31862,  10124,    169,  12935,  67701,  10135,
           169,  18077,  13028,  11387,  14794,  10529,    112,    119,  35088,
         12963,    119,    108,  63598,  11809,    108,  25121,    108,  12796,
         31862,    102,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0, 

In [42]:
#forward pass
outputs = model(input_ids=encoded_dataset['train']['input_ids'][0].unsqueeze(0), labels=encoded_dataset['train'][0]['labels'].unsqueeze(0))
outputs

SequenceClassifierOutput([('loss',
                           tensor(0.6869, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)),
                          ('logits',
                           tensor([[-0.1150,  0.0214,  0.0988,  0.0486, -0.0101,  0.1667,  0.0157, -0.2099,
                                    -0.0566, -0.0968, -0.0454]], grad_fn=<AddmmBackward0>))])

In [43]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [44]:
trainer.train()

***** Running training *****
  Num examples = 6838
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 4275


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.4296,0.35845,0.591216,0.720465,0.209932
2,0.3211,0.346114,0.655164,0.76647,0.264108
3,0.2743,0.333408,0.669056,0.773468,0.267494
4,0.2491,0.336251,0.677582,0.780612,0.266366
5,0.2183,0.34218,0.676245,0.782141,0.248307


***** Running Evaluation *****
  Num examples = 886
  Batch size = 8
Saving model checkpoint to bert-finetuned-sem_eval-english/checkpoint-855
Configuration saved in bert-finetuned-sem_eval-english/checkpoint-855/config.json
Model weights saved in bert-finetuned-sem_eval-english/checkpoint-855/pytorch_model.bin
tokenizer config file saved in bert-finetuned-sem_eval-english/checkpoint-855/tokenizer_config.json
Special tokens file saved in bert-finetuned-sem_eval-english/checkpoint-855/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 886
  Batch size = 8
Saving model checkpoint to bert-finetuned-sem_eval-english/checkpoint-1710
Configuration saved in bert-finetuned-sem_eval-english/checkpoint-1710/config.json
Model weights saved in bert-finetuned-sem_eval-english/checkpoint-1710/pytorch_model.bin
tokenizer config file saved in bert-finetuned-sem_eval-english/checkpoint-1710/tokenizer_config.json
Special tokens file saved in bert-finetuned-sem_eval-english/checkpoin

TrainOutput(global_step=4275, training_loss=0.2923255920410156, metrics={'train_runtime': 682.126, 'train_samples_per_second': 50.123, 'train_steps_per_second': 6.267, 'total_flos': 2249123476753920.0, 'train_loss': 0.2923255920410156, 'epoch': 5.0})

In [45]:
text = "I'm happy I can finally train a model for multi-label classification"

encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)

In [46]:
logits = outputs.logits
logits.shape

torch.Size([1, 11])

In [47]:
# apply sigmoid + threshold
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
predictions = np.zeros(probs.shape)
predictions[np.where(probs >= 0.5)] = 1
# turn predicted id's into actual label names
predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
print(predicted_labels)

['joy', 'optimism']
