In [1]:
import os
import json
import numpy as np

In [2]:
def load_dataset(path_data, tokenizer):
    with open(path_data, 'r') as file: 
        list_content = json.load(file)
    # end
    
    labels_all = sorted(list(list_content[0]['multilabels'].keys()))
    id2label = {id_label:label for id_label, label in enumerate(labels_all)}
    label2id = {label:id_label for id_label, label in id2label.items()}
    
    samples = []
    for content in list_content:
        sample = tokenizer(content['processed'], padding="max_length", truncation=True, max_length=128)
        label_target = np.zeros(len(labels_all))
        
        for label, val in content['multilabels'].items():
            if val > 0:
                id_label = label2id[label]
                label_target[id_label] = 1.0
            # end
        # end
        
        sample['label'] = label_target
        samples.append(sample)
    # end
    
    return samples, labels_all, id2label, label2id
# end

In [3]:
folder_data = 'dataset'
filename_data_origin = 'goscv_trainingdataset_unpatched_0722_merged.json'
path_data_origin = os.path.join(folder_data, filename_data_origin)



In [4]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
samples, labels_all, id2label, label2id  = load_dataset(path_data_origin, tokenizer)

In [5]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels_all),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifi

In [6]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    "multilabel_main_1",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    #push_to_hub=True,
)

In [7]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [8]:
trainer = Trainer(
    model,
    args,
    train_dataset=samples,
    eval_dataset=samples,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [9]:
trainer.train()

***** Running training *****
  Num examples = 731
  Num Epochs = 5
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 1830


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.21938,0.813936,0.855042,0.49658
2,0.322700,0.096679,0.955348,0.967364,0.896033
3,0.132400,0.059121,0.973421,0.979773,0.935705
4,0.132400,0.04441,0.981481,0.987151,0.961696
5,0.063000,0.038971,0.983888,0.988812,0.9658


***** Running Evaluation *****
  Num examples = 731
  Batch size = 2
Saving model checkpoint to multilabel_main_1/checkpoint-366
Configuration saved in multilabel_main_1/checkpoint-366/config.json
Model weights saved in multilabel_main_1/checkpoint-366/pytorch_model.bin
tokenizer config file saved in multilabel_main_1/checkpoint-366/tokenizer_config.json
Special tokens file saved in multilabel_main_1/checkpoint-366/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 731
  Batch size = 2
Saving model checkpoint to multilabel_main_1/checkpoint-732
Configuration saved in multilabel_main_1/checkpoint-732/config.json
Model weights saved in multilabel_main_1/checkpoint-732/pytorch_model.bin
tokenizer config file saved in multilabel_main_1/checkpoint-732/tokenizer_config.json
Special tokens file saved in multilabel_main_1/checkpoint-732/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 731
  Batch size = 2
Saving model checkpoint to multilabel_main_1/

TrainOutput(global_step=1830, training_loss=0.1498341596843115, metrics={'train_runtime': 90.0714, 'train_samples_per_second': 40.579, 'train_steps_per_second': 20.317, 'total_flos': 121057195818240.0, 'train_loss': 0.1498341596843115, 'epoch': 5.0})