In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 7.5MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 34.1MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 56.9MB/s 
Installing collected packages: tokenizers, sacremoses, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1


In [2]:
!nvidia-smi

Mon Apr 26 08:59:45 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
import os
import numpy as np
import pandas as pd

from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import Trainer, TrainingArguments
from transformers.trainer_callback import ProgressCallback

import torch
from sklearn.model_selection import train_test_split

from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import datetime

from google.colab import drive
import os
import sys
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd

class PolishDatasetLoader:
    MAIN_DIR_PATH = 'https://github.com/WilyLynx/mlt4pm/raw/master/data/PolishDataset'

    @staticmethod
    def load_train(type:object, size:object)->pd.DataFrame:
        """Loads the training dataset from repository

        Args:
            type (object): dataset type: all, chemia, napoje
            size (object): dataset size: small, medium, large

        Returns:
            pd.DataFrame: training dataset
        """
        path = f'{PolishDatasetLoader.MAIN_DIR_PATH}/{type}_train/pl_wdc_{type}_{size}.json.gz'
        df = pd.read_json(path, compression='gzip', lines=True)
        return df.reset_index()

    @staticmethod
    def load_test(type:object)->pd.DataFrame:
        """Loads the test dataset form repository

        Args:
            type (object): dataset type: all, chemia, napoje

        Returns:
            pd.DataFrame: test dataset
        """
        path = f'{PolishDatasetLoader.MAIN_DIR_PATH}/test/pl_wdc_{type}_test.json.gz'
        df = pd.read_json(path, compression='gzip', lines=True)
        return df.reset_index()


class FeatureBuilder:
    def __init__(self, columns):
        self.columns = columns

    def get_X(self, dataset):
        X = '[CLS] ' + dataset[f'{self.columns[0]}_left']
        for i in range(1, len(self.columns)):
            X = X + ' [SEP] ' + dataset[f'{self.columns[i]}_left']
        for i in range(len(self.columns)):
            X = X + ' [SEP] ' + dataset[f'{self.columns[i]}_right']
        X + ' [SEP]'
        return X.to_list()

    def get_y(self, dataset):
        return dataset['label'].to_list()


class TorchPreprocessedDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        self.items = self.preprocessItems(encodings, labels)

    def __getitem__(self, idx):
        return self.items[idx]

    def __len__(self):
        return len(self.labels)

    def preprocessItems(self, encodings, labels):
        items = []
        for idx in range(len(labels)):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx])
            items.append(item)
        return items


In [5]:
model_name = 'bert-base-multilingual-uncased'
dataset_type = 'chemia'
dataset_size = 'small'

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
title_fb = FeatureBuilder(['title'])

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=871891.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1715180.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




In [7]:
train_df = PolishDatasetLoader.load_train(dataset_type, dataset_size)
X_train = title_fb.get_X(train_df)
y_train = title_fb.get_y(train_df)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

train_encodings = tokenizer(X_train, truncation=True, padding=True)
val_encodings = tokenizer(X_val, truncation=True, padding=True)

train_dataset = TorchPreprocessedDataset(train_encodings, y_train)
val_dataset = TorchPreprocessedDataset(val_encodings, y_val)

del train_df, X_train, X_val

In [8]:
test_df = PolishDatasetLoader.load_test(dataset_type)
X_test = title_fb.get_X(test_df)
y_test = title_fb.get_y(test_df)
test_encodings = tokenizer(X_test, truncation=True, padding=True)
test_dataset = TorchPreprocessedDataset(test_encodings, y_test)
del test_df, X_test

In [9]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=16,   # batch size per device during training
    per_device_eval_batch_size=64,    # batch size for evaluation
    warmup_steps=500,                 # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                # strength of weight decay
    logging_dir=logdir,               # directory for storing logs
    logging_steps=10,
    disable_tqdm=False,
    fp16=True,
    evaluation_strategy='epoch',
    save_strategy='no',
)
model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                                  num_labels=2,
                                                                  output_attentions=False,
                                                                  output_hidden_states=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=672271273.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

In [10]:
print('DEVICE USED: ', training_args.device)

DEVICE USED:  cuda:0


In [11]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
1,0.4578,0.486986,0.781638,0.65625,0.563758,0.785047,1.3764,292.789


TrainOutput(global_step=101, training_loss=0.5680497687051792, metrics={'train_runtime': 21.8423, 'train_samples_per_second': 4.624, 'total_flos': 109865810778288.0, 'epoch': 1.0, 'init_mem_cpu_alloc_delta': 2125484032, 'init_mem_gpu_alloc_delta': 669758976, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 14352384, 'train_mem_gpu_alloc_delta': 2052214272, 'train_mem_cpu_peaked_delta': 0, 'train_mem_gpu_peaked_delta': 867237888})

In [12]:
trainer.evaluate()

{'epoch': 1.0,
 'eval_accuracy': 0.7816377171215881,
 'eval_f1': 0.65625,
 'eval_loss': 0.4869861900806427,
 'eval_mem_cpu_alloc_delta': 0,
 'eval_mem_cpu_peaked_delta': 0,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_gpu_peaked_delta': 143913472,
 'eval_precision': 0.5637583892617449,
 'eval_recall': 0.7850467289719626,
 'eval_runtime': 1.3681,
 'eval_samples_per_second': 294.569}

In [32]:
pred = trainer.predict(test_dataset)
metrics = pd.DataFrame(compute_metrics(pred), index=[0])
metrics['model'] = model_name
metrics['dataset_type'] = dataset_type
metrics['dataset_size'] = dataset_size
metrics

Unnamed: 0,accuracy,f1,precision,recall,model,dataset_type,dataset_size
0,0.797273,0.66963,0.602667,0.753333,bert-base-multilingual-uncased,chemia,small


In [33]:
model_tmp_save = 'results/test'
model.save_pretrained(model_tmp_save)

In [34]:
DRIVE = 'drive/MyDrive'
p = Path(os.path.join(DRIVE, 'MGR', 'PL', model_name, dataset_type, dataset_size))
p.mkdir(parents=True, exist_ok=True)
os.system(f'python -m transformers.convert_graph_to_onnx --model {model_tmp_save} --framework pt --tokenizer {model_name} {p}/model/model.onnx')
os.system(f'mv {p}/model/model.onnx {p}/model.onnx')
os.system(f'rm -R {p}/model/')
os.system(f'rm -R {model_tmp_save}')
metrics.to_csv(f'{p}/metrics.csv')

log_path = Path(os.path.join(DRIVE, "MGR", "PL", "logs", logdir_name))
log_path.mkdir(parents=True, exist_ok=True)
os.system(f'cp -R {logdir} {log_path}')

In [35]:
dataset = f'{dataset_type}_{dataset_size}'
os.system(f'python -m transformers.convert_graph_to_onnx --model {model_tmp_save} --framework pt --tokenizer {model_name} {p}/{dataset}.onnx')
os.system(f'rm -R {model_tmp_save}')

metrics.to_csv(f'{p}/{dataset}_metrics.csv')
os.system(f'cp -R {logdir} {p}/logs')

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs