In [0]:
from google.colab import drive
drive.mount('/content/gdrive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive/


In [0]:
!unzip -q ./gdrive/My\ Drive/DP_punctuator/DP_final_exams_autoresolver.zip

In [0]:
!unzip -q ./gdrive/My\ Drive/DP_punctuator/dataset.zip

In [0]:
%%capture
!pip install -r requirements.txt

In [0]:
!mkdir data

In [0]:
import json
import re
import attr
import logging
import numpy as np
import os
import shutil
import torch
import torch.nn as nn
from operator import attrgetter


logger = logging.getLogger(__name__)


@attr.s(frozen=True)
class Field(object):
    name = attr.ib()
    dtype = attr.ib()


def save_model(model, optimizer, scheduler, model_path):
    model_dir = os.path.dirname(model_path)
    if not os.path.isdir(model_dir):
        os.makedirs(model_dir)
    torch.save({
        'model': model.state_dict(),
        'optimizer': optimizer.state_dict(),
        'scheduler': scheduler.state_dict()
    }, model_path + '_tmp')
    shutil.copyfile(model_path + '_tmp', model_path)
    os.remove(model_path + '_tmp')


def load_tasks(dir_path, task_num=None):
    if not isinstance(task_num, list):
        task_num = [task_num]
    tasks, filenames = [], [os.path.join(dir_path, f) for f in os.listdir(dir_path)]
    for filename in filenames:
        if filename.endswith(".json"):
            with open(filename, encoding='utf-8') as f:
                dt = f.read().encode('utf-8')
                data = json.loads(dt)
                if not isinstance(data, list):
                    data = [data]
                tasks += [d for d in data if 'id' in d and int(d['id']) in task_num]
    return tasks


def fix_spaces(text):
    return re.compile('\s+').sub(' ', text)


class BatchCollector(object):
    def __init__(self, matrix_fields, vector_fields, device):
        self._matrix_fields = matrix_fields
        self._vector_fields = vector_fields
        self._device = device

    def __call__(self, samples):
        def batchify_matrix(get_field, dtype):
            tensor = np.zeros((len(samples), max_length), dtype=dtype)

            for sample_id, sample in enumerate(samples):
                data = get_field(sample)
                tensor[sample_id, :len(data)] = data

            return torch.from_numpy(tensor)

        def batchify_vector(get_field, dtype):
            return torch.as_tensor([get_field(sample) for sample in samples], dtype=dtype)

        max_length = max(len(sample) for sample in samples)

        batch = {
            field.name: batchify_matrix(attrgetter(field.name), field.dtype).to(self._device)
            for field in self._matrix_fields
        }
        for field in self._vector_fields:
            batch[field.name] = batchify_vector(attrgetter(field.name), field.dtype).to(self._device)

        return batch


class AccuracyCounter(object):
    def __init__(self, name=None, masked_values=None):
        self.name = name
        self.correct_count = 0.
        self.total_count = 0.
        self._masked_values = masked_values or []

    def update(self, predictions, labels):
        mask = torch.ones_like(labels, dtype=torch.bool)
        for masked_value in self._masked_values:
            mask &= labels != masked_value

        self.correct_count += ((predictions == labels).float() * mask.float()).sum()
        self.total_count += mask.sum()

    @property
    def value(self):
        return self.correct_count / self.total_count

    def __str__(self):
        prefix = '{}: '.format(self.name) if self.name else 'Acc: '
        return prefix + '{:.2%}'.format(self.value)


class ConsoleProgressBar(object):
    def __init__(self, total):
        self._step = 0
        self._total_steps = total

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        pass

    def update(self):
        self._step += 1

    def set_description(self, text):
        if self._step % 100 == 0:
            logger.info(text + ' | [{}/{}]'.format(self._step, self._total_steps))

    def refresh(self):
        pass


class ModelTrainer(object):
    def __init__(self, model, optimizer, scheduler=None, use_tqdm=False, model_path=None, clip_norm=5.):
        self._model = model
        self._optimizer = optimizer
        self._scheduler = scheduler
        self._use_tqdm = use_tqdm
        self._model_path = model_path
        self._clip_norm = clip_norm
        self._global_step = 0

    def on_epoch_begin(self, is_train, name, batches_count):
        """
        Initializes metrics
        """
        self._epoch_loss = 0.
        self._is_train = is_train
        self._name = name
        self._batches_count = batches_count

        self._model.train(is_train)

    def on_epoch_end(self):
        """
        Outputs final metrics
        """
        return '{:>5s} Loss = {:.5f}'.format(
            self._name, self._epoch_loss / self._batches_count
        )

    def on_batch(self, batch):
        """
        Performs forward and (if is_train) backward pass with optimization, updates metrics
        """

        loss, metrics_info = self.forward_pass(batch)
        self._epoch_loss += loss.item()

        if self._is_train:
            self.backward_pass(loss)
            self._global_step += 1

            if self._global_step % 1000 == 0 and self._model_path:
                save_model(self._model, self._optimizer, self._scheduler, self._model_path)

        return '{:>5s} Loss = {:.5f}, '.format(self._name, loss.item()) + metrics_info

    def forward_pass(self, batch):
        outputs = self._model(batch)
        return outputs['loss'], 'Loss = {:.3f}'.format(outputs['loss'].item())

    def backward_pass(self, loss):
        self._optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(self._model.parameters(), self._clip_norm)
        self._optimizer.step()
        if self._scheduler:
            self._scheduler.step()

    def fit(self, train_iter, train_batches_per_epoch=None, epochs_count=1,
            val_iter=None, val_batches_per_epoch=None):

        train_batches_per_epoch = train_batches_per_epoch or len(train_iter)
        if val_iter is not None:
            val_batches_per_epoch = val_batches_per_epoch or len(val_iter)

        try:
            for epoch in range(epochs_count):
                name_prefix = '[{} / {}] '.format(epoch + 1, epochs_count)
                self._do_epoch(iter(train_iter), is_train=True,
                            batches_count=train_batches_per_epoch,
                            name=name_prefix + 'Train:')

                if val_iter is not None:
                    self._do_epoch(iter(val_iter), is_train=False,
                                batches_count=val_batches_per_epoch,
                                name=name_prefix + '  Val:')
        except KeyboardInterrupt:
            logger.info('Early stopping was triggered')

        if self._model_path:
            save_model(self._model, self._optimizer, self._scheduler, self._model_path)

    def _do_epoch(self, data_iter, is_train, batches_count, name=None):
        self.on_epoch_begin(is_train, name, batches_count=batches_count)

        progress_bar_class = ConsoleProgressBar
        if self._use_tqdm:
            try:
                from tqdm import tqdm
                tqdm.get_lock().locks = []

                progress_bar_class = tqdm
            except:
                pass

        with torch.autograd.set_grad_enabled(is_train):
            with progress_bar_class(total=batches_count) as progress_bar:
                try:
                    for _ in range(batches_count):
                        batch = next(data_iter)
                        batch_progress = self.on_batch(batch)

                        progress_bar.update()
                        progress_bar.set_description(batch_progress)
                except StopIteration:
                    pass
                epoch_progress = self.on_epoch_end()
                progress_bar.set_description(epoch_progress)
                progress_bar.refresh()


In [0]:
import re
import numpy as np
import attr
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import transformers
from transformers.optimization import AdamW, get_linear_schedule_with_warmup

CLS_TOKEN = '[CLS]'
SEP_TOKEN = '[SEP]'
MASK_TOKEN = '[MASK]'


@attr.s(frozen=True)
class Example(object):
    tokens = attr.ib()
    token_ids = attr.ib()
    segment_ids = attr.ib()
    mask = attr.ib()
    position = attr.ib()
    label_id = attr.ib(default=None)

    def __len__(self):
        return len(self.token_ids)


@attr.s(frozen=True)
class TrainConfig(object):
    learning_rate = attr.ib(default=1e-5)
    train_batch_size = attr.ib(default=32)
    test_batch_size = attr.ib(default=32)
    epoch_count = attr.ib(default=25)
    warm_up = attr.ib(default=0.1)


def _get_optimizer(model, train_size, config):
    num_total_steps = int(train_size / config.train_batch_size * config.epoch_count)
    num_warmup_steps = int(num_total_steps * config.warm_up)

    optimizer = AdamW(model.parameters(), lr=config.learning_rate, correct_bias=False)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_total_steps)

    return optimizer, scheduler


def _get_batch_collector(device, is_train=True):
    if is_train:
        vector_fields = [Field('label_id', torch.float32)]
    else:
        vector_fields = []

    vector_fields.append(Field('position', torch.long))

    return BatchCollector(
        matrix_fields=[
            Field('token_ids', np.int64),
            Field('segment_ids', np.int64),
            Field('mask', np.int64),
        ],
        vector_fields=vector_fields,
        device=device
    )


class BertClassifier(nn.Module):
    def __init__(self, bert, output_name, bert_output_dim=768):
        super(BertClassifier, self).__init__()

        self._bert = bert
        self._dropout = nn.Dropout(0.3)
        self._predictor = nn.Linear(bert_output_dim, 1)
        self._output_name = output_name

    def forward(self, batch):
        outputs, pooled_outputs = self._bert(
            input_ids=batch['token_ids'],
            attention_mask=batch['mask'],
            token_type_ids=batch['segment_ids'],
        )

        row_positions = torch.arange(0, outputs.shape[0])
        outputs = outputs[row_positions, batch['position']]

        status_logits = self._predictor(self._dropout(outputs)).squeeze(-1)

        loss = 0.
        if self._output_name + '_id' in batch:
            loss = F.binary_cross_entropy_with_logits(
                status_logits, batch[self._output_name + '_id']
            )

        return {
            self._output_name + '_logits': status_logits,
            'loss': loss
        }


class ClassifierTrainer(ModelTrainer):
    def on_epoch_begin(self, *args, **kwargs):
        super(ClassifierTrainer, self).on_epoch_begin(*args, **kwargs)

        self._accuracies = [AccuracyCounter()]

    def on_epoch_end(self):
        info = super(ClassifierTrainer, self).on_epoch_end()

        return '{}, {}'.format(info, ', '.join(str(score) for score in self._accuracies))

    def forward_pass(self, batch):
        outputs = self._model(batch)

        predictions = (outputs['label_logits'] > 0.).float()
        assert predictions.shape == batch['label_id'].shape

        self._accuracies[0].update(predictions, batch['label_id'])

        info = ', '.join(str(score) for score in self._accuracies)
        return outputs['loss'], info


class Solver(object):
    def __init__(self, model_name):
        self._model_name = model_name
        self._tokenizer = transformers.AutoTokenizer.from_pretrained(self._model_name)
        self._bert = transformers.BertModel.from_pretrained(self._model_name)
        self._device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        self._batch_collector = _get_batch_collector(self._device, is_train=False)
        self._model = BertClassifier(self._bert, output_name='label').to(self._device)

    @staticmethod
    def _get_sentences(text):
        for sent in text.split('.'):
            if re.search('\(\d+\)', sent):
                yield fix_spaces(sent.strip()) + '.'

    def _convert_task(self, task):
        text = task["text"].replace("?", ".").replace("\xa0", "")
        sentence = ' '.join(list(self._get_sentences(text)))

        correct = None
        if 'solution' in task:
            if 'correct' in task['solution']:
                solution = task['solution']['correct']
            else:
                solution = task['solution']['correct_variants'][0]
            correct = sorted([int(idx) - 1 for idx in solution])

        tokens, positions = [CLS_TOKEN], []
        prev_match_end = 0
        for match in re.finditer('\(\d+\)', sentence):
            tokens.extend(self._tokenizer.tokenize(sentence[prev_match_end: match.start()].strip()))
            positions.append(len(tokens))
            tokens.append(MASK_TOKEN)
            prev_match_end = match.end()

        tokens.extend(self._tokenizer.tokenize(sentence[prev_match_end:].strip()))
        tokens.append(SEP_TOKEN)
        assert all(tokens[position] == MASK_TOKEN for position in positions)

        token_ids = self._tokenizer.convert_tokens_to_ids(tokens)
        assert token_ids[0] == 101 and token_ids[-1] == 102
        assert len(token_ids) == len(tokens)

        for position_id, position in enumerate(positions):
            label_id = None
            if correct is not None:
                label_id = int(position_id in correct)

            yield Example(
                tokens=tokens,
                token_ids=token_ids,
                segment_ids=[0] * len(token_ids),
                mask=[1] * len(token_ids),
                position=position,
                label_id=label_id
            )

    def _prepare_examples(self, tasks):
        examples = []
        for task in tasks:
            for example in self._convert_task(task):
                examples.append(example)
        return examples

    def fit(self, tasks, test_tasks=None):
        train_examples = self._prepare_examples(tasks)
        test_examples = None
        if test_tasks is not None:
            test_examples = self._prepare_examples(test_tasks)

        config = TrainConfig()
        self._model = BertClassifier(
            transformers.BertModel.from_pretrained(self._model_name),
            output_name='label'
        ).to(self._device)

        batch_collector = _get_batch_collector(self._device, is_train=True)

        train_loader = DataLoader(
            train_examples, batch_size=config.train_batch_size,
            shuffle=True, collate_fn=batch_collector, pin_memory=False
        )
        test_loader, test_batches_per_epoch = None, 0
        if test_examples is not None:
            test_loader = DataLoader(
                test_examples, batch_size=config.test_batch_size,
                shuffle=False, collate_fn=batch_collector, pin_memory=False
            )
            test_batches_per_epoch = int(len(test_examples) / config.test_batch_size)

        optimizer, scheduler = _get_optimizer(self._model, len(train_examples), config)

        trainer = ClassifierTrainer(
            self._model, optimizer, scheduler, use_tqdm=True
        )
        trainer.fit(
            train_iter=train_loader,
            train_batches_per_epoch=int(len(train_examples) / config.train_batch_size),
            val_iter=test_loader, val_batches_per_epoch=test_batches_per_epoch,
            epochs_count=config.epoch_count
        )

    def save(self, path="data/model_solver17.pkl"):
        torch.save(self._model.state_dict(), path)

    def load(self, path="data/model_solver17.pkl"):
        model_checkpoint = torch.load(path, map_location=self._device)
        self._model.load_state_dict(model_checkpoint)
        self._model.eval()

    def predict_from_model(self, task):
        examples = list(self._convert_task(task))
        model_inputs = self._batch_collector(examples)

        with torch.no_grad():
            model_prediction = self._model(model_inputs)['label_logits'].cpu().numpy()

        choices = task['question']['choices']
        prediction = [
            str(choices[ind]['id'])
            for ind, logit in enumerate(model_prediction)
            if logit > 0.
        ]

        return prediction




In [0]:
SEED = 42

def seed_everything(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

Train different BERT models over all 17-20 tasks at the same time

In [0]:
model_names = [
  'DeepPavlov/rubert-base-cased',
  'DeepPavlov/rubert-base-cased-sentence',
  'DeepPavlov/bert-base-multilingual-cased-sentence',
  'bert-base-multilingual-cased',
  'bert-base-multilingual-uncased'
]

for model_name in model_names:
    print()
    print()
    print()
    print("------------------------")
    print("train ", model_name)
    print("------------------------")
    print()

    seed_everything(SEED)

    solver = Solver(model_name)

    train_tasks, test_tasks = [], []
    for task_id in range(17, 21):
        train_tasks.extend(load_tasks('dataset/train/', task_num=task_id))
        test_tasks.extend(load_tasks('dataset/test/', task_num=task_id))

    seed_everything(SEED)

    solver.fit(train_tasks, test_tasks)

    model_fn = "data/model_" + model_name.replace("/", "_") + "_all_task.pkl"

    solver.save(model_fn)

    # print()
    # print("predict on test datases")
    
    # solver = Solver(model_name)
    # solver.load(model_fn)
    # task = load_tasks('dataset/test/', task_num=17)[0]
    # print(task)
    # print(solver.predict_from_model(task))





------------------------
train  DeepPavlov/rubert-base-cased
------------------------



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=642.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1649718.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=711456796.0, style=ProgressStyle(descri…




[1 / 25] Train: Loss = 0.39340, Acc: 79.25%: 100%|██████████| 591/591 [02:44<00:00,  3.59it/s]
[1 / 25]   Val: Loss = 0.19206, Acc: 89.91%: 100%|██████████| 162/162 [00:10<00:00, 15.30it/s]
[2 / 25] Train: Loss = 0.08782, Acc: 96.43%: 100%|██████████| 591/591 [02:44<00:00,  3.59it/s]
[2 / 25]   Val: Loss = 0.08654, Acc: 97.11%: 100%|██████████| 162/162 [00:10<00:00, 15.29it/s]
[3 / 25] Train: Loss = 0.02748, Acc: 99.10%: 100%|██████████| 591/591 [02:43<00:00,  3.61it/s]
[3 / 25]   Val: Loss = 0.08873, Acc: 97.63%: 100%|██████████| 162/162 [00:10<00:00, 15.32it/s]
[4 / 25] Train: Loss = 0.01425, Acc: 99.66%: 100%|██████████| 591/591 [02:45<00:00,  3.57it/s]
[4 / 25]   Val: Loss = 0.05960, Acc: 98.57%: 100%|██████████| 162/162 [00:10<00:00, 15.28it/s]
[5 / 25] Train: Loss = 0.00601, Acc: 99.86%: 100%|██████████| 591/591 [02:43<00:00,  3.61it/s]
[5 / 25]   Val: Loss = 0.07584, Acc: 98.53%: 100%|██████████| 162/162 [00:10<00:00, 15.33it/s]
[6 / 25] Train: Loss = 0.00378, Acc: 99.91%: 100%|




------------------------
train  DeepPavlov/rubert-base-cased-sentence
------------------------



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=642.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1649718.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=24.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=711456784.0, style=ProgressStyle(descri…




[1 / 25] Train: Loss = 0.44212, Acc: 75.58%: 100%|██████████| 591/591 [02:37<00:00,  3.74it/s]
[1 / 25]   Val: Loss = 0.21562, Acc: 88.37%: 100%|██████████| 162/162 [00:09<00:00, 16.21it/s]
[2 / 25] Train: Loss = 0.12678, Acc: 93.64%: 100%|██████████| 591/591 [02:38<00:00,  3.74it/s]
[2 / 25]   Val: Loss = 0.14938, Acc: 92.63%: 100%|██████████| 162/162 [00:09<00:00, 16.21it/s]
[3 / 25] Train: Loss = 0.06116, Acc: 97.62%: 100%|██████████| 591/591 [02:37<00:00,  3.75it/s]
[3 / 25]   Val: Loss = 0.05709, Acc: 98.01%: 100%|██████████| 162/162 [00:09<00:00, 16.22it/s]
[4 / 25] Train: Loss = 0.02377, Acc: 99.27%: 100%|██████████| 591/591 [02:39<00:00,  3.71it/s]
[4 / 25]   Val: Loss = 0.04632, Acc: 98.63%: 100%|██████████| 162/162 [00:09<00:00, 16.21it/s]
[5 / 25] Train: Loss = 0.01026, Acc: 99.70%: 100%|██████████| 591/591 [02:37<00:00,  3.74it/s]
[5 / 25]   Val: Loss = 0.06719, Acc: 98.50%: 100%|██████████| 162/162 [00:09<00:00, 16.21it/s]
[6 / 25] Train: Loss = 0.00607, Acc: 99.86%: 100%|




------------------------
train  DeepPavlov/bert-base-multilingual-cased-sentence
------------------------



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=642.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=24.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=711456796.0, style=ProgressStyle(descri…




[1 / 25] Train: Loss = 0.42581, Acc: 76.35%: 100%|██████████| 591/591 [03:37<00:00,  2.72it/s]
[1 / 25]   Val: Loss = 0.22164, Acc: 88.62%: 100%|██████████| 162/162 [00:14<00:00, 10.81it/s]
[2 / 25] Train: Loss = 0.12503, Acc: 93.95%: 100%|██████████| 591/591 [03:37<00:00,  2.71it/s]
[2 / 25]   Val: Loss = 0.14781, Acc: 92.01%: 100%|██████████| 162/162 [00:14<00:00, 10.81it/s]
[3 / 25] Train: Loss = 0.07556, Acc: 96.88%: 100%|██████████| 591/591 [03:36<00:00,  2.73it/s]
[3 / 25]   Val: Loss = 0.07690, Acc: 97.01%: 100%|██████████| 162/162 [00:14<00:00, 10.82it/s]
[4 / 25] Train: Loss = 0.03171, Acc: 98.85%: 100%|██████████| 591/591 [03:39<00:00,  2.70it/s]
[4 / 25]   Val: Loss = 0.06850, Acc: 97.72%: 100%|██████████| 162/162 [00:14<00:00, 10.80it/s]
[5 / 25] Train: Loss = 0.01517, Acc: 99.55%: 100%|██████████| 591/591 [03:37<00:00,  2.72it/s]
[5 / 25]   Val: Loss = 0.06605, Acc: 98.15%: 100%|██████████| 162/162 [00:15<00:00, 10.78it/s]
[6 / 25] Train: Loss = 0.01020, Acc: 99.70%: 100%|




------------------------
train  bert-base-multilingual-cased
------------------------



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=714314041.0, style=ProgressStyle(descri…




[1 / 25] Train: Loss = 0.35059, Acc: 80.94%: 100%|██████████| 591/591 [03:37<00:00,  2.72it/s]
[1 / 25]   Val: Loss = 0.16202, Acc: 91.03%: 100%|██████████| 162/162 [00:14<00:00, 10.82it/s]
[2 / 25] Train: Loss = 0.08142, Acc: 96.49%: 100%|██████████| 591/591 [03:38<00:00,  2.71it/s]
[2 / 25]   Val: Loss = 0.08927, Acc: 96.93%: 100%|██████████| 162/162 [00:14<00:00, 10.83it/s]
[3 / 25] Train: Loss = 0.03879, Acc: 98.65%: 100%|██████████| 591/591 [03:36<00:00,  2.73it/s]
[3 / 25]   Val: Loss = 0.06492, Acc: 97.63%: 100%|██████████| 162/162 [00:14<00:00, 10.83it/s]
[4 / 25] Train: Loss = 0.02027, Acc: 99.36%: 100%|██████████| 591/591 [03:38<00:00,  2.70it/s]
[4 / 25]   Val: Loss = 0.05956, Acc: 97.94%: 100%|██████████| 162/162 [00:14<00:00, 10.82it/s]
[5 / 25] Train: Loss = 0.01184, Acc: 99.67%: 100%|██████████| 591/591 [03:36<00:00,  2.72it/s]
[5 / 25]   Val: Loss = 0.08047, Acc: 98.09%: 100%|██████████| 162/162 [00:14<00:00, 10.80it/s]
[6 / 25] Train: Loss = 0.00555, Acc: 99.87%: 100%|




------------------------
train  bert-base-multilingual-uncased
------------------------



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=871891.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=672271273.0, style=ProgressStyle(descri…




[1 / 25] Train: Loss = 0.39902, Acc: 79.16%: 100%|██████████| 591/591 [03:29<00:00,  2.83it/s]
[1 / 25]   Val: Loss = 0.18861, Acc: 90.28%: 100%|██████████| 162/162 [00:14<00:00, 11.16it/s]
[2 / 25] Train: Loss = 0.09614, Acc: 96.28%: 100%|██████████| 591/591 [03:29<00:00,  2.82it/s]
[2 / 25]   Val: Loss = 0.09507, Acc: 96.32%: 100%|██████████| 162/162 [00:14<00:00, 11.17it/s]
[3 / 25] Train: Loss = 0.03591, Acc: 98.80%: 100%|██████████| 591/591 [03:29<00:00,  2.83it/s]
[3 / 25]   Val: Loss = 0.06537, Acc: 97.96%: 100%|██████████| 162/162 [00:14<00:00, 11.17it/s]
[4 / 25] Train: Loss = 0.01768, Acc: 99.48%: 100%|██████████| 591/591 [03:29<00:00,  2.83it/s]
[4 / 25]   Val: Loss = 0.10214, Acc: 97.51%: 100%|██████████| 162/162 [00:14<00:00, 11.16it/s]
[5 / 25] Train: Loss = 0.01091, Acc: 99.74%: 100%|██████████| 591/591 [03:29<00:00,  2.82it/s]
[5 / 25]   Val: Loss = 0.08220, Acc: 97.99%: 100%|██████████| 162/162 [00:14<00:00, 11.17it/s]
[6 / 25] Train: Loss = 0.00712, Acc: 99.79%: 100%|

Train different BERT models over 17-20 tasks separately

In [0]:
model_names = [
  'DeepPavlov/rubert-base-cased',
  'DeepPavlov/rubert-base-cased-sentence',
  'DeepPavlov/bert-base-multilingual-cased-sentence',
  'bert-base-multilingual-cased',
  'bert-base-multilingual-uncased'
]

def train_model(model_name):
    print()
    print()
    print()
    print("------------------------")
    print("train ", model_name)
    print("------------------------")
    print()

    for task_id in range(17, 21):
        print()
        print("------------------------")
        print("train task", task_id)
        print("------------------------")
        print()

        seed_everything(SEED)

        solver = Solver(model_name)

        train_tasks, test_tasks = [], []
        train_tasks.extend(load_tasks('dataset/train/', task_num=task_id))
        test_tasks.extend(load_tasks('dataset/test/', task_num=task_id))

        seed_everything(SEED)

        solver.fit(train_tasks, test_tasks)

        model_fn = "data/model_" + model_name.replace("/", "_") + "_task_{}.pkl".format(task_id)

        solver.save(model_fn)

        # print()
        # print("predict on test datases")
        
        # solver = Solver(model_name)
        # solver.load(model_fn)
        # task = load_tasks('dataset/test/', task_num=task_id)[0]
        # print(task)
        # print(solver.predict_from_model(task))


In [13]:
train_model('DeepPavlov/rubert-base-cased')




------------------------
train  DeepPavlov/rubert-base-cased
------------------------


------------------------
train task 17
------------------------



[1 / 25] Train: Loss = 0.50523, Acc: 71.53%: 100%|██████████| 144/144 [00:36<00:00,  3.91it/s]
[1 / 25]   Val: Loss = 0.28504, Acc: 85.58%: 100%|██████████| 39/39 [00:02<00:00, 16.35it/s]
[2 / 25] Train: Loss = 0.13634, Acc: 94.79%: 100%|██████████| 144/144 [00:36<00:00,  3.92it/s]
[2 / 25]   Val: Loss = 0.08096, Acc: 97.28%: 100%|██████████| 39/39 [00:02<00:00, 16.28it/s]
[3 / 25] Train: Loss = 0.03890, Acc: 98.89%: 100%|██████████| 144/144 [00:36<00:00,  3.89it/s]
[3 / 25]   Val: Loss = 0.06450, Acc: 97.76%: 100%|██████████| 39/39 [00:02<00:00, 16.30it/s]
[4 / 25] Train: Loss = 0.01302, Acc: 99.67%: 100%|██████████| 144/144 [00:36<00:00,  3.90it/s]
[4 / 25]   Val: Loss = 0.04520, Acc: 98.72%: 100%|██████████| 39/39 [00:02<00:00, 16.31it/s]
[5 / 25] Train: Loss = 0.00389, Acc: 99.89%: 100%|██████████| 144/144 [00:36<00:00,  3.98it/s]
[5 / 25]   Val: Loss = 0.05319, Acc: 98.88%: 100%|██████████| 39/39 [00:02<00:00, 16.33it/s]
[6 / 25] Train: Loss = 0.00195, Acc: 99.96%: 100%|██████████


------------------------
train task 18
------------------------



[1 / 25] Train: Loss = 0.52843, Acc: 70.11%: 100%|██████████| 154/154 [00:54<00:00,  2.85it/s]
[1 / 25]   Val: Loss = 0.38064, Acc: 79.62%: 100%|██████████| 46/46 [00:03<00:00, 12.30it/s]
[2 / 25] Train: Loss = 0.17354, Acc: 92.05%: 100%|██████████| 154/154 [00:54<00:00,  2.81it/s]
[2 / 25]   Val: Loss = 0.18877, Acc: 92.26%: 100%|██████████| 46/46 [00:03<00:00, 12.31it/s]
[3 / 25] Train: Loss = 0.06257, Acc: 97.83%: 100%|██████████| 154/154 [00:54<00:00,  2.85it/s]
[3 / 25]   Val: Loss = 0.15717, Acc: 94.57%: 100%|██████████| 46/46 [00:03<00:00, 12.32it/s]
[4 / 25] Train: Loss = 0.02081, Acc: 99.31%: 100%|██████████| 154/154 [00:54<00:00,  2.83it/s]
[4 / 25]   Val: Loss = 0.17075, Acc: 94.90%: 100%|██████████| 46/46 [00:03<00:00, 12.31it/s]
[5 / 25] Train: Loss = 0.01311, Acc: 99.66%: 100%|██████████| 154/154 [00:54<00:00,  2.82it/s]
[5 / 25]   Val: Loss = 0.16081, Acc: 95.45%: 100%|██████████| 46/46 [00:03<00:00, 12.32it/s]
[6 / 25] Train: Loss = 0.00259, Acc: 99.94%: 100%|██████████


------------------------
train task 19
------------------------



[1 / 25] Train: Loss = 0.51963, Acc: 71.44%: 100%|██████████| 144/144 [00:33<00:00,  4.24it/s]
[1 / 25]   Val: Loss = 0.27026, Acc: 89.50%: 100%|██████████| 36/36 [00:02<00:00, 16.10it/s]
[2 / 25] Train: Loss = 0.11807, Acc: 96.18%: 100%|██████████| 144/144 [00:33<00:00,  4.27it/s]
[2 / 25]   Val: Loss = 0.07341, Acc: 97.22%: 100%|██████████| 36/36 [00:02<00:00, 16.20it/s]
[3 / 25] Train: Loss = 0.03800, Acc: 98.87%: 100%|██████████| 144/144 [00:33<00:00,  4.28it/s]
[3 / 25]   Val: Loss = 0.05286, Acc: 98.44%: 100%|██████████| 36/36 [00:02<00:00, 16.13it/s]
[4 / 25] Train: Loss = 0.01157, Acc: 99.70%: 100%|██████████| 144/144 [00:33<00:00,  4.28it/s]
[4 / 25]   Val: Loss = 0.04971, Acc: 98.96%: 100%|██████████| 36/36 [00:02<00:00, 16.18it/s]
[5 / 25] Train: Loss = 0.00459, Acc: 99.91%: 100%|██████████| 144/144 [00:33<00:00,  4.28it/s]
[5 / 25]   Val: Loss = 0.05061, Acc: 98.96%: 100%|██████████| 36/36 [00:02<00:00, 16.23it/s]
[6 / 25] Train: Loss = 0.00260, Acc: 99.98%: 100%|██████████


------------------------
train task 20
------------------------



[1 / 25] Train: Loss = 0.50912, Acc: 72.41%: 100%|██████████| 147/147 [00:36<00:00,  4.06it/s]
[1 / 25]   Val: Loss = 0.24437, Acc: 90.70%: 100%|██████████| 40/40 [00:02<00:00, 17.01it/s]
[2 / 25] Train: Loss = 0.14330, Acc: 93.96%: 100%|██████████| 147/147 [00:36<00:00,  4.05it/s]
[2 / 25]   Val: Loss = 0.08797, Acc: 96.09%: 100%|██████████| 40/40 [00:02<00:00, 16.94it/s]
[3 / 25] Train: Loss = 0.04494, Acc: 98.41%: 100%|██████████| 147/147 [00:36<00:00,  4.05it/s]
[3 / 25]   Val: Loss = 0.03925, Acc: 98.67%: 100%|██████████| 40/40 [00:02<00:00, 16.90it/s]
[4 / 25] Train: Loss = 0.01088, Acc: 99.66%: 100%|██████████| 147/147 [00:36<00:00,  4.03it/s]
[4 / 25]   Val: Loss = 0.03381, Acc: 99.06%: 100%|██████████| 40/40 [00:02<00:00, 16.92it/s]
[5 / 25] Train: Loss = 0.00478, Acc: 99.89%: 100%|██████████| 147/147 [00:36<00:00,  4.07it/s]
[5 / 25]   Val: Loss = 0.02516, Acc: 99.14%: 100%|██████████| 40/40 [00:02<00:00, 17.07it/s]
[6 / 25] Train: Loss = 0.00118, Acc: 100.00%: 100%|█████████

In [14]:
train_model('DeepPavlov/rubert-base-cased-sentence')




------------------------
train  DeepPavlov/rubert-base-cased-sentence
------------------------


------------------------
train task 17
------------------------



[1 / 25] Train: Loss = 0.57504, Acc: 66.10%: 100%|██████████| 144/144 [00:34<00:00,  4.12it/s]
[1 / 25]   Val: Loss = 0.35988, Acc: 81.73%: 100%|██████████| 39/39 [00:02<00:00, 17.50it/s]
[2 / 25] Train: Loss = 0.20453, Acc: 90.43%: 100%|██████████| 144/144 [00:34<00:00,  4.14it/s]
[2 / 25]   Val: Loss = 0.19712, Acc: 91.51%: 100%|██████████| 39/39 [00:02<00:00, 17.48it/s]
[3 / 25] Train: Loss = 0.09865, Acc: 95.83%: 100%|██████████| 144/144 [00:34<00:00,  4.12it/s]
[3 / 25]   Val: Loss = 0.12459, Acc: 95.51%: 100%|██████████| 39/39 [00:02<00:00, 17.42it/s]
[4 / 25] Train: Loss = 0.03186, Acc: 99.00%: 100%|██████████| 144/144 [00:34<00:00,  4.13it/s]
[4 / 25]   Val: Loss = 0.05615, Acc: 98.24%: 100%|██████████| 39/39 [00:02<00:00, 17.45it/s]
[5 / 25] Train: Loss = 0.01185, Acc: 99.74%: 100%|██████████| 144/144 [00:34<00:00,  4.21it/s]
[5 / 25]   Val: Loss = 0.05452, Acc: 98.32%: 100%|██████████| 39/39 [00:02<00:00, 17.46it/s]
[6 / 25] Train: Loss = 0.00610, Acc: 99.87%: 100%|██████████


------------------------
train task 18
------------------------



[1 / 25] Train: Loss = 0.59016, Acc: 65.48%: 100%|██████████| 154/154 [00:51<00:00,  2.97it/s]
[1 / 25]   Val: Loss = 0.47616, Acc: 75.88%: 100%|██████████| 46/46 [00:03<00:00, 12.75it/s]
[2 / 25] Train: Loss = 0.22471, Acc: 89.89%: 100%|██████████| 154/154 [00:52<00:00,  2.92it/s]
[2 / 25]   Val: Loss = 0.29634, Acc: 85.73%: 100%|██████████| 46/46 [00:03<00:00, 12.72it/s]
[3 / 25] Train: Loss = 0.11009, Acc: 95.19%: 100%|██████████| 154/154 [00:52<00:00,  2.95it/s]
[3 / 25]   Val: Loss = 0.22691, Acc: 90.83%: 100%|██████████| 46/46 [00:03<00:00, 12.76it/s]
[4 / 25] Train: Loss = 0.05784, Acc: 98.23%: 100%|██████████| 154/154 [00:52<00:00,  2.93it/s]
[4 / 25]   Val: Loss = 0.23518, Acc: 91.51%: 100%|██████████| 46/46 [00:03<00:00, 12.69it/s]
[5 / 25] Train: Loss = 0.02476, Acc: 99.13%: 100%|██████████| 154/154 [00:52<00:00,  2.92it/s]
[5 / 25]   Val: Loss = 0.21404, Acc: 92.73%: 100%|██████████| 46/46 [00:03<00:00, 12.76it/s]
[6 / 25] Train: Loss = 0.01426, Acc: 99.43%: 100%|██████████


------------------------
train task 19
------------------------



[1 / 25] Train: Loss = 0.59833, Acc: 65.45%: 100%|██████████| 144/144 [00:31<00:00,  4.53it/s]
[1 / 25]   Val: Loss = 0.37729, Acc: 85.16%: 100%|██████████| 36/36 [00:02<00:00, 17.78it/s]
[2 / 25] Train: Loss = 0.16248, Acc: 94.62%: 100%|██████████| 144/144 [00:31<00:00,  4.51it/s]
[2 / 25]   Val: Loss = 0.10404, Acc: 96.61%: 100%|██████████| 36/36 [00:02<00:00, 17.70it/s]
[3 / 25] Train: Loss = 0.05205, Acc: 98.42%: 100%|██████████| 144/144 [00:31<00:00,  4.53it/s]
[3 / 25]   Val: Loss = 0.06762, Acc: 97.92%: 100%|██████████| 36/36 [00:02<00:00, 17.75it/s]
[4 / 25] Train: Loss = 0.02667, Acc: 99.24%: 100%|██████████| 144/144 [00:31<00:00,  4.55it/s]
[4 / 25]   Val: Loss = 0.04401, Acc: 98.52%: 100%|██████████| 36/36 [00:02<00:00, 17.72it/s]
[5 / 25] Train: Loss = 0.00747, Acc: 99.76%: 100%|██████████| 144/144 [00:31<00:00,  4.56it/s]
[5 / 25]   Val: Loss = 0.06485, Acc: 98.44%: 100%|██████████| 36/36 [00:02<00:00, 17.73it/s]
[6 / 25] Train: Loss = 0.00298, Acc: 99.93%: 100%|██████████


------------------------
train task 20
------------------------



[1 / 25] Train: Loss = 0.59306, Acc: 66.11%: 100%|██████████| 147/147 [00:34<00:00,  4.29it/s]
[1 / 25]   Val: Loss = 0.33068, Acc: 86.17%: 100%|██████████| 40/40 [00:02<00:00, 17.82it/s]
[2 / 25] Train: Loss = 0.18268, Acc: 92.90%: 100%|██████████| 147/147 [00:34<00:00,  4.20it/s]
[2 / 25]   Val: Loss = 0.13889, Acc: 94.30%: 100%|██████████| 40/40 [00:02<00:00, 18.00it/s]
[3 / 25] Train: Loss = 0.11277, Acc: 95.56%: 100%|██████████| 147/147 [00:34<00:00,  4.29it/s]
[3 / 25]   Val: Loss = 0.11765, Acc: 95.23%: 100%|██████████| 40/40 [00:02<00:00, 17.89it/s]
[4 / 25] Train: Loss = 0.06811, Acc: 97.15%: 100%|██████████| 147/147 [00:34<00:00,  4.24it/s]
[4 / 25]   Val: Loss = 0.09449, Acc: 96.48%: 100%|██████████| 40/40 [00:02<00:00, 17.99it/s]
[5 / 25] Train: Loss = 0.03515, Acc: 99.00%: 100%|██████████| 147/147 [00:34<00:00,  4.30it/s]
[5 / 25]   Val: Loss = 0.07184, Acc: 97.89%: 100%|██████████| 40/40 [00:02<00:00, 17.90it/s]
[6 / 25] Train: Loss = 0.01561, Acc: 99.66%: 100%|██████████

In [15]:
train_model('DeepPavlov/bert-base-multilingual-cased-sentence')




------------------------
train  DeepPavlov/bert-base-multilingual-cased-sentence
------------------------


------------------------
train task 17
------------------------



[1 / 25] Train: Loss = 0.56329, Acc: 66.51%: 100%|██████████| 144/144 [00:48<00:00,  2.94it/s]
[1 / 25]   Val: Loss = 0.34063, Acc: 83.33%: 100%|██████████| 39/39 [00:03<00:00, 11.69it/s]
[2 / 25] Train: Loss = 0.20736, Acc: 90.78%: 100%|██████████| 144/144 [00:48<00:00,  2.95it/s]
[2 / 25]   Val: Loss = 0.15846, Acc: 93.27%: 100%|██████████| 39/39 [00:03<00:00, 11.73it/s]
[3 / 25] Train: Loss = 0.08527, Acc: 96.79%: 100%|██████████| 144/144 [00:48<00:00,  2.96it/s]
[3 / 25]   Val: Loss = 0.09430, Acc: 96.39%: 100%|██████████| 39/39 [00:03<00:00, 11.68it/s]
[4 / 25] Train: Loss = 0.03393, Acc: 98.83%: 100%|██████████| 144/144 [00:48<00:00,  2.95it/s]
[4 / 25]   Val: Loss = 0.07696, Acc: 97.68%: 100%|██████████| 39/39 [00:03<00:00, 11.64it/s]
[5 / 25] Train: Loss = 0.01450, Acc: 99.52%: 100%|██████████| 144/144 [00:48<00:00,  2.97it/s]
[5 / 25]   Val: Loss = 0.08542, Acc: 96.79%: 100%|██████████| 39/39 [00:03<00:00, 11.68it/s]
[6 / 25] Train: Loss = 0.00269, Acc: 99.96%: 100%|██████████


------------------------
train task 18
------------------------



[1 / 25] Train: Loss = 0.57338, Acc: 66.52%: 100%|██████████| 154/154 [01:09<00:00,  2.22it/s]
[1 / 25]   Val: Loss = 0.43313, Acc: 76.56%: 100%|██████████| 46/46 [00:05<00:00,  9.12it/s]
[2 / 25] Train: Loss = 0.22597, Acc: 89.12%: 100%|██████████| 154/154 [01:10<00:00,  2.19it/s]
[2 / 25]   Val: Loss = 0.28418, Acc: 84.38%: 100%|██████████| 46/46 [00:05<00:00,  9.10it/s]
[3 / 25] Train: Loss = 0.12992, Acc: 93.85%: 100%|██████████| 154/154 [01:09<00:00,  2.22it/s]
[3 / 25]   Val: Loss = 0.28846, Acc: 88.18%: 100%|██████████| 46/46 [00:05<00:00,  9.11it/s]
[4 / 25] Train: Loss = 0.06758, Acc: 97.40%: 100%|██████████| 154/154 [01:10<00:00,  2.18it/s]
[4 / 25]   Val: Loss = 0.21682, Acc: 91.85%: 100%|██████████| 46/46 [00:05<00:00,  9.13it/s]
[5 / 25] Train: Loss = 0.03113, Acc: 99.03%: 100%|██████████| 154/154 [01:10<00:00,  2.19it/s]
[5 / 25]   Val: Loss = 0.26643, Acc: 91.64%: 100%|██████████| 46/46 [00:05<00:00,  9.06it/s]
[6 / 25] Train: Loss = 0.01572, Acc: 99.55%: 100%|██████████


------------------------
train task 19
------------------------



[1 / 25] Train: Loss = 0.57749, Acc: 66.69%: 100%|██████████| 144/144 [00:46<00:00,  3.09it/s]
[1 / 25]   Val: Loss = 0.31866, Acc: 86.72%: 100%|██████████| 36/36 [00:03<00:00, 11.39it/s]
[2 / 25] Train: Loss = 0.15733, Acc: 94.66%: 100%|██████████| 144/144 [00:46<00:00,  3.10it/s]
[2 / 25]   Val: Loss = 0.11945, Acc: 96.53%: 100%|██████████| 36/36 [00:03<00:00, 11.33it/s]
[3 / 25] Train: Loss = 0.06884, Acc: 97.85%: 100%|██████████| 144/144 [00:46<00:00,  3.08it/s]
[3 / 25]   Val: Loss = 0.06231, Acc: 98.26%: 100%|██████████| 36/36 [00:03<00:00, 11.35it/s]
[4 / 25] Train: Loss = 0.02406, Acc: 99.26%: 100%|██████████| 144/144 [00:46<00:00,  3.10it/s]
[4 / 25]   Val: Loss = 0.06520, Acc: 98.26%: 100%|██████████| 36/36 [00:03<00:00, 11.46it/s]
[5 / 25] Train: Loss = 0.01555, Acc: 99.54%: 100%|██████████| 144/144 [00:46<00:00,  3.11it/s]
[5 / 25]   Val: Loss = 0.06250, Acc: 98.61%: 100%|██████████| 36/36 [00:03<00:00, 11.44it/s]
[6 / 25] Train: Loss = 0.00510, Acc: 99.87%: 100%|██████████


------------------------
train task 20
------------------------



[1 / 25] Train: Loss = 0.55597, Acc: 68.92%: 100%|██████████| 147/147 [00:49<00:00,  2.97it/s]
[1 / 25]   Val: Loss = 0.29018, Acc: 87.66%: 100%|██████████| 40/40 [00:03<00:00, 11.55it/s]
[2 / 25] Train: Loss = 0.17660, Acc: 93.11%: 100%|██████████| 147/147 [00:48<00:00,  3.02it/s]
[2 / 25]   Val: Loss = 0.16178, Acc: 93.59%: 100%|██████████| 40/40 [00:03<00:00, 11.52it/s]
[3 / 25] Train: Loss = 0.11083, Acc: 95.37%: 100%|██████████| 147/147 [00:48<00:00,  3.01it/s]
[3 / 25]   Val: Loss = 0.13855, Acc: 93.91%: 100%|██████████| 40/40 [00:03<00:00, 11.54it/s]
[4 / 25] Train: Loss = 0.08399, Acc: 96.34%: 100%|██████████| 147/147 [00:49<00:00,  2.99it/s]
[4 / 25]   Val: Loss = 0.09640, Acc: 96.09%: 100%|██████████| 40/40 [00:03<00:00, 11.61it/s]
[5 / 25] Train: Loss = 0.04887, Acc: 98.24%: 100%|██████████| 147/147 [00:48<00:00,  3.03it/s]
[5 / 25]   Val: Loss = 0.05898, Acc: 97.89%: 100%|██████████| 40/40 [00:03<00:00, 11.67it/s]
[6 / 25] Train: Loss = 0.01918, Acc: 99.45%: 100%|██████████

In [16]:
train_model('bert-base-multilingual-cased')




------------------------
train  bert-base-multilingual-cased
------------------------


------------------------
train task 17
------------------------



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=714314041.0, style=ProgressStyle(descri…




[1 / 25] Train: Loss = 0.47550, Acc: 73.44%: 100%|██████████| 144/144 [00:49<00:00,  2.92it/s]
[1 / 25]   Val: Loss = 0.25265, Acc: 87.50%: 100%|██████████| 39/39 [00:03<00:00, 11.68it/s]
[2 / 25] Train: Loss = 0.11823, Acc: 95.29%: 100%|██████████| 144/144 [00:48<00:00,  2.94it/s]
[2 / 25]   Val: Loss = 0.08473, Acc: 96.23%: 100%|██████████| 39/39 [00:03<00:00, 11.64it/s]
[3 / 25] Train: Loss = 0.04962, Acc: 98.24%: 100%|██████████| 144/144 [00:48<00:00,  2.95it/s]
[3 / 25]   Val: Loss = 0.06307, Acc: 98.16%: 100%|██████████| 39/39 [00:03<00:00, 11.67it/s]
[4 / 25] Train: Loss = 0.01396, Acc: 99.54%: 100%|██████████| 144/144 [00:48<00:00,  2.95it/s]
[4 / 25]   Val: Loss = 0.07737, Acc: 97.60%: 100%|██████████| 39/39 [00:03<00:00, 11.68it/s]
[5 / 25] Train: Loss = 0.00471, Acc: 99.87%: 100%|██████████| 144/144 [00:48<00:00,  2.97it/s]
[5 / 25]   Val: Loss = 0.06109, Acc: 98.40%: 100%|██████████| 39/39 [00:03<00:00, 11.69it/s]
[6 / 25] Train: Loss = 0.00298, Acc: 99.93%: 100%|██████████


------------------------
train task 18
------------------------



[1 / 25] Train: Loss = 0.49979, Acc: 70.86%: 100%|██████████| 154/154 [01:09<00:00,  2.22it/s]
[1 / 25]   Val: Loss = 0.35377, Acc: 79.01%: 100%|██████████| 46/46 [00:05<00:00,  9.14it/s]
[2 / 25] Train: Loss = 0.18390, Acc: 90.89%: 100%|██████████| 154/154 [01:10<00:00,  2.20it/s]
[2 / 25]   Val: Loss = 0.23006, Acc: 88.99%: 100%|██████████| 46/46 [00:05<00:00,  9.15it/s]
[3 / 25] Train: Loss = 0.09335, Acc: 96.14%: 100%|██████████| 154/154 [01:09<00:00,  2.23it/s]
[3 / 25]   Val: Loss = 0.22592, Acc: 91.85%: 100%|██████████| 46/46 [00:05<00:00,  9.16it/s]
[4 / 25] Train: Loss = 0.04156, Acc: 98.46%: 100%|██████████| 154/154 [01:10<00:00,  2.19it/s]
[4 / 25]   Val: Loss = 0.21382, Acc: 93.21%: 100%|██████████| 46/46 [00:05<00:00,  9.17it/s]
[5 / 25] Train: Loss = 0.01722, Acc: 99.43%: 100%|██████████| 154/154 [01:10<00:00,  2.20it/s]
[5 / 25]   Val: Loss = 0.23153, Acc: 93.68%: 100%|██████████| 46/46 [00:05<00:00,  9.13it/s]
[6 / 25] Train: Loss = 0.01042, Acc: 99.68%: 100%|██████████


------------------------
train task 19
------------------------



[1 / 25] Train: Loss = 0.48052, Acc: 73.81%: 100%|██████████| 144/144 [00:46<00:00,  3.10it/s]
[1 / 25]   Val: Loss = 0.17394, Acc: 94.36%: 100%|██████████| 36/36 [00:03<00:00, 11.45it/s]
[2 / 25] Train: Loss = 0.08518, Acc: 97.22%: 100%|██████████| 144/144 [00:46<00:00,  3.12it/s]
[2 / 25]   Val: Loss = 0.06940, Acc: 98.00%: 100%|██████████| 36/36 [00:03<00:00, 11.44it/s]
[3 / 25] Train: Loss = 0.03191, Acc: 99.20%: 100%|██████████| 144/144 [00:46<00:00,  3.11it/s]
[3 / 25]   Val: Loss = 0.05452, Acc: 98.35%: 100%|██████████| 36/36 [00:03<00:00, 11.40it/s]
[4 / 25] Train: Loss = 0.01751, Acc: 99.46%: 100%|██████████| 144/144 [00:46<00:00,  3.10it/s]
[4 / 25]   Val: Loss = 0.04525, Acc: 99.05%: 100%|██████████| 36/36 [00:03<00:00, 11.42it/s]
[5 / 25] Train: Loss = 0.01051, Acc: 99.72%: 100%|██████████| 144/144 [00:46<00:00,  3.11it/s]
[5 / 25]   Val: Loss = 0.03955, Acc: 98.96%: 100%|██████████| 36/36 [00:03<00:00, 11.41it/s]
[6 / 25] Train: Loss = 0.00318, Acc: 99.87%: 100%|██████████


------------------------
train task 20
------------------------



[1 / 25] Train: Loss = 0.48359, Acc: 73.98%: 100%|██████████| 147/147 [00:49<00:00,  3.00it/s]
[1 / 25]   Val: Loss = 0.19953, Acc: 92.19%: 100%|██████████| 40/40 [00:03<00:00, 11.62it/s]
[2 / 25] Train: Loss = 0.13101, Acc: 94.86%: 100%|██████████| 147/147 [00:48<00:00,  3.03it/s]
[2 / 25]   Val: Loss = 0.13892, Acc: 95.31%: 100%|██████████| 40/40 [00:03<00:00, 11.56it/s]
[3 / 25] Train: Loss = 0.06509, Acc: 97.36%: 100%|██████████| 147/147 [00:48<00:00,  3.02it/s]
[3 / 25]   Val: Loss = 0.04986, Acc: 98.28%: 100%|██████████| 40/40 [00:03<00:00, 11.60it/s]
[4 / 25] Train: Loss = 0.01929, Acc: 99.30%: 100%|██████████| 147/147 [00:49<00:00,  2.99it/s]
[4 / 25]   Val: Loss = 0.03971, Acc: 98.59%: 100%|██████████| 40/40 [00:03<00:00, 11.62it/s]
[5 / 25] Train: Loss = 0.00905, Acc: 99.72%: 100%|██████████| 147/147 [00:49<00:00,  3.00it/s]
[5 / 25]   Val: Loss = 0.05640, Acc: 98.75%: 100%|██████████| 40/40 [00:03<00:00, 11.60it/s]
[6 / 25] Train: Loss = 0.00254, Acc: 99.91%: 100%|██████████

In [17]:
train_model('bert-base-multilingual-uncased')




------------------------
train  bert-base-multilingual-uncased
------------------------


------------------------
train task 17
------------------------



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=871891.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=672271273.0, style=ProgressStyle(descri…




[1 / 25] Train: Loss = 0.50321, Acc: 71.29%: 100%|██████████| 144/144 [00:47<00:00,  3.04it/s]
[1 / 25]   Val: Loss = 0.28029, Acc: 87.82%: 100%|██████████| 39/39 [00:03<00:00, 11.95it/s]
[2 / 25] Train: Loss = 0.14030, Acc: 94.51%: 100%|██████████| 144/144 [00:47<00:00,  3.05it/s]
[2 / 25]   Val: Loss = 0.09128, Acc: 97.04%: 100%|██████████| 39/39 [00:03<00:00, 11.97it/s]
[3 / 25] Train: Loss = 0.03371, Acc: 99.09%: 100%|██████████| 144/144 [00:47<00:00,  3.03it/s]
[3 / 25]   Val: Loss = 0.07431, Acc: 97.44%: 100%|██████████| 39/39 [00:03<00:00, 11.96it/s]
[4 / 25] Train: Loss = 0.01909, Acc: 99.46%: 100%|██████████| 144/144 [00:47<00:00,  3.04it/s]
[4 / 25]   Val: Loss = 0.09891, Acc: 97.52%: 100%|██████████| 39/39 [00:03<00:00, 11.90it/s]
[5 / 25] Train: Loss = 0.01230, Acc: 99.61%: 100%|██████████| 144/144 [00:46<00:00,  3.07it/s]
[5 / 25]   Val: Loss = 0.07739, Acc: 98.16%: 100%|██████████| 39/39 [00:03<00:00, 11.87it/s]
[6 / 25] Train: Loss = 0.00357, Acc: 99.91%: 100%|██████████


------------------------
train task 18
------------------------



[1 / 25] Train: Loss = 0.53307, Acc: 70.39%: 100%|██████████| 154/154 [01:07<00:00,  2.29it/s]
[1 / 25]   Val: Loss = 0.36582, Acc: 81.25%: 100%|██████████| 46/46 [00:04<00:00,  9.50it/s]
[2 / 25] Train: Loss = 0.17359, Acc: 91.92%: 100%|██████████| 154/154 [01:07<00:00,  2.28it/s]
[2 / 25]   Val: Loss = 0.21477, Acc: 90.35%: 100%|██████████| 46/46 [00:04<00:00,  9.49it/s]
[3 / 25] Train: Loss = 0.07632, Acc: 97.00%: 100%|██████████| 154/154 [01:07<00:00,  2.27it/s]
[3 / 25]   Val: Loss = 0.21668, Acc: 93.00%: 100%|██████████| 46/46 [00:04<00:00,  9.48it/s]
[4 / 25] Train: Loss = 0.02364, Acc: 99.29%: 100%|██████████| 154/154 [01:06<00:00,  2.33it/s]
[4 / 25]   Val: Loss = 0.20261, Acc: 94.50%: 100%|██████████| 46/46 [00:04<00:00,  9.49it/s]
[5 / 25] Train: Loss = 0.01129, Acc: 99.72%: 100%|██████████| 154/154 [01:08<00:00,  2.26it/s]
[5 / 25]   Val: Loss = 0.17304, Acc: 95.92%: 100%|██████████| 46/46 [00:04<00:00,  9.51it/s]
[6 / 25] Train: Loss = 0.00886, Acc: 99.78%: 100%|██████████


------------------------
train task 19
------------------------



[1 / 25] Train: Loss = 0.51652, Acc: 72.63%: 100%|██████████| 144/144 [00:44<00:00,  3.21it/s]
[1 / 25]   Val: Loss = 0.21572, Acc: 92.62%: 100%|██████████| 36/36 [00:03<00:00, 11.74it/s]
[2 / 25] Train: Loss = 0.12135, Acc: 96.18%: 100%|██████████| 144/144 [00:45<00:00,  3.17it/s]
[2 / 25]   Val: Loss = 0.08952, Acc: 97.22%: 100%|██████████| 36/36 [00:03<00:00, 11.73it/s]
[3 / 25] Train: Loss = 0.04082, Acc: 98.72%: 100%|██████████| 144/144 [00:45<00:00,  3.20it/s]
[3 / 25]   Val: Loss = 0.05557, Acc: 98.44%: 100%|██████████| 36/36 [00:03<00:00, 11.68it/s]
[4 / 25] Train: Loss = 0.01220, Acc: 99.61%: 100%|██████████| 144/144 [00:44<00:00,  3.22it/s]
[4 / 25]   Val: Loss = 0.03967, Acc: 99.22%: 100%|██████████| 36/36 [00:03<00:00, 11.73it/s]
[5 / 25] Train: Loss = 0.00709, Acc: 99.76%: 100%|██████████| 144/144 [00:45<00:00,  3.19it/s]
[5 / 25]   Val: Loss = 0.04698, Acc: 98.87%: 100%|██████████| 36/36 [00:03<00:00, 11.64it/s]
[6 / 25] Train: Loss = 0.00875, Acc: 99.83%: 100%|██████████


------------------------
train task 20
------------------------



[1 / 25] Train: Loss = 0.51709, Acc: 72.64%: 100%|██████████| 147/147 [00:47<00:00,  3.09it/s]
[1 / 25]   Val: Loss = 0.33507, Acc: 87.50%: 100%|██████████| 40/40 [00:03<00:00, 12.09it/s]
[2 / 25] Train: Loss = 0.14738, Acc: 94.30%: 100%|██████████| 147/147 [00:47<00:00,  3.10it/s]
[2 / 25]   Val: Loss = 0.10929, Acc: 96.02%: 100%|██████████| 40/40 [00:03<00:00, 12.06it/s]
[3 / 25] Train: Loss = 0.06231, Acc: 97.77%: 100%|██████████| 147/147 [00:46<00:00,  3.13it/s]
[3 / 25]   Val: Loss = 0.05293, Acc: 97.89%: 100%|██████████| 40/40 [00:03<00:00, 12.07it/s]
[4 / 25] Train: Loss = 0.02396, Acc: 99.23%: 100%|██████████| 147/147 [00:47<00:00,  3.09it/s]
[4 / 25]   Val: Loss = 0.03727, Acc: 98.59%: 100%|██████████| 40/40 [00:03<00:00, 12.03it/s]
[5 / 25] Train: Loss = 0.00964, Acc: 99.70%: 100%|██████████| 147/147 [00:47<00:00,  3.12it/s]
[5 / 25]   Val: Loss = 0.04747, Acc: 98.67%: 100%|██████████| 40/40 [00:03<00:00, 12.03it/s]
[6 / 25] Train: Loss = 0.00268, Acc: 99.94%: 100%|██████████