In [1]:
import pandas as pd
import re
import os
import numpy as np
from collections import Counter
from sklearn.model_selection import KFold
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F

from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset
import gc
from transformers import AdamW, AutoConfig, AutoModel, AutoTokenizer, get_cosine_schedule_with_warmup

import ast

import random
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(123)

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
data = pd.read_csv('/home/ssd2/tag_video/baseline/train_data_categories.csv')
tags = pd.read_csv('/home/ssd2/tag_video/baseline/IAB_tags.csv')

video_data = pd.read_csv('video_feat.csv', index_col='video_id')

In [3]:
all_video_data = video_data.values[:, 1:].astype('float') # for augs
all_video_data.shape

(1049, 1000)

In [4]:
data['old_tags'] = data['tags'] 
data['tags'] = [x if not re.findall('\S\:\S', x) else x.replace(':', ': ') for x in data['tags'].fillna(
    'Медицинские направления: Психиатрия, Массовая культура: Юмор и сатира') ]
data['tags'] = [x.replace('астрология', 'Астрология').replace('\n', '').replace('\t', ': ').replace('  ', ' ') for x in data['tags'] ]
data['tags'] = [x.replace(', Фильмы и анимация: Фильмы и анимация ', '') for x in data['tags'] ]
data['tags'] = [x if x[-2:] != ', ' else x[:-2] for x in data['tags'] ]
data['tags'] = [x if x[:2] != ', ' else x[2:] for x in data['tags'] ]
data['tags'] = [x if ('Медицинские направления' not in x) or ('Медицина' in x) else 
                x.replace('Медицинские направления', 'Медицина: Медицинские направления') for x in data['tags'] ]
data['tags'] = [x.replace('Путешествия: Путешествия на машине', 'Путешествия: Тип путешествия: Путешествия на машине') for x in data['tags']]
data['tags'] = [x if x != 'Отношения знаменитостей: Семьи знаменитостей' else 
                'Массовая культура: Отношения знаменитостей, Массовая культура: Семьи знаменитостей' for x in data['tags']]
data['tags'] = [x if x != 'Массовая культура: Стиль знаменитостей: Стиль знаменитостей' else 
                'Массовая культура: Стиль знаменитостей' for x in data['tags']]
data['tags'] = [x if x != 'Массовая культура: Отношения знаменитостей: Семьи знаменитостей' else 
                'Массовая культура: Отношения знаменитостей, Массовая культура: Семьи знаменитостей' for x in data['tags']]
data['tags'] = [x if ', Красота: Макияж' not in x else x.replace('Красота: Макияж', 'Стиль и красота: Красота: Макияж')
                for x in data['tags']]
data['tags'] = [x.replace('%', ':') for x in data['tags']]
data['tags'] = [x if ', Экономика: Валюты' not in x else x.replace(', Экономика: Валюты', ', Бизнес и финансы: Экономика: Валюты') for x in data['tags']]
data['tags'] = [x if ', Компьютеры и цифровые технологии' not in x else 
                x.replace(', Компьютеры и цифровые технологии', ', Информационные технологии: Компьютеры и цифровые технологии') for x in data['tags']]
data['tags'] = [x if ', Игры и головоломки: Карточные игры' not in x else 
                x.replace(', Игры и головоломки: Карточные игры', ', Хобби и интересы: Игры и головоломки: Карточные игры') for x in data['tags']]
data['tags'] = [x.replace('Хобби и стиль', 'Хобби и интересы') for x in data['tags']]              
data['tags'] = [x if x != 'Создание контента: Личные события' else 'Хобби и интересы: Создание контента, События и достопримечательности: Личные события' for x in data['tags']]

In [5]:
tags_0 = set(tags.iloc[:, 0].unique())
tags_1 = set(tags.iloc[:, 1].unique())
tags_2 = set(tags.iloc[:, 2].unique())

In [6]:
cnt_level_0 = Counter( [i for s in [[k.split(': ')[0] for k in x.split(', ')] for x in data['tags'] ] for i in s] )
cnt_level_1 = Counter( [i for s in [[k.split(': ')[1] for k in x.split(', ') if len(k.split(': ')) > 1] for x in data['tags'] ] for i in s] )


In [7]:
target_level_0 = sorted([k for k,v in cnt_level_0.items() if v >= 10])
target_level_1 = sorted([k for k,v in cnt_level_1.items() if v >= 10])

dict_target_level_0 = {x:i for i,x in enumerate(target_level_0)}
dict_target_level_1 = {x:i for i,x in enumerate(target_level_1)}

dict_target_level_0_inv = {i:x for i,x in enumerate(target_level_0)}
dict_target_level_1_inv = {i:x for i,x in enumerate(target_level_1)}

num_target_level0 = len(dict_target_level_0)
num_target_level1 = len(dict_target_level_1)

In [8]:
def standart_split(data, target, n_splits = 5, seed = 69):
    split_list = []
    kf = KFold(n_splits = n_splits, shuffle = True, random_state = seed)
    for train_index, test_index in kf.split(data, target) :
        split_list += [(train_index, test_index)]
    return split_list
split_list = standart_split(data, data['tags'])

In [9]:
class RutubeDataset(Dataset):
    def __init__(self, videos_ids, title, description, target, tokenizer, dict_target_level_0, dict_target_level_1, max_len = 512, training=True):
        self.videos_ids = videos_ids
        self.title = title
        self.description = description
        self.target = target
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.dict_target_level_0 = dict_target_level_0
        self.dict_target_level_1 = dict_target_level_1
        self.training = training
    def __len__(self):
        return len(self.title)

    def __getitem__(self, idx):
        video_id = self.videos_ids[idx]
        title, description, target = self.title[idx], self.description[idx], self.target[idx]
        text = f'{title}. Описание - {description}'

        if self.training:
            if random.random() < 0.05:
                tmp = text.split(' ')
                tmp = [w for w in tmp if random.random() < 0.95]
                if len(tmp) > 0:
                    text = ' '.join(tmp)

        tok = self.tokenizer(text, max_length=self.max_len, truncation=True)

        target_lvl1 = [x.split(': ')[0] for x in target.split(', ')]
        target_lvl2 = [x.split(': ')[1] for x in target.split(', ') if len(x.split(': ')) > 1]

        full_target = np.zeros(num_target_level0 + num_target_level1)
        for t in target_lvl1:
            if t in self.dict_target_level_0:
                full_target[self.dict_target_level_0[t]] = 1
        
        for t in target_lvl2:
            if t in self.dict_target_level_1:
                full_target[self.dict_target_level_1[t] + num_target_level0] = 1

        video_feat = video_data.loc[video_id].values[1:].astype('float')

        if self.training:
            if random.random() < 0.01:
                video_feat = all_video_data[random.randrange(len(all_video_data))]
            if random.random() < 0.1:
                for i in range(len(video_feat)):
                    if random.random() < 0.005:
                        video_feat[i] = video_feat[i] * (0.95 + 0.1 * random.random())

        return tok, full_target, video_feat

In [10]:
class CustomModel(nn.Module):
    def __init__(self, model, fc_dropout = [0.3], nn_dp = 0., lns = 1e-07, config_path=None, pretrained=False, num_feat=1000): #1000+29
        super().__init__()

        if config_path is None:
            self.config = AutoConfig.from_pretrained(model)
        else:
            self.config = torch.load(config_path)

        self.num_labels = num_target_level0 + num_target_level1
        self.config.update(
            {
                'hidden_dropout_prob': nn_dp,
                "output_hidden_states": True,
                'layer_norm_eps': lns,
                "num_labels": 1,
            }
        )

        if pretrained:
            self.model = AutoModel.from_pretrained(model, config=self.config)
        else:
            self.model = AutoModel(self.config)

        self.num_dropout = len(fc_dropout)
        self.fc_dropout0 = nn.Dropout(fc_dropout[0])
        self.fc_dropout1 = nn.Dropout(fc_dropout[1] if len(fc_dropout) > 1 else 0)
        self.fc_dropout2 = nn.Dropout(fc_dropout[2] if len(fc_dropout) > 2 else 0)
        self.fc_dropout3 = nn.Dropout(fc_dropout[3] if len(fc_dropout) > 3 else 0)
        self.fc_dropout4 = nn.Dropout(fc_dropout[4] if len(fc_dropout) > 4 else 0)

        self.l0 = nn.Sequential(
                    nn.Dropout(0.1),
                    nn.Linear(num_feat, 256),
                    nn.BatchNorm1d(256),
                    nn.SiLU(inplace=True),
                    nn.Linear(256, 64))

        self.fc = nn.Linear(self.config.hidden_size + 64, self.num_labels) # + 29  + 8
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0][:,0,:].squeeze(1)
        return last_hidden_states


    def forward(self, inputs, feat): #, feat1
        feature = self.feature(inputs)
        feat = self.l0(feat)
        feature = torch.concat([feature, feat], dim=1) #, feat1
        output_list = []
        output0 = self.fc(self.fc_dropout0(feature))
        output1 = self.fc(self.fc_dropout1(feature))
        output2 = self.fc(self.fc_dropout2(feature))
        output3 = self.fc(self.fc_dropout3(feature))
        output4 = self.fc(self.fc_dropout4(feature))

        output_list = [output0, output1, output2, output3, output4]
        return output_list[:self.num_dropout]

In [11]:
class Collate:
    def __init__(self, tokenizer, is_train = True):
        self.tokenizer = tokenizer
        self.is_train = is_train
    def __call__(self, batch):

        inputs = [sample[0] for sample in batch]
        labels = [sample[1] for sample in batch]
        feat = [sample[2] for sample in batch]
        
        # calculate max token length of this batch
        batch_max = max([len(ids['input_ids']) for ids in inputs])
        # add padding
        inputs_dict = dict()
        inputs_dict["attention_mask"] = [s['attention_mask'] + (batch_max - len(s['attention_mask'])) * [0] for s in inputs]
        inputs_dict["input_ids"] = [s['input_ids'] + (batch_max - len(s['attention_mask'])) * [0] for s in inputs]
        # convert to tensors
        inputs_dict["attention_mask"] = torch.tensor(inputs_dict["attention_mask"], dtype=torch.long)
        inputs_dict["input_ids"] = torch.tensor(inputs_dict["input_ids"], dtype=torch.long)

        labels = torch.tensor(np.array(labels), dtype=torch.float)
        feat = torch.tensor(np.array(feat), dtype=torch.float)
       
        return inputs_dict, labels, feat #, feat1

In [12]:
from tqdm.notebook import tqdm

In [13]:
import pandas as pd
import argparse
import ast
import numpy as np

def iou_metric(ground_truth, predictions):
    iou =  len(set.intersection(set(ground_truth), set(predictions)))
    iou = iou/(len(set(ground_truth).union(set(predictions))))
    return iou

def split_tags(tag_list):
    final_tag_list = []
    for tag in tag_list:
        tags = tag.split(": ")
        if len(tags) == 3:
            final_tag_list.append(tags[0])
            final_tag_list.append(tags[0] + ": " + tags[1])
            final_tag_list.append(tags[0]+ ": " + tags[1] + ": " + tags[2])
        elif len(tags) == 2:
            final_tag_list.append(tags[0])
            final_tag_list.append(tags[0] + ": " + tags[1])
        elif len(tags) == 1:
            final_tag_list.append(tags[0])
        else:
            print("NOT IMPLEMENTED!!!!", tag)
    return final_tag_list


def find_iou_for_sample_submission(pred_submission, true_submission):
    ground_truth_df = true_submission
    ground_truth_df["tags"] = ground_truth_df["tags"].apply(lambda l: l.split(', '))
    ground_truth_df["tags_split"] = ground_truth_df["tags"].apply(lambda l: split_tags(l))

    predictions_df = pred_submission
    predictions_df["predicted_tags"] = predictions_df["predicted_tags"].apply(ast.literal_eval)
    predictions_df["predicted_tags_split"] = predictions_df["predicted_tags"].apply(lambda l: split_tags(l))
    iou=0
    counter = 0
    for i, row in ground_truth_df.iterrows():
        predicted_tags = predictions_df[predictions_df["video_id"]==row["video_id"]]["predicted_tags_split"].values[0]
        iou_temp=iou_metric(row['tags_split'], predicted_tags)
        iou+=iou_temp
        counter+=1

    return iou/counter

In [14]:
def make_predict(model, valid_dataloader, criterion, epoch, valid_df):
    preds = []
    model.eval()
    len_loader = len(valid_dataloader)
    tk0 = tqdm(enumerate(valid_dataloader), total = len_loader)
    average_loss = 0
    with torch.no_grad():
        for batch_number,  (inputs, labels, feat)  in tk0: #, feat1
            for k, v in inputs.items():
                inputs[k] = v.cuda()
            labels = labels.cuda()
            feat = feat.cuda()
          
            with torch.cuda.amp.autocast():
                y_preds_list  = model(inputs, feat) #, feat1
                loss_list = [criterion(pred, labels) for pred in y_preds_list]
                loss = sum(loss_list) / len(loss_list)

            y_preds = sum(y_preds_list) / len(y_preds_list)

            average_loss += loss.cpu().detach().numpy()
            tk0.set_postfix(loss=average_loss / (batch_number + 1), stage="validation", epoch = epoch)
            preds += [y_preds.sigmoid().to('cpu').numpy()]

    preds = np.concatenate(preds)

    # TH = 0.3
    best_score = -10

    for TH in [0.2, 0.3]:
        list_predicts = []
        for pr in preds:
            ind = np.where(pr > TH)[0]
            tmp_pred = []
            if len(ind):
                for i in ind:
                    if i >= num_target_level0:
                        tmp_pred += [': '.join(tags[tags.iloc[:, 1] == dict_target_level_1_inv[i - num_target_level0]].iloc[0].tolist()[:2])]
                    else:
                        tmp_pred += [ dict_target_level_0_inv[i]]
            list_predicts += [str(tmp_pred)]

        pred_submission = pd.DataFrame()
        pred_submission['video_id'] = valid_df['video_id']
        pred_submission['predicted_tags'] = list_predicts

        true_submission = valid_df[['video_id', 'tags']]

        score = find_iou_for_sample_submission(pred_submission.copy(), true_submission.copy())
        # print(score, TH)
        if score > best_score:
            best_score = score
            best_th = TH
    return best_score, best_th


In [15]:
max_len = 512
max_val_len = 512

batch_size = 8
epochs = 20 #10 #5
lr = 1.5e-5
fp16 = True
clip_grad_norm = 5
model_name = "ai-forever/ruRoberta-large"
fc_dropout= [0.1, 0.2]
tokenizer = AutoTokenizer.from_pretrained(model_name)

weight_decay = 1e-4
eps = 1e-6
betas = (0.9, 0.99)
scheduler = 'cosine' # ['linear', 'cosine']
accumulation_steps = 1
batch_scheduler = True

device = 'cuda'

params_train = {'batch_size': batch_size, 'shuffle': True, 'drop_last': True, 'num_workers': 4}
params_valid = {'batch_size': batch_size, 'shuffle': False, 'drop_last': False, 'num_workers': 4}

all_scores = []
for fold in [0, 1, 2, 3, 4]:
    ckp = f'model_1_feat_fold_{fold}'
    criterion = nn.BCEWithLogitsLoss()

    train_df = data.loc[split_list[fold][0]].reset_index(drop=True)
    valid_df = data.loc[split_list[fold][1]].reset_index(drop=True)
    model = CustomModel(model_name, fc_dropout, pretrained = True).cuda()

    model.train()
   
    scaler = torch.cuda.amp.GradScaler(enabled = True)
    collate_fn = Collate(tokenizer)
    train_dataloader = DataLoader( RutubeDataset( train_df['video_id'], train_df['title'], train_df['description'], train_df['tags'], tokenizer, dict_target_level_0, dict_target_level_1), 
                                  collate_fn  = collate_fn, **params_train)
    valid_dataloader = DataLoader( RutubeDataset( valid_df['video_id'], valid_df['title'], valid_df['description'], valid_df['tags'], tokenizer, dict_target_level_0, dict_target_level_1, training=False), 
                                  collate_fn  = collate_fn, **params_valid)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
          'lr': lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
          'lr': lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "model" not in n],
          'lr': lr, 'weight_decay': 0.0}
    ]

    optimizer = torch.optim.AdamW(optimizer_parameters, lr=lr, eps=eps, betas=betas)

    num_train_steps = int(len(train_df) / batch_size * epochs) // accumulation_steps
    scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, 1, 0.01, num_train_steps, -1)

    best_score = -1
    len_dataloader = len(train_dataloader)

    scores = []
    for epoch in range(epochs):
        average_loss = 0
        tk0 = tqdm(enumerate(train_dataloader), total = len_dataloader)
        for batch_number,  (inputs, labels, feat)  in tk0: #, feat1
            for k, v in inputs.items():
                inputs[k] = v.cuda()
            labels = labels.cuda()
            feat = feat.cuda()
            with torch.amp.autocast(device):
                y_preds_list  = model(inputs, feat) # , feat1
                loss_list = [criterion(pred, labels) for pred in y_preds_list]
                loss = sum(loss_list) / len(loss_list)

            optimizer.zero_grad()
            scaler.scale(loss).backward()
            if clip_grad_norm > 0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad_norm)
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            average_loss += loss.cpu().detach().numpy()
            tk0.set_postfix(loss=average_loss / (batch_number + 1), stage="train", epoch = epoch)

        score, th = make_predict(model, valid_dataloader, criterion, epoch, valid_df)
        print(score, th)
        scores.append(score)
    all_scores.append(np.asarray(scores))

    torch.save(model.state_dict(), f'{ckp}.pt')
    del model
    gc.collect()
    torch.cuda.empty_cache()

  return self.fget.__get__(instance, owner)()
Some weights of RobertaModel were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.526887755102041 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6418764172335601 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6551473922902493 0.3


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6666326530612244 0.3


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6659410430839001 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6666780045351474 0.3


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6663718820861677 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6690136054421768 0.3


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6778004535147392 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6811564625850339 0.3


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6731972789115642 0.3


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6801020408163263 0.3


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6857256235827661 0.3


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6858390022675733 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.684331065759637 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6857596371882083 0.3


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6778798185941042 0.3


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6804195011337868 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6799433106575963 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6823242630385485 0.3


Some weights of RobertaModel were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.4887641723356011 0.3


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6096258503401362 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6699999999999997 0.3


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6818367346938772 0.3


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7009977324263036 0.3


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7029931972789111 0.3


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6841836734693876 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7044671201814056 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7053741496598637 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7077097505668931 0.3


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6951814058956911 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7129931972789113 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7095464852607708 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7075170068027208 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7141836734693875 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7059297052154192 0.3


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7014852607709748 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7060884353741493 0.3


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7021201814058953 0.3


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7056916099773238 0.3


Some weights of RobertaModel were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.4977891156462584 0.3


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6085770975056691 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6883163265306121 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6842517006802723 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6959240362811792 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7017913832199546 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7058276643990931 0.3


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6877324263038549 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6957369614512473 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6904875283446711 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6961224489795919 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7064739229024944 0.3


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7127097505668933 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7089909297052156 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6981405895691611 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7040702947845805 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7044557823129253 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.707154195011338 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7081292517006804 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7070068027210885 0.2


Some weights of RobertaModel were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.5096768707482995 0.3


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6271712018140587 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6798544973544975 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6628401360544217 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6997732426303857 0.3


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7095313681027967 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.709875283446712 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.704831821617536 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7047864701436131 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7061281179138322 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.709109977324263 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7158163265306123 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7188265306122448 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7097845804988662 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.716530612244898 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7166893424036282 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.712562358276644 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7196258503401359 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7121655328798184 0.2


  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.7104195011337869 0.3


Some weights of RobertaModel were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/105 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.5489348370927317 0.2


  0%|          | 0/105 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6154951773372827 0.2


  0%|          | 0/105 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.640561631351105 0.3


  0%|          | 0/105 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.672362724994304 0.3


  0%|          | 0/105 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6740905293536874 0.2


  0%|          | 0/105 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6883002961950334 0.3


  0%|          | 0/105 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6923558897243111 0.2


  0%|          | 0/105 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.688368648894965 0.3


  0%|          | 0/105 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6883230804283439 0.2


  0%|          | 0/105 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.696958304853042 0.2


  0%|          | 0/105 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6918432444748236 0.2


  0%|          | 0/105 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6881408065618594 0.3


  0%|          | 0/105 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6905217589428118 0.2


  0%|          | 0/105 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.692515379357485 0.2


  0%|          | 0/105 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6981430849851904 0.2


  0%|          | 0/105 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6981089086352248 0.2


  0%|          | 0/105 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.697664616085669 0.2


  0%|          | 0/105 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6996126680337208 0.3


  0%|          | 0/105 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6951583504215085 0.2


  0%|          | 0/105 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

0.6947140578719528 0.2


In [16]:
all_scores = np.asarray(all_scores)
bst_iter = all_scores.mean(axis=0).argmax()
print(all_scores.mean(axis=0).max(), bst_iter, all_scores[:, bst_iter])


0.7034660297930974 12 [0.68572562 0.70954649 0.71270975 0.71882653 0.69052176]
