In [1]:
import os

OUTPUT_DIR = 'output_intfloat-multilingual-e5-large-extradata_baseline'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [2]:
class CFG:
    apex = True
    print_freq = 100
    num_workers = 8
    model = "intfloat/multilingual-e5-large"
    gradient_checkpointing = True
    scheduler = 'cosine'  # ['linear', 'cosine']
    batch_scheduler = True
    num_cycles = 0.5
    num_warmup_steps = 0
    epochs = 10
    encoder_lr = 2e-5
    decoder_lr = 2e-5
    min_lr = 1e-6
    eps = 1e-6
    betas = (0.9, 0.999)
    batch_size = 8
    max_len = 512
    weight_decay = 0.01
    gradient_accumulation_steps = 1
    max_grad_norm = 1000
    target_cols = ['class']
    seed = 42
    n_fold = 5
    trn_fold = [0, 1, 2, 3, 4]
    train = True

In [3]:
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel, AutoConfig
from sklearn.metrics import f1_score
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
import torch.nn as nn
import torch
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
import pickle
import pandas as pd
import numpy as np
import gc
import re
import time
import math
import random
import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# from iterstrat.ml_stratifiers import MultilabelStratifiedKFold


os.environ["TOKENIZERS_PARALLELISM"] = "false"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
def preprocess(df):
    df["text"] = df["text"].astype(str)
    df["class"] = df["class"].astype(str)
    df.drop_duplicates(["class", "text"], inplace=True)
    df.reset_index(drop=True, inplace=True)
    df["text"] = df["text"].apply(lambda x: " ".join(
        re.findall(r"[а-яА-Я0-9 ёЁ\-\.,?!+a-zA-Z]+", x)))

    return df


def get_score(y_trues, class_predictions):
    class_predictions = [np.argmax(el) for el in class_predictions]

    class_score = f1_score(y_trues[:, 0], class_predictions, average="macro")
    return class_score


def get_logger(filename=os.path.join(OUTPUT_DIR, 'train')):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger


LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_everything(seed=42)

In [5]:
train = pd.read_csv("dataset/sample.csv")
print(train.shape)
extra_train = pd.read_csv("dataset/extra_sample.csv")
print(extra_train.shape)
extra_train_v2 = pd.read_csv("dataset/extra_sample_v2.csv")
print(extra_train_v2.shape)
train = pd.concat([train, extra_train, extra_train_v2], axis=0)
print(train.shape)
train = preprocess(train)
print(train.shape)
train["text"] = train["text"].apply(lambda x: "query: " + x)

(501, 2)
(673, 2)
(193, 2)
(1367, 2)
(1221, 2)


In [6]:
train["class"].value_counts()

class
arrangement       324
order             310
contract          184
application        72
proxy              69
act                69
determination      50
invoice            42
bill               41
statute            35
contract offer     25
Name: count, dtype: int64

In [7]:
class_le = LabelEncoder()
class_le.fit(train["class"].tolist())
train["class"] = class_le.transform(train["class"].tolist())

with open(os.path.join(OUTPUT_DIR, "executor_le.pkl"), "wb") as f:
    pickle.dump(class_le, f)

In [8]:
Fold = StratifiedKFold(n_splits=CFG.n_fold,
                       shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(train, train["class"].tolist())):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
train.head()

Unnamed: 0,class,text,fold
0,2,query: СОГЛАШЕНИЕ N 8 о расторжении трудового ...,2
1,2,query: Соглашение о предоставлении опциона на ...,1
2,2,query: Соглашение о реструктуризации задолженн...,2
3,2,query: Дополнительное соглашение к договору ку...,0
4,2,query: Соглашение о расторжении договора об ок...,2


In [9]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, 'tokenizer'))
CFG.tokenizer = tokenizer

lengths = []
tk0 = tqdm(train['text'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 2  # cls & sep
LOGGER.info(f"max_len: {CFG.max_len}")
CFG.max_len = 512

  0%|          | 0/1221 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1421 > 512). Running this sequence through the model will result in indexing errors
max_len: 345518


In [10]:
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text,
        return_tensors=None,
        add_special_tokens=True,
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['text'].values
        self.labels = df["class"].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.long)
        return inputs, label


def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:, :mask_len]
    return inputs

In [11]:
def average_pool(last_hidden_states, attention_mask):
    last_hidden = last_hidden_states.masked_fill(
        ~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(
                cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(
                cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        self.fc = nn.Linear(self.config.hidden_size, 11)
        self._init_weights(self.fc)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(
                mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(
                mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature_me5(self, inputs):
        outputs = self.model(**inputs)
        feature = average_pool(outputs.last_hidden_state,
                               inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature_me5(inputs)
        output = self.fc(feature)

        return output

In [12]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

In [13]:
def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, label) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        label = label.to(device)

        batch_size = label.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            pred = model(inputs)
            loss = criterion(pred, label)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(
            model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader),
                          remain=timeSince(start, float(
                              step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, label) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        label = label.to(device)

        batch_size = label.size(0)
        with torch.no_grad():
            pred = model(inputs)
            loss = criterion(pred, label)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(pred.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions

In [14]:
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)

    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values

    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, os.path.join(OUTPUT_DIR, 'config.pth'))
    model.to(device)

    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr,
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr,
                      eps=CFG.eps, betas=CFG.betas)

    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler

    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.CrossEntropyLoss()  # МБ добавить веса в лосс

    best_score = -1 * float('inf')

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model,
                            criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(
            valid_loader, model, criterion, device)

        # scoring
        score = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(
            f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        if best_score < score:
            best_score = score
            LOGGER.info(
                f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                       os.path.join(OUTPUT_DIR, f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth"))

    predictions = torch.load(os.path.join(OUTPUT_DIR, f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth"),
                             map_location=torch.device('cpu'))['predictions']

    valid_folds["pred"] = [np.argmax(el) for el in predictions]

    torch.cuda.empty_cache()
    gc.collect()

    return valid_folds

In [15]:
if __name__ == '__main__':

    def get_result(oof_df):
        labels = oof_df[CFG.target_cols].values
        predictions = oof_df["pred"].tolist()
        score = f1_score(labels[:, 0], predictions, average="macro")
        LOGGER.info(f'Score: {score:.4f}')

    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(os.path.join(OUTPUT_DIR, 'oof_df.pkl'))

XLMRobertaConfig {
  "_name_or_path": "intfloat/multilingual-e5-large",
  "architectures": [
    "XLMRobertaModel"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.39.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}



Epoch: [1][0/122] Elapsed 0m 0s (remain 1m 57s) Loss: 3.0698(3.0698) Grad: inf  LR: 0.00002000  
Epoch: [1][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.3916(0.7492) Grad: 1091246.2500  LR: 0.00001966  
Epoch: [1][121/122] Elapsed 0m 50s (remain 0m 0s) Loss: 0.1607(0.6500) Grad: 363798.2812  LR: 0.00001951  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.8839(0.8839) 


Epoch 1 - avg_train_loss: 0.6500  avg_val_loss: 0.2622  time: 57s
Epoch 1 - Score: 0.9443
Epoch 1 - Save Best Score: 0.9443 Model


EVAL: [15/16] Elapsed 0m 6s (remain 0m 0s) Loss: 0.0010(0.2622) 
Epoch: [2][0/122] Elapsed 0m 0s (remain 1m 9s) Loss: 0.0003(0.0003) Grad: 2199.1697  LR: 0.00001950  
Epoch: [2][100/122] Elapsed 0m 41s (remain 0m 8s) Loss: 0.0901(0.0946) Grad: 241652.7031  LR: 0.00001840  
Epoch: [2][121/122] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0005(0.1120) Grad: 313.2736  LR: 0.00001809  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 0.4737(0.4737) 


Epoch 2 - avg_train_loss: 0.1120  avg_val_loss: 0.2931  time: 57s
Epoch 2 - Score: 0.8483


EVAL: [15/16] Elapsed 0m 6s (remain 0m 0s) Loss: 0.0005(0.2931) 
Epoch: [3][0/122] Elapsed 0m 0s (remain 1m 8s) Loss: 0.0018(0.0018) Grad: 16904.5879  LR: 0.00001808  
Epoch: [3][100/122] Elapsed 0m 41s (remain 0m 8s) Loss: 0.0001(0.1107) Grad: 39.5872  LR: 0.00001631  
Epoch: [3][121/122] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0000(0.0963) Grad: 6.9755  LR: 0.00001588  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 1.0773(1.0773) 


Epoch 3 - avg_train_loss: 0.0963  avg_val_loss: 0.1742  time: 57s
Epoch 3 - Score: 0.9921
Epoch 3 - Save Best Score: 0.9921 Model


EVAL: [15/16] Elapsed 0m 6s (remain 0m 0s) Loss: 0.0000(0.1742) 
Epoch: [4][0/122] Elapsed 0m 0s (remain 1m 7s) Loss: 0.0000(0.0000) Grad: 121.1076  LR: 0.00001586  
Epoch: [4][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0000(0.0078) Grad: 15.5362  LR: 0.00001360  
Epoch: [4][121/122] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0000(0.0065) Grad: 21.7788  LR: 0.00001309  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 1.1256(1.1256) 


Epoch 4 - avg_train_loss: 0.0065  avg_val_loss: 0.1303  time: 58s
Epoch 4 - Score: 0.9941
Epoch 4 - Save Best Score: 0.9941 Model


EVAL: [15/16] Elapsed 0m 6s (remain 0m 0s) Loss: 0.0000(0.1303) 
Epoch: [5][0/122] Elapsed 0m 0s (remain 1m 6s) Loss: 0.0000(0.0000) Grad: 21.3614  LR: 0.00001307  
Epoch: [5][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0000(0.0085) Grad: 9.5025  LR: 0.00001054  
Epoch: [5][121/122] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0000(0.0071) Grad: 9.0212  LR: 0.00001000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 1.1256(1.1256) 


Epoch 5 - avg_train_loss: 0.0071  avg_val_loss: 0.1297  time: 58s
Epoch 5 - Score: 0.9941


EVAL: [15/16] Elapsed 0m 6s (remain 0m 0s) Loss: 0.0000(0.1297) 
Epoch: [6][0/122] Elapsed 0m 0s (remain 1m 1s) Loss: 0.0000(0.0000) Grad: 38.4241  LR: 0.00000997  
Epoch: [6][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0000(0.0090) Grad: 15.2017  LR: 0.00000743  
Epoch: [6][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0000(0.0075) Grad: 6.1796  LR: 0.00000691  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 1.1266(1.1266) 


Epoch 6 - avg_train_loss: 0.0075  avg_val_loss: 0.1295  time: 58s
Epoch 6 - Score: 0.9941


EVAL: [15/16] Elapsed 0m 6s (remain 0m 0s) Loss: 0.0000(0.1295) 
Epoch: [7][0/122] Elapsed 0m 0s (remain 1m 2s) Loss: 0.0000(0.0000) Grad: 16.6559  LR: 0.00000689  
Epoch: [7][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0000(0.0000) Grad: 22.9074  LR: 0.00000457  
Epoch: [7][121/122] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0000(0.0079) Grad: 9.0501  LR: 0.00000412  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 1.1274(1.1274) 


Epoch 7 - avg_train_loss: 0.0079  avg_val_loss: 0.1293  time: 58s
Epoch 7 - Score: 0.9941


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0000(0.1293) 
Epoch: [8][0/122] Elapsed 0m 0s (remain 1m 9s) Loss: 0.0000(0.0000) Grad: 13.3966  LR: 0.00000410  
Epoch: [8][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0000(0.0096) Grad: 25.5464  LR: 0.00000224  
Epoch: [8][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0000(0.0080) Grad: 17.6206  LR: 0.00000191  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 1.1280(1.1280) 


Epoch 8 - avg_train_loss: 0.0080  avg_val_loss: 0.1292  time: 58s
Epoch 8 - Score: 0.9941
Epoch 8 - Save Best Score: 0.9941 Model


EVAL: [15/16] Elapsed 0m 6s (remain 0m 0s) Loss: 0.0000(0.1292) 
Epoch: [9][0/122] Elapsed 0m 0s (remain 1m 11s) Loss: 0.0000(0.0000) Grad: 19.0996  LR: 0.00000189  
Epoch: [9][100/122] Elapsed 0m 41s (remain 0m 8s) Loss: 0.0000(0.0096) Grad: 5.4324  LR: 0.00000067  
Epoch: [9][121/122] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0000(0.0080) Grad: 14.1617  LR: 0.00000049  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 7s) Loss: 1.1283(1.1283) 


Epoch 9 - avg_train_loss: 0.0080  avg_val_loss: 0.1292  time: 57s
Epoch 9 - Score: 0.9941


EVAL: [15/16] Elapsed 0m 6s (remain 0m 0s) Loss: 0.0000(0.1292) 
Epoch: [10][0/122] Elapsed 0m 0s (remain 1m 0s) Loss: 0.0000(0.0000) Grad: 23.7634  LR: 0.00000048  
Epoch: [10][100/122] Elapsed 0m 41s (remain 0m 8s) Loss: 0.0000(0.0097) Grad: 4.9210  LR: 0.00000001  
Epoch: [10][121/122] Elapsed 0m 49s (remain 0m 0s) Loss: 0.0000(0.0080) Grad: 37.1088  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 1.1282(1.1282) 


Epoch 10 - avg_train_loss: 0.0080  avg_val_loss: 0.1292  time: 57s
Epoch 10 - Score: 0.9941


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0000(0.1292) 


Score: 0.9941
XLMRobertaConfig {
  "_name_or_path": "intfloat/multilingual-e5-large",
  "architectures": [
    "XLMRobertaModel"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.39.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}



Epoch: [1][0/122] Elapsed 0m 0s (remain 1m 14s) Loss: 2.5630(2.5630) Grad: inf  LR: 0.00002000  
Epoch: [1][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0066(0.9074) Grad: 6338.2729  LR: 0.00001966  
Epoch: [1][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.4828(0.7702) Grad: 862541.8125  LR: 0.00001951  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0040(0.0040) 


Epoch 1 - avg_train_loss: 0.7702  avg_val_loss: 0.0572  time: 59s
Epoch 1 - Score: 0.9562
Epoch 1 - Save Best Score: 0.9562 Model


EVAL: [15/16] Elapsed 0m 6s (remain 0m 0s) Loss: 0.3555(0.0572) 
Epoch: [2][0/122] Elapsed 0m 0s (remain 1m 12s) Loss: 0.0038(0.0038) Grad: 16997.6992  LR: 0.00001950  
Epoch: [2][100/122] Elapsed 0m 43s (remain 0m 8s) Loss: 0.0000(0.0436) Grad: 21.2400  LR: 0.00001840  
Epoch: [2][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0001(0.0635) Grad: 85.5990  LR: 0.00001809  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0026(0.0026) 


Epoch 2 - avg_train_loss: 0.0635  avg_val_loss: 0.1031  time: 59s
Epoch 2 - Score: 0.9490


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0134(0.1031) 
Epoch: [3][0/122] Elapsed 0m 0s (remain 1m 9s) Loss: 0.0001(0.0001) Grad: 706.9316  LR: 0.00001808  
Epoch: [3][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0000(0.0083) Grad: 20.1943  LR: 0.00001631  
Epoch: [3][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0000(0.0143) Grad: 18.3733  LR: 0.00001588  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0001(0.0001) 


Epoch 3 - avg_train_loss: 0.0143  avg_val_loss: 0.0492  time: 58s
Epoch 3 - Score: 0.9857
Epoch 3 - Save Best Score: 0.9857 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0002(0.0492) 
Epoch: [4][0/122] Elapsed 0m 0s (remain 1m 12s) Loss: 0.0000(0.0000) Grad: 159.5550  LR: 0.00001586  
Epoch: [4][100/122] Elapsed 0m 41s (remain 0m 8s) Loss: 0.0000(0.0076) Grad: 6.7682  LR: 0.00001361  
Epoch: [4][121/122] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0000(0.0102) Grad: 16.3533  LR: 0.00001310  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0000(0.0000) 


Epoch 4 - avg_train_loss: 0.0102  avg_val_loss: 0.0503  time: 57s
Epoch 4 - Score: 0.9857


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0001(0.0503) 
Epoch: [5][0/122] Elapsed 0m 0s (remain 1m 16s) Loss: 0.0000(0.0000) Grad: 29.0394  LR: 0.00001308  
Epoch: [5][100/122] Elapsed 0m 41s (remain 0m 8s) Loss: 0.0000(0.0051) Grad: 20.1134  LR: 0.00001055  
Epoch: [5][121/122] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0000(0.0109) Grad: 5.3278  LR: 0.00001001  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0001(0.0001) 


Epoch 5 - avg_train_loss: 0.0109  avg_val_loss: 0.0512  time: 57s
Epoch 5 - Score: 0.9857


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0001(0.0512) 
Epoch: [6][0/122] Elapsed 0m 0s (remain 1m 9s) Loss: 0.0000(0.0000) Grad: 32.3456  LR: 0.00000999  
Epoch: [6][100/122] Elapsed 0m 41s (remain 0m 8s) Loss: 0.0000(0.0052) Grad: 14.5139  LR: 0.00000744  
Epoch: [6][121/122] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0000(0.0113) Grad: 7.4578  LR: 0.00000692  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0001(0.0001) 


Epoch 6 - avg_train_loss: 0.0113  avg_val_loss: 0.0519  time: 58s
Epoch 6 - Score: 0.9857


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0001(0.0519) 
Epoch: [7][0/122] Elapsed 0m 0s (remain 1m 15s) Loss: 0.0000(0.0000) Grad: 30.3901  LR: 0.00000690  
Epoch: [7][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0000(0.0138) Grad: 9.7173  LR: 0.00000458  
Epoch: [7][121/122] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0000(0.0114) Grad: 7.8206  LR: 0.00000414  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 9s) Loss: 0.0002(0.0002) 


Epoch 7 - avg_train_loss: 0.0114  avg_val_loss: 0.0523  time: 58s
Epoch 7 - Score: 0.9857


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0001(0.0523) 
Epoch: [8][0/122] Elapsed 0m 0s (remain 1m 15s) Loss: 0.0000(0.0000) Grad: 10.7804  LR: 0.00000412  
Epoch: [8][100/122] Elapsed 0m 41s (remain 0m 8s) Loss: 0.0000(0.0140) Grad: 2.9370  LR: 0.00000225  
Epoch: [8][121/122] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0000(0.0116) Grad: 5.4613  LR: 0.00000192  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0002(0.0002) 


Epoch 8 - avg_train_loss: 0.0116  avg_val_loss: 0.0524  time: 58s
Epoch 8 - Score: 0.9857


EVAL: [15/16] Elapsed 0m 6s (remain 0m 0s) Loss: 0.0000(0.0524) 
Epoch: [9][0/122] Elapsed 0m 0s (remain 1m 15s) Loss: 0.0000(0.0000) Grad: 14.2263  LR: 0.00000191  
Epoch: [9][100/122] Elapsed 0m 41s (remain 0m 8s) Loss: 0.0000(0.0141) Grad: 11.9517  LR: 0.00000068  
Epoch: [9][121/122] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0000(0.0117) Grad: 2.2643  LR: 0.00000050  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0002(0.0002) 


Epoch 9 - avg_train_loss: 0.0117  avg_val_loss: 0.0525  time: 57s
Epoch 9 - Score: 0.9857


EVAL: [15/16] Elapsed 0m 6s (remain 0m 0s) Loss: 0.0000(0.0525) 
Epoch: [10][0/122] Elapsed 0m 0s (remain 1m 21s) Loss: 0.0000(0.0000) Grad: 12.2797  LR: 0.00000049  
Epoch: [10][100/122] Elapsed 0m 40s (remain 0m 8s) Loss: 0.0000(0.0141) Grad: 6.3166  LR: 0.00000002  
Epoch: [10][121/122] Elapsed 0m 49s (remain 0m 0s) Loss: 0.0000(0.0117) Grad: 5.9471  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0002(0.0002) 


Epoch 10 - avg_train_loss: 0.0117  avg_val_loss: 0.0525  time: 56s
Epoch 10 - Score: 0.9857


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0000(0.0525) 


Score: 0.9857
XLMRobertaConfig {
  "_name_or_path": "intfloat/multilingual-e5-large",
  "architectures": [
    "XLMRobertaModel"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.39.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}



Epoch: [1][0/122] Elapsed 0m 0s (remain 1m 6s) Loss: 2.5087(2.5087) Grad: inf  LR: 0.00002000  
Epoch: [1][100/122] Elapsed 0m 41s (remain 0m 8s) Loss: 1.0845(0.6780) Grad: 666560.8125  LR: 0.00001966  
Epoch: [1][121/122] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0304(0.6230) Grad: 11598.3047  LR: 0.00001951  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0012(0.0012) 


Epoch 1 - avg_train_loss: 0.6230  avg_val_loss: 0.0925  time: 58s
Epoch 1 - Score: 0.9843
Epoch 1 - Save Best Score: 0.9843 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0033(0.0925) 
Epoch: [2][0/122] Elapsed 0m 0s (remain 1m 12s) Loss: 0.0545(0.0545) Grad: 588141.5000  LR: 0.00001950  
Epoch: [2][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0001(0.0528) Grad: 123.0307  LR: 0.00001840  
Epoch: [2][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0001(0.0522) Grad: 94.3123  LR: 0.00001809  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0001(0.0001) 


Epoch 2 - avg_train_loss: 0.0522  avg_val_loss: 0.0116  time: 59s
Epoch 2 - Score: 0.9980
Epoch 2 - Save Best Score: 0.9980 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0001(0.0116) 
Epoch: [3][0/122] Elapsed 0m 0s (remain 1m 9s) Loss: 0.0001(0.0001) Grad: 243.8514  LR: 0.00001808  
Epoch: [3][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0024(0.0121) Grad: 20366.2539  LR: 0.00001631  
Epoch: [3][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0000(0.0100) Grad: 75.9015  LR: 0.00001588  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0000(0.0000) 


Epoch 3 - avg_train_loss: 0.0100  avg_val_loss: 0.0288  time: 59s
Epoch 3 - Score: 0.9886


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0000(0.0288) 
Epoch: [4][0/122] Elapsed 0m 0s (remain 1m 17s) Loss: 0.0000(0.0000) Grad: 37.3311  LR: 0.00001586  
Epoch: [4][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0000(0.0058) Grad: 22.2844  LR: 0.00001361  
Epoch: [4][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0000(0.0102) Grad: 21.7957  LR: 0.00001310  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0000(0.0000) 


Epoch 4 - avg_train_loss: 0.0102  avg_val_loss: 0.0235  time: 59s
Epoch 4 - Score: 0.9886


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0000(0.0235) 
Epoch: [5][0/122] Elapsed 0m 0s (remain 1m 9s) Loss: 0.0000(0.0000) Grad: 19.3857  LR: 0.00001308  
Epoch: [5][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0000(0.0114) Grad: 11.4406  LR: 0.00001055  
Epoch: [5][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0000(0.0095) Grad: 25.2484  LR: 0.00001001  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0000(0.0000) 


Epoch 5 - avg_train_loss: 0.0095  avg_val_loss: 0.0222  time: 59s
Epoch 5 - Score: 0.9886


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0000(0.0222) 
Epoch: [6][0/122] Elapsed 0m 0s (remain 1m 24s) Loss: 0.0000(0.0000) Grad: 44.4686  LR: 0.00000999  
Epoch: [6][100/122] Elapsed 0m 43s (remain 0m 8s) Loss: 0.0000(0.0060) Grad: 13.4756  LR: 0.00000744  
Epoch: [6][121/122] Elapsed 0m 52s (remain 0m 0s) Loss: 0.0000(0.0094) Grad: 8.4148  LR: 0.00000692  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0000(0.0000) 


Epoch 6 - avg_train_loss: 0.0094  avg_val_loss: 0.0225  time: 59s
Epoch 6 - Score: 0.9886


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0000(0.0225) 
Epoch: [7][0/122] Elapsed 0m 0s (remain 1m 18s) Loss: 0.0000(0.0000) Grad: 131.6944  LR: 0.00000690  
Epoch: [7][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0000(0.0060) Grad: 12.7123  LR: 0.00000458  
Epoch: [7][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0000(0.0049) Grad: 17.8560  LR: 0.00000414  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0000(0.0000) 


Epoch 7 - avg_train_loss: 0.0049  avg_val_loss: 0.0220  time: 59s
Epoch 7 - Score: 0.9886


EVAL: [15/16] Elapsed 0m 6s (remain 0m 0s) Loss: 0.0000(0.0220) 
Epoch: [8][0/122] Elapsed 0m 0s (remain 1m 13s) Loss: 0.0000(0.0000) Grad: 29.8280  LR: 0.00000412  
Epoch: [8][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0000(0.0057) Grad: 10.8387  LR: 0.00000225  
Epoch: [8][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0000(0.0047) Grad: 7.6137  LR: 0.00000192  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0000(0.0000) 


Epoch 8 - avg_train_loss: 0.0047  avg_val_loss: 0.0217  time: 59s
Epoch 8 - Score: 0.9886


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0000(0.0217) 
Epoch: [9][0/122] Elapsed 0m 0s (remain 1m 11s) Loss: 0.0000(0.0000) Grad: 12.2055  LR: 0.00000191  
Epoch: [9][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0000(0.0000) Grad: 24.8430  LR: 0.00000068  
Epoch: [9][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0000(0.0046) Grad: 15.8450  LR: 0.00000050  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0000(0.0000) 


Epoch 9 - avg_train_loss: 0.0046  avg_val_loss: 0.0216  time: 59s
Epoch 9 - Score: 0.9886


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0000(0.0216) 
Epoch: [10][0/122] Elapsed 0m 0s (remain 1m 23s) Loss: 0.0000(0.0000) Grad: 15.3723  LR: 0.00000049  
Epoch: [10][100/122] Elapsed 0m 43s (remain 0m 8s) Loss: 0.0000(0.0056) Grad: 9.1943  LR: 0.00000002  
Epoch: [10][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0000(0.0046) Grad: 11.2089  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0000(0.0000) 


Epoch 10 - avg_train_loss: 0.0046  avg_val_loss: 0.0216  time: 59s
Epoch 10 - Score: 0.9886


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0000(0.0216) 


Score: 0.9980
XLMRobertaConfig {
  "_name_or_path": "intfloat/multilingual-e5-large",
  "architectures": [
    "XLMRobertaModel"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.39.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}



Epoch: [1][0/122] Elapsed 0m 0s (remain 1m 8s) Loss: 2.9829(2.9829) Grad: inf  LR: 0.00002000  
Epoch: [1][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.9364(0.6476) Grad: inf  LR: 0.00001966  
Epoch: [1][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0002(0.5483) Grad: 122.6515  LR: 0.00001951  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0006(0.0006) 


Epoch 1 - avg_train_loss: 0.5483  avg_val_loss: 0.0560  time: 59s
Epoch 1 - Score: 0.9778
Epoch 1 - Save Best Score: 0.9778 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0027(0.0560) 
Epoch: [2][0/122] Elapsed 0m 0s (remain 1m 20s) Loss: 0.0002(0.0002) Grad: 654.5746  LR: 0.00001950  
Epoch: [2][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0000(0.0375) Grad: 121.8889  LR: 0.00001840  
Epoch: [2][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0000(0.0321) Grad: 13.5745  LR: 0.00001809  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0000(0.0000) 


Epoch 2 - avg_train_loss: 0.0321  avg_val_loss: 0.0610  time: 59s
Epoch 2 - Score: 0.9666


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0007(0.0610) 
Epoch: [3][0/122] Elapsed 0m 0s (remain 1m 8s) Loss: 0.4746(0.4746) Grad: inf  LR: 0.00001808  
Epoch: [3][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0000(0.0198) Grad: 22.4677  LR: 0.00001631  
Epoch: [3][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0001(0.0281) Grad: 291.5529  LR: 0.00001588  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0001(0.0001) 


Epoch 3 - avg_train_loss: 0.0281  avg_val_loss: 0.0003  time: 59s
Epoch 3 - Score: 1.0000
Epoch 3 - Save Best Score: 1.0000 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0001(0.0003) 
Epoch: [4][0/122] Elapsed 0m 0s (remain 1m 18s) Loss: 0.0001(0.0001) Grad: 1012.1805  LR: 0.00001586  
Epoch: [4][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0000(0.0020) Grad: 41.3591  LR: 0.00001361  
Epoch: [4][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0000(0.0126) Grad: 12.4848  LR: 0.00001310  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0000(0.0000) 


Epoch 4 - avg_train_loss: 0.0126  avg_val_loss: 0.0002  time: 59s
Epoch 4 - Score: 1.0000


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0001(0.0002) 
Epoch: [5][0/122] Elapsed 0m 0s (remain 1m 11s) Loss: 0.0000(0.0000) Grad: 68.5035  LR: 0.00001308  
Epoch: [5][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0000(0.0129) Grad: 8.5847  LR: 0.00001055  
Epoch: [5][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0000(0.0106) Grad: 7.2427  LR: 0.00001001  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0000(0.0000) 


Epoch 5 - avg_train_loss: 0.0106  avg_val_loss: 0.0001  time: 59s
Epoch 5 - Score: 1.0000


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0000(0.0001) 
Epoch: [6][0/122] Elapsed 0m 0s (remain 1m 8s) Loss: 0.0000(0.0000) Grad: 28.5084  LR: 0.00000999  
Epoch: [6][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0000(0.0131) Grad: 7.6769  LR: 0.00000744  
Epoch: [6][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0000(0.0109) Grad: 23.3733  LR: 0.00000692  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0000(0.0000) 


Epoch 6 - avg_train_loss: 0.0109  avg_val_loss: 0.0001  time: 59s
Epoch 6 - Score: 1.0000


EVAL: [15/16] Elapsed 0m 6s (remain 0m 0s) Loss: 0.0000(0.0001) 
Epoch: [7][0/122] Elapsed 0m 0s (remain 1m 14s) Loss: 0.0000(0.0000) Grad: 13.3162  LR: 0.00000690  
Epoch: [7][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0000(0.0133) Grad: 6.4364  LR: 0.00000458  
Epoch: [7][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0000(0.0111) Grad: 6.1355  LR: 0.00000414  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0000(0.0000) 


Epoch 7 - avg_train_loss: 0.0111  avg_val_loss: 0.0000  time: 58s
Epoch 7 - Score: 1.0000


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0000(0.0000) 
Epoch: [8][0/122] Elapsed 0m 0s (remain 1m 12s) Loss: 0.0000(0.0000) Grad: 29.4504  LR: 0.00000412  
Epoch: [8][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0000(0.0134) Grad: 11.5971  LR: 0.00000225  
Epoch: [8][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0000(0.0111) Grad: 8.7563  LR: 0.00000192  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0000(0.0000) 


Epoch 8 - avg_train_loss: 0.0111  avg_val_loss: 0.0000  time: 59s
Epoch 8 - Score: 1.0000


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0000(0.0000) 
Epoch: [9][0/122] Elapsed 0m 0s (remain 1m 11s) Loss: 0.0000(0.0000) Grad: 22.9407  LR: 0.00000191  
Epoch: [9][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0000(0.0134) Grad: 5.9192  LR: 0.00000068  
Epoch: [9][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0000(0.0111) Grad: 5.3267  LR: 0.00000050  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0000(0.0000) 


Epoch 9 - avg_train_loss: 0.0111  avg_val_loss: 0.0000  time: 59s
Epoch 9 - Score: 1.0000


EVAL: [15/16] Elapsed 0m 6s (remain 0m 0s) Loss: 0.0000(0.0000) 
Epoch: [10][0/122] Elapsed 0m 0s (remain 1m 13s) Loss: 0.0000(0.0000) Grad: 17.7641  LR: 0.00000049  
Epoch: [10][100/122] Elapsed 0m 43s (remain 0m 8s) Loss: 0.0000(0.0134) Grad: 11.4387  LR: 0.00000002  
Epoch: [10][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0000(0.0111) Grad: 14.0370  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0000(0.0000) 


Epoch 10 - avg_train_loss: 0.0111  avg_val_loss: 0.0000  time: 59s
Epoch 10 - Score: 1.0000


EVAL: [15/16] Elapsed 0m 6s (remain 0m 0s) Loss: 0.0000(0.0000) 


Score: 1.0000
XLMRobertaConfig {
  "_name_or_path": "intfloat/multilingual-e5-large",
  "architectures": [
    "XLMRobertaModel"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.39.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}



Epoch: [1][0/122] Elapsed 0m 0s (remain 1m 14s) Loss: 2.1149(2.1149) Grad: inf  LR: 0.00002000  
Epoch: [1][100/122] Elapsed 0m 43s (remain 0m 8s) Loss: 0.2037(0.6734) Grad: 226627.6406  LR: 0.00001966  
Epoch: [1][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0057(0.5990) Grad: 2971.7019  LR: 0.00001951  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0245(0.0245) 


Epoch 1 - avg_train_loss: 0.5990  avg_val_loss: 0.0778  time: 59s
Epoch 1 - Score: 0.9822
Epoch 1 - Save Best Score: 0.9822 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0037(0.0778) 
Epoch: [2][0/122] Elapsed 0m 0s (remain 1m 10s) Loss: 0.0226(0.0226) Grad: 191911.7500  LR: 0.00001950  
Epoch: [2][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0002(0.0739) Grad: 99.0619  LR: 0.00001840  
Epoch: [2][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0005(0.0803) Grad: 216.0377  LR: 0.00001809  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0002(0.0002) 


Epoch 2 - avg_train_loss: 0.0803  avg_val_loss: 0.1134  time: 59s
Epoch 2 - Score: 0.9837
Epoch 2 - Save Best Score: 0.9837 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0003(0.1134) 
Epoch: [3][0/122] Elapsed 0m 0s (remain 1m 8s) Loss: 0.0003(0.0003) Grad: 3941.6265  LR: 0.00001808  
Epoch: [3][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0000(0.0850) Grad: 52.5461  LR: 0.00001631  
Epoch: [3][121/122] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0000(0.0703) Grad: 6.1373  LR: 0.00001588  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0000(0.0000) 


Epoch 3 - avg_train_loss: 0.0703  avg_val_loss: 0.0085  time: 58s
Epoch 3 - Score: 0.9901
Epoch 3 - Save Best Score: 0.9901 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0000(0.0085) 
Epoch: [4][0/122] Elapsed 0m 0s (remain 1m 9s) Loss: 0.0000(0.0000) Grad: 35.2826  LR: 0.00001586  
Epoch: [4][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0000(0.0394) Grad: 13.6164  LR: 0.00001361  
Epoch: [4][121/122] Elapsed 0m 50s (remain 0m 0s) Loss: 0.0000(0.0333) Grad: 7.1376  LR: 0.00001310  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0000(0.0000) 


Epoch 4 - avg_train_loss: 0.0333  avg_val_loss: 0.0542  time: 58s
Epoch 4 - Score: 0.9954
Epoch 4 - Save Best Score: 0.9954 Model


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0000(0.0542) 
Epoch: [5][0/122] Elapsed 0m 0s (remain 1m 7s) Loss: 0.0000(0.0000) Grad: 110.6040  LR: 0.00001308  
Epoch: [5][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0000(0.0008) Grad: 11.7660  LR: 0.00001055  
Epoch: [5][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0000(0.0007) Grad: 7.6898  LR: 0.00001001  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0000(0.0000) 


Epoch 5 - avg_train_loss: 0.0007  avg_val_loss: 0.0503  time: 59s
Epoch 5 - Score: 0.9954


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0000(0.0503) 
Epoch: [6][0/122] Elapsed 0m 0s (remain 1m 16s) Loss: 0.0000(0.0000) Grad: 132.7482  LR: 0.00000999  
Epoch: [6][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0000(0.0020) Grad: 8.4197  LR: 0.00000744  
Epoch: [6][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0000(0.0017) Grad: 7.7845  LR: 0.00000692  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0000(0.0000) 


Epoch 6 - avg_train_loss: 0.0017  avg_val_loss: 0.0515  time: 59s
Epoch 6 - Score: 0.9954


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0000(0.0515) 
Epoch: [7][0/122] Elapsed 0m 0s (remain 1m 41s) Loss: 0.0000(0.0000) Grad: 16.1647  LR: 0.00000690  
Epoch: [7][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0000(0.0024) Grad: 12.6033  LR: 0.00000458  
Epoch: [7][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0000(0.0020) Grad: 16.3733  LR: 0.00000414  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0000(0.0000) 


Epoch 7 - avg_train_loss: 0.0020  avg_val_loss: 0.0519  time: 59s
Epoch 7 - Score: 0.9954


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0000(0.0519) 
Epoch: [8][0/122] Elapsed 0m 0s (remain 1m 12s) Loss: 0.0000(0.0000) Grad: 12.8488  LR: 0.00000412  
Epoch: [8][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.2590(0.0026) Grad: nan  LR: 0.00000225  
Epoch: [8][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0000(0.0021) Grad: 5.8987  LR: 0.00000192  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0000(0.0000) 


Epoch 8 - avg_train_loss: 0.0021  avg_val_loss: 0.0522  time: 59s
Epoch 8 - Score: 0.9954


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0000(0.0522) 
Epoch: [9][0/122] Elapsed 0m 0s (remain 1m 14s) Loss: 0.0000(0.0000) Grad: 30.7726  LR: 0.00000191  
Epoch: [9][100/122] Elapsed 0m 42s (remain 0m 8s) Loss: 0.0000(0.0026) Grad: 5.0068  LR: 0.00000068  
Epoch: [9][121/122] Elapsed 0m 51s (remain 0m 0s) Loss: 0.0000(0.0022) Grad: 7.7708  LR: 0.00000050  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0000(0.0000) 


Epoch 9 - avg_train_loss: 0.0022  avg_val_loss: 0.0522  time: 59s
Epoch 9 - Score: 0.9954


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0000(0.0522) 
Epoch: [10][0/122] Elapsed 0m 1s (remain 2m 58s) Loss: 0.0000(0.0000) Grad: 21.3994  LR: 0.00000049  
Epoch: [10][100/122] Elapsed 0m 43s (remain 0m 9s) Loss: 0.0000(0.0000) Grad: 18.6668  LR: 0.00000002  
Epoch: [10][121/122] Elapsed 0m 52s (remain 0m 0s) Loss: 0.0000(0.0022) Grad: 15.6176  LR: 0.00000000  
EVAL: [0/16] Elapsed 0m 0s (remain 0m 8s) Loss: 0.0000(0.0000) 


Epoch 10 - avg_train_loss: 0.0022  avg_val_loss: 0.0522  time: 59s
Epoch 10 - Score: 0.9954


EVAL: [15/16] Elapsed 0m 7s (remain 0m 0s) Loss: 0.0000(0.0522) 


Score: 0.9954
Score: 0.9946
