In [1]:
import os

OUTPUT_DIR = './labse_v4_good'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [2]:
class CFG:
    apex=True
    print_freq=100
    num_workers=10
    model="sentence-transformers/LaBSE"
    scheduler='linear' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=10
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=256
    fc_dropout=0.1
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    extra=False

In [3]:
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
import ast
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true
# %env CUDA_LAUNCH_BLOCKING=1

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizers.__version__: 0.13.3
transformers.__version__: 4.30.1
env: TOKENIZERS_PARALLELISM=true


In [4]:
def get_logger(filename=os.path.join(OUTPUT_DIR, "train")):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [5]:
import re

def probs2str(results, texts): # (4501, 70, 3)
    predictions = []
    classes = np.argmax(results, axis=-1)
    input_ids_list = [tokenizer(text, 
                           add_special_tokens=True,
                           max_length=CFG.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)["input_ids"] for text in texts]

    for clas, input_ids in zip(classes, input_ids_list):
        entities = convert_ids_to_entities(clas, input_ids)

        entities = list(itertools.chain(*entities))
        prediction = tokenizer.decode(entities, skip_special_tokens=True)
        prediction = re.sub(" - ", "-", prediction)
        prediction = re.sub(" & ", "&", prediction)
        prediction = re.sub("##", "", prediction)
        predictions.append(prediction)
    
    return predictions

def get_score_f1(y_true, y_pred):
    tp, fp, fn = 0, 0, 0
    acc = 0
    # print(y_pred)
    for tr, pr in zip(y_true, y_pred):
        if tr == pr: acc += 1
        pr = frozenset([pr])
        tr = frozenset([tr])

        tp += len(pr & tr)
        fp += len(pr - tr)
        fn += len(tr - pr)

    if tp == 0:
        return 0.0
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    
    print(f"Accuracy: {acc / len(y_true)}")
    score = 2 / (1 / precision + 1 / recall)

    return score

In [6]:
def convert_ids_to_entities(ids, tokens=None, o_id=0, b_id=1, i_id=2):
    entities = []
    entity = []
    is_entity_started = False
    for index, token in enumerate(ids):
        if not is_entity_started and token == b_id:
            is_entity_started = True
            entity.append(index)
        elif is_entity_started and token == i_id:
            entity.append(index)
    
        if (is_entity_started and token == o_id) or (index == (len(ids) - 1)):
            is_entity_started = False
            entities.append(entity)
            entity = []
                
    if tokens is not None:
        tokens = np.array(tokens)
        token_entities = []
        for entity in entities:
            entity = np.array(entity)
            if len(entity) > 0:
                token_entity = tokens[entity]
                token_entities.append(token_entity.tolist())
            
        return token_entities
            
    return entities

In [7]:
import re

def preprocess_text(text):
    text = text.lower()
    # text = re.sub("_", " ", text)
    # text = re.sub("\d{5,}", " ", text) # убираю длиньше чем есть в метках
    # text = re.sub("\d+[\.\,]?\d* *(см|гр|г|л|мл|кг|шт|мкм|м)", " ", text) # убираю единицы измерения
    # text = re.sub("\d+[\.\,]?\d*\s?%", " ", text) # процентное содержание в товарах
    # text = re.sub("\[.*\]", " ", text) # убираем скобки и содержимое
    # text = re.sub("<.+>", " ", text) # убираем скобки и содержимое
    # text = re.sub("{.+}", " ", text) # убираем скобки и содержимое
    # for el in ["\x07", "\t", "\n", "\x18", "\x1a", '"', ",", "/", ":", ";", '\\\\', '\|', '~', "\x7f", "\xa0", "°", "·", "є", "∙", "╣"]:
    #     text = re.sub(el, " ", text)

    return text.strip()

def get_good_pos(row):
    if row["good"] == "": return (0, 0)
    start = row["name"].lower().find(row["good"])
    if start == -1:
        end = -1
    else:
        end = start + len(row["good"])

    return (start, end)

In [8]:
df = pd.read_csv("train_supervised_dataset.csv")
df.good = df.good.replace("лента.", "лента")
df.good = df.good.replace("товара нет", np.nan)
df.good = df.good.replace("т,а,б,л,е,т,к,и", "таблетки")
df.good = df.good.replace('автокормушка", "автопоилка', "автокормушка,автопоилка")

df = df[["name", "good"]].fillna("")
df.name = df.name.apply(preprocess_text)
df["good_pos"] = df.apply(get_good_pos, axis=1)
df = df[df.good_pos != (-1,-1)].reset_index(drop=True)

df["strat"] = df.good_pos.apply(lambda x: 1 if x == (0, 0) else 0)
print(df.shape)
df.head()

(23412, 4)


Unnamed: 0,name,good,good_pos,strat
0,petmax бантик леопард с красн розой 2шт,бантик,"(7, 13)",0
1,87191 бусы для елки шарики_87191,бусы,"(6, 10)",0
2,футболка piazza italia wr011446881,футболка,"(0, 8)",0
3,7) yi572-03x-one заколка для волос для девочки,заколка,"(17, 24)",0
4,одежда (вес) 1500,одежда,"(0, 6)",0


In [9]:
Fold = StratifiedKFold(n_splits=CFG.n_fold)
for n, (train_index, val_index) in enumerate(Fold.split(df, df.strat.values)):
    df.loc[val_index, 'fold'] = int(n)
df['fold'] = df['fold'].astype(int)
display(df.groupby('fold').size())

fold
0    4683
1    4683
2    4682
3    4682
4    4682
dtype: int64

In [10]:
if CFG.extra:
    extra_data = pd.read_csv("unsup_pred_rubert.csv")
    extra_data = extra_data[["name", "good"]].fillna("")
    extra_data.name = extra_data.name.apply(preprocess_text)
    extra_data["good_pos"] = extra_data.apply(get_good_pos, axis=1)
    extra_data = extra_data[extra_data.good_pos != (-1,-1)].reset_index(drop=True)
    extra_data.head()

In [11]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, "tokenizer"))
CFG.tokenizer = tokenizer

In [12]:
lengths = []
tk0 = tqdm(df.name.values, total=len(df))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)

if CFG.extra:
    tk1 = tqdm(extra_data.name.values, total=len(extra_data))
    for text in tk1:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        lengths.append(length)

CFG.max_len = max(lengths) + 2 # cls & sep
LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/23412 [00:00<?, ?it/s]

max_len: 66


In [13]:
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text, 
                           add_special_tokens=True,
                           max_length=CFG.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


def create_label(cfg, text, annotation, pos):
    # "O" -> 0
    # "B" -> 1
    # "I" -> 2
    encoded = cfg.tokenizer(text,
                            add_special_tokens=True,
                            max_length=CFG.max_len,
                            padding="max_length",
                            return_offsets_mapping=True)
    offset_mapping = encoded['offset_mapping']
    ignore_idxes = np.where(np.array(encoded.sequence_ids()) != 0)[0]
    label = np.zeros(len(offset_mapping))
    label[ignore_idxes] = -1

    if pos == (0, 0):
        return torch.tensor(label, dtype=torch.long)
    for i in range(len(offset_mapping)):
        if offset_mapping[i] == (0, 0):
            continue
        elif offset_mapping[i][0] == pos[0]:
            label[i] = 1
        elif offset_mapping[i][1] <= pos[1] and offset_mapping[i][0] >= pos[0]:
            label[i] = 2

    return torch.tensor(label, dtype=torch.long)

class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.name = df['name'].values
        self.good = df['good'].values
        self.good_pos = df['good_pos'].values

    def __len__(self):
        return len(self.name)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, 
                               self.name[item])
        label = create_label(self.cfg, 
                             self.name[item], 
                             self.good[item], 
                             self.good_pos[item])
        return inputs, label

In [14]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, 3)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        return last_hidden_states

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

In [15]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

In [16]:
def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        loss = criterion(y_preds.transpose(1, 2), labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg

In [17]:
def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.transpose(1, 2), labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.softmax(-1).to('cpu').numpy()) ####
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions

In [18]:
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)

    if CFG.extra:
        train_folds = pd.concat([train_folds, extra_data], axis=0)
    
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_texts = valid_folds['name'].values
    valid_labels = valid_folds['good'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, os.path.join(OUTPUT_DIR, "config.pth"))
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler=='linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler=='cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.CrossEntropyLoss(ignore_index=-1)
    
    best_score = 0.0

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        # scoring
        preds = probs2str(predictions, valid_texts)
        score = get_score_f1(valid_labels, preds)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': np.argmax(predictions, axis=-1)},
                        os.path.join(OUTPUT_DIR, f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth"))

    predictions = torch.load(os.path.join(OUTPUT_DIR, f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth"), 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[i for i in range(CFG.max_len)]] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [19]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        predictions = []
        classes = oof_df[[i for i in range(CFG.max_len)]].values.tolist()
        input_ids_list = [CFG.tokenizer(text, 
                           add_special_tokens=True,
                           max_length=CFG.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)["input_ids"] for text in oof_df.name.values]
        for clas, input_ids in zip(classes, input_ids_list):
            entities = convert_ids_to_entities(clas, input_ids)
    
            entities = list(itertools.chain(*entities))
            # entities = tokenizer.convert_ids_to_tokens(entities)
            # entities = [entity for entity in entities if entity != "[PAD]"]
            # prediction = tokenizer.convert_tokens_to_string(entities)
            prediction = tokenizer.decode(entities, skip_special_tokens=True)
            prediction = re.sub(" - ", "-", prediction)
            prediction = re.sub(" & ", "&", prediction)
            prediction = re.sub("##", "", prediction)
            predictions.append(prediction)

        score = get_score_f1(oof_df.good.values, predictions)
        LOGGER.info(f'Score: {score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(df, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(os.path.join(OUTPUT_DIR, "oof_df.pkl"))



Epoch: [1][0/73] Elapsed 0m 0s (remain 1m 7s) Loss: 1.0244(1.0244) Grad: nan  LR: 0.00001997  
Epoch: [1][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0505(0.2408) Grad: 6431.5391  LR: 0.00001800  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0545(0.0545) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.1391(0.0422) 


Epoch 1 - avg_train_loss: 0.2408  avg_val_loss: 0.0422  time: 23s
Epoch 1 - Score: 0.9216
Epoch 1 - Save Best Score: 0.9216 Model


Accuracy: 0.9216314328421952
Epoch: [2][0/73] Elapsed 0m 0s (remain 0m 30s) Loss: 0.0386(0.0386) Grad: nan  LR: 0.00001798  
Epoch: [2][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0405(0.0420) Grad: 10838.8818  LR: 0.00001601  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0504(0.0504) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.1165(0.0347) 


Epoch 2 - avg_train_loss: 0.0420  avg_val_loss: 0.0347  time: 23s
Epoch 2 - Score: 0.9357
Epoch 2 - Save Best Score: 0.9357 Model


Accuracy: 0.9357249626307922
Epoch: [3][0/73] Elapsed 0m 0s (remain 0m 31s) Loss: 0.0276(0.0276) Grad: nan  LR: 0.00001598  
Epoch: [3][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0316(0.0321) Grad: 10996.6025  LR: 0.00001401  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0502(0.0502) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.1411(0.0349) 


Epoch 3 - avg_train_loss: 0.0321  avg_val_loss: 0.0349  time: 23s
Epoch 3 - Score: 0.9430
Epoch 3 - Save Best Score: 0.9430 Model


Accuracy: 0.942985265855221
Epoch: [4][0/73] Elapsed 0m 0s (remain 0m 30s) Loss: 0.0311(0.0311) Grad: nan  LR: 0.00001398  
Epoch: [4][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0295(0.0269) Grad: 14206.8525  LR: 0.00001201  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0500(0.0500) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.1300(0.0333) 


Epoch 4 - avg_train_loss: 0.0269  avg_val_loss: 0.0333  time: 23s
Epoch 4 - Score: 0.9455
Epoch 4 - Save Best Score: 0.9455 Model


Accuracy: 0.9455477258167841
Epoch: [5][0/73] Elapsed 0m 0s (remain 0m 31s) Loss: 0.0221(0.0221) Grad: nan  LR: 0.00001198  
Epoch: [5][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0172(0.0237) Grad: nan  LR: 0.00001001  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0540(0.0540) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.1339(0.0338) 


Epoch 5 - avg_train_loss: 0.0237  avg_val_loss: 0.0338  time: 23s
Epoch 5 - Score: 0.9468
Epoch 5 - Save Best Score: 0.9468 Model


Accuracy: 0.9468289557975657
Epoch: [6][0/73] Elapsed 0m 0s (remain 0m 30s) Loss: 0.0237(0.0237) Grad: nan  LR: 0.00000999  
Epoch: [6][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0266(0.0214) Grad: 18426.9355  LR: 0.00000802  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0587(0.0587) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.1429(0.0348) 


Epoch 6 - avg_train_loss: 0.0214  avg_val_loss: 0.0348  time: 23s
Epoch 6 - Score: 0.9462


Accuracy: 0.9461883408071748
Epoch: [7][0/73] Elapsed 0m 0s (remain 0m 31s) Loss: 0.0155(0.0155) Grad: nan  LR: 0.00000799  
Epoch: [7][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0220(0.0197) Grad: 11151.4600  LR: 0.00000602  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0581(0.0581) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.1502(0.0368) 


Epoch 7 - avg_train_loss: 0.0197  avg_val_loss: 0.0368  time: 23s
Epoch 7 - Score: 0.9451


Accuracy: 0.9451206491565236
Epoch: [8][0/73] Elapsed 0m 0s (remain 0m 30s) Loss: 0.0204(0.0204) Grad: nan  LR: 0.00000599  
Epoch: [8][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0181(0.0181) Grad: 12376.8428  LR: 0.00000402  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0611(0.0611) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.1454(0.0363) 


Epoch 8 - avg_train_loss: 0.0181  avg_val_loss: 0.0363  time: 23s
Epoch 8 - Score: 0.9485
Epoch 8 - Save Best Score: 0.9485 Model


Accuracy: 0.9485372624386077
Epoch: [9][0/73] Elapsed 0m 0s (remain 0m 30s) Loss: 0.0114(0.0114) Grad: nan  LR: 0.00000399  
Epoch: [9][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0118(0.0170) Grad: 9837.9834  LR: 0.00000202  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0616(0.0616) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.1452(0.0366) 


Epoch 9 - avg_train_loss: 0.0170  avg_val_loss: 0.0366  time: 23s
Epoch 9 - Score: 0.9485


Accuracy: 0.9485372624386077
Epoch: [10][0/73] Elapsed 0m 0s (remain 0m 30s) Loss: 0.0122(0.0122) Grad: nan  LR: 0.00000200  
Epoch: [10][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0215(0.0164) Grad: 11072.4062  LR: 0.00000003  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0610(0.0610) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.1455(0.0366) 


Epoch 10 - avg_train_loss: 0.0164  avg_val_loss: 0.0366  time: 23s
Epoch 10 - Score: 0.9483


Accuracy: 0.9483237241084774


Score: 0.9485


Accuracy: 0.9485372624386077
Epoch: [1][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 1.1738(1.1738) Grad: nan  LR: 0.00001997  
Epoch: [1][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0450(0.2535) Grad: 7033.1733  LR: 0.00001800  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0397(0.0397) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0162(0.0450) 


Epoch 1 - avg_train_loss: 0.2535  avg_val_loss: 0.0450  time: 23s
Epoch 1 - Score: 0.9174
Epoch 1 - Save Best Score: 0.9174 Model


Accuracy: 0.91736066623959
Epoch: [2][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0359(0.0359) Grad: nan  LR: 0.00001798  
Epoch: [2][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0349(0.0407) Grad: 11070.4512  LR: 0.00001601  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0362(0.0362) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0121(0.0361) 


Epoch 2 - avg_train_loss: 0.0407  avg_val_loss: 0.0361  time: 23s
Epoch 2 - Score: 0.9327
Epoch 2 - Save Best Score: 0.9327 Model


Accuracy: 0.9327354260089686
Epoch: [3][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0363(0.0363) Grad: nan  LR: 0.00001598  
Epoch: [3][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0458(0.0326) Grad: 21227.8984  LR: 0.00001401  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0345(0.0345) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0144(0.0358) 


Epoch 3 - avg_train_loss: 0.0326  avg_val_loss: 0.0358  time: 23s
Epoch 3 - Score: 0.9362
Epoch 3 - Save Best Score: 0.9362 Model


Accuracy: 0.9361520392910527
Epoch: [4][0/73] Elapsed 0m 0s (remain 0m 34s) Loss: 0.0239(0.0239) Grad: nan  LR: 0.00001398  
Epoch: [4][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0256(0.0273) Grad: 12708.5273  LR: 0.00001201  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0336(0.0336) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0141(0.0331) 


Epoch 4 - avg_train_loss: 0.0273  avg_val_loss: 0.0331  time: 23s
Epoch 4 - Score: 0.9387
Epoch 4 - Save Best Score: 0.9387 Model


Accuracy: 0.9387144992526159
Epoch: [5][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0170(0.0170) Grad: nan  LR: 0.00001198  
Epoch: [5][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0260(0.0240) Grad: 10664.3125  LR: 0.00001001  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0318(0.0318) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0126(0.0326) 


Epoch 5 - avg_train_loss: 0.0240  avg_val_loss: 0.0326  time: 23s
Epoch 5 - Score: 0.9421
Epoch 5 - Save Best Score: 0.9421 Model


Accuracy: 0.9421311125346999
Epoch: [6][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0204(0.0204) Grad: nan  LR: 0.00000999  
Epoch: [6][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0157(0.0212) Grad: 12257.4707  LR: 0.00000802  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0310(0.0310) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0115(0.0330) 


Epoch 6 - avg_train_loss: 0.0212  avg_val_loss: 0.0330  time: 23s
Epoch 6 - Score: 0.9419


Accuracy: 0.9419175742045697
Epoch: [7][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0118(0.0118) Grad: nan  LR: 0.00000799  
Epoch: [7][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0177(0.0191) Grad: 10462.3486  LR: 0.00000602  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0332(0.0332) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0122(0.0333) 


Epoch 7 - avg_train_loss: 0.0191  avg_val_loss: 0.0333  time: 23s
Epoch 7 - Score: 0.9447
Epoch 7 - Save Best Score: 0.9447 Model


Accuracy: 0.9446935724962631
Epoch: [8][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0158(0.0158) Grad: nan  LR: 0.00000599  
Epoch: [8][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0199(0.0176) Grad: 9720.4922  LR: 0.00000402  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0320(0.0320) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0134(0.0349) 


Epoch 8 - avg_train_loss: 0.0176  avg_val_loss: 0.0349  time: 23s
Epoch 8 - Score: 0.9423


Accuracy: 0.9423446508648302
Epoch: [9][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0116(0.0116) Grad: nan  LR: 0.00000399  
Epoch: [9][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0219(0.0169) Grad: 18970.9512  LR: 0.00000202  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0327(0.0327) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0131(0.0344) 


Epoch 9 - avg_train_loss: 0.0169  avg_val_loss: 0.0344  time: 23s
Epoch 9 - Score: 0.9430


Accuracy: 0.942985265855221
Epoch: [10][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0188(0.0188) Grad: nan  LR: 0.00000200  
Epoch: [10][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0127(0.0158) Grad: 9053.9492  LR: 0.00000003  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0325(0.0325) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0133(0.0345) 


Epoch 10 - avg_train_loss: 0.0158  avg_val_loss: 0.0345  time: 23s
Epoch 10 - Score: 0.9438


Accuracy: 0.9438394191757421


Score: 0.9447


Accuracy: 0.9446935724962631
Epoch: [1][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 1.1650(1.1650) Grad: nan  LR: 0.00001997  
Epoch: [1][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0494(0.2402) Grad: 7348.7065  LR: 0.00001800  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0504(0.0504) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0487(0.0444) 


Epoch 1 - avg_train_loss: 0.2402  avg_val_loss: 0.0444  time: 23s
Epoch 1 - Score: 0.9133
Epoch 1 - Save Best Score: 0.9133 Model


Accuracy: 0.9132849209739428
Epoch: [2][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0374(0.0374) Grad: nan  LR: 0.00001798  
Epoch: [2][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0424(0.0401) Grad: 11263.0615  LR: 0.00001601  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0440(0.0440) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0340(0.0372) 


Epoch 2 - avg_train_loss: 0.0401  avg_val_loss: 0.0372  time: 23s
Epoch 2 - Score: 0.9289
Epoch 2 - Save Best Score: 0.9289 Model


Accuracy: 0.928876548483554
Epoch: [3][0/73] Elapsed 0m 0s (remain 0m 33s) Loss: 0.0352(0.0352) Grad: nan  LR: 0.00001598  
Epoch: [3][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0457(0.0318) Grad: 11298.7227  LR: 0.00001401  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0407(0.0407) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0310(0.0335) 


Epoch 3 - avg_train_loss: 0.0318  avg_val_loss: 0.0335  time: 23s
Epoch 3 - Score: 0.9361
Epoch 3 - Save Best Score: 0.9361 Model


Accuracy: 0.9361384023921401
Epoch: [4][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0346(0.0346) Grad: nan  LR: 0.00001398  
Epoch: [4][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0282(0.0264) Grad: 10485.3125  LR: 0.00001201  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0428(0.0428) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0313(0.0343) 


Epoch 4 - avg_train_loss: 0.0264  avg_val_loss: 0.0343  time: 23s
Epoch 4 - Score: 0.9370
Epoch 4 - Save Best Score: 0.9370 Model


Accuracy: 0.9369927381460914
Epoch: [5][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0210(0.0210) Grad: nan  LR: 0.00001198  
Epoch: [5][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0274(0.0232) Grad: 10959.9199  LR: 0.00001001  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0411(0.0411) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0286(0.0324) 


Epoch 5 - avg_train_loss: 0.0232  avg_val_loss: 0.0324  time: 23s
Epoch 5 - Score: 0.9389
Epoch 5 - Save Best Score: 0.9389 Model


Accuracy: 0.9389149935924819
Epoch: [6][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0304(0.0304) Grad: nan  LR: 0.00000999  
Epoch: [6][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0119(0.0209) Grad: 9916.3828  LR: 0.00000802  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0405(0.0405) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0291(0.0326) 


Epoch 6 - avg_train_loss: 0.0209  avg_val_loss: 0.0326  time: 23s
Epoch 6 - Score: 0.9417
Epoch 6 - Save Best Score: 0.9417 Model


Accuracy: 0.9416915847928236
Epoch: [7][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0175(0.0175) Grad: nan  LR: 0.00000799  
Epoch: [7][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0241(0.0187) Grad: 12999.5771  LR: 0.00000602  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0417(0.0417) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0260(0.0335) 


Epoch 7 - avg_train_loss: 0.0187  avg_val_loss: 0.0335  time: 23s
Epoch 7 - Score: 0.9436
Epoch 7 - Save Best Score: 0.9436 Model


Accuracy: 0.943613840239214
Epoch: [8][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0162(0.0162) Grad: nan  LR: 0.00000599  
Epoch: [8][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0164(0.0176) Grad: 12011.6494  LR: 0.00000402  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0437(0.0437) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0269(0.0332) 


Epoch 8 - avg_train_loss: 0.0176  avg_val_loss: 0.0332  time: 23s
Epoch 8 - Score: 0.9451
Epoch 8 - Save Best Score: 0.9451 Model


Accuracy: 0.9451089278086288
Epoch: [9][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0178(0.0178) Grad: nan  LR: 0.00000399  
Epoch: [9][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0084(0.0162) Grad: 8920.5215  LR: 0.00000202  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0444(0.0444) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0256(0.0338) 


Epoch 9 - avg_train_loss: 0.0162  avg_val_loss: 0.0338  time: 23s
Epoch 9 - Score: 0.9455
Epoch 9 - Save Best Score: 0.9455 Model


Accuracy: 0.9455360956856045
Epoch: [10][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0141(0.0141) Grad: nan  LR: 0.00000200  
Epoch: [10][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0071(0.0156) Grad: 6704.8804  LR: 0.00000003  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0444(0.0444) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0255(0.0337) 


Epoch 10 - avg_train_loss: 0.0156  avg_val_loss: 0.0337  time: 23s
Epoch 10 - Score: 0.9445


Accuracy: 0.9444681759931653


Score: 0.9455


Accuracy: 0.9455360956856045
Epoch: [1][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.9561(0.9561) Grad: nan  LR: 0.00001997  
Epoch: [1][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0450(0.2282) Grad: 7400.3936  LR: 0.00001800  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0332(0.0332) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0419(0.0466) 


Epoch 1 - avg_train_loss: 0.2282  avg_val_loss: 0.0466  time: 23s
Epoch 1 - Score: 0.9090
Epoch 1 - Save Best Score: 0.9090 Model


Accuracy: 0.9090132422041862
Epoch: [2][0/73] Elapsed 0m 0s (remain 0m 33s) Loss: 0.0386(0.0386) Grad: nan  LR: 0.00001798  
Epoch: [2][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0350(0.0403) Grad: 13203.0801  LR: 0.00001601  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0271(0.0271) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0352(0.0394) 


Epoch 2 - avg_train_loss: 0.0403  avg_val_loss: 0.0394  time: 23s
Epoch 2 - Score: 0.9238
Epoch 2 - Save Best Score: 0.9238 Model


Accuracy: 0.9237505339598462
Epoch: [3][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0334(0.0334) Grad: nan  LR: 0.00001598  
Epoch: [3][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0453(0.0318) Grad: 14637.5000  LR: 0.00001401  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0249(0.0249) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0327(0.0363) 


Epoch 3 - avg_train_loss: 0.0318  avg_val_loss: 0.0363  time: 23s
Epoch 3 - Score: 0.9327
Epoch 3 - Save Best Score: 0.9327 Model


Accuracy: 0.9327210593763349
Epoch: [4][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0317(0.0317) Grad: nan  LR: 0.00001398  
Epoch: [4][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0309(0.0270) Grad: 12522.8916  LR: 0.00001201  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0228(0.0228) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0344(0.0335) 


Epoch 4 - avg_train_loss: 0.0270  avg_val_loss: 0.0335  time: 23s
Epoch 4 - Score: 0.9376
Epoch 4 - Save Best Score: 0.9376 Model


Accuracy: 0.9376334899615549
Epoch: [5][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0213(0.0213) Grad: nan  LR: 0.00001198  
Epoch: [5][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0327(0.0228) Grad: 10184.6074  LR: 0.00001001  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0239(0.0239) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0356(0.0339) 


Epoch 5 - avg_train_loss: 0.0228  avg_val_loss: 0.0339  time: 23s
Epoch 5 - Score: 0.9391
Epoch 5 - Save Best Score: 0.9391 Model


Accuracy: 0.9391285775309697
Epoch: [6][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0254(0.0254) Grad: nan  LR: 0.00000999  
Epoch: [6][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0170(0.0211) Grad: 12499.0176  LR: 0.00000802  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0232(0.0232) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0352(0.0351) 


Epoch 6 - avg_train_loss: 0.0211  avg_val_loss: 0.0351  time: 23s
Epoch 6 - Score: 0.9376


Accuracy: 0.9376334899615549
Epoch: [7][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0241(0.0241) Grad: nan  LR: 0.00000799  
Epoch: [7][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0117(0.0191) Grad: 8141.4263  LR: 0.00000602  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0225(0.0225) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0323(0.0341) 


Epoch 7 - avg_train_loss: 0.0191  avg_val_loss: 0.0341  time: 23s
Epoch 7 - Score: 0.9415
Epoch 7 - Save Best Score: 0.9415 Model


Accuracy: 0.9414780008543358
Epoch: [8][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0220(0.0220) Grad: nan  LR: 0.00000599  
Epoch: [8][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0163(0.0175) Grad: 14190.3916  LR: 0.00000402  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0224(0.0224) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0348(0.0357) 


Epoch 8 - avg_train_loss: 0.0175  avg_val_loss: 0.0357  time: 23s
Epoch 8 - Score: 0.9423
Epoch 8 - Save Best Score: 0.9423 Model


Accuracy: 0.9423323366082871
Epoch: [9][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0100(0.0100) Grad: nan  LR: 0.00000399  
Epoch: [9][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0090(0.0164) Grad: 7524.8320  LR: 0.00000202  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0223(0.0223) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0333(0.0356) 


Epoch 9 - avg_train_loss: 0.0164  avg_val_loss: 0.0356  time: 23s
Epoch 9 - Score: 0.9421


Accuracy: 0.9421187526697993
Epoch: [10][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0097(0.0097) Grad: nan  LR: 0.00000200  
Epoch: [10][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0121(0.0155) Grad: 8140.7256  LR: 0.00000003  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0220(0.0220) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0328(0.0358) 


Epoch 10 - avg_train_loss: 0.0155  avg_val_loss: 0.0358  time: 23s
Epoch 10 - Score: 0.9419


Accuracy: 0.9419051687313114


Score: 0.9423


Accuracy: 0.9423323366082871
Epoch: [1][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 1.2236(1.2236) Grad: nan  LR: 0.00001997  
Epoch: [1][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0392(0.2476) Grad: 6126.8633  LR: 0.00001800  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0634(0.0634) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0388(0.0453) 


Epoch 1 - avg_train_loss: 0.2476  avg_val_loss: 0.0453  time: 23s
Epoch 1 - Score: 0.9090
Epoch 1 - Save Best Score: 0.9090 Model


Accuracy: 0.9090132422041862
Epoch: [2][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0404(0.0404) Grad: nan  LR: 0.00001798  
Epoch: [2][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0506(0.0409) Grad: 14775.3203  LR: 0.00001601  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0461(0.0461) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0321(0.0372) 


Epoch 2 - avg_train_loss: 0.0409  avg_val_loss: 0.0372  time: 23s
Epoch 2 - Score: 0.9310
Epoch 2 - Save Best Score: 0.9310 Model


Accuracy: 0.9310123878684323
Epoch: [3][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0290(0.0290) Grad: nan  LR: 0.00001598  
Epoch: [3][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0219(0.0325) Grad: 9758.8691  LR: 0.00001401  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0419(0.0419) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0325(0.0329) 


Epoch 3 - avg_train_loss: 0.0325  avg_val_loss: 0.0329  time: 23s
Epoch 3 - Score: 0.9370
Epoch 3 - Save Best Score: 0.9370 Model


Accuracy: 0.9369927381460914
Epoch: [4][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0233(0.0233) Grad: nan  LR: 0.00001398  
Epoch: [4][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0219(0.0278) Grad: 9229.1758  LR: 0.00001201  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0381(0.0381) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0335(0.0323) 


Epoch 4 - avg_train_loss: 0.0278  avg_val_loss: 0.0323  time: 23s
Epoch 4 - Score: 0.9378
Epoch 4 - Save Best Score: 0.9378 Model


Accuracy: 0.9378470739000427
Epoch: [5][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0119(0.0119) Grad: nan  LR: 0.00001198  
Epoch: [5][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0402(0.0241) Grad: 14497.2402  LR: 0.00001001  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0348(0.0348) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0265(0.0307) 


Epoch 5 - avg_train_loss: 0.0241  avg_val_loss: 0.0307  time: 23s
Epoch 5 - Score: 0.9408
Epoch 5 - Save Best Score: 0.9408 Model


Accuracy: 0.9408372490388722
Epoch: [6][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0284(0.0284) Grad: nan  LR: 0.00000999  
Epoch: [6][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0239(0.0207) Grad: 10765.4434  LR: 0.00000802  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0337(0.0337) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0291(0.0323) 


Epoch 6 - avg_train_loss: 0.0207  avg_val_loss: 0.0323  time: 23s
Epoch 6 - Score: 0.9417
Epoch 6 - Save Best Score: 0.9417 Model


Accuracy: 0.9416915847928236
Epoch: [7][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0217(0.0217) Grad: nan  LR: 0.00000799  
Epoch: [7][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0246(0.0191) Grad: 15332.1729  LR: 0.00000602  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0339(0.0339) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0286(0.0322) 


Epoch 7 - avg_train_loss: 0.0191  avg_val_loss: 0.0322  time: 23s
Epoch 7 - Score: 0.9411


Accuracy: 0.9410508329773601
Epoch: [8][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0191(0.0191) Grad: nan  LR: 0.00000599  
Epoch: [8][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0310(0.0180) Grad: 20850.0391  LR: 0.00000402  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0320(0.0320) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0288(0.0327) 


Epoch 8 - avg_train_loss: 0.0180  avg_val_loss: 0.0327  time: 23s
Epoch 8 - Score: 0.9451
Epoch 8 - Save Best Score: 0.9451 Model


Accuracy: 0.9451089278086288
Epoch: [9][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0108(0.0108) Grad: nan  LR: 0.00000399  
Epoch: [9][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0267(0.0164) Grad: 13691.5332  LR: 0.00000202  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0324(0.0324) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0287(0.0328) 


Epoch 9 - avg_train_loss: 0.0164  avg_val_loss: 0.0328  time: 23s
Epoch 9 - Score: 0.9462
Epoch 9 - Save Best Score: 0.9462 Model


Accuracy: 0.9461768475010679
Epoch: [10][0/73] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0226(0.0226) Grad: nan  LR: 0.00000200  
Epoch: [10][72/73] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0217(0.0157) Grad: 13092.1074  LR: 0.00000003  
EVAL: [0/19] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0317(0.0317) 
EVAL: [18/19] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0298(0.0333) 


Epoch 10 - avg_train_loss: 0.0157  avg_val_loss: 0.0333  time: 23s
Epoch 10 - Score: 0.9455


Accuracy: 0.9455360956856045


Score: 0.9462


Accuracy: 0.9461768475010679


Score: 0.9455


Accuracy: 0.9454553220570647


In [20]:
# 0.9367
# 0.9427