In [1]:
import os

OUTPUT_DIR = './labse_v4_brand'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [2]:
class CFG:
    apex=True
    print_freq=100
    num_workers=10
    model="sentence-transformers/LaBSE"
    scheduler='linear' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=10
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=256
    fc_dropout=0.1
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    extra=False

In [3]:
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
import ast
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig, T5EncoderModel
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true
# %env CUDA_LAUNCH_BLOCKING=1

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizers.__version__: 0.13.3
transformers.__version__: 4.30.1
env: TOKENIZERS_PARALLELISM=true


In [4]:
def get_logger(filename=os.path.join(OUTPUT_DIR, "train")):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [5]:
import re

def probs2str(results, texts): # (4501, 70, 3)
    predictions = []
    classes = np.argmax(results, axis=-1)
    input_ids_list = [tokenizer(text, 
                           add_special_tokens=True,
                           max_length=CFG.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)["input_ids"] for text in texts]

    for clas, input_ids in zip(classes, input_ids_list):
        entities = convert_ids_to_entities(clas, input_ids)

        entities = list(itertools.chain(*entities))
        # entities = tokenizer.convert_ids_to_tokens(entities)
        # entities = [entity for entity in entities if (entity != "[PAD]") and (entity != "[SEP]")]
        # prediction = tokenizer.convert_tokens_to_string(entities)
        prediction = tokenizer.decode(entities, skip_special_tokens=True)
        prediction = re.sub(" - ", "-", prediction)
        prediction = re.sub(" & ", "&", prediction)
        prediction = re.sub("##", "", prediction)
        predictions.append(prediction)
    
    return predictions

def get_score_f1(y_true, y_pred):
    tp, fp, fn = 0, 0, 0
    acc = 0
    for tr, pr in zip(y_true, y_pred):
        if tr == pr: acc += 1
        pr = frozenset([pr])
        tr = frozenset([tr])

        tp += len(pr & tr)
        fp += len(pr - tr)
        fn += len(tr - pr)

    if tp == 0:
        return 0.0
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    print(f"Accuracy: {acc / len(y_true)}")
    score = 2 / (1 / precision + 1 / recall)

    return score

In [6]:
def convert_ids_to_entities(ids, tokens=None, o_id=0, b_id=1, i_id=2):
    entities = []
    entity = []
    is_entity_started = False
    for index, token in enumerate(ids):
        if not is_entity_started and token == b_id:
            is_entity_started = True
            entity.append(index)
        elif is_entity_started and token == i_id:
            entity.append(index)
    
        if (is_entity_started and token == o_id) or (index == (len(ids) - 1)):
            is_entity_started = False
            entities.append(entity)
            entity = []
                
    if tokens is not None:
        tokens = np.array(tokens)
        token_entities = []
        for entity in entities:
            entity = np.array(entity)
            if len(entity) > 0:
                token_entity = tokens[entity]
                token_entities.append(token_entity.tolist())
            
        return token_entities
            
    return entities

In [7]:
import re

def preprocess_text(text):
    text = text.lower()
    # text = re.sub("_", " ", text) # v2e
    # text = re.sub("\d{5,}", " ", text) # убираю длиньше чем есть в метках
    # text = re.sub("\d+[\.\,]?\d* *(см|гр|г|л|мл|кг|шт|мкм|м)", " ", text) # убираю единицы измерения
    # text = re.sub("\d+[\.\,]?\d*\s?%", " ", text) # процентное содержание в товарах
    # text = re.sub("\[.*\]", " ", text) # убираем скобки и содержимое
    # text = re.sub("<.+>", " ", text) # убираем скобки и содержимое
    # text = re.sub("{.+}", " ", text) # убираем скобки и содержимое

    return text.strip()

def get_brand_pos(row):
    if row["brand"] == "": return (0, 0)
    start = row["name"].lower().find(row["brand"])
    if start == -1:
        end = -1
    else:
        end = start + len(row["brand"])

    return (start, end)

In [8]:
df = pd.read_csv("train_supervised_dataset.csv")
df = df[["name", "brand"]].fillna("")
df.name = df.name.apply(preprocess_text)
df["brand_pos"] = df.apply(get_brand_pos, axis=1)
df = df[df.brand_pos != (-1,-1)].reset_index(drop=True)

df["strat"] = df.brand_pos.apply(lambda x: 1 if x == (0, 0) else 0)
print(df.shape)
df.head()

(22550, 4)


Unnamed: 0,name,brand,brand_pos,strat
0,petmax бантик леопард с красн розой 2шт,petmax,"(0, 6)",0
1,87191 бусы для елки шарики_87191,,"(0, 0)",1
2,футболка piazza italia wr011446881,piazza italia,"(9, 22)",0
3,7) yi572-03x-one заколка для волос для девочки,,"(0, 0)",1
4,одежда (вес) 1500,,"(0, 0)",1


In [9]:
Fold = StratifiedKFold(n_splits=CFG.n_fold)
for n, (train_index, val_index) in enumerate(Fold.split(df, df.strat.values)):
    df.loc[val_index, 'fold'] = int(n)
df['fold'] = df['fold'].astype(int)
display(df.groupby('fold').size())

fold
0    4510
1    4510
2    4510
3    4510
4    4510
dtype: int64

In [10]:
if CFG.extra:
    extra_data = pd.read_csv("unsup_pred_rubert.csv")
    extra_data = extra_data[["name", "brand"]].fillna("")
    extra_data.name = extra_data.name.apply(preprocess_text)
    extra_data["brand_pos"] = extra_data.apply(get_brand_pos, axis=1)
    extra_data = extra_data[extra_data.brand_pos != (-1,-1)].reset_index(drop=True)
    extra_data.head()

In [11]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, "tokenizer"))
CFG.tokenizer = tokenizer

In [12]:
lengths = []
tk0 = tqdm(df.name.values, total=len(df))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)

if CFG.extra:
    tk1 = tqdm(extra_data.name.values, total=len(extra_data))
    for text in tk1:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        lengths.append(length)

CFG.max_len = max(lengths) + 2 # cls & sep
LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/22550 [00:00<?, ?it/s]

max_len: 66


In [13]:
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text, 
                           add_special_tokens=True,
                           max_length=CFG.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


def create_label(cfg, text, annotation, pos):
    # "O" -> 0
    # "B" -> 1
    # "I" -> 2
    encoded = cfg.tokenizer(text,
                            add_special_tokens=True,
                            max_length=CFG.max_len,
                            padding="max_length",
                            return_offsets_mapping=True)
    offset_mapping = encoded['offset_mapping']
    ignore_idxes = np.where(np.array(encoded.sequence_ids()) != 0)[0]
    label = np.zeros(len(offset_mapping))
    label[ignore_idxes] = -1

    if pos == (0, 0):
        return torch.tensor(label, dtype=torch.long)
    for i in range(len(offset_mapping)):
        if offset_mapping[i] == (0, 0):
            continue
        elif offset_mapping[i][0] == pos[0]:
            label[i] = 1
        elif offset_mapping[i][1] <= pos[1] and offset_mapping[i][0] >= pos[0]:
            label[i] = 2

    return torch.tensor(label, dtype=torch.long)

class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.name = df['name'].values
        self.brand = df['brand'].values
        self.brand_pos = df['brand_pos'].values

    def __len__(self):
        return len(self.name)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, 
                               self.name[item])
        label = create_label(self.cfg, 
                             self.name[item], 
                             self.brand[item], 
                             self.brand_pos[item])
        return inputs, label

In [14]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, 3)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        return last_hidden_states

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

In [15]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

In [16]:
def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        loss = criterion(y_preds.transpose(1, 2), labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg

In [17]:
def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.transpose(1, 2), labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.softmax(-1).to('cpu').numpy()) ####
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions

In [18]:
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)

    if CFG.extra:
        train_folds = pd.concat([train_folds, extra_data], axis=0)
    
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_texts = valid_folds['name'].values
    valid_labels = valid_folds['brand'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, os.path.join(OUTPUT_DIR, "config.pth"))
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler=='linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler=='cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.CrossEntropyLoss(ignore_index=-1)
    
    best_score = 0.0

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        # scoring
        preds = probs2str(predictions, valid_texts)
        score = get_score_f1(valid_labels, preds)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': np.argmax(predictions, axis=-1)},
                        os.path.join(OUTPUT_DIR, f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth"))

    predictions = torch.load(os.path.join(OUTPUT_DIR, f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth"), 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[i for i in range(CFG.max_len)]] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [19]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        predictions = []
        classes = oof_df[[i for i in range(CFG.max_len)]].values.tolist()
        input_ids_list = [CFG.tokenizer(text, 
                           add_special_tokens=True,
                           max_length=CFG.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)["input_ids"] for text in oof_df.name.values]
        for clas, input_ids in zip(classes, input_ids_list):
            entities = convert_ids_to_entities(clas, input_ids)
    
            entities = list(itertools.chain(*entities))
            # entities = tokenizer.convert_ids_to_tokens(entities)
            # entities = [entity for entity in entities if entity != "[PAD]"]
            # prediction = tokenizer.convert_tokens_to_string(entities)
            prediction = tokenizer.decode(entities, skip_special_tokens=True)
            prediction = re.sub(" - ", "-", prediction)
            prediction = re.sub(" & ", "&", prediction)
            prediction = re.sub("##", "", prediction)
            predictions.append(prediction)

        score = get_score_f1(oof_df.brand.values, predictions)
        LOGGER.info(f'Score: {score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(df, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(os.path.join(OUTPUT_DIR, "oof_df.pkl"))



Epoch: [1][0/70] Elapsed 0m 1s (remain 1m 11s) Loss: 1.0273(1.0273) Grad: nan  LR: 0.00001997  
Epoch: [1][69/70] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0976(0.2759) Grad: 10076.9023  LR: 0.00001801  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.1009(0.1009) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0956(0.0999) 


Epoch 1 - avg_train_loss: 0.2759  avg_val_loss: 0.0999  time: 23s
Epoch 1 - Score: 0.7497
Epoch 1 - Save Best Score: 0.7497 Model


Accuracy: 0.7496674057649667
Epoch: [2][0/70] Elapsed 0m 0s (remain 0m 30s) Loss: 0.1114(0.1114) Grad: nan  LR: 0.00001798  
Epoch: [2][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0762(0.0935) Grad: 16474.7168  LR: 0.00001602  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0782(0.0782) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0791(0.0824) 


Epoch 2 - avg_train_loss: 0.0935  avg_val_loss: 0.0824  time: 22s
Epoch 2 - Score: 0.7960
Epoch 2 - Save Best Score: 0.7960 Model


Accuracy: 0.7960088691796009
Epoch: [3][0/70] Elapsed 0m 0s (remain 0m 30s) Loss: 0.1038(0.1038) Grad: nan  LR: 0.00001599  
Epoch: [3][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0732(0.0769) Grad: 17449.3750  LR: 0.00001403  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0696(0.0696) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0654(0.0774) 


Epoch 3 - avg_train_loss: 0.0769  avg_val_loss: 0.0774  time: 23s
Epoch 3 - Score: 0.8129
Epoch 3 - Save Best Score: 0.8129 Model


Accuracy: 0.812860310421286
Epoch: [4][0/70] Elapsed 0m 0s (remain 0m 30s) Loss: 0.0720(0.0720) Grad: nan  LR: 0.00001401  
Epoch: [4][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0611(0.0666) Grad: 17592.6348  LR: 0.00001205  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0648(0.0648) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0635(0.0725) 


Epoch 4 - avg_train_loss: 0.0666  avg_val_loss: 0.0725  time: 23s
Epoch 4 - Score: 0.8233
Epoch 4 - Save Best Score: 0.8233 Model


Accuracy: 0.8232815964523281
Epoch: [5][0/70] Elapsed 0m 0s (remain 0m 30s) Loss: 0.0668(0.0668) Grad: nan  LR: 0.00001202  
Epoch: [5][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0569(0.0584) Grad: 18966.6621  LR: 0.00001006  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0643(0.0643) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0600(0.0719) 


Epoch 5 - avg_train_loss: 0.0584  avg_val_loss: 0.0719  time: 23s
Epoch 5 - Score: 0.8341
Epoch 5 - Save Best Score: 0.8341 Model


Accuracy: 0.8341463414634146
Epoch: [6][0/70] Elapsed 0m 0s (remain 0m 31s) Loss: 0.0441(0.0441) Grad: nan  LR: 0.00001003  
Epoch: [6][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0537(0.0537) Grad: 16371.0225  LR: 0.00000807  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0618(0.0618) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0599(0.0724) 


Epoch 6 - avg_train_loss: 0.0537  avg_val_loss: 0.0724  time: 23s
Epoch 6 - Score: 0.8384
Epoch 6 - Save Best Score: 0.8384 Model


Accuracy: 0.8383592017738359
Epoch: [7][0/70] Elapsed 0m 0s (remain 0m 30s) Loss: 0.0540(0.0540) Grad: nan  LR: 0.00000804  
Epoch: [7][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0468(0.0483) Grad: 18967.0449  LR: 0.00000608  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0602(0.0602) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0598(0.0721) 


Epoch 7 - avg_train_loss: 0.0483  avg_val_loss: 0.0721  time: 23s
Epoch 7 - Score: 0.8406
Epoch 7 - Save Best Score: 0.8406 Model


Accuracy: 0.8405764966740576
Epoch: [8][0/70] Elapsed 0m 0s (remain 0m 30s) Loss: 0.0381(0.0381) Grad: nan  LR: 0.00000605  
Epoch: [8][69/70] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0449(0.0454) Grad: 20356.7109  LR: 0.00000409  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0602(0.0602) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0598(0.0743) 


Epoch 8 - avg_train_loss: 0.0454  avg_val_loss: 0.0743  time: 23s
Epoch 8 - Score: 0.8439
Epoch 8 - Save Best Score: 0.8439 Model


Accuracy: 0.8439024390243902
Epoch: [9][0/70] Elapsed 0m 0s (remain 0m 31s) Loss: 0.0504(0.0504) Grad: nan  LR: 0.00000406  
Epoch: [9][69/70] Elapsed 0m 19s (remain 0m 0s) Loss: 0.0469(0.0430) Grad: 22693.1641  LR: 0.00000210  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0604(0.0604) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0603(0.0739) 


Epoch 9 - avg_train_loss: 0.0430  avg_val_loss: 0.0739  time: 23s
Epoch 9 - Score: 0.8481
Epoch 9 - Save Best Score: 0.8481 Model


Accuracy: 0.8481152993348116
Epoch: [10][0/70] Elapsed 0m 0s (remain 0m 29s) Loss: 0.0364(0.0364) Grad: nan  LR: 0.00000207  
Epoch: [10][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0348(0.0420) Grad: 17028.9727  LR: 0.00000011  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0598(0.0598) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0600(0.0742) 


Epoch 10 - avg_train_loss: 0.0420  avg_val_loss: 0.0742  time: 22s
Epoch 10 - Score: 0.8475


Accuracy: 0.847450110864745


Score: 0.8481


Accuracy: 0.8481152993348116
Epoch: [1][0/70] Elapsed 0m 0s (remain 0m 29s) Loss: 1.1787(1.1787) Grad: nan  LR: 0.00001997  
Epoch: [1][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.1140(0.2842) Grad: 9645.5537  LR: 0.00001801  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0977(0.0977) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.1115(0.0999) 


Epoch 1 - avg_train_loss: 0.2842  avg_val_loss: 0.0999  time: 22s
Epoch 1 - Score: 0.7457
Epoch 1 - Save Best Score: 0.7457 Model


Accuracy: 0.7456762749445677
Epoch: [2][0/70] Elapsed 0m 0s (remain 0m 30s) Loss: 0.1159(0.1159) Grad: nan  LR: 0.00001798  
Epoch: [2][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0937(0.0954) Grad: 20945.8965  LR: 0.00001602  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0769(0.0769) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.1014(0.0802) 


Epoch 2 - avg_train_loss: 0.0954  avg_val_loss: 0.0802  time: 22s
Epoch 2 - Score: 0.7945
Epoch 2 - Save Best Score: 0.7945 Model


Accuracy: 0.7944567627494457
Epoch: [3][0/70] Elapsed 0m 0s (remain 0m 30s) Loss: 0.0877(0.0877) Grad: nan  LR: 0.00001599  
Epoch: [3][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0908(0.0782) Grad: 28223.1035  LR: 0.00001403  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0694(0.0694) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.1007(0.0736) 


Epoch 3 - avg_train_loss: 0.0782  avg_val_loss: 0.0736  time: 22s
Epoch 3 - Score: 0.8177
Epoch 3 - Save Best Score: 0.8177 Model


Accuracy: 0.8177383592017738
Epoch: [4][0/70] Elapsed 0m 0s (remain 0m 30s) Loss: 0.0563(0.0563) Grad: nan  LR: 0.00001401  
Epoch: [4][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0621(0.0659) Grad: 19802.7891  LR: 0.00001205  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0678(0.0678) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0978(0.0716) 


Epoch 4 - avg_train_loss: 0.0659  avg_val_loss: 0.0716  time: 22s
Epoch 4 - Score: 0.8257
Epoch 4 - Save Best Score: 0.8257 Model


Accuracy: 0.825720620842572
Epoch: [5][0/70] Elapsed 0m 0s (remain 0m 29s) Loss: 0.0626(0.0626) Grad: nan  LR: 0.00001202  
Epoch: [5][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0429(0.0587) Grad: 13316.1592  LR: 0.00001006  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0680(0.0680) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0999(0.0711) 


Epoch 5 - avg_train_loss: 0.0587  avg_val_loss: 0.0711  time: 22s
Epoch 5 - Score: 0.8337
Epoch 5 - Save Best Score: 0.8337 Model


Accuracy: 0.8337028824833703
Epoch: [6][0/70] Elapsed 0m 0s (remain 0m 29s) Loss: 0.0552(0.0552) Grad: nan  LR: 0.00001003  
Epoch: [6][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0389(0.0530) Grad: 18567.4512  LR: 0.00000807  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0735(0.0735) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.1055(0.0727) 


Epoch 6 - avg_train_loss: 0.0530  avg_val_loss: 0.0727  time: 22s
Epoch 6 - Score: 0.8379
Epoch 6 - Save Best Score: 0.8379 Model


Accuracy: 0.8379157427937916
Epoch: [7][0/70] Elapsed 0m 0s (remain 0m 29s) Loss: 0.0478(0.0478) Grad: nan  LR: 0.00000804  
Epoch: [7][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0501(0.0490) Grad: 18513.2148  LR: 0.00000608  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0709(0.0709) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.1016(0.0718) 


Epoch 7 - avg_train_loss: 0.0490  avg_val_loss: 0.0718  time: 22s
Epoch 7 - Score: 0.8381
Epoch 7 - Save Best Score: 0.8381 Model


Accuracy: 0.8381374722838137
Epoch: [8][0/70] Elapsed 0m 0s (remain 0m 29s) Loss: 0.0313(0.0313) Grad: nan  LR: 0.00000605  
Epoch: [8][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0377(0.0452) Grad: 16091.1514  LR: 0.00000409  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0728(0.0728) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.1014(0.0720) 


Epoch 8 - avg_train_loss: 0.0452  avg_val_loss: 0.0720  time: 22s
Epoch 8 - Score: 0.8408
Epoch 8 - Save Best Score: 0.8408 Model


Accuracy: 0.8407982261640798
Epoch: [9][0/70] Elapsed 0m 0s (remain 0m 29s) Loss: 0.0380(0.0380) Grad: nan  LR: 0.00000406  
Epoch: [9][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0441(0.0438) Grad: 21531.3730  LR: 0.00000210  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0731(0.0731) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.1023(0.0730) 


Epoch 9 - avg_train_loss: 0.0438  avg_val_loss: 0.0730  time: 22s
Epoch 9 - Score: 0.8437
Epoch 9 - Save Best Score: 0.8437 Model


Accuracy: 0.843680709534368
Epoch: [10][0/70] Elapsed 0m 0s (remain 0m 29s) Loss: 0.0513(0.0513) Grad: nan  LR: 0.00000207  
Epoch: [10][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0340(0.0425) Grad: 12786.0098  LR: 0.00000011  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0741(0.0741) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.1041(0.0733) 


Epoch 10 - avg_train_loss: 0.0425  avg_val_loss: 0.0733  time: 22s
Epoch 10 - Score: 0.8437


Accuracy: 0.843680709534368


Score: 0.8437


Accuracy: 0.843680709534368
Epoch: [1][0/70] Elapsed 0m 0s (remain 0m 31s) Loss: 1.1777(1.1777) Grad: nan  LR: 0.00001997  
Epoch: [1][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.1190(0.2936) Grad: 8814.3916  LR: 0.00001801  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.1082(0.1082) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0787(0.1071) 


Epoch 1 - avg_train_loss: 0.2936  avg_val_loss: 0.1071  time: 22s
Epoch 1 - Score: 0.7554
Epoch 1 - Save Best Score: 0.7554 Model


Accuracy: 0.7554323725055433
Epoch: [2][0/70] Elapsed 0m 0s (remain 0m 31s) Loss: 0.1034(0.1034) Grad: nan  LR: 0.00001798  
Epoch: [2][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0829(0.0935) Grad: 16771.7891  LR: 0.00001602  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0879(0.0879) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0617(0.0855) 


Epoch 2 - avg_train_loss: 0.0935  avg_val_loss: 0.0855  time: 22s
Epoch 2 - Score: 0.7947
Epoch 2 - Save Best Score: 0.7947 Model


Accuracy: 0.7946784922394678
Epoch: [3][0/70] Elapsed 0m 0s (remain 0m 31s) Loss: 0.0871(0.0871) Grad: nan  LR: 0.00001599  
Epoch: [3][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0763(0.0760) Grad: 21244.5254  LR: 0.00001403  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0879(0.0879) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0575(0.0809) 


Epoch 3 - avg_train_loss: 0.0760  avg_val_loss: 0.0809  time: 22s
Epoch 3 - Score: 0.8124
Epoch 3 - Save Best Score: 0.8124 Model


Accuracy: 0.8124168514412416
Epoch: [4][0/70] Elapsed 0m 0s (remain 0m 31s) Loss: 0.0727(0.0727) Grad: nan  LR: 0.00001401  
Epoch: [4][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0666(0.0655) Grad: 21042.4395  LR: 0.00001205  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0854(0.0854) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0590(0.0782) 


Epoch 4 - avg_train_loss: 0.0655  avg_val_loss: 0.0782  time: 22s
Epoch 4 - Score: 0.8239
Epoch 4 - Save Best Score: 0.8239 Model


Accuracy: 0.8239467849223947
Epoch: [5][0/70] Elapsed 0m 0s (remain 0m 31s) Loss: 0.0764(0.0764) Grad: nan  LR: 0.00001202  
Epoch: [5][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0528(0.0585) Grad: 18091.9727  LR: 0.00001006  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0839(0.0839) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0605(0.0768) 


Epoch 5 - avg_train_loss: 0.0585  avg_val_loss: 0.0768  time: 22s
Epoch 5 - Score: 0.8279
Epoch 5 - Save Best Score: 0.8279 Model


Accuracy: 0.8279379157427937
Epoch: [6][0/70] Elapsed 0m 0s (remain 0m 31s) Loss: 0.0468(0.0468) Grad: nan  LR: 0.00001003  
Epoch: [6][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0539(0.0532) Grad: 17755.1426  LR: 0.00000807  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0843(0.0843) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0570(0.0758) 


Epoch 6 - avg_train_loss: 0.0532  avg_val_loss: 0.0758  time: 22s
Epoch 6 - Score: 0.8364
Epoch 6 - Save Best Score: 0.8364 Model


Accuracy: 0.8363636363636363
Epoch: [7][0/70] Elapsed 0m 0s (remain 0m 31s) Loss: 0.0542(0.0542) Grad: nan  LR: 0.00000804  
Epoch: [7][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0544(0.0480) Grad: 20246.7578  LR: 0.00000608  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0846(0.0846) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0574(0.0765) 


Epoch 7 - avg_train_loss: 0.0480  avg_val_loss: 0.0765  time: 22s
Epoch 7 - Score: 0.8377
Epoch 7 - Save Best Score: 0.8377 Model


Accuracy: 0.8376940133037694
Epoch: [8][0/70] Elapsed 0m 0s (remain 0m 31s) Loss: 0.0328(0.0328) Grad: nan  LR: 0.00000605  
Epoch: [8][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0432(0.0452) Grad: 19368.6309  LR: 0.00000409  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0885(0.0885) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0612(0.0789) 


Epoch 8 - avg_train_loss: 0.0452  avg_val_loss: 0.0789  time: 22s
Epoch 8 - Score: 0.8415
Epoch 8 - Save Best Score: 0.8415 Model


Accuracy: 0.8414634146341463
Epoch: [9][0/70] Elapsed 0m 0s (remain 0m 31s) Loss: 0.0453(0.0453) Grad: nan  LR: 0.00000406  
Epoch: [9][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0418(0.0427) Grad: 16723.3027  LR: 0.00000210  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0889(0.0889) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0611(0.0807) 


Epoch 9 - avg_train_loss: 0.0427  avg_val_loss: 0.0807  time: 22s
Epoch 9 - Score: 0.8410


Accuracy: 0.841019955654102
Epoch: [10][0/70] Elapsed 0m 0s (remain 0m 30s) Loss: 0.0356(0.0356) Grad: nan  LR: 0.00000207  
Epoch: [10][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0350(0.0406) Grad: 19048.0352  LR: 0.00000011  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0884(0.0884) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0608(0.0807) 


Epoch 10 - avg_train_loss: 0.0406  avg_val_loss: 0.0807  time: 22s
Epoch 10 - Score: 0.8417
Epoch 10 - Save Best Score: 0.8417 Model


Accuracy: 0.8416851441241685


Score: 0.8417


Accuracy: 0.8416851441241685
Epoch: [1][0/70] Elapsed 0m 0s (remain 0m 31s) Loss: 0.9199(0.9199) Grad: nan  LR: 0.00001997  
Epoch: [1][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.1101(0.2707) Grad: 10412.8701  LR: 0.00001801  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0954(0.0954) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0909(0.1024) 


Epoch 1 - avg_train_loss: 0.2707  avg_val_loss: 0.1024  time: 22s
Epoch 1 - Score: 0.7435
Epoch 1 - Save Best Score: 0.7435 Model


Accuracy: 0.7434589800443459
Epoch: [2][0/70] Elapsed 0m 0s (remain 0m 31s) Loss: 0.1078(0.1078) Grad: nan  LR: 0.00001798  
Epoch: [2][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0725(0.0948) Grad: 21185.2988  LR: 0.00001602  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0780(0.0780) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0699(0.0832) 


Epoch 2 - avg_train_loss: 0.0948  avg_val_loss: 0.0832  time: 22s
Epoch 2 - Score: 0.7885
Epoch 2 - Save Best Score: 0.7885 Model


Accuracy: 0.788470066518847
Epoch: [3][0/70] Elapsed 0m 0s (remain 0m 31s) Loss: 0.0705(0.0705) Grad: nan  LR: 0.00001599  
Epoch: [3][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0817(0.0774) Grad: 18191.3242  LR: 0.00001403  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0678(0.0678) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0654(0.0750) 


Epoch 3 - avg_train_loss: 0.0774  avg_val_loss: 0.0750  time: 22s
Epoch 3 - Score: 0.8049
Epoch 3 - Save Best Score: 0.8049 Model


Accuracy: 0.8048780487804879
Epoch: [4][0/70] Elapsed 0m 0s (remain 0m 31s) Loss: 0.0721(0.0721) Grad: nan  LR: 0.00001401  
Epoch: [4][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0615(0.0658) Grad: 16123.3037  LR: 0.00001205  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0673(0.0673) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0641(0.0712) 


Epoch 4 - avg_train_loss: 0.0658  avg_val_loss: 0.0712  time: 22s
Epoch 4 - Score: 0.8213
Epoch 4 - Save Best Score: 0.8213 Model


Accuracy: 0.8212860310421286
Epoch: [5][0/70] Elapsed 0m 0s (remain 0m 31s) Loss: 0.0541(0.0541) Grad: nan  LR: 0.00001202  
Epoch: [5][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0599(0.0586) Grad: 19298.9824  LR: 0.00001006  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0661(0.0661) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0612(0.0704) 


Epoch 5 - avg_train_loss: 0.0586  avg_val_loss: 0.0704  time: 22s
Epoch 5 - Score: 0.8264
Epoch 5 - Save Best Score: 0.8264 Model


Accuracy: 0.8263858093126386
Epoch: [6][0/70] Elapsed 0m 0s (remain 0m 33s) Loss: 0.0856(0.0856) Grad: nan  LR: 0.00001003  
Epoch: [6][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0522(0.0529) Grad: 20855.8027  LR: 0.00000807  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 7s) Loss: 0.0653(0.0653) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0627(0.0704) 


Epoch 6 - avg_train_loss: 0.0529  avg_val_loss: 0.0704  time: 22s
Epoch 6 - Score: 0.8335
Epoch 6 - Save Best Score: 0.8335 Model


Accuracy: 0.8334811529933481
Epoch: [7][0/70] Elapsed 0m 0s (remain 0m 33s) Loss: 0.0602(0.0602) Grad: nan  LR: 0.00000804  
Epoch: [7][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0437(0.0487) Grad: 18832.4219  LR: 0.00000608  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0673(0.0673) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0661(0.0699) 


Epoch 7 - avg_train_loss: 0.0487  avg_val_loss: 0.0699  time: 22s
Epoch 7 - Score: 0.8399
Epoch 7 - Save Best Score: 0.8399 Model


Accuracy: 0.8399113082039912
Epoch: [8][0/70] Elapsed 0m 0s (remain 0m 31s) Loss: 0.0428(0.0428) Grad: nan  LR: 0.00000605  
Epoch: [8][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0468(0.0457) Grad: 20667.7109  LR: 0.00000409  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0680(0.0680) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0691(0.0717) 


Epoch 8 - avg_train_loss: 0.0457  avg_val_loss: 0.0717  time: 22s
Epoch 8 - Score: 0.8410
Epoch 8 - Save Best Score: 0.8410 Model


Accuracy: 0.841019955654102
Epoch: [9][0/70] Elapsed 0m 0s (remain 0m 31s) Loss: 0.0466(0.0466) Grad: nan  LR: 0.00000406  
Epoch: [9][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0346(0.0427) Grad: 16414.6484  LR: 0.00000210  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0690(0.0690) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0675(0.0711) 


Epoch 9 - avg_train_loss: 0.0427  avg_val_loss: 0.0711  time: 22s
Epoch 9 - Score: 0.8437
Epoch 9 - Save Best Score: 0.8437 Model


Accuracy: 0.843680709534368
Epoch: [10][0/70] Elapsed 0m 0s (remain 0m 31s) Loss: 0.0400(0.0400) Grad: nan  LR: 0.00000207  
Epoch: [10][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0509(0.0415) Grad: 17328.4180  LR: 0.00000011  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0699(0.0699) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0681(0.0714) 


Epoch 10 - avg_train_loss: 0.0415  avg_val_loss: 0.0714  time: 22s
Epoch 10 - Score: 0.8439
Epoch 10 - Save Best Score: 0.8439 Model


Accuracy: 0.8439024390243902


Score: 0.8439


Accuracy: 0.8439024390243902
Epoch: [1][0/70] Elapsed 0m 0s (remain 0m 31s) Loss: 1.1768(1.1768) Grad: nan  LR: 0.00001997  
Epoch: [1][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.1088(0.2816) Grad: 9081.4629  LR: 0.00001801  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.1008(0.1008) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0883(0.1018) 


Epoch 1 - avg_train_loss: 0.2816  avg_val_loss: 0.1018  time: 22s
Epoch 1 - Score: 0.7530
Epoch 1 - Save Best Score: 0.7530 Model


Accuracy: 0.7529933481152994
Epoch: [2][0/70] Elapsed 0m 0s (remain 0m 31s) Loss: 0.1035(0.1035) Grad: nan  LR: 0.00001798  
Epoch: [2][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.1062(0.0919) Grad: 23465.3555  LR: 0.00001602  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0783(0.0783) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0638(0.0809) 


Epoch 2 - avg_train_loss: 0.0919  avg_val_loss: 0.0809  time: 22s
Epoch 2 - Score: 0.7967
Epoch 2 - Save Best Score: 0.7967 Model


Accuracy: 0.7966740576496674
Epoch: [3][0/70] Elapsed 0m 0s (remain 0m 31s) Loss: 0.0800(0.0800) Grad: nan  LR: 0.00001599  
Epoch: [3][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0809(0.0763) Grad: 19930.5586  LR: 0.00001403  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0783(0.0783) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0586(0.0749) 


Epoch 3 - avg_train_loss: 0.0763  avg_val_loss: 0.0749  time: 22s
Epoch 3 - Score: 0.8111
Epoch 3 - Save Best Score: 0.8111 Model


Accuracy: 0.8110864745011086
Epoch: [4][0/70] Elapsed 0m 0s (remain 0m 32s) Loss: 0.0632(0.0632) Grad: nan  LR: 0.00001401  
Epoch: [4][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0609(0.0667) Grad: 19902.0898  LR: 0.00001205  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0763(0.0763) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0562(0.0715) 


Epoch 4 - avg_train_loss: 0.0667  avg_val_loss: 0.0715  time: 22s
Epoch 4 - Score: 0.8302
Epoch 4 - Save Best Score: 0.8302 Model


Accuracy: 0.8301552106430156
Epoch: [5][0/70] Elapsed 0m 0s (remain 0m 31s) Loss: 0.0732(0.0732) Grad: nan  LR: 0.00001202  
Epoch: [5][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0399(0.0592) Grad: 16794.2773  LR: 0.00001006  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0730(0.0730) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0512(0.0687) 


Epoch 5 - avg_train_loss: 0.0592  avg_val_loss: 0.0687  time: 22s
Epoch 5 - Score: 0.8366
Epoch 5 - Save Best Score: 0.8366 Model


Accuracy: 0.8365853658536585
Epoch: [6][0/70] Elapsed 0m 0s (remain 0m 31s) Loss: 0.0477(0.0477) Grad: nan  LR: 0.00001003  
Epoch: [6][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0525(0.0532) Grad: 16148.2451  LR: 0.00000807  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0732(0.0732) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0499(0.0691) 


Epoch 6 - avg_train_loss: 0.0532  avg_val_loss: 0.0691  time: 22s
Epoch 6 - Score: 0.8392
Epoch 6 - Save Best Score: 0.8392 Model


Accuracy: 0.8392461197339246
Epoch: [7][0/70] Elapsed 0m 0s (remain 0m 31s) Loss: 0.0479(0.0479) Grad: nan  LR: 0.00000804  
Epoch: [7][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0441(0.0502) Grad: 16778.5234  LR: 0.00000608  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0719(0.0719) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0507(0.0681) 


Epoch 7 - avg_train_loss: 0.0502  avg_val_loss: 0.0681  time: 22s
Epoch 7 - Score: 0.8443
Epoch 7 - Save Best Score: 0.8443 Model


Accuracy: 0.8443458980044346
Epoch: [8][0/70] Elapsed 0m 0s (remain 0m 31s) Loss: 0.0528(0.0528) Grad: nan  LR: 0.00000605  
Epoch: [8][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0402(0.0461) Grad: 15859.8428  LR: 0.00000409  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0742(0.0742) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0504(0.0692) 


Epoch 8 - avg_train_loss: 0.0461  avg_val_loss: 0.0692  time: 22s
Epoch 8 - Score: 0.8448
Epoch 8 - Save Best Score: 0.8448 Model


Accuracy: 0.844789356984479
Epoch: [9][0/70] Elapsed 0m 0s (remain 0m 31s) Loss: 0.0479(0.0479) Grad: nan  LR: 0.00000406  
Epoch: [9][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0428(0.0444) Grad: 19235.4746  LR: 0.00000210  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0748(0.0748) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0527(0.0691) 


Epoch 9 - avg_train_loss: 0.0444  avg_val_loss: 0.0691  time: 22s
Epoch 9 - Score: 0.8443


Accuracy: 0.8443458980044346
Epoch: [10][0/70] Elapsed 0m 0s (remain 0m 31s) Loss: 0.0428(0.0428) Grad: nan  LR: 0.00000207  
Epoch: [10][69/70] Elapsed 0m 18s (remain 0m 0s) Loss: 0.0441(0.0428) Grad: 18080.6465  LR: 0.00000011  
EVAL: [0/18] Elapsed 0m 0s (remain 0m 6s) Loss: 0.0758(0.0758) 
EVAL: [17/18] Elapsed 0m 3s (remain 0m 0s) Loss: 0.0520(0.0694) 


Epoch 10 - avg_train_loss: 0.0428  avg_val_loss: 0.0694  time: 22s
Epoch 10 - Score: 0.8446


Accuracy: 0.8445676274944568


Score: 0.8448


Accuracy: 0.844789356984479


Score: 0.8444


Accuracy: 0.8444345898004435
