In [1]:
# !pip install optuna

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import gc, warnings, random, time, os

from pathlib import Path

from tqdm.notebook import tqdm

warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from transformers import AutoModel, AutoTokenizer, AutoConfig
from transformers import get_cosine_schedule_with_warmup

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

import seaborn as sns

import gc
gc.enable()

import optuna

### Folders and Dataframes

In [3]:
DATA_PATH = Path('/home/commonlit/data/')
assert DATA_PATH.exists()
MODELS_PATH = Path('/home/commonlit/models/')
if not MODELS_PATH.exists():
    os.mkdir(MODELS_PATH)
assert MODELS_PATH.exists()

In [4]:
train_df = pd.read_csv(DATA_PATH/'train-orig.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')
sample_df = pd.read_csv(DATA_PATH/'sample_submission.csv')

In [5]:
def remove_unnecessary(df):
    df.drop(df[df['target'] == 0].index, inplace=True)
    df.reset_index(drop=True, inplace=True)
    
remove_unnecessary(train_df)

In [6]:
train_df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845
...,...,...,...,...,...,...
2828,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,When you think of dinosaurs and where they liv...,1.711390,0.646900
2829,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,So what is a solid? Solids are usually hard be...,0.189476,0.535648
2830,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,The second state of matter we will discuss is ...,0.255209,0.483866
2831,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,Solids are shapes that you can actually touch....,-0.215279,0.514128


### Config and Seeding

In [7]:
class Config(): 
    NUM_FOLDS = 6
    NUM_EPOCHS = 3
    BATCH_SIZE = 16
    MAX_LEN = 248
    EVAL_SCHEDULE = [(0.50, 16), (0.49, 8), (0.48, 4), (0.47, 2), (-1., 1)]
    ROBERTA_PATH = 't5-large'
    TOKENIZER_PATH = 't5-large'
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    SEED = 1000
    NUM_WORKERS = 2
    MODEL_FOLDER = MODELS_PATH
    model_name = 't5-large'
    svm_kernels = ['rbf']
    svm_c = 5

cfg = Config()

In [8]:
if not cfg.MODEL_FOLDER.exists():
    os.mkdir(cfg.MODEL_FOLDER)

In [9]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True

### Dataset

In [10]:
def add_bins(train_df, num_bins):
    train_df.loc[:, 'bins'] = pd.cut(train_df['target'], bins=num_bins, labels=False)
    return num_bins

In [11]:
add_bins(train_df, cfg.NUM_FOLDS)

6

In [12]:
train_df.groupby(['bins'])['target'].agg(['count', 'mean'])

Unnamed: 0_level_0,count,mean
bins,Unnamed: 1_level_1,Unnamed: 2_level_1
0,122,-3.125765
1,441,-2.270279
2,784,-1.41215
3,886,-0.548095
4,494,0.289716
5,106,1.070237


In [13]:
tokenizer = AutoTokenizer.from_pretrained(cfg.TOKENIZER_PATH)

In [14]:
class CommonLitDataset(Dataset):
    def __init__(self, df, tokenizer, inference_only=False):
        super().__init__()
        self.df, self.inference_only = df, inference_only
        self.text = df['excerpt'].tolist()
        self.bins = df['bins']
        if not inference_only:
            self.target = torch.tensor(df['target'].to_numpy(), dtype = torch.float32)
        
        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',
            max_length = cfg.MAX_LEN,
            truncation = True,
            return_attention_mask=True
        )
        
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return {'input_ids': input_ids, 'attention_mask': attention_mask}
        else:
            target = self.target[index]
            return {'input_ids': input_ids, 'attention_mask': attention_mask, 'target': target}
    
    def __len__(self):
        return len(self.df)

In [15]:
sample_ds = CommonLitDataset(train_df, tokenizer)

### Model

In [16]:
class AttentionHead(nn.Module):
    
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        
        self.hidden_layer = nn.Linear(in_features, hidden_dim)
        self.final_layer = nn.Linear(hidden_dim, num_targets)
        self.out_features = hidden_dim
        
    def forward(self, features):
        att = torch.tanh(self.hidden_layer(features))
        score = self.final_layer(att)
        attention_weights = torch.softmax(score, dim=1)
        return attention_weights

In [17]:
from transformers import T5EncoderModel

class CommonLitModel(nn.Module):
    def __init__(self):
        super(CommonLitModel, self).__init__()
        config = AutoConfig.from_pretrained(cfg.ROBERTA_PATH)
        config.update({
            "output_hidden_states": True,
            "hidden_dropout_prob": 0.0,
            "layer_norm_eps": 1e-7
        })
        self.transformer_model = T5EncoderModel.from_pretrained(cfg.ROBERTA_PATH, config=config)
        self.attention = AttentionHead(config.hidden_size, 512, 1)
        self.regressor = nn.Linear(config.hidden_size, 1)
    
    def forward(self, input_ids, attention_mask):
        last_layer_hidden_states = self.transformer_model(input_ids=input_ids, attention_mask=attention_mask)['last_hidden_state']
        weights = self.attention(last_layer_hidden_states)
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1) 
        return self.regressor(context_vector), context_vector

In [18]:
sample_model = CommonLitModel()

Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

In [19]:
import re

for i, (name, param) in enumerate(sample_model.named_parameters()):
    if(name.find('layer') > -1):
        layer_name = re.sub(r'.+(layer\.\d+).+', r'\1', name)

In [20]:
for i, (name, param) in enumerate(sample_model.named_parameters()):
    print(i, name, param.size())

0 transformer_model.shared.weight torch.Size([32128, 1024])
1 transformer_model.encoder.block.0.layer.0.SelfAttention.q.weight torch.Size([1024, 1024])
2 transformer_model.encoder.block.0.layer.0.SelfAttention.k.weight torch.Size([1024, 1024])
3 transformer_model.encoder.block.0.layer.0.SelfAttention.v.weight torch.Size([1024, 1024])
4 transformer_model.encoder.block.0.layer.0.SelfAttention.o.weight torch.Size([1024, 1024])
5 transformer_model.encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight torch.Size([32, 16])
6 transformer_model.encoder.block.0.layer.0.layer_norm.weight torch.Size([1024])
7 transformer_model.encoder.block.0.layer.1.DenseReluDense.wi.weight torch.Size([4096, 1024])
8 transformer_model.encoder.block.0.layer.1.DenseReluDense.wo.weight torch.Size([1024, 4096])
9 transformer_model.encoder.block.0.layer.1.layer_norm.weight torch.Size([1024])
10 transformer_model.encoder.block.1.layer.0.SelfAttention.q.weight torch.Size([1024, 1024])
11 transformer_mode

In [21]:
sample_input_ids = torch.randint(0, 1000, [8, 248])
sample_attention_mask = torch.randint(0, 1000, [8, 248])

In [22]:
sample_res = sample_model(sample_input_ids, sample_attention_mask)

In [23]:
sample_res

(tensor([[ 0.1762],
         [ 0.0296],
         [ 0.1032],
         [ 0.0202],
         [ 0.1519],
         [-0.0234],
         [-0.0488],
         [ 0.0940]], grad_fn=<AddmmBackward>),
 tensor([[-0.0932,  0.5152, -0.0563,  ...,  0.0748,  0.0292, -0.0033],
         [-0.2340,  0.5193, -0.2320,  ..., -0.0967, -0.0371, -0.0875],
         [-0.0411,  0.2741,  0.0178,  ...,  0.0452,  0.0516, -0.0613],
         ...,
         [-0.1031,  0.2917,  0.0695,  ...,  0.0698, -0.0837, -0.1785],
         [-0.2271,  0.1039, -0.2060,  ...,  0.3366, -0.0767, -0.1611],
         [-0.2194,  0.4349, -0.0851,  ...,  0.0124, -0.0073,  0.0182]],
        grad_fn=<SumBackward1>))

In [24]:
torch.sum(torch.randn([8, 496, 768]), axis=1)

tensor([[ 25.3458,  -7.4167,  18.7784,  ..., -50.2850,  16.0396,  32.5412],
        [-32.2676, -55.3496,  -5.2535,  ...,  -7.6934, -36.6839, -12.5520],
        [ -8.9862, -22.8392,  -5.4140,  ..., -21.0663,  32.6891,   2.7056],
        ...,
        [ 10.5804,  -6.3983,  33.1813,  ..., -33.5966, -31.8749, -35.1670],
        [  8.9105,  23.7824,  19.3646,  ...,  -5.1728,   4.5497, -26.4514],
        [ 32.8454, -32.7110,  18.4993,  ...,  45.9479,  -1.8786,  -3.3850]])

### Evaluation and Prediction

In [25]:
def eval_mse(model, data_loader):
    model.eval()
    mse_sum = 0
    mse_loss = nn.MSELoss(reduction='sum')
    
    with torch.no_grad():
        for batch_num, record in enumerate(data_loader):
            input_ids, attention_mask, target = record['input_ids'].to(cfg.DEVICE), record['attention_mask'].to(cfg.DEVICE), record['target'].to(cfg.DEVICE)
            pred, _ = model(input_ids, attention_mask)
            mse_sum += mse_loss(pred.flatten().cpu(), target.cpu())
            
    return mse_sum / len(data_loader.dataset)

In [26]:
def predict(model, data_loader):
    model.eval()
    result = []
    
    with torch.no_grad():
        for batch_num, record in tqdm(enumerate(data_loader), total=len(data_loader)):
            input_ids, attention_mask = record['input_ids'].to(cfg.DEVICE), record['attention_mask'].to(cfg.DEVICE)
            pred, _ = model(input_ids, attention_mask)
            result.extend(pred.flatten().to("cpu").tolist())
            
    return np.array(result)

In [27]:
sample_dl = DataLoader(sample_ds, shuffle=False, batch_size=16, num_workers=1)

### Optimizer and Sampler

In [28]:
5e-5 / 2.5, 5e-5 / 0.5, 5e-5

(2e-05, 0.0001, 5e-05)

In [29]:
def create_optimizer(model, base_lr=5e-5, last_lr=None):
    named_parameters = list(model.named_parameters())
    
    regressor_param_start = 199
    attention_param_start = 195
    roberta_parameters = named_parameters[:attention_param_start]
    attention_parameters = named_parameters[attention_param_start:regressor_param_start]
    regressor_parameters = named_parameters[regressor_param_start:]
    
    attention_group = [params for (name, params) in attention_parameters]
    regressor_group = [params for (name, params) in regressor_parameters]
    
    parameters = []
    if last_lr is not None:
        parameters.append({"params": attention_group, "lr": last_lr})
        parameters.append({"params": regressor_group, "lr": last_lr})
    else:
        parameters.append({"params": attention_group})
        parameters.append({"params": regressor_group})
    
    for layer_num, (name, params) in enumerate(roberta_parameters):
        weight_decay = 0.0 if 'bias' in name else 0.01
        
        lr = base_lr / 2.5 # 2e-05
        if layer_num >= 130:
            lr = base_lr / 0.5 # 1e-4
        elif layer_num >= 82:        
            lr = base_lr    
            
        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})
        
    return AdamW(parameters)

In [30]:
sample_optimizer = create_optimizer(sample_model)

In [31]:
from torch.utils.data import Sampler,SequentialSampler,RandomSampler,SubsetRandomSampler
from collections import Counter

class WeightedSampler(Sampler):
    
    def __init__(self, dataset):
        
        self.indices = list(range(len(dataset)))
        self.num_samples = len(dataset)
        self.label_to_count = dict(Counter(dataset.bins))
        weights = [1/self.label_to_count[i] for i in dataset.bins]
        
        self.weights = torch.tensor(weights,dtype=torch.double)
        
    def __iter__(self):
        count = 0
        index = [self.indices[i] for i in torch.multinomial(self.weights, self.num_samples, replacement=True)]
        while count < self.num_samples:
            yield index[count]
            count += 1
    
    def __len__(self):
        return self.num_samples

### Training

In [32]:
def choose_eval_period(val_rmse):
    for rmse, period in cfg.EVAL_SCHEDULE:
        if val_rmse >= rmse:
            return period

In [33]:
def serialize_best(best_val_rmse, best_epoch, val_rmse, epoch, model, model_path):
    if not best_val_rmse or val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        best_epoch = epoch
        if not model_path.parent.exists():
            os.makedirs(model_path.parent)
        
#         torch.save(model.state_dict(), model_path)
        print(f"New best_val_rmse: {best_val_rmse:0.4}")
    else:       
        print(f"Still best_val_rmse: {best_val_rmse:0.4}",
              f"(from epoch {best_epoch})")
    return best_epoch, best_val_rmse

In [34]:
class Trainer():
    def __init__(self, scaler, model, model_path, train_loader, val_loader, optimizer, scheduler=None, num_epochs=cfg.NUM_EPOCHS):
        self.scaler, self.model, self.model_path, self.train_loader, self.val_loader, self.optimizer, self.scheduler, self.num_epochs = (
            scaler, model, model_path, train_loader, val_loader, optimizer, scheduler, num_epochs
        )
            
    def train(self):
        self.model.train()
        
        mse_loss = nn.MSELoss(reduction='mean')
        
        best_val_rmse = None
        best_epoch = 0
        step = 0
        last_eval_step = 0
        eval_period = cfg.EVAL_SCHEDULE[0][1]    

        start = time.time()
        val_rmse_list = []
        
        tbar = tqdm(range(self.num_epochs), total=self.num_epochs)
        for epoch in tbar:
            tbar.set_description(f'Epoch: {epoch}')
            val_rmse = None
            for batch_num, record in enumerate(self.train_loader):
                input_ids, attention_mask, target = record['input_ids'].to(cfg.DEVICE), record['attention_mask'].to(cfg.DEVICE), record['target'].to(cfg.DEVICE)
                
                self.optimizer.zero_grad()
                
                # Casts operations to mixed precision
#                 with torch.cuda.amp.autocast():
                pred, _ = self.model(input_ids, attention_mask)
                mse = mse_loss(pred.flatten(), target)
                    
#                 self.scaler.scale(mse).backward()
#                 self.scaler.step(self.optimizer)
#                 self.scaler.update()
                
                mse.backward()
                self.optimizer.step()
                
                if self.scheduler:
                    self.scheduler.step()
                    
                if step >= last_eval_step + eval_period:
                    elapsed_seconds = time.time() - start
                    num_steps = step - last_eval_step
                    print(f"\n{num_steps} steps took {elapsed_seconds:0.3} seconds")
                    last_eval_step = step
                    
                    val_rmse = np.sqrt(eval_mse(self.model, self.val_loader))
                    print(f"Epoch: {epoch} batch_num: {batch_num}", f"val_rmse: {val_rmse:0.4} ", end='')
                    
                    eval_period = choose_eval_period(val_rmse)
                    best_epoch, best_val_rmse = serialize_best(best_val_rmse, best_epoch, val_rmse, epoch, self.model, self.model_path)
                    val_rmse_list.append(val_rmse)
                    start = time.time()
                # Finish early on condition
                if epoch > 0 and best_val_rmse > 0.6 or (len(val_rmse_list) > 5 and np.array(val_rmse_list).mean() > 1.0):
                    return best_val_rmse
                
                step += 1
        return best_val_rmse

In [35]:
kfold = KFold(n_splits=cfg.NUM_FOLDS, random_state=cfg.SEED, shuffle=True)
splits = list(kfold.split(train_df))

### Optuna

In [36]:
# Best results
# Fold 0: {'base_lr': 0.00013575061062518292, 'last_lr': 0.0027390926762560675} Best value:  0.48893508315086365
# Fold 1: {'base_lr': 6.433162302000639e-05, 'last_lr': 0.0025302612125878217}. Best is trial 0 with value: 0.4527459144592285
# Fold 2: {'base_lr': 0.00012105407461535033, 'last_lr': 0.00012780642309774768}. Best is trial 4 with value: 0.476582378149032
# Fold 3: {'base_lr': 0.00016420220823284873, 'last_lr': 0.004783602075813355}. Best is trial 13 with value: 0.4700598418712616
# Fold 4: {'base_lr': 8.176324330617398e-05, 'last_lr': 0.0012432581220121835}. Best is trial 17 with value: 0.4916570484638214
# Fold 5: {'base_lr': 0.0002297546136917806, 'last_lr': 0.00034915806261776055}. Best is trial 12 with value: 0.4850253164768219

In [37]:
from transformers import T5Tokenizer

fold = 0

def objective(trial):
    base_lr = trial.suggest_float("base_lr", 3e-5, 5e-4, log=True)
    last_lr = trial.suggest_float("last_lr", 8e-5, 5e-3, log=True)
    
    print(f'##### Using fold {fold}')
    print(f'##### Using base_lr {base_lr} last_lr {last_lr}')
    
    model_path = cfg.MODEL_FOLDER/f"{cfg.model_name.replace('/', '_')}_{fold + 1}/model_{fold + 1}.pth"
    
    set_random_seed(cfg.SEED + fold)
    
    tokenizer = T5Tokenizer.from_pretrained(cfg.TOKENIZER_PATH)
    
    train_indices, val_indices = splits[fold]
    train_dataset = CommonLitDataset(train_df.loc[train_indices], tokenizer)    
    val_dataset = CommonLitDataset(train_df.loc[val_indices], tokenizer)
    
    train_loader = DataLoader(train_dataset, batch_size=cfg.BATCH_SIZE,
                              drop_last=False, shuffle=True, num_workers=cfg.NUM_WORKERS)    
    val_loader = DataLoader(val_dataset, batch_size=cfg.BATCH_SIZE,
                            drop_last=False, shuffle=False, num_workers=cfg.NUM_WORKERS)
    
    set_random_seed(cfg.SEED + fold)
    
    model = CommonLitModel().to(cfg.DEVICE)
    
    optimizer = create_optimizer(model, base_lr=base_lr, last_lr=last_lr)
    
    scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                num_training_steps=cfg.NUM_EPOCHS * len(train_loader), 
                                                num_warmup_steps=50)
    scaler = torch.cuda.amp.GradScaler()
    
    trainer = Trainer(scaler, model, model_path, train_loader, val_loader, optimizer, scheduler = scheduler)
    rmse_val = trainer.train()
    
    del trainer
    del model
    del tokenizer
    del scaler
    del optimizer
    del train_loader
    del val_loader
    torch.cuda.empty_cache()
    gc.collect()
    
    return rmse_val

In [None]:
%%time

for i in range(3, len(list(splits))):
    fold = i
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=20)
    print(" Best value: ", study.best_trial.value)
    print(" Best params: ")
    for key, value in study.best_trial.params.items():
        print(f"    {key}: {value}")

[32m[I 2021-07-16 07:18:08,196][0m A new study created in memory with name: no-name-cf37c018-a7a4-44a2-b4d0-ac3e70b938fe[0m


##### Using fold 3
##### Using base_lr 5.327815988824232e-05 last_lr 0.000838614305106896


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 14.2 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.304 New best_val_rmse: 1.304

16 steps took 11.8 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9902 New best_val_rmse: 0.9902

16 steps took 11.9 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7017 New best_val_rmse: 0.7017

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.658 New best_val_rmse: 0.658

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6251 New best_val_rmse: 0.6251

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6138 New best_val_rmse: 0.6138

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5789 New best_val_rmse: 0.5789

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5566 New best_val_rmse: 0.5566

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5524 New best_val_rmse: 0.5524

16 steps took 12.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5646 Still best_val_rmse: 0.5524 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 1 batch_n

[32m[I 2021-07-16 07:28:49,664][0m Trial 0 finished with value: 0.49225884675979614 and parameters: {'base_lr': 5.327815988824232e-05, 'last_lr': 0.000838614305106896}. Best is trial 0 with value: 0.49225884675979614.[0m



##### Using fold 3
##### Using base_lr 0.00023702006138636648 last_lr 0.0015281915172502291


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.045 New best_val_rmse: 1.045

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7023 New best_val_rmse: 0.7023

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7444 Still best_val_rmse: 0.7023 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7663 Still best_val_rmse: 0.7023 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5816 New best_val_rmse: 0.5816

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5413 New best_val_rmse: 0.5413

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5358 New best_val_rmse: 0.5358

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5584 Still best_val_rmse: 0.5358 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5304 New best_val_rmse: 0.5304

16 steps took 12.6 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5888 Still best_val_rmse: 0.5304 (from 

[32m[I 2021-07-16 07:42:04,987][0m Trial 1 finished with value: 0.48303282260894775 and parameters: {'base_lr': 0.00023702006138636648, 'last_lr': 0.0015281915172502291}. Best is trial 1 with value: 0.48303282260894775.[0m



##### Using fold 3
##### Using base_lr 0.00011335195714243843 last_lr 0.0012314504768167866


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.9 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.191 New best_val_rmse: 1.191

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8593 New best_val_rmse: 0.8593

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7324 New best_val_rmse: 0.7324

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6444 New best_val_rmse: 0.6444

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6168 New best_val_rmse: 0.6168

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5731 New best_val_rmse: 0.5731

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5399 New best_val_rmse: 0.5399

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5418 Still best_val_rmse: 0.5399 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5373 New best_val_rmse: 0.5373

16 steps took 12.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5546 Still best_val_rmse: 0.5373 (from epoch 0)

16 steps took 12.0 secon

[32m[I 2021-07-16 07:55:51,013][0m Trial 2 finished with value: 0.47981569170951843 and parameters: {'base_lr': 0.00011335195714243843, 'last_lr': 0.0012314504768167866}. Best is trial 2 with value: 0.47981569170951843.[0m



##### Using fold 3
##### Using base_lr 0.00012000374177976627 last_lr 0.00028582109634624095


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.265 New best_val_rmse: 1.265

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9784 New best_val_rmse: 0.9784

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6865 New best_val_rmse: 0.6865

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.641 New best_val_rmse: 0.641

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5925 New best_val_rmse: 0.5925

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5603 New best_val_rmse: 0.5603

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5333 New best_val_rmse: 0.5333

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5482 Still best_val_rmse: 0.5333 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5384 Still best_val_rmse: 0.5333 (from epoch 0)

16 steps took 12.6 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5711 Still best_val_rmse: 0.5333 (from epoch 0)

16 steps 

[32m[I 2021-07-16 08:10:35,478][0m Trial 3 finished with value: 0.47883087396621704 and parameters: {'base_lr': 0.00012000374177976627, 'last_lr': 0.00028582109634624095}. Best is trial 3 with value: 0.47883087396621704.[0m



##### Using fold 3
##### Using base_lr 5.6933106526460035e-05 last_lr 0.00113397895075368


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.274 New best_val_rmse: 1.274

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9832 New best_val_rmse: 0.9832

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7125 New best_val_rmse: 0.7125

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6597 New best_val_rmse: 0.6597

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6109 New best_val_rmse: 0.6109

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5971 New best_val_rmse: 0.5971

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5566 New best_val_rmse: 0.5566

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5497 New best_val_rmse: 0.5497

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5394 New best_val_rmse: 0.5394

16 steps took 12.6 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.57 Still best_val_rmse: 0.5394 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 1 batch_n

[32m[I 2021-07-16 08:21:18,641][0m Trial 4 finished with value: 0.49137866497039795 and parameters: {'base_lr': 5.6933106526460035e-05, 'last_lr': 0.00113397895075368}. Best is trial 3 with value: 0.47883087396621704.[0m



##### Using fold 3
##### Using base_lr 0.00024406579729722316 last_lr 8.844624308809113e-05


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.151 New best_val_rmse: 1.151

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.847 New best_val_rmse: 0.847

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7098 New best_val_rmse: 0.7098

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.633 New best_val_rmse: 0.633

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5748 New best_val_rmse: 0.5748

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.557 New best_val_rmse: 0.557

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5611 Still best_val_rmse: 0.557 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.543 New best_val_rmse: 0.543

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.529 New best_val_rmse: 0.529

16 steps took 12.5 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.6014 Still best_val_rmse: 0.529 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 1 

[32m[I 2021-07-16 08:34:49,586][0m Trial 5 finished with value: 0.48264938592910767 and parameters: {'base_lr': 0.00024406579729722316, 'last_lr': 8.844624308809113e-05}. Best is trial 3 with value: 0.47883087396621704.[0m



##### Using fold 3
##### Using base_lr 8.193837221897893e-05 last_lr 0.001034478754448138


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.25 New best_val_rmse: 1.25

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9484 New best_val_rmse: 0.9484

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6952 New best_val_rmse: 0.6952

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6666 New best_val_rmse: 0.6666

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6083 New best_val_rmse: 0.6083

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5817 New best_val_rmse: 0.5817

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5487 New best_val_rmse: 0.5487

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5366 New best_val_rmse: 0.5366

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5343 New best_val_rmse: 0.5343

16 steps took 12.6 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5506 Still best_val_rmse: 0.5343 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 1 batch_n

[32m[I 2021-07-16 08:48:21,026][0m Trial 6 finished with value: 0.4855365455150604 and parameters: {'base_lr': 8.193837221897893e-05, 'last_lr': 0.001034478754448138}. Best is trial 3 with value: 0.47883087396621704.[0m



##### Using fold 3
##### Using base_lr 0.0003983886731916653 last_lr 0.00016242920336068236


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.9 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.035 New best_val_rmse: 1.035

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7243 New best_val_rmse: 0.7243

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.624 New best_val_rmse: 0.624

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6029 New best_val_rmse: 0.6029

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5721 New best_val_rmse: 0.5721

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5515 New best_val_rmse: 0.5515

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5841 Still best_val_rmse: 0.5515 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5518 Still best_val_rmse: 0.5515 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5594 Still best_val_rmse: 0.5515 (from epoch 0)

16 steps took 12.6 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.6003 Still best_val_rmse: 0.5515 (from ep

[32m[I 2021-07-16 08:59:09,509][0m Trial 7 finished with value: 0.49252238869667053 and parameters: {'base_lr': 0.0003983886731916653, 'last_lr': 0.00016242920336068236}. Best is trial 3 with value: 0.47883087396621704.[0m



##### Using fold 3
##### Using base_lr 0.00023956590096521377 last_lr 0.001230842125022629


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.064 New best_val_rmse: 1.064

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7213 New best_val_rmse: 0.7213

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7539 Still best_val_rmse: 0.7213 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.608 New best_val_rmse: 0.608

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5971 New best_val_rmse: 0.5971

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5575 New best_val_rmse: 0.5575

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5597 Still best_val_rmse: 0.5575 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5867 Still best_val_rmse: 0.5575 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5247 New best_val_rmse: 0.5247

16 steps took 12.5 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5744 Still best_val_rmse: 0.5247 (from ep

[32m[I 2021-07-16 09:14:09,364][0m Trial 8 finished with value: 0.4795965254306793 and parameters: {'base_lr': 0.00023956590096521377, 'last_lr': 0.001230842125022629}. Best is trial 3 with value: 0.47883087396621704.[0m



##### Using fold 3
##### Using base_lr 0.0002628413124986733 last_lr 0.0001988720448953958


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.123 New best_val_rmse: 1.123

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7996 New best_val_rmse: 0.7996

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6958 New best_val_rmse: 0.6958

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6565 New best_val_rmse: 0.6565

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5937 New best_val_rmse: 0.5937

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5623 New best_val_rmse: 0.5623

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5474 New best_val_rmse: 0.5474

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5504 Still best_val_rmse: 0.5474 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5273 New best_val_rmse: 0.5273

16 steps took 12.5 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.641 Still best_val_rmse: 0.5273 (from epoch 0)

16 steps took 12.1 second

[32m[I 2021-07-16 09:27:40,021][0m Trial 9 finished with value: 0.48248663544654846 and parameters: {'base_lr': 0.0002628413124986733, 'last_lr': 0.0001988720448953958}. Best is trial 3 with value: 0.47883087396621704.[0m



##### Using fold 3
##### Using base_lr 0.0001295274544410545 last_lr 0.004988067319093683


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9791 New best_val_rmse: 0.9791

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7156 New best_val_rmse: 0.7156

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6885 New best_val_rmse: 0.6885

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7707 Still best_val_rmse: 0.6885 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6067 New best_val_rmse: 0.6067

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5403 New best_val_rmse: 0.5403

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5445 Still best_val_rmse: 0.5403 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5512 Still best_val_rmse: 0.5403 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5168 New best_val_rmse: 0.5168

16 steps took 12.5 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.572 Still best_val_rmse: 0.5168 (from

[32m[I 2021-07-16 09:46:02,064][0m Trial 10 finished with value: 0.47466132044792175 and parameters: {'base_lr': 0.0001295274544410545, 'last_lr': 0.004988067319093683}. Best is trial 10 with value: 0.47466132044792175.[0m



##### Using fold 3
##### Using base_lr 0.00013704587708202512 last_lr 0.00431927536820673


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9884 New best_val_rmse: 0.9884

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7012 New best_val_rmse: 0.7012

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6975 New best_val_rmse: 0.6975

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7238 Still best_val_rmse: 0.6975 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6102 New best_val_rmse: 0.6102

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5259 New best_val_rmse: 0.5259

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5493 Still best_val_rmse: 0.5259 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5472 Still best_val_rmse: 0.5259 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5207 New best_val_rmse: 0.5207

16 steps took 12.5 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5469 Still best_val_rmse: 0.5207 (fro

[32m[I 2021-07-16 10:03:34,496][0m Trial 11 finished with value: 0.4777217507362366 and parameters: {'base_lr': 0.00013704587708202512, 'last_lr': 0.00431927536820673}. Best is trial 10 with value: 0.47466132044792175.[0m



##### Using fold 3
##### Using base_lr 0.00014995360832615058 last_lr 0.004767785679534565


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9776 New best_val_rmse: 0.9776

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7148 New best_val_rmse: 0.7148

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6795 New best_val_rmse: 0.6795

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7902 Still best_val_rmse: 0.6795 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6452 New best_val_rmse: 0.6452

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5366 New best_val_rmse: 0.5366

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.533 New best_val_rmse: 0.533

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5438 Still best_val_rmse: 0.533 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5078 New best_val_rmse: 0.5078

16 steps took 12.6 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5424 Still best_val_rmse: 0.5078 (from epoch 0)

16 steps

[32m[I 2021-07-16 10:22:03,319][0m Trial 12 finished with value: 0.47244909405708313 and parameters: {'base_lr': 0.00014995360832615058, 'last_lr': 0.004767785679534565}. Best is trial 12 with value: 0.47244909405708313.[0m



##### Using fold 3
##### Using base_lr 0.00016420220823284873 last_lr 0.004783602075813355


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9745 New best_val_rmse: 0.9745

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7092 New best_val_rmse: 0.7092

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6529 New best_val_rmse: 0.6529

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7 Still best_val_rmse: 0.6529 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6009 New best_val_rmse: 0.6009

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5217 New best_val_rmse: 0.5217

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5583 Still best_val_rmse: 0.5217 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5609 Still best_val_rmse: 0.5217 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5124 New best_val_rmse: 0.5124

16 steps took 12.5 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.554 Still best_val_rmse: 0.5124 (from ep

[32m[I 2021-07-16 10:41:17,508][0m Trial 13 finished with value: 0.4700598418712616 and parameters: {'base_lr': 0.00016420220823284873, 'last_lr': 0.004783602075813355}. Best is trial 13 with value: 0.4700598418712616.[0m



##### Using fold 3
##### Using base_lr 3.0738293953323484e-05 last_lr 0.0029007270337567067


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.159 New best_val_rmse: 1.159

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.799 New best_val_rmse: 0.799

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7108 New best_val_rmse: 0.7108

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6828 New best_val_rmse: 0.6828

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6324 New best_val_rmse: 0.6324

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6399 Still best_val_rmse: 0.6324 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.589 New best_val_rmse: 0.589

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5816 New best_val_rmse: 0.5816

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5555 New best_val_rmse: 0.5555

16 steps took 12.5 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.6148 Still best_val_rmse: 0.5555 (from epoch 0)

16 steps took 12.1 seconds
E

[32m[I 2021-07-16 10:50:47,946][0m Trial 14 finished with value: 0.5022666454315186 and parameters: {'base_lr': 3.0738293953323484e-05, 'last_lr': 0.0029007270337567067}. Best is trial 13 with value: 0.4700598418712616.[0m



##### Using fold 3
##### Using base_lr 0.00043560837477936117 last_lr 0.0024767088905740904


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.9 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9691 New best_val_rmse: 0.9691

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7018 New best_val_rmse: 0.7018

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6366 New best_val_rmse: 0.6366

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7136 Still best_val_rmse: 0.6366 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5891 New best_val_rmse: 0.5891

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5698 New best_val_rmse: 0.5698

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5518 New best_val_rmse: 0.5518

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5809 Still best_val_rmse: 0.5518 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5628 Still best_val_rmse: 0.5518 (from epoch 0)

16 steps took 12.6 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5786 Still best_val_rmse: 0.5518 (fro

[32m[I 2021-07-16 11:03:07,086][0m Trial 15 finished with value: 0.4854622483253479 and parameters: {'base_lr': 0.00043560837477936117, 'last_lr': 0.0024767088905740904}. Best is trial 13 with value: 0.4700598418712616.[0m



##### Using fold 3
##### Using base_lr 0.00018137091249728613 last_lr 0.002588742095280771


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.025 New best_val_rmse: 1.025

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7041 New best_val_rmse: 0.7041

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7007 New best_val_rmse: 0.7007

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7274 Still best_val_rmse: 0.7007 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5867 New best_val_rmse: 0.5867

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5468 New best_val_rmse: 0.5468

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5278 New best_val_rmse: 0.5278

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5387 Still best_val_rmse: 0.5278 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.524 New best_val_rmse: 0.524

16 steps took 12.6 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5653 Still best_val_rmse: 0.524 (from epoch 0)

16 steps t

[32m[I 2021-07-16 11:16:08,478][0m Trial 16 finished with value: 0.48343199491500854 and parameters: {'base_lr': 0.00018137091249728613, 'last_lr': 0.002588742095280771}. Best is trial 13 with value: 0.4700598418712616.[0m



##### Using fold 3
##### Using base_lr 8.453136230101006e-05 last_lr 0.00040680520681215864


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.303 New best_val_rmse: 1.303

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9937 New best_val_rmse: 0.9937

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6848 New best_val_rmse: 0.6848

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6806 New best_val_rmse: 0.6806

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5981 New best_val_rmse: 0.5981

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5882 New best_val_rmse: 0.5882

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5488 New best_val_rmse: 0.5488

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5366 New best_val_rmse: 0.5366

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5496 Still best_val_rmse: 0.5366 (from epoch 0)

16 steps took 12.6 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5442 Still best_val_rmse: 0.5366 (from epoch 0)

16 steps took 12.1 secon

[32m[I 2021-07-16 11:29:49,132][0m Trial 17 finished with value: 0.4836425483226776 and parameters: {'base_lr': 8.453136230101006e-05, 'last_lr': 0.00040680520681215864}. Best is trial 13 with value: 0.4700598418712616.[0m



##### Using fold 3
##### Using base_lr 0.00017184623680797118 last_lr 0.004228619760468622


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9802 New best_val_rmse: 0.9802

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7089 New best_val_rmse: 0.7089

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.684 New best_val_rmse: 0.684

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7556 Still best_val_rmse: 0.684 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6489 New best_val_rmse: 0.6489

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5387 New best_val_rmse: 0.5387

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5213 New best_val_rmse: 0.5213

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.527 Still best_val_rmse: 0.5213 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5215 Still best_val_rmse: 0.5213 (from epoch 0)

16 steps took 12.5 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5343 Still best_val_rmse: 0.5213 (from ep

[32m[I 2021-07-16 11:48:39,105][0m Trial 18 finished with value: 0.4730079770088196 and parameters: {'base_lr': 0.00017184623680797118, 'last_lr': 0.004228619760468622}. Best is trial 13 with value: 0.4700598418712616.[0m



##### Using fold 3
##### Using base_lr 0.00031718898239970335 last_lr 0.0020428758207603837


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9942 New best_val_rmse: 0.9942

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6934 New best_val_rmse: 0.6934

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6446 New best_val_rmse: 0.6446

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6831 Still best_val_rmse: 0.6446 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6293 New best_val_rmse: 0.6293

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5624 New best_val_rmse: 0.5624

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5875 Still best_val_rmse: 0.5624 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5575 New best_val_rmse: 0.5575

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5473 New best_val_rmse: 0.5473

16 steps took 12.6 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5936 Still best_val_rmse: 0.5473 (from epoch 0)

16 st

[32m[I 2021-07-16 12:00:42,494][0m Trial 19 finished with value: 0.48685845732688904 and parameters: {'base_lr': 0.00031718898239970335, 'last_lr': 0.0020428758207603837}. Best is trial 13 with value: 0.4700598418712616.[0m
[32m[I 2021-07-16 12:00:42,496][0m A new study created in memory with name: no-name-c6b6f9df-550a-4dfd-8d7a-933a7b4ccdde[0m



 Best value:  0.4700598418712616
 Best params: 
    base_lr: 0.00016420220823284873
    last_lr: 0.004783602075813355
##### Using fold 4
##### Using base_lr 0.00026731608181367815 last_lr 9.61081591519267e-05


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.104 New best_val_rmse: 1.104

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7759 New best_val_rmse: 0.7759

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.659 New best_val_rmse: 0.659

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6083 New best_val_rmse: 0.6083

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5764 New best_val_rmse: 0.5764

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5787 Still best_val_rmse: 0.5764 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6119 Still best_val_rmse: 0.5764 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5461 New best_val_rmse: 0.5461

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5765 Still best_val_rmse: 0.5461 (from epoch 0)

16 steps took 12.6 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5435 New best_val_rmse: 0.5435

16 steps 

[32m[I 2021-07-16 12:10:13,747][0m Trial 0 finished with value: 0.502634584903717 and parameters: {'base_lr': 0.00026731608181367815, 'last_lr': 9.61081591519267e-05}. Best is trial 0 with value: 0.502634584903717.[0m



##### Using fold 4
##### Using base_lr 4.909801078250626e-05 last_lr 0.0028036344765952014


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.137 New best_val_rmse: 1.137

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9298 New best_val_rmse: 0.9298

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7373 New best_val_rmse: 0.7373

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6928 New best_val_rmse: 0.6928

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6597 New best_val_rmse: 0.6597

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6387 New best_val_rmse: 0.6387

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5849 New best_val_rmse: 0.5849

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5712 New best_val_rmse: 0.5712

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.582 Still best_val_rmse: 0.5712 (from epoch 0)

16 steps took 12.6 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5209 New best_val_rmse: 0.5209

16 steps took 12.0 seconds
Epoch: 1 batch_

[32m[I 2021-07-16 12:20:33,250][0m Trial 1 finished with value: 0.49909043312072754 and parameters: {'base_lr': 4.909801078250626e-05, 'last_lr': 0.0028036344765952014}. Best is trial 1 with value: 0.49909043312072754.[0m



##### Using fold 4
##### Using base_lr 0.00016060563953910976 last_lr 0.0036556302658151893


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.9 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.02 New best_val_rmse: 1.02

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7628 New best_val_rmse: 0.7628

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7209 New best_val_rmse: 0.7209

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6124 New best_val_rmse: 0.6124

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5778 New best_val_rmse: 0.5778

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5979 Still best_val_rmse: 0.5778 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6805 Still best_val_rmse: 0.5778 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5705 New best_val_rmse: 0.5705

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5359 New best_val_rmse: 0.5359

16 steps took 12.5 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.493 New best_val_rmse: 0.493

8 steps took 6.03 seconds
Ep

[32m[I 2021-07-16 12:31:18,143][0m Trial 2 finished with value: 0.492962121963501 and parameters: {'base_lr': 0.00016060563953910976, 'last_lr': 0.0036556302658151893}. Best is trial 2 with value: 0.492962121963501.[0m



##### Using fold 4
##### Using base_lr 0.0003341313008947839 last_lr 0.0006673675873997908


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.034 New best_val_rmse: 1.034

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7052 New best_val_rmse: 0.7052

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.651 New best_val_rmse: 0.651

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5861 New best_val_rmse: 0.5861

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6064 Still best_val_rmse: 0.5861 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6029 Still best_val_rmse: 0.5861 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5618 New best_val_rmse: 0.5618

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5347 New best_val_rmse: 0.5347

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5428 Still best_val_rmse: 0.5347 (from epoch 0)

16 steps took 12.6 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5366 Still best_val_rmse: 0.5347 (from ep

[32m[I 2021-07-16 12:40:49,075][0m Trial 3 finished with value: 0.5056121945381165 and parameters: {'base_lr': 0.0003341313008947839, 'last_lr': 0.0006673675873997908}. Best is trial 2 with value: 0.492962121963501.[0m



##### Using fold 4
##### Using base_lr 9.139225868122028e-05 last_lr 0.0001387513717190892


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.9 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.298 New best_val_rmse: 1.298

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.001 New best_val_rmse: 1.001

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7382 New best_val_rmse: 0.7382

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7491 Still best_val_rmse: 0.7382 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6255 New best_val_rmse: 0.6255

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6009 New best_val_rmse: 0.6009

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5528 New best_val_rmse: 0.5528

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5415 New best_val_rmse: 0.5415

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5486 Still best_val_rmse: 0.5415 (from epoch 0)

16 steps took 12.5 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5076 New best_val_rmse: 0.5076

16 steps took 12.1 seconds

[32m[I 2021-07-16 12:51:24,420][0m Trial 4 finished with value: 0.491883784532547 and parameters: {'base_lr': 9.139225868122028e-05, 'last_lr': 0.0001387513717190892}. Best is trial 4 with value: 0.491883784532547.[0m



##### Using fold 4
##### Using base_lr 6.124444603592389e-05 last_lr 0.0002325018070380025


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.325 New best_val_rmse: 1.325

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.04 New best_val_rmse: 1.04

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8845 New best_val_rmse: 0.8845

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7059 New best_val_rmse: 0.7059

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6602 New best_val_rmse: 0.6602

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6194 New best_val_rmse: 0.6194

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5656 New best_val_rmse: 0.5656

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5633 New best_val_rmse: 0.5633

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5891 Still best_val_rmse: 0.5633 (from epoch 0)

16 steps took 12.5 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5262 New best_val_rmse: 0.5262

16 steps took 12.1 seconds
Epoch: 1 batch_num

[32m[I 2021-07-16 13:00:54,027][0m Trial 5 finished with value: 0.5012246370315552 and parameters: {'base_lr': 6.124444603592389e-05, 'last_lr': 0.0002325018070380025}. Best is trial 4 with value: 0.491883784532547.[0m



##### Using fold 4
##### Using base_lr 5.9863999054129436e-05 last_lr 0.00019344175119655413


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.329 New best_val_rmse: 1.329

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.051 New best_val_rmse: 1.051

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8918 New best_val_rmse: 0.8918

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7255 New best_val_rmse: 0.7255

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6624 New best_val_rmse: 0.6624

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.622 New best_val_rmse: 0.622

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.571 New best_val_rmse: 0.571

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5693 New best_val_rmse: 0.5693

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6022 Still best_val_rmse: 0.5693 (from epoch 0)

16 steps took 12.6 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.53 New best_val_rmse: 0.53

16 steps took 12.1 seconds
Epoch: 1 batch_num: 28 v

[32m[I 2021-07-16 13:10:42,478][0m Trial 6 finished with value: 0.4995722472667694 and parameters: {'base_lr': 5.9863999054129436e-05, 'last_lr': 0.00019344175119655413}. Best is trial 4 with value: 0.491883784532547.[0m



##### Using fold 4
##### Using base_lr 5.017606328573479e-05 last_lr 0.0040150402984343615


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.064 New best_val_rmse: 1.064

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8879 New best_val_rmse: 0.8879

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7409 New best_val_rmse: 0.7409

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6621 New best_val_rmse: 0.6621

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.65 New best_val_rmse: 0.65

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6297 New best_val_rmse: 0.6297

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5882 New best_val_rmse: 0.5882

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5698 New best_val_rmse: 0.5698

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5711 Still best_val_rmse: 0.5698 (from epoch 0)

16 steps took 12.6 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5252 New best_val_rmse: 0.5252

16 steps took 12.0 seconds
Epoch: 1 batch_num

[32m[I 2021-07-16 13:21:17,401][0m Trial 7 finished with value: 0.4971056580543518 and parameters: {'base_lr': 5.017606328573479e-05, 'last_lr': 0.0040150402984343615}. Best is trial 4 with value: 0.491883784532547.[0m



##### Using fold 4
##### Using base_lr 0.0004644004477364371 last_lr 0.00023139932216750472


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.9 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.008 New best_val_rmse: 1.008

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7879 New best_val_rmse: 0.7879

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6167 New best_val_rmse: 0.6167

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5864 New best_val_rmse: 0.5864

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6196 Still best_val_rmse: 0.5864 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6312 Still best_val_rmse: 0.5864 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7136 Still best_val_rmse: 0.5864 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5687 New best_val_rmse: 0.5687

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5604 New best_val_rmse: 0.5604

16 steps took 12.5 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5436 New best_val_rmse: 0.5436

16 step

[32m[I 2021-07-16 13:30:48,304][0m Trial 8 finished with value: 0.5131124258041382 and parameters: {'base_lr': 0.0004644004477364371, 'last_lr': 0.00023139932216750472}. Best is trial 4 with value: 0.491883784532547.[0m



##### Using fold 4
##### Using base_lr 5.1145580702159575e-05 last_lr 8.3159197490671e-05


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.349 New best_val_rmse: 1.349

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.124 New best_val_rmse: 1.124

16 steps took 11.9 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9337 New best_val_rmse: 0.9337

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7405 New best_val_rmse: 0.7405

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7038 New best_val_rmse: 0.7038

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6437 New best_val_rmse: 0.6437

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5847 New best_val_rmse: 0.5847

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5775 New best_val_rmse: 0.5775

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6124 Still best_val_rmse: 0.5775 (from epoch 0)

16 steps took 12.6 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5382 New best_val_rmse: 0.5382

16 steps took 12.0 seconds
Epoch: 1 batch_n

[32m[I 2021-07-16 13:41:16,577][0m Trial 9 finished with value: 0.4971366226673126 and parameters: {'base_lr': 5.1145580702159575e-05, 'last_lr': 8.3159197490671e-05}. Best is trial 4 with value: 0.491883784532547.[0m



##### Using fold 4
##### Using base_lr 0.0001049633236379046 last_lr 0.0008637274006839614


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.226 New best_val_rmse: 1.226

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9441 New best_val_rmse: 0.9441

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.711 New best_val_rmse: 0.711

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7387 Still best_val_rmse: 0.711 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6138 New best_val_rmse: 0.6138

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5992 New best_val_rmse: 0.5992

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5646 New best_val_rmse: 0.5646

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5461 New best_val_rmse: 0.5461

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5525 Still best_val_rmse: 0.5461 (from epoch 0)

16 steps took 12.6 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5076 New best_val_rmse: 0.5076

16 steps took 12.0 seconds


[32m[I 2021-07-16 13:50:46,448][0m Trial 10 finished with value: 0.500770628452301 and parameters: {'base_lr': 0.0001049633236379046, 'last_lr': 0.0008637274006839614}. Best is trial 4 with value: 0.491883784532547.[0m



##### Using fold 4
##### Using base_lr 0.00014813025515643673 last_lr 0.0017270438507141566


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.9 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.118 New best_val_rmse: 1.118

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8802 New best_val_rmse: 0.8802

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6909 New best_val_rmse: 0.6909

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7579 Still best_val_rmse: 0.6909 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6635 New best_val_rmse: 0.6635

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5883 New best_val_rmse: 0.5883

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5488 New best_val_rmse: 0.5488

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5801 Still best_val_rmse: 0.5488 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5566 Still best_val_rmse: 0.5488 (from epoch 0)

16 steps took 12.6 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.4997 New best_val_rmse: 0.4997

8 steps

[32m[I 2021-07-16 14:01:29,429][0m Trial 11 finished with value: 0.4947921633720398 and parameters: {'base_lr': 0.00014813025515643673, 'last_lr': 0.0017270438507141566}. Best is trial 4 with value: 0.491883784532547.[0m



##### Using fold 4
##### Using base_lr 0.00013604630042020016 last_lr 0.0014531988209567372


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.149 New best_val_rmse: 1.149

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8815 New best_val_rmse: 0.8815

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6834 New best_val_rmse: 0.6834

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7749 Still best_val_rmse: 0.6834 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6642 New best_val_rmse: 0.6642

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.601 New best_val_rmse: 0.601

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5463 New best_val_rmse: 0.5463

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5481 Still best_val_rmse: 0.5463 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5431 New best_val_rmse: 0.5431

16 steps took 12.6 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.4961 New best_val_rmse: 0.4961

8 steps took 6.0 seconds
E

[32m[I 2021-07-16 14:12:06,762][0m Trial 12 finished with value: 0.4950611889362335 and parameters: {'base_lr': 0.00013604630042020016, 'last_lr': 0.0014531988209567372}. Best is trial 4 with value: 0.491883784532547.[0m



##### Using fold 4
##### Using base_lr 9.730206103250245e-05 last_lr 0.0004146135481763983


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.27 New best_val_rmse: 1.27

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9672 New best_val_rmse: 0.9672

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7166 New best_val_rmse: 0.7166

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7281 Still best_val_rmse: 0.7166 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6278 New best_val_rmse: 0.6278

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.599 New best_val_rmse: 0.599

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5435 New best_val_rmse: 0.5435

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5396 New best_val_rmse: 0.5396

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5425 Still best_val_rmse: 0.5396 (from epoch 0)

16 steps took 12.5 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5058 New best_val_rmse: 0.5058

16 steps took 12.0 seconds
E

[32m[I 2021-07-16 14:22:09,453][0m Trial 13 finished with value: 0.4993009865283966 and parameters: {'base_lr': 9.730206103250245e-05, 'last_lr': 0.0004146135481763983}. Best is trial 4 with value: 0.491883784532547.[0m



##### Using fold 4
##### Using base_lr 0.00023973581827158925 last_lr 0.00013194914165020892


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.129 New best_val_rmse: 1.129

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8107 New best_val_rmse: 0.8107

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6572 New best_val_rmse: 0.6572

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6902 Still best_val_rmse: 0.6572 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6238 New best_val_rmse: 0.6238

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5918 New best_val_rmse: 0.5918

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.546 New best_val_rmse: 0.546

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5441 New best_val_rmse: 0.5441

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5807 Still best_val_rmse: 0.5441 (from epoch 0)

16 steps took 12.6 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5469 Still best_val_rmse: 0.5441 (from epoch 0)

16 steps 

[32m[I 2021-07-16 14:31:40,016][0m Trial 14 finished with value: 0.5050088167190552 and parameters: {'base_lr': 0.00023973581827158925, 'last_lr': 0.00013194914165020892}. Best is trial 4 with value: 0.491883784532547.[0m



##### Using fold 4
##### Using base_lr 3.291783570455888e-05 last_lr 0.0004053167156653586


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.9 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.345 New best_val_rmse: 1.345

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.116 New best_val_rmse: 1.116

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9424 New best_val_rmse: 0.9424

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7684 New best_val_rmse: 0.7684

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7033 New best_val_rmse: 0.7033

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6708 New best_val_rmse: 0.6708

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6186 New best_val_rmse: 0.6186

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6031 New best_val_rmse: 0.6031

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6419 Still best_val_rmse: 0.6031 (from epoch 0)


[32m[I 2021-07-16 14:35:03,275][0m Trial 15 finished with value: 0.6030850410461426 and parameters: {'base_lr': 3.291783570455888e-05, 'last_lr': 0.0004053167156653586}. Best is trial 4 with value: 0.491883784532547.[0m



##### Using fold 4
##### Using base_lr 0.0001807543792223713 last_lr 0.004354964057622347


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.002 New best_val_rmse: 1.002

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7199 New best_val_rmse: 0.7199

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6602 New best_val_rmse: 0.6602

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5976 New best_val_rmse: 0.5976

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5774 New best_val_rmse: 0.5774

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5979 Still best_val_rmse: 0.5774 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6667 Still best_val_rmse: 0.5774 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5863 Still best_val_rmse: 0.5774 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5273 New best_val_rmse: 0.5273

16 steps took 12.7 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5033 New best_val_rmse: 0.5033

16 step

[32m[I 2021-07-16 14:45:13,588][0m Trial 16 finished with value: 0.4977419972419739 and parameters: {'base_lr': 0.0001807543792223713, 'last_lr': 0.004354964057622347}. Best is trial 4 with value: 0.491883784532547.[0m



##### Using fold 4
##### Using base_lr 8.176324330617398e-05 last_lr 0.0012432581220121835


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.223 New best_val_rmse: 1.223

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9659 New best_val_rmse: 0.9659

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6943 New best_val_rmse: 0.6943

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7109 Still best_val_rmse: 0.6943 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6139 New best_val_rmse: 0.6139

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5907 New best_val_rmse: 0.5907

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5475 New best_val_rmse: 0.5475

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5403 New best_val_rmse: 0.5403

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5342 New best_val_rmse: 0.5342

16 steps took 12.5 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.4974 New best_val_rmse: 0.4974

8 steps took 6.01 seconds
Epoch: 1 batch_

[32m[I 2021-07-16 14:55:56,865][0m Trial 17 finished with value: 0.4916570484638214 and parameters: {'base_lr': 8.176324330617398e-05, 'last_lr': 0.0012432581220121835}. Best is trial 17 with value: 0.4916570484638214.[0m



##### Using fold 4
##### Using base_lr 8.300164646884342e-05 last_lr 0.0010564751027599343


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.236 New best_val_rmse: 1.236

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9706 New best_val_rmse: 0.9706

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7057 New best_val_rmse: 0.7057

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7123 Still best_val_rmse: 0.7057 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6234 New best_val_rmse: 0.6234

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6027 New best_val_rmse: 0.6027

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5503 New best_val_rmse: 0.5503

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.541 New best_val_rmse: 0.541

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5378 New best_val_rmse: 0.5378

16 steps took 12.5 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.4988 New best_val_rmse: 0.4988

8 steps took 6.01 seconds
Epoch: 1 batch_nu

[32m[I 2021-07-16 15:06:41,010][0m Trial 18 finished with value: 0.4933856427669525 and parameters: {'base_lr': 8.300164646884342e-05, 'last_lr': 0.0010564751027599343}. Best is trial 17 with value: 0.4916570484638214.[0m



##### Using fold 4
##### Using base_lr 7.879107116906062e-05 last_lr 0.00041831060497249747


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.29 New best_val_rmse: 1.29

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9842 New best_val_rmse: 0.9842

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7943 New best_val_rmse: 0.7943

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7376 New best_val_rmse: 0.7376

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6435 New best_val_rmse: 0.6435

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6073 New best_val_rmse: 0.6073

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5596 New best_val_rmse: 0.5596

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5494 New best_val_rmse: 0.5494

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5686 Still best_val_rmse: 0.5494 (from epoch 0)

16 steps took 12.6 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5119 New best_val_rmse: 0.5119

16 steps took 12.0 seconds
Epoch: 1 batch_n

[32m[I 2021-07-16 15:17:07,754][0m Trial 19 finished with value: 0.4963444471359253 and parameters: {'base_lr': 7.879107116906062e-05, 'last_lr': 0.00041831060497249747}. Best is trial 17 with value: 0.4916570484638214.[0m
[32m[I 2021-07-16 15:17:07,756][0m A new study created in memory with name: no-name-a742421d-155b-4f4a-a123-2adbd7a1f56e[0m



 Best value:  0.4916570484638214
 Best params: 
    base_lr: 8.176324330617398e-05
    last_lr: 0.0012432581220121835
##### Using fold 5
##### Using base_lr 0.000471631234884967 last_lr 0.004293828027624209


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8874 New best_val_rmse: 0.8874

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.663 New best_val_rmse: 0.663

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7249 Still best_val_rmse: 0.663 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6884 Still best_val_rmse: 0.663 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7479 Still best_val_rmse: 0.663 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.735 Still best_val_rmse: 0.663 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.611 New best_val_rmse: 0.611

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5978 New best_val_rmse: 0.5978

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5598 New best_val_rmse: 0.5598

16 steps took 12.6 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5412 New best_val_rmse: 0.541

[32m[I 2021-07-16 15:26:38,266][0m Trial 0 finished with value: 0.5019031763076782 and parameters: {'base_lr': 0.000471631234884967, 'last_lr': 0.004293828027624209}. Best is trial 0 with value: 0.5019031763076782.[0m



##### Using fold 5
##### Using base_lr 0.0001588419458411717 last_lr 0.0009567157920320694


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.057 New best_val_rmse: 1.057

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7335 New best_val_rmse: 0.7335

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6864 New best_val_rmse: 0.6864

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6278 New best_val_rmse: 0.6278

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.647 Still best_val_rmse: 0.6278 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5985 New best_val_rmse: 0.5985

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5825 New best_val_rmse: 0.5825

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.592 Still best_val_rmse: 0.5825 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5557 New best_val_rmse: 0.5557

16 steps took 12.6 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5885 Still best_val_rmse: 0.5557 (from epoch 0)

16 steps 

[32m[I 2021-07-16 15:37:28,987][0m Trial 1 finished with value: 0.4910915195941925 and parameters: {'base_lr': 0.0001588419458411717, 'last_lr': 0.0009567157920320694}. Best is trial 1 with value: 0.4910915195941925.[0m



##### Using fold 5
##### Using base_lr 9.170662664079284e-05 last_lr 0.0006441795305290672


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.163 New best_val_rmse: 1.163

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8978 New best_val_rmse: 0.8978

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6982 New best_val_rmse: 0.6982

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6901 New best_val_rmse: 0.6901

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6022 New best_val_rmse: 0.6022

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6005 New best_val_rmse: 0.6005

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5823 New best_val_rmse: 0.5823

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5595 New best_val_rmse: 0.5595

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.547 New best_val_rmse: 0.547

16 steps took 12.5 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.605 Still best_val_rmse: 0.547 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 1 batch_num

[32m[I 2021-07-16 15:48:11,002][0m Trial 2 finished with value: 0.49325135350227356 and parameters: {'base_lr': 9.170662664079284e-05, 'last_lr': 0.0006441795305290672}. Best is trial 1 with value: 0.4910915195941925.[0m



##### Using fold 5
##### Using base_lr 6.104770123792814e-05 last_lr 0.0018098034219241532


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.079 New best_val_rmse: 1.079

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7939 New best_val_rmse: 0.7939

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6872 New best_val_rmse: 0.6872

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6566 New best_val_rmse: 0.6566

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5995 New best_val_rmse: 0.5995

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.589 New best_val_rmse: 0.589

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5728 New best_val_rmse: 0.5728

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5697 New best_val_rmse: 0.5697

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5477 New best_val_rmse: 0.5477

16 steps took 12.6 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5712 Still best_val_rmse: 0.5477 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 1 batch_n

[32m[I 2021-07-16 15:57:39,328][0m Trial 3 finished with value: 0.5013652443885803 and parameters: {'base_lr': 6.104770123792814e-05, 'last_lr': 0.0018098034219241532}. Best is trial 1 with value: 0.4910915195941925.[0m



##### Using fold 5
##### Using base_lr 0.00043586247767072087 last_lr 0.00010719695412836729


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.013 New best_val_rmse: 1.013

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7277 New best_val_rmse: 0.7277

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6241 New best_val_rmse: 0.6241

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5966 New best_val_rmse: 0.5966

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.653 Still best_val_rmse: 0.5966 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5944 New best_val_rmse: 0.5944

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5984 Still best_val_rmse: 0.5944 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6022 Still best_val_rmse: 0.5944 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6089 Still best_val_rmse: 0.5944 (from epoch 0)

16 steps took 12.5 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5728 New best_val_rmse:

[32m[I 2021-07-16 16:07:07,978][0m Trial 4 finished with value: 0.506541907787323 and parameters: {'base_lr': 0.00043586247767072087, 'last_lr': 0.00010719695412836729}. Best is trial 1 with value: 0.4910915195941925.[0m



##### Using fold 5
##### Using base_lr 0.00043149275513041085 last_lr 8.652046816842741e-05


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.013 New best_val_rmse: 1.013

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7347 New best_val_rmse: 0.7347

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6569 New best_val_rmse: 0.6569

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5936 New best_val_rmse: 0.5936

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7005 Still best_val_rmse: 0.5936 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6219 Still best_val_rmse: 0.5936 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5783 New best_val_rmse: 0.5783

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6021 Still best_val_rmse: 0.5783 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5973 Still best_val_rmse: 0.5783 (from epoch 0)

16 steps took 12.5 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5489 New best_val_rmse

[32m[I 2021-07-16 16:16:37,959][0m Trial 5 finished with value: 0.5003818273544312 and parameters: {'base_lr': 0.00043149275513041085, 'last_lr': 8.652046816842741e-05}. Best is trial 1 with value: 0.4910915195941925.[0m



##### Using fold 5
##### Using base_lr 0.00010193746973446588 last_lr 0.0048433216768270886


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9856 New best_val_rmse: 0.9856

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7327 New best_val_rmse: 0.7327

16 steps took 11.9 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.71 New best_val_rmse: 0.71

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6677 New best_val_rmse: 0.6677

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5706 New best_val_rmse: 0.5706

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5759 Still best_val_rmse: 0.5706 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6095 Still best_val_rmse: 0.5706 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5792 Still best_val_rmse: 0.5706 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5744 Still best_val_rmse: 0.5706 (from epoch 0)

16 steps took 12.6 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5472 New best_val_rmse: 

[32m[I 2021-07-16 16:27:36,618][0m Trial 6 finished with value: 0.49097248911857605 and parameters: {'base_lr': 0.00010193746973446588, 'last_lr': 0.0048433216768270886}. Best is trial 6 with value: 0.49097248911857605.[0m



##### Using fold 5
##### Using base_lr 0.00024004824734813894 last_lr 0.0005898015737527751


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.027 New best_val_rmse: 1.027

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7479 New best_val_rmse: 0.7479

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6303 New best_val_rmse: 0.6303

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6851 Still best_val_rmse: 0.6303 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6777 Still best_val_rmse: 0.6303 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5943 New best_val_rmse: 0.5943

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5737 New best_val_rmse: 0.5737

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.566 New best_val_rmse: 0.566

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5814 Still best_val_rmse: 0.566 (from epoch 0)

16 steps took 12.5 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5377 New best_val_rmse: 0.5377

16 steps t

[32m[I 2021-07-16 16:40:20,057][0m Trial 7 finished with value: 0.4856576919555664 and parameters: {'base_lr': 0.00024004824734813894, 'last_lr': 0.0005898015737527751}. Best is trial 7 with value: 0.4856576919555664.[0m



##### Using fold 5
##### Using base_lr 0.00030625350108401557 last_lr 0.0005997069317383801


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.011 New best_val_rmse: 1.011

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7295 New best_val_rmse: 0.7295

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6397 New best_val_rmse: 0.6397

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6797 Still best_val_rmse: 0.6397 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6725 Still best_val_rmse: 0.6397 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5986 New best_val_rmse: 0.5986

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5828 New best_val_rmse: 0.5828

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5691 New best_val_rmse: 0.5691

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6011 Still best_val_rmse: 0.5691 (from epoch 0)

16 steps took 12.6 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5358 New best_val_rmse: 0.5358

16 step

[32m[I 2021-07-16 16:51:04,752][0m Trial 8 finished with value: 0.48951485753059387 and parameters: {'base_lr': 0.00030625350108401557, 'last_lr': 0.0005997069317383801}. Best is trial 7 with value: 0.4856576919555664.[0m



##### Using fold 5
##### Using base_lr 6.517563273485793e-05 last_lr 0.0012817042987679538


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.118 New best_val_rmse: 1.118

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8432 New best_val_rmse: 0.8432

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6786 New best_val_rmse: 0.6786

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6515 New best_val_rmse: 0.6515

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.596 New best_val_rmse: 0.596

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5888 New best_val_rmse: 0.5888

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5771 New best_val_rmse: 0.5771

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5597 New best_val_rmse: 0.5597

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5451 New best_val_rmse: 0.5451

16 steps took 12.6 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5864 Still best_val_rmse: 0.5451 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 1 batch_n

[32m[I 2021-07-16 17:00:34,119][0m Trial 9 finished with value: 0.5098778009414673 and parameters: {'base_lr': 6.517563273485793e-05, 'last_lr': 0.0012817042987679538}. Best is trial 7 with value: 0.4856576919555664.[0m



##### Using fold 5
##### Using base_lr 3.266078444594387e-05 last_lr 0.0002033942977641447


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.3 New best_val_rmse: 1.3

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.082 New best_val_rmse: 1.082

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9323 New best_val_rmse: 0.9323

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7397 New best_val_rmse: 0.7397

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6561 New best_val_rmse: 0.6561

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6262 New best_val_rmse: 0.6262

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6073 New best_val_rmse: 0.6073

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6183 Still best_val_rmse: 0.6073 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5971 New best_val_rmse: 0.5971

16 steps took 12.6 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5737 New best_val_rmse: 0.5737

16 steps took 12.0 seconds
Epoch: 1 batch_num: 

[32m[I 2021-07-16 17:10:03,771][0m Trial 10 finished with value: 0.5209943056106567 and parameters: {'base_lr': 3.266078444594387e-05, 'last_lr': 0.0002033942977641447}. Best is trial 7 with value: 0.4856576919555664.[0m



##### Using fold 5
##### Using base_lr 0.0002506692176886094 last_lr 0.0002902778174110193


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.036 New best_val_rmse: 1.036

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7747 New best_val_rmse: 0.7747

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7542 New best_val_rmse: 0.7542

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6965 New best_val_rmse: 0.6965

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5835 New best_val_rmse: 0.5835

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6147 Still best_val_rmse: 0.5835 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6209 Still best_val_rmse: 0.5835 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5871 Still best_val_rmse: 0.5835 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.582 New best_val_rmse: 0.582

16 steps took 12.5 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5352 New best_val_rmse: 0.5352

16 steps 

[32m[I 2021-07-16 17:20:47,834][0m Trial 11 finished with value: 0.49322155117988586 and parameters: {'base_lr': 0.0002506692176886094, 'last_lr': 0.0002902778174110193}. Best is trial 7 with value: 0.4856576919555664.[0m



##### Using fold 5
##### Using base_lr 0.0002297546136917806 last_lr 0.00034915806261776055


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.056 New best_val_rmse: 1.056

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8289 New best_val_rmse: 0.8289

16 steps took 11.9 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6998 New best_val_rmse: 0.6998

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6867 New best_val_rmse: 0.6867

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5745 New best_val_rmse: 0.5745

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5833 Still best_val_rmse: 0.5745 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6533 Still best_val_rmse: 0.5745 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5828 Still best_val_rmse: 0.5745 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6056 Still best_val_rmse: 0.5745 (from epoch 0)

16 steps took 12.5 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5614 New best_val_rmse

[32m[I 2021-07-16 17:33:39,147][0m Trial 12 finished with value: 0.4850253164768219 and parameters: {'base_lr': 0.0002297546136917806, 'last_lr': 0.00034915806261776055}. Best is trial 12 with value: 0.4850253164768219.[0m



##### Using fold 5
##### Using base_lr 0.00019621961496857723 last_lr 0.00031447979189509236


Some weights of the model checkpoint at t5-large were not used when initializing T5EncoderModel: ['decoder.block.15.layer.0.SelfAttention.v.weight', 'decoder.block.3.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.v.weight', 'decoder.block.18.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.14.layer.0.SelfAttention.o.weight', 'decoder.block.6.layer.0.SelfAttention.v.weight', 'decoder.block.20.layer.1.EncDecAttention.o.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.0.SelfAttention.o.weight', 'decoder.block.9.layer.2.layer_norm.weight', 'decoder.block.6.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.2.DenseReluDense.wi.weight', 'decoder.block.15.layer.1.EncDecAttention.q.weight', 'decoder.block.16.layer.2.DenseReluDense.wo.weight', 'decoder.block.13.layer.2.layer_

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.078 New best_val_rmse: 1.078

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8251 New best_val_rmse: 0.8251

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.701 New best_val_rmse: 0.701

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6376 New best_val_rmse: 0.6376

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5837 New best_val_rmse: 0.5837

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5747 New best_val_rmse: 0.5747

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5865 Still best_val_rmse: 0.5747 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5816 Still best_val_rmse: 0.5747 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5756 Still best_val_rmse: 0.5747 (from epoch 0)


### Verify the model

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm

In [None]:
cfg.model_offset = 0
cfg.model_limit = 6
cfg.n_folds = 5
cfg.svm_kernels = ['rbf']
cfg.svm_c = 5

In [None]:
num_bins = int(np.ceil(np.log2(len(train_df))))
train_df['bins'] = pd.cut(train_df['target'], bins=num_bins, labels=False)
bins = train_df['bins'].values

In [None]:
%%time

inference_models = []
for i in range(1, cfg.NUM_FOLDS + 1):
    print(f'Model {i}')
    inference_model = CommonLitModel()
    inference_model = inference_model.cuda()
    inference_model.load_state_dict(torch.load(str(MODELS_PATH/f"{cfg.model_name.replace('/', '_')}_{i}/model_{i}.pth")))
    inference_model.eval();
    inference_models.append(inference_model)

In [None]:
from transformers import RobertaTokenizer

tokenizers = []
for i in range(1, cfg.NUM_FOLDS):
    tokenizer = RobertaTokenizer.from_pretrained(MODELS_PATH/f"{cfg.model_name.replace('/', '_')}_{i}")
    tokenizers.append(tokenizer)

In [None]:
def get_cls_embeddings(dl, transformer_model):
    cls_embeddings = []
    with torch.no_grad():
        for input_features in tqdm(dl, total=len(dl)):
            output, context_vector = transformer_model(input_features['input_ids'].cuda(), input_features['attention_mask'].cuda())
#             cls_embeddings.extend(output['last_hidden_state'][:,0,:].detach().cpu().numpy())
            embedding_out = context_vector.detach().cpu().numpy()
            cls_embeddings.extend(embedding_out)
    return np.array(cls_embeddings)

In [None]:
def rmse_score(X, y):
    return np.sqrt(mean_squared_error(X, y))

In [None]:
def convert_to_list(t):
    return t.flatten().long()

class CommonLitDataset(nn.Module):
    def __init__(self, text, test_id, tokenizer, max_len=128):
        self.excerpt = text
        self.test_id = test_id
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        return {'input_ids': convert_to_list(encode['input_ids']),
                'attention_mask': convert_to_list(encode['attention_mask']),
                'id': self.test_id[idx]}
    
    def __len__(self):
        return len(self.excerpt)

In [None]:
def create_dl(df, tokenizer):
    text = df['excerpt'].values
    ids = df['id'].values
    ds = CommonLitDataset(text, ids, tokenizer, max_len=cfg.MAX_LEN)
    return DataLoader(ds, 
                      batch_size = cfg.BATCH_SIZE,
                      shuffle=False,
                      num_workers = 1,
                      pin_memory=True,
                      drop_last=False
                     )

In [None]:
train_df = pd.read_csv(DATA_PATH/'train-orig.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')
remove_unnecessary(train_df)

In [None]:
train_target_mean = train_df['target'].mean()
train_target_std = train_df['target'].std()
train_df['normalized_target'] = (train_df['target'] - train_target_mean) / train_target_std

In [None]:
%%time

train_target = train_df['normalized_target'].values

def calc_mean(scores):
    return np.mean(np.array(scores), axis=0)

final_scores = []
final_rmse = []
kernel_rmse_score_mean = []
final_kernel_predictions_means = []
for j, (inference_model, tokenizer) in enumerate(zip(inference_models, tokenizers)):
    print('Model', j)
    test_dl = create_dl(test_df, tokenizer)
    train_dl = create_dl(train_df, tokenizer)
    transformer_model = inference_model
    transformer_model.cuda()
    X = get_cls_embeddings(train_dl, transformer_model)
    
    y = train_target
    X_test = get_cls_embeddings(test_dl, transformer_model)
    
    kfold = StratifiedKFold(n_splits=cfg.NUM_FOLDS)
    scores = []
    rmse_scores = []
    kernel_predictions_means = []
    for kernel in cfg.svm_kernels:
        print('Kernel', kernel)
        kernel_scores = []
        kernel_rmse_scores = []
        kernel_predictions = []
        for k, (train_idx, valid_idx) in enumerate(kfold.split(X, bins)):

            print('Fold', k, train_idx.shape, valid_idx.shape)
            model = SVR(C=cfg.svm_c, kernel=kernel, gamma='auto')

            X_train, y_train = X[train_idx], y[train_idx]
            X_valid, y_valid = X[valid_idx], y[valid_idx]
            model.fit(X_train, y_train)
            prediction = model.predict(X_valid)
            kernel_predictions.append(prediction)
            kernel_rmse_scores.append(rmse_score(prediction, y_valid))
            print('rmse_score', kernel_rmse_scores[k])
            kernel_scores.append(model.predict(X_test))
        kernel_predictions_means.append(np.array([np.mean(kp) for kp in kernel_predictions]).mean())
        scores.append(calc_mean(kernel_scores))
        kernel_rmse_score = calc_mean(kernel_rmse_scores)
        kernel_rmse_score_mean.append(kernel_rmse_score)
        rmse_scores.append(kernel_rmse_score)
    final_kernel_predictions_means.append(kernel_predictions_means)
    final_scores.append(calc_mean(scores))
    final_rmse.append(calc_mean(rmse_scores))
print('FINAL RMSE score', np.mean(np.array(final_rmse)))

In [None]:
final_kernel_predictions_means

In [None]:
# (train_df['target'] - cfg.train_target_mean) / cfg.train_target_std
final_scores_normalized = np.array(final_scores) * train_target_std + train_target_mean

In [None]:
kernel_rmse_score_mean_array = np.array(kernel_rmse_score_mean)
kernel_rmse_score_mean_sum = np.sum(kernel_rmse_score_mean_array)
prop_losses = kernel_rmse_score_mean_array / kernel_rmse_score_mean_sum
prop_losses_sum = (1 - prop_losses).sum()
weights = (1 - prop_losses) / prop_losses_sum
weights

In [None]:
def calc_mean(scores, weights=weights):
    return np.average(np.array(scores), weights=weights, axis=0)

In [None]:
target_mean = train_df['target'].mean()
final_scores_flat = calc_mean(final_scores_normalized).flatten()
final_scores_mean = final_scores_flat.mean()
target_mean, np.array(final_scores_normalized).mean()
# (-0.9579984513405823, -0.8029817438292849)

In [None]:
final_scores_flat

In [None]:
mean_diff = target_mean - final_scores_mean
mean_diff, mean_diff / len(final_scores)

In [None]:
sample_df['target'] = final_scores_flat + mean_diff
# sample_df['target'] = len(final_scores) / np.sum(1 / np.array(final_scores), axis=0) # harmonic mean
sample_df

### Prepare Packaging

In [None]:
cfg.model_name

In [None]:
BEST_MODEL_FOLDER = MODELS_PATH/cfg.model_name/'best'
!rm -rf {BEST_MODEL_FOLDER}
!mkdir -p {BEST_MODEL_FOLDER}

In [None]:
BEST_MODEL_FOLDER

In [None]:
cfg.NUM_FOLDS

In [None]:
bestmodels = [MODELS_PATH/f'{cfg.model_name}_{i + 1}' for i in range(0, cfg.NUM_FOLDS)]

In [None]:
bestmodels

In [None]:
from shutil import copyfile

def normalize_name(path_name):
    return path_name.replace('', '')

for i, best_model in enumerate(bestmodels):
    print(f'Processing {i}th model')
    i = i + 1
    best_model_file = f'{best_model}/model_{i}.pth'
    if Path(best_model_file).exists():
        copyfile(best_model_file, f'{BEST_MODEL_FOLDER}/{i}_pytorch_model.bin')
        tokenizer_path = Path(BEST_MODEL_FOLDER/f'tokenizer-{i}')
        tokenizer_path.mkdir(parents=True, exist_ok=True)
        assert tokenizer_path.exists()

        tokenizer_json = Path(normalize_name(f'{MODELS_PATH/cfg.model_name}_{i}/tokenizer_config.json'))
        assert tokenizer_json.exists(), f'{tokenizer_json} does not exist'
        copyfile(tokenizer_json, tokenizer_path/'tokenizer.json')

        vocab_txt = Path(normalize_name(f'{MODELS_PATH/cfg.model_name}_{i}/vocab.json'))
        assert vocab_txt.exists(), f'{vocab_txt} does not exist'
        copyfile(vocab_txt, tokenizer_path/'vocab.json')

        merges = Path(normalize_name(f'{MODELS_PATH/cfg.model_name}_{i}/merges.txt'))
        assert merges.exists()
        copyfile(merges, tokenizer_path/'merges.txt')
    else:
        print(f'{best_model_file} is missing')

In [None]:
import shutil

shutil.make_archive(MODELS_PATH/cfg.model_name/'best_models', 'zip', BEST_MODEL_FOLDER)

In [None]:
!ls {MODELS_PATH/cfg.model_name}

In [None]:
!mv {MODELS_PATH}/{cfg.model_name}.yaml {MODELS_PATH/cfg.model_name}

In [None]:
transformer_model.transformer_model.save_pretrained(save_directory=f'{MODELS_PATH/cfg.model_name}/lm')

In [None]:
!du -h {MODELS_PATH/cfg.model_name}/*

In [None]:
shutil.make_archive(MODELS_PATH/cfg.model_name/'lm', 'zip', f'{MODELS_PATH/cfg.model_name}/lm')

In [None]:
!kaggle datasets init -p {MODELS_PATH/cfg.model_name}

In [None]:
dataset_json_path = Path(MODELS_PATH/cfg.model_name/'dataset-metadata.json')
assert dataset_json_path.exists()

In [None]:
!cat {str(dataset_json_path)}

In [None]:
with open(dataset_json_path, 'r') as f:
    dataset_json = f.read()
    dataset_json = dataset_json.replace('INSERT_TITLE_HERE', f'commonlit-{cfg.model_name}-light').replace('INSERT_SLUG_HERE', f'commonlit-{cfg.model_name}-light')
    print(dataset_json)
with(open(dataset_json_path, 'w')) as f:
    f.write(dataset_json)

In [None]:
!rm -rf {MODELS_PATH/cfg.model_name}/best
!rm -rf {MODELS_PATH/cfg.model_name}/lm

In [None]:
!kaggle datasets create -p {MODELS_PATH/cfg.model_name}

In [None]:
!kaggle datasets version -p {MODELS_PATH/cfg.model_name} -m "Version with merges.txt" -d

In [None]:
state_dict = torch.load(str(MODELS_PATH/f'distilroberta-0/checkpoint-105/pytorch_model.bin'))

In [None]:
loaded_model = CommonLitModel()

In [None]:
loaded_model.load_state_dict(state_dict)