In [1]:
# !pip install optuna

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import gc, warnings, random, time, os

from pathlib import Path

from tqdm.notebook import tqdm

warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from transformers import AutoModel, AutoTokenizer, AutoConfig
from transformers import get_cosine_schedule_with_warmup

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

import seaborn as sns

import gc
gc.enable()

import optuna

### Folders and Dataframes

In [2]:
DATA_PATH = Path('/home/commonlit/data/')
assert DATA_PATH.exists()
MODELS_PATH = Path('/home/commonlit/models/')
if not MODELS_PATH.exists():
    os.mkdir(MODELS_PATH)
assert MODELS_PATH.exists()

In [3]:
train_df = pd.read_csv(DATA_PATH/'train-orig.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')
sample_df = pd.read_csv(DATA_PATH/'sample_submission.csv')

In [4]:
def remove_unnecessary(df):
    df.drop(df[df['target'] == 0].index, inplace=True)
    df.reset_index(drop=True, inplace=True)
    
remove_unnecessary(train_df)

In [5]:
train_df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845
...,...,...,...,...,...,...
2828,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,When you think of dinosaurs and where they liv...,1.711390,0.646900
2829,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,So what is a solid? Solids are usually hard be...,0.189476,0.535648
2830,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,The second state of matter we will discuss is ...,0.255209,0.483866
2831,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,Solids are shapes that you can actually touch....,-0.215279,0.514128


### Config and Seeding

In [6]:
class Config(): 
    NUM_FOLDS = 6
    NUM_EPOCHS = 3
    BATCH_SIZE = 16
    MAX_LEN = 248
    EVAL_SCHEDULE = [(0.50, 16), (0.49, 8), (0.48, 4), (0.47, 2), (-1., 1)]
    ROBERTA_PATH = 'roberta-large-mnli'
    TOKENIZER_PATH = 'roberta-large-mnli'
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    SEED = 1000
    NUM_WORKERS = 2
    MODEL_FOLDER = MODELS_PATH
    model_name = 'roberta-large-mnli'
    svm_kernels = ['rbf']
    svm_c = 5

cfg = Config()

In [7]:
if not cfg.MODEL_FOLDER.exists():
    os.mkdir(cfg.MODEL_FOLDER)

In [8]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True

### Dataset

In [9]:
def add_bins(train_df, num_bins):
    train_df.loc[:, 'bins'] = pd.cut(train_df['target'], bins=num_bins, labels=False)
    return num_bins

In [10]:
add_bins(train_df, cfg.NUM_FOLDS)

6

In [11]:
train_df.groupby(['bins'])['target'].agg(['count', 'mean'])

Unnamed: 0_level_0,count,mean
bins,Unnamed: 1_level_1,Unnamed: 2_level_1
0,122,-3.125765
1,441,-2.270279
2,784,-1.41215
3,886,-0.548095
4,494,0.289716
5,106,1.070237


In [12]:
tokenizer = AutoTokenizer.from_pretrained(cfg.TOKENIZER_PATH)

In [13]:
class CommonLitDataset(Dataset):
    def __init__(self, df, tokenizer, inference_only=False):
        super().__init__()
        self.df, self.inference_only = df, inference_only
        self.text = df['excerpt'].tolist()
        self.bins = df['bins']
        if not inference_only:
            self.target = torch.tensor(df['target'].to_numpy(), dtype = torch.float32)
        
        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',
            max_length = cfg.MAX_LEN,
            truncation = True,
            return_attention_mask=True
        )
        
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return {'input_ids': input_ids, 'attention_mask': attention_mask}
        else:
            target = self.target[index]
            return {'input_ids': input_ids, 'attention_mask': attention_mask, 'target': target}
    
    def __len__(self):
        return len(self.df)

In [14]:
sample_ds = CommonLitDataset(train_df, tokenizer)

### Model

In [15]:
class AttentionHead(nn.Module):
    
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        
        self.hidden_layer = nn.Linear(in_features, hidden_dim)
        self.final_layer = nn.Linear(hidden_dim, num_targets)
        self.out_features = hidden_dim
        
    def forward(self, features):
        att = torch.tanh(self.hidden_layer(features))
        score = self.final_layer(att)
        attention_weights = torch.softmax(score, dim=1)
        return attention_weights

In [16]:
class CommonLitModel(nn.Module):
    def __init__(self):
        super(CommonLitModel, self).__init__()
        config = AutoConfig.from_pretrained(cfg.ROBERTA_PATH)
        config.update({
            "output_hidden_states": True,
            "hidden_dropout_prob": 0.0,
            "layer_norm_eps": 1e-7
        })
        self.transformer_model = AutoModel.from_pretrained(cfg.ROBERTA_PATH, config=config)
        self.attention = AttentionHead(config.hidden_size, 512, 1)
        self.regressor = nn.Linear(config.hidden_size, 1)
    
    def forward(self, input_ids, attention_mask):
        last_layer_hidden_states = self.transformer_model(input_ids=input_ids, attention_mask=attention_mask)['last_hidden_state']
        weights = self.attention(last_layer_hidden_states)
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1) 
        return self.regressor(context_vector), context_vector

In [17]:
sample_model = CommonLitModel()

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
import re

for i, (name, param) in enumerate(sample_model.named_parameters()):
    if(name.find('layer') > -1):
        layer_name = re.sub(r'.+(layer\.\d+).+', r'\1', name)

In [19]:
for i, (name, param) in enumerate(sample_model.named_parameters()):
    print(i, name, param.size())

0 transformer_model.embeddings.word_embeddings.weight torch.Size([50265, 1024])
1 transformer_model.embeddings.position_embeddings.weight torch.Size([514, 1024])
2 transformer_model.embeddings.token_type_embeddings.weight torch.Size([1, 1024])
3 transformer_model.embeddings.LayerNorm.weight torch.Size([1024])
4 transformer_model.embeddings.LayerNorm.bias torch.Size([1024])
5 transformer_model.encoder.layer.0.attention.self.query.weight torch.Size([1024, 1024])
6 transformer_model.encoder.layer.0.attention.self.query.bias torch.Size([1024])
7 transformer_model.encoder.layer.0.attention.self.key.weight torch.Size([1024, 1024])
8 transformer_model.encoder.layer.0.attention.self.key.bias torch.Size([1024])
9 transformer_model.encoder.layer.0.attention.self.value.weight torch.Size([1024, 1024])
10 transformer_model.encoder.layer.0.attention.self.value.bias torch.Size([1024])
11 transformer_model.encoder.layer.0.attention.output.dense.weight torch.Size([1024, 1024])
12 transformer_model.enco

In [20]:
sample_input_ids = torch.randint(0, 1000, [8, 248])
sample_attention_mask = torch.randint(0, 1000, [8, 248])

In [21]:
sample_model(sample_input_ids, sample_attention_mask)[1].shape

torch.Size([8, 1024])

In [22]:
torch.sum(torch.randn([8, 496, 768]), axis=1)

tensor([[-12.7320,   2.2587,  30.8258,  ...,  17.4633, -30.4209,  34.1521],
        [ 36.0794,  49.1316, -36.8480,  ..., -20.6265, -36.4150,  -0.8295],
        [ 16.8225,  21.9673,   8.4700,  ..., -12.0893,  -0.0784,   8.2608],
        ...,
        [-10.8944, -20.0534, -18.3019,  ...,  36.6813, -10.7048,  11.8113],
        [ 28.1540, -33.0799, -10.2496,  ..., -17.4998, -34.1061,   5.0624],
        [-14.8967,  33.1763,   0.9350,  ...,  35.0615,   0.2091,   7.9617]])

### Evaluation and Prediction

In [23]:
def eval_mse(model, data_loader):
    model.eval()
    mse_sum = 0
    mse_loss = nn.MSELoss(reduction='sum')
    
    with torch.no_grad():
        for batch_num, record in enumerate(data_loader):
            input_ids, attention_mask, target = record['input_ids'].to(cfg.DEVICE), record['attention_mask'].to(cfg.DEVICE), record['target'].to(cfg.DEVICE)
            pred, _ = model(input_ids, attention_mask)
            mse_sum += mse_loss(pred.flatten().cpu(), target.cpu())
            
    return mse_sum / len(data_loader.dataset)

In [24]:
def predict(model, data_loader):
    model.eval()
    result = []
    
    with torch.no_grad():
        for batch_num, record in tqdm(enumerate(data_loader), total=len(data_loader)):
            input_ids, attention_mask = record['input_ids'].to(cfg.DEVICE), record['attention_mask'].to(cfg.DEVICE)
            pred, _ = model(input_ids, attention_mask)
            result.extend(pred.flatten().to("cpu").tolist())
            
    return np.array(result)

In [25]:
sample_dl = DataLoader(sample_ds, shuffle=False, batch_size=16, num_workers=1)

### Optimizer and Sampler

In [26]:
5e-5 / 2.5, 5e-5 / 0.5, 5e-5

(2e-05, 0.0001, 5e-05)

In [27]:
def create_optimizer(model, base_lr=5e-5, last_lr=None):
    named_parameters = list(model.named_parameters())
    
    regressor_param_start = 395
    roberta_parameters = named_parameters[:389]
    attention_parameters = named_parameters[391:regressor_param_start]
    regressor_parameters = named_parameters[regressor_param_start:]
    
    attention_group = [params for (name, params) in attention_parameters]
    regressor_group = [params for (name, params) in regressor_parameters]
    
    parameters = []
    if last_lr is not None:
        parameters.append({"params": attention_group, "lr": last_lr})
        parameters.append({"params": regressor_group, "lr": last_lr})
    else:
        parameters.append({"params": attention_group})
        parameters.append({"params": regressor_group})
    
    for layer_num, (name, params) in enumerate(roberta_parameters):
        weight_decay = 0.0 if 'bias' in name else 0.01
        
        lr = base_lr / 2.5 # 2e-05
        if layer_num >= 260:
            lr = base_lr / 0.5 # 1e-4
        elif layer_num >= 132:        
            lr = base_lr    
            
        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})
        
    return AdamW(parameters)

In [28]:
sample_optimizer = create_optimizer(sample_model)

In [29]:
from torch.utils.data import Sampler,SequentialSampler,RandomSampler,SubsetRandomSampler
from collections import Counter

class WeightedSampler(Sampler):
    
    def __init__(self, dataset):
        
        self.indices = list(range(len(dataset)))
        self.num_samples = len(dataset)
        self.label_to_count = dict(Counter(dataset.bins))
        weights = [1/self.label_to_count[i] for i in dataset.bins]
        
        self.weights = torch.tensor(weights,dtype=torch.double)
        
    def __iter__(self):
        count = 0
        index = [self.indices[i] for i in torch.multinomial(self.weights, self.num_samples, replacement=True)]
        while count < self.num_samples:
            yield index[count]
            count += 1
    
    def __len__(self):
        return self.num_samples

### Training

In [30]:
def choose_eval_period(val_rmse):
    for rmse, period in cfg.EVAL_SCHEDULE:
        if val_rmse >= rmse:
            return period

In [31]:
def serialize_best(best_val_rmse, best_epoch, val_rmse, epoch, model, model_path):
    if not best_val_rmse or val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        best_epoch = epoch
        if not model_path.parent.exists():
            os.makedirs(model_path.parent)
        
        torch.save(model.state_dict(), model_path)
        print(f"New best_val_rmse: {best_val_rmse:0.4}")
    else:       
        print(f"Still best_val_rmse: {best_val_rmse:0.4}",
              f"(from epoch {best_epoch})")
    return best_epoch, best_val_rmse

In [32]:
class Trainer():
    def __init__(self, model, model_path, train_loader, val_loader, optimizer, scheduler=None, num_epochs=cfg.NUM_EPOCHS):
        self.model, self.model_path, self.train_loader, self.val_loader, self.optimizer, self.scheduler, self.num_epochs = (
            model, model_path, train_loader, val_loader, optimizer, scheduler, num_epochs
        )
            
    def train(self):
        self.model.train()
        mse_loss = nn.MSELoss(reduction='mean')
        
        best_val_rmse = None
        best_epoch = 0
        step = 0
        last_eval_step = 0
        eval_period = cfg.EVAL_SCHEDULE[0][1]    

        start = time.time()
        
        tbar = tqdm(range(self.num_epochs), total=self.num_epochs)
        for epoch in tbar:
            tbar.set_description(f'Epoch: {epoch}')
            val_rmse = None
            for batch_num, record in enumerate(self.train_loader):
                input_ids, attention_mask, target = record['input_ids'].to(cfg.DEVICE), record['attention_mask'].to(cfg.DEVICE), record['target'].to(cfg.DEVICE)
                
                self.optimizer.zero_grad()
                
                pred, _ = self.model(input_ids, attention_mask)
                
                mse = mse_loss(pred.flatten(), target)
                
                mse.backward()
                
                self.optimizer.step()
                if self.scheduler:
                    self.scheduler.step()
                    
                if step >= last_eval_step + eval_period:
                    elapsed_seconds = time.time() - start
                    num_steps = step - last_eval_step
                    print(f"\n{num_steps} steps took {elapsed_seconds:0.3} seconds")
                    last_eval_step = step
                    
                    val_rmse = np.sqrt(eval_mse(self.model, self.val_loader))
                    print(f"Epoch: {epoch} batch_num: {batch_num}", f"val_rmse: {val_rmse:0.4} ", end='')
                    
                    eval_period = choose_eval_period(val_rmse)
                    best_epoch, best_val_rmse = serialize_best(best_val_rmse, best_epoch, val_rmse, epoch, self.model, self.model_path)
                    start = time.time()
                # Finish early on condition
                if epoch > 0 and best_val_rmse > 0.6:
                    return best_val_rmse
                
                step += 1
        return best_val_rmse

In [33]:
kfold = KFold(n_splits=cfg.NUM_FOLDS, random_state=cfg.SEED, shuffle=True)
splits = list(kfold.split(train_df))

### Optuna

In [34]:
# fold 0 best: {'base_lr': 5.399287252438555e-05, 'last_lr': 0.00031407679884352875} 0.4808
# fold 0 best: {'base_lr': 8.008719222051192e-05, 'last_lr': 9.176800039281186e-05}. Best is trial 13 with value: 0.48905229568481445.
# fold 1 best: {'base_lr': 6.499521436817742e-05, 'last_lr': 0.003793058089074889}. Best is trial 16 with value: 0.4678534269332886.
# fold 2 best: {'base_lr': 8.906201454928846e-05, 'last_lr': 0.00012184015294923997}. Best is trial 2 with value: 0.47420474886894226.
# fold 3 best: {'base_lr': 3.226406820160473e-05, 'last_lr': 0.00238164178709544}. Best trial:  0.48740705847740173
# fold 4 best: {'base_lr': 4.25982124671479e-05, 'last_lr':  0.00046783085975998843}. Best trial:  0.49172893166542053
# fold 5 best: {'base_lr': 2.517671262528407e-05, 'last_lr': 0.00406777602451577}. Best is trial 0 with value: 0.4810352027416229

In [35]:
fold = 0

def objective(trial):
    base_lr = trial.suggest_float("base_lr", 8e-6, 5e-4, log=True)
    last_lr = trial.suggest_float("last_lr", 8e-5, 5e-3, log=True)
    
    print(f'##### Using fold {fold}')
    
    model_path = cfg.MODEL_FOLDER/f"{cfg.model_name.replace('/', '_')}_{fold + 1}/model_{fold + 1}.pth"
    
    set_random_seed(cfg.SEED + fold)
    
    tokenizer = AutoTokenizer.from_pretrained(cfg.TOKENIZER_PATH)
    
    train_indices, val_indices = splits[fold]
    train_dataset = CommonLitDataset(train_df.loc[train_indices], tokenizer)    
    val_dataset = CommonLitDataset(train_df.loc[val_indices], tokenizer)
    
    train_loader = DataLoader(train_dataset, batch_size=cfg.BATCH_SIZE,
                              drop_last=False, shuffle=True, num_workers=cfg.NUM_WORKERS)    
    val_loader = DataLoader(val_dataset, batch_size=cfg.BATCH_SIZE,
                            drop_last=False, shuffle=False, num_workers=cfg.NUM_WORKERS)
    
    set_random_seed(cfg.SEED + fold)
    
    model = CommonLitModel().to(cfg.DEVICE)
    
    optimizer = create_optimizer(model, base_lr=base_lr, last_lr=last_lr)
    
    scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                num_training_steps=cfg.NUM_EPOCHS * len(train_loader), 
                                                num_warmup_steps=50)
    
    trainer = Trainer(model, model_path, train_loader, val_loader, optimizer, scheduler = scheduler)
    rmse_val = trainer.train()
    
    return rmse_val

In [37]:
%%time

for i in range(3, len(list(splits))):
    fold = i
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=20)
    print(" Best value: ", study.best_trial.value)
    print(" Best params: ")
    for key, value in study.best_trial.params.items():
        print(f"    {key}: {value}")

[32m[I 2021-07-11 07:59:01,593][0m A new study created in memory with name: no-name-4db231ff-e2d5-4192-ba8c-b26885c3282e[0m


##### Using fold 3


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.109 New best_val_rmse: 1.109

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.18 Still best_val_rmse: 1.109 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7689 New best_val_rmse: 0.7689

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8934 Still best_val_rmse: 0.7689 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7836 Still best_val_rmse: 0.7689 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5792 New best_val_rmse: 0.5792

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5853 Still best_val_rmse: 0.5792 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7048 Still best_val_rmse: 0.5792 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5701 New best_val_rmse: 0.5701

16 steps took 12.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5429 New

[32m[I 2021-07-11 08:10:47,041][0m Trial 0 finished with value: 0.48740705847740173 and parameters: {'base_lr': 3.226406820160473e-05, 'last_lr': 0.00238164178709544}. Best is trial 0 with value: 0.48740705847740173.[0m



##### Using fold 3


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.036 New best_val_rmse: 1.036

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9773 New best_val_rmse: 0.9773

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7996 New best_val_rmse: 0.7996

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6484 New best_val_rmse: 0.6484

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7931 Still best_val_rmse: 0.6484 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6312 New best_val_rmse: 0.6312

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6435 Still best_val_rmse: 0.6312 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7457 Still best_val_rmse: 0.6312 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.716 Still best_val_rmse: 0.6312 (from epoch 0)


[32m[I 2021-07-11 08:14:19,526][0m Trial 1 finished with value: 0.6312096118927002 and parameters: {'base_lr': 5.0650380156407335e-05, 'last_lr': 0.0019169971339642756}. Best is trial 0 with value: 0.48740705847740173.[0m



##### Using fold 3


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.115 New best_val_rmse: 1.115

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.024 New best_val_rmse: 1.024

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9418 New best_val_rmse: 0.9418

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7 New best_val_rmse: 0.7

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6755 New best_val_rmse: 0.6755

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5956 New best_val_rmse: 0.5956

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5757 New best_val_rmse: 0.5757

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7491 Still best_val_rmse: 0.5757 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6896 Still best_val_rmse: 0.5757 (from epoch 0)

16 steps took 12.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5557 New best_val_rmse: 0.5557

16 steps took 12.1 seconds
Epoch

[32m[I 2021-07-11 08:25:12,253][0m Trial 2 finished with value: 0.4955609440803528 and parameters: {'base_lr': 1.623896394027085e-05, 'last_lr': 0.0001593112483093467}. Best is trial 0 with value: 0.48740705847740173.[0m



##### Using fold 3


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.161 New best_val_rmse: 1.161

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.324 Still best_val_rmse: 1.161 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.142 New best_val_rmse: 1.142

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.047 New best_val_rmse: 1.047

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.057 Still best_val_rmse: 1.047 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.044 New best_val_rmse: 1.044

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.067 Still best_val_rmse: 1.044 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.064 Still best_val_rmse: 1.044 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.045 Still best_val_rmse: 1.044 (from epoch 0)


[32m[I 2021-07-11 08:28:42,807][0m Trial 3 finished with value: 1.0437140464782715 and parameters: {'base_lr': 7.707379473012052e-05, 'last_lr': 0.0018886627125349312}. Best is trial 0 with value: 0.48740705847740173.[0m



##### Using fold 3


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.097 New best_val_rmse: 1.097

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.01 New best_val_rmse: 1.01

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9927 New best_val_rmse: 0.9927

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.069 Still best_val_rmse: 0.9927 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.088 Still best_val_rmse: 0.9927 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.042 Still best_val_rmse: 0.9927 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.22 Still best_val_rmse: 0.9927 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.08 Still best_val_rmse: 0.9927 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.045 Still best_val_rmse: 0.9927 (from epoch 0)


[32m[I 2021-07-11 08:32:09,435][0m Trial 4 finished with value: 0.9926589727401733 and parameters: {'base_lr': 0.00030608464251860324, 'last_lr': 0.004258430458938841}. Best is trial 0 with value: 0.48740705847740173.[0m



##### Using fold 3


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.03 New best_val_rmse: 1.03

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.156 Still best_val_rmse: 1.03 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.042 Still best_val_rmse: 1.03 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.042 Still best_val_rmse: 1.03 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.054 Still best_val_rmse: 1.03 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.04 Still best_val_rmse: 1.03 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.37 Still best_val_rmse: 1.03 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.506 Still best_val_rmse: 1.03 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.221 Still best_val_rmse: 1.03 (from epoch 0)


[32m[I 2021-07-11 08:35:29,509][0m Trial 5 finished with value: 1.0304316282272339 and parameters: {'base_lr': 0.0003219582161600415, 'last_lr': 0.00014373934795551644}. Best is trial 0 with value: 0.48740705847740173.[0m



##### Using fold 3


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.068 New best_val_rmse: 1.068

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9885 New best_val_rmse: 0.9885

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.046 Still best_val_rmse: 0.9885 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7773 New best_val_rmse: 0.7773

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7118 New best_val_rmse: 0.7118

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6614 New best_val_rmse: 0.6614

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.637 New best_val_rmse: 0.637

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6309 New best_val_rmse: 0.6309

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6243 New best_val_rmse: 0.6243


[32m[I 2021-07-11 08:39:11,429][0m Trial 6 finished with value: 0.6242756843566895 and parameters: {'base_lr': 1.08227367059183e-05, 'last_lr': 9.576502655421232e-05}. Best is trial 0 with value: 0.48740705847740173.[0m



##### Using fold 3


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.05 New best_val_rmse: 1.05

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.001 New best_val_rmse: 1.001

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8875 New best_val_rmse: 0.8875

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7849 New best_val_rmse: 0.7849

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6712 New best_val_rmse: 0.6712

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6308 New best_val_rmse: 0.6308

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5946 New best_val_rmse: 0.5946

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5941 New best_val_rmse: 0.5941

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5884 New best_val_rmse: 0.5884

16 steps took 12.3 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5751 New best_val_rmse: 0.5751

16 steps took 12.1 seconds
Epoch: 1 batch_num: 28 val_rmse: 0.

[32m[I 2021-07-11 08:49:33,953][0m Trial 7 finished with value: 0.5185472965240479 and parameters: {'base_lr': 9.478088635051655e-06, 'last_lr': 8.52154544554797e-05}. Best is trial 0 with value: 0.48740705847740173.[0m



##### Using fold 3


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.043 New best_val_rmse: 1.043

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8402 New best_val_rmse: 0.8402

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9243 Still best_val_rmse: 0.8402 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8568 Still best_val_rmse: 0.8402 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.072 Still best_val_rmse: 0.8402 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.043 Still best_val_rmse: 0.8402 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.053 Still best_val_rmse: 0.8402 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.065 Still best_val_rmse: 0.8402 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.043 Still best_val_rmse: 0.8402 (from epoch 0)


[32m[I 2021-07-11 08:52:57,840][0m Trial 8 finished with value: 0.8401888012886047 and parameters: {'base_lr': 6.498141366625214e-05, 'last_lr': 0.00026174059012865964}. Best is trial 0 with value: 0.48740705847740173.[0m



##### Using fold 3


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.076 New best_val_rmse: 1.076

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9979 New best_val_rmse: 0.9979

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7987 New best_val_rmse: 0.7987

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8887 Still best_val_rmse: 0.7987 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.811 Still best_val_rmse: 0.7987 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.8357 Still best_val_rmse: 0.7987 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6789 New best_val_rmse: 0.6789

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.8297 Still best_val_rmse: 0.6789 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.7766 Still best_val_rmse: 0.6789 (from epoch 0)


[32m[I 2021-07-11 08:56:26,927][0m Trial 9 finished with value: 0.6789323091506958 and parameters: {'base_lr': 1.57188580049819e-05, 'last_lr': 0.0007715723221240116}. Best is trial 0 with value: 0.48740705847740173.[0m



##### Using fold 3


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.111 New best_val_rmse: 1.111

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.024 New best_val_rmse: 1.024

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8389 New best_val_rmse: 0.8389

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7056 New best_val_rmse: 0.7056

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6996 New best_val_rmse: 0.6996

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7152 Still best_val_rmse: 0.6996 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6173 New best_val_rmse: 0.6173

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.9434 Still best_val_rmse: 0.6173 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6715 Still best_val_rmse: 0.6173 (from epoch 0)


[32m[I 2021-07-11 09:00:02,829][0m Trial 10 finished with value: 0.6172806024551392 and parameters: {'base_lr': 3.5645110422610476e-05, 'last_lr': 0.004134932864629202}. Best is trial 0 with value: 0.48740705847740173.[0m



##### Using fold 3


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.033 New best_val_rmse: 1.033

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8281 New best_val_rmse: 0.8281

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8355 Still best_val_rmse: 0.8281 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7785 New best_val_rmse: 0.7785

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7484 New best_val_rmse: 0.7484

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7097 New best_val_rmse: 0.7097

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6257 New best_val_rmse: 0.6257

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5785 New best_val_rmse: 0.5785

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5826 Still best_val_rmse: 0.5785 (from epoch 0)

16 steps took 12.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5524 New best_val_rmse: 0.5524

16 steps took 12.1 secon

[32m[I 2021-07-11 09:10:07,809][0m Trial 11 finished with value: 0.5177271366119385 and parameters: {'base_lr': 2.4570003964377455e-05, 'last_lr': 0.0004139454666719998}. Best is trial 0 with value: 0.48740705847740173.[0m



##### Using fold 3


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9728 New best_val_rmse: 0.9728

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.242 Still best_val_rmse: 0.9728 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.146 Still best_val_rmse: 0.9728 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8581 New best_val_rmse: 0.8581

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.258 Still best_val_rmse: 0.8581 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.059 Still best_val_rmse: 0.8581 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.07 Still best_val_rmse: 0.8581 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.09 Still best_val_rmse: 0.8581 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.041 Still best_val_rmse: 0.8581 (from epoch 0)


[32m[I 2021-07-11 09:13:32,750][0m Trial 12 finished with value: 0.8580694794654846 and parameters: {'base_lr': 0.00012798657998965728, 'last_lr': 0.0014394313833508198}. Best is trial 0 with value: 0.48740705847740173.[0m



##### Using fold 3


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.026 New best_val_rmse: 1.026

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9683 New best_val_rmse: 0.9683

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8661 New best_val_rmse: 0.8661

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8524 New best_val_rmse: 0.8524

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7854 New best_val_rmse: 0.7854

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6047 New best_val_rmse: 0.6047

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5764 New best_val_rmse: 0.5764

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7898 Still best_val_rmse: 0.5764 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.638 Still best_val_rmse: 0.5764 (from epoch 0)

16 steps took 12.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5466 New best_val_rmse: 0.5466

16 steps took 12.1 second

[32m[I 2021-07-11 09:24:22,976][0m Trial 13 finished with value: 0.4916834533214569 and parameters: {'base_lr': 2.295995320560339e-05, 'last_lr': 0.0002251081411140287}. Best is trial 0 with value: 0.48740705847740173.[0m



##### Using fold 3


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.068 New best_val_rmse: 1.068

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.084 Still best_val_rmse: 1.068 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7638 New best_val_rmse: 0.7638

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.757 New best_val_rmse: 0.757

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6969 New best_val_rmse: 0.6969

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5682 New best_val_rmse: 0.5682

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5828 Still best_val_rmse: 0.5682 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.9831 Still best_val_rmse: 0.5682 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.7164 Still best_val_rmse: 0.5682 (from epoch 0)

16 steps took 12.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.6088 Still best_val_rmse: 

[32m[I 2021-07-11 09:34:52,290][0m Trial 14 finished with value: 0.4968758821487427 and parameters: {'base_lr': 2.776364796099378e-05, 'last_lr': 0.000769372229016373}. Best is trial 0 with value: 0.48740705847740173.[0m



##### Using fold 3


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.096 New best_val_rmse: 1.096

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9935 New best_val_rmse: 0.9935

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7512 New best_val_rmse: 0.7512

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8505 Still best_val_rmse: 0.7512 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6723 New best_val_rmse: 0.6723

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6027 New best_val_rmse: 0.6027

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6637 Still best_val_rmse: 0.6027 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5659 New best_val_rmse: 0.5659

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5796 Still best_val_rmse: 0.5659 (from epoch 0)

16 steps took 12.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5524 New best_val_rmse: 0.5524

16 step

[32m[I 2021-07-11 09:45:38,190][0m Trial 15 finished with value: 0.4986198842525482 and parameters: {'base_lr': 1.719833259096913e-05, 'last_lr': 0.0002900707863882122}. Best is trial 0 with value: 0.48740705847740173.[0m



##### Using fold 3


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.005 New best_val_rmse: 1.005

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8029 New best_val_rmse: 0.8029

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8281 Still best_val_rmse: 0.8029 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.05 Still best_val_rmse: 0.8029 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.779 New best_val_rmse: 0.779

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.053 Still best_val_rmse: 0.779 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.117 Still best_val_rmse: 0.779 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.042 Still best_val_rmse: 0.779 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.042 Still best_val_rmse: 0.779 (from epoch 0)


[32m[I 2021-07-11 09:49:03,772][0m Trial 16 finished with value: 0.7789701223373413 and parameters: {'base_lr': 0.00011464411208561552, 'last_lr': 0.0004463445687831131}. Best is trial 0 with value: 0.48740705847740173.[0m



##### Using fold 3


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.124 New best_val_rmse: 1.124

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9539 New best_val_rmse: 0.9539

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6714 New best_val_rmse: 0.6714

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8184 Still best_val_rmse: 0.6714 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.8913 Still best_val_rmse: 0.6714 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6674 New best_val_rmse: 0.6674

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5903 New best_val_rmse: 0.5903

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7801 Still best_val_rmse: 0.5903 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6928 Still best_val_rmse: 0.5903 (from epoch 0)

16 steps took 12.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5463 New best_val_rmse

[32m[I 2021-07-11 10:00:34,006][0m Trial 17 finished with value: 0.4896315336227417 and parameters: {'base_lr': 4.259134139400092e-05, 'last_lr': 0.0013604223988754553}. Best is trial 0 with value: 0.48740705847740173.[0m



##### Using fold 3


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.04 New best_val_rmse: 1.04

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.192 Still best_val_rmse: 1.04 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.213 Still best_val_rmse: 1.04 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.098 Still best_val_rmse: 1.04 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.041 Still best_val_rmse: 1.04 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.053 Still best_val_rmse: 1.04 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.064 Still best_val_rmse: 1.04 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.052 Still best_val_rmse: 1.04 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.047 Still best_val_rmse: 1.04 (from epoch 0)


[32m[I 2021-07-11 10:03:54,678][0m Trial 18 finished with value: 1.0395355224609375 and parameters: {'base_lr': 4.28125697100149e-05, 'last_lr': 0.0029387836901945542}. Best is trial 0 with value: 0.48740705847740173.[0m



##### Using fold 3


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.025 New best_val_rmse: 1.025

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9857 New best_val_rmse: 0.9857

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9392 New best_val_rmse: 0.9392

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7839 New best_val_rmse: 0.7839

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.009 Still best_val_rmse: 0.7839 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7913 Still best_val_rmse: 0.7839 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.917 Still best_val_rmse: 0.7839 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.046 Still best_val_rmse: 0.7839 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.061 Still best_val_rmse: 0.7839 (from epoch 0)


[32m[I 2021-07-11 10:07:23,234][0m Trial 19 finished with value: 0.7838825583457947 and parameters: {'base_lr': 0.00018275096677978996, 'last_lr': 0.0011428449261499623}. Best is trial 0 with value: 0.48740705847740173.[0m
[32m[I 2021-07-11 10:07:23,236][0m A new study created in memory with name: no-name-f9c2b496-99c1-4f50-bd23-62528ffe97a9[0m



 Best value:  0.48740705847740173
 Best params: 
    base_lr: 3.226406820160473e-05
    last_lr: 0.00238164178709544
##### Using fold 4


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.3 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8239 New best_val_rmse: 0.8239

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9845 Still best_val_rmse: 0.8239 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9298 Still best_val_rmse: 0.8239 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7159 New best_val_rmse: 0.7159

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6701 New best_val_rmse: 0.6701

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6099 New best_val_rmse: 0.6099

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5945 New best_val_rmse: 0.5945

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6063 Still best_val_rmse: 0.5945 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5626 New best_val_rmse: 0.5626

16 steps took 12.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5837 Still best_val_rmse: 0.5626 (fro

[32m[I 2021-07-11 10:18:26,749][0m Trial 0 finished with value: 0.49270474910736084 and parameters: {'base_lr': 5.500307030273427e-05, 'last_lr': 0.0006403732314030953}. Best is trial 0 with value: 0.49270474910736084.[0m



##### Using fold 4


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.079 New best_val_rmse: 1.079

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.081 Still best_val_rmse: 1.079 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.244 Still best_val_rmse: 1.079 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.04 New best_val_rmse: 1.04

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.044 Still best_val_rmse: 1.04 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.052 Still best_val_rmse: 1.04 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.042 Still best_val_rmse: 1.04 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.078 Still best_val_rmse: 1.04 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.075 Still best_val_rmse: 1.04 (from epoch 0)


[32m[I 2021-07-11 10:21:49,816][0m Trial 1 finished with value: 1.0399631261825562 and parameters: {'base_lr': 0.00037545113828094123, 'last_lr': 0.0017743638378241453}. Best is trial 0 with value: 0.49270474910736084.[0m



##### Using fold 4


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8186 New best_val_rmse: 0.8186

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.114 Still best_val_rmse: 0.8186 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8966 Still best_val_rmse: 0.8186 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7289 New best_val_rmse: 0.7289

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6352 New best_val_rmse: 0.6352

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6055 New best_val_rmse: 0.6055

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5705 New best_val_rmse: 0.5705

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6351 Still best_val_rmse: 0.5705 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6146 Still best_val_rmse: 0.5705 (from epoch 0)

16 steps took 12.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5917 Still best_val_r

[32m[I 2021-07-11 10:31:52,420][0m Trial 2 finished with value: 0.5010343194007874 and parameters: {'base_lr': 5.761810365833942e-05, 'last_lr': 0.00012441583144915167}. Best is trial 0 with value: 0.49270474910736084.[0m



##### Using fold 4


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.068 New best_val_rmse: 1.068

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9494 New best_val_rmse: 0.9494

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7069 New best_val_rmse: 0.7069

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6596 New best_val_rmse: 0.6596

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6102 New best_val_rmse: 0.6102

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6099 New best_val_rmse: 0.6099

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6124 Still best_val_rmse: 0.6099 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5936 New best_val_rmse: 0.5936

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6484 Still best_val_rmse: 0.5936 (from epoch 0)

16 steps took 12.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5443 New best_val_rmse: 0.5443

16 steps took 12.1 secon

[32m[I 2021-07-11 10:42:11,637][0m Trial 3 finished with value: 0.511554479598999 and parameters: {'base_lr': 1.2842135751689883e-05, 'last_lr': 0.0004185856077827944}. Best is trial 0 with value: 0.49270474910736084.[0m



##### Using fold 4


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.096 New best_val_rmse: 1.096

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8626 New best_val_rmse: 0.8626

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7915 New best_val_rmse: 0.7915

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7702 New best_val_rmse: 0.7702

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.8292 Still best_val_rmse: 0.7702 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6682 New best_val_rmse: 0.6682

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6425 New best_val_rmse: 0.6425

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.584 New best_val_rmse: 0.584

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5571 New best_val_rmse: 0.5571

16 steps took 12.3 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5461 New best_val_rmse: 0.5461

16 steps took 12.1 seconds
Epoch: 1 batch_n

[32m[I 2021-07-11 10:52:25,476][0m Trial 4 finished with value: 0.5101152062416077 and parameters: {'base_lr': 1.6820031392249838e-05, 'last_lr': 0.0016550548055579862}. Best is trial 0 with value: 0.49270474910736084.[0m



##### Using fold 4


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9308 New best_val_rmse: 0.9308

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.094 Still best_val_rmse: 0.9308 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.112 Still best_val_rmse: 0.9308 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.089 Still best_val_rmse: 0.9308 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.043 Still best_val_rmse: 0.9308 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.045 Still best_val_rmse: 0.9308 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.133 Still best_val_rmse: 0.9308 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.044 Still best_val_rmse: 0.9308 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.083 Still best_val_rmse: 0.9308 (from epoch 0)


[32m[I 2021-07-11 10:55:44,655][0m Trial 5 finished with value: 0.930773913860321 and parameters: {'base_lr': 0.00034292286321233083, 'last_lr': 0.00017722907591614917}. Best is trial 0 with value: 0.49270474910736084.[0m



##### Using fold 4


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8296 New best_val_rmse: 0.8296

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.157 Still best_val_rmse: 0.8296 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9546 Still best_val_rmse: 0.8296 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7351 New best_val_rmse: 0.7351

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6773 New best_val_rmse: 0.6773

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6497 New best_val_rmse: 0.6497

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6521 Still best_val_rmse: 0.6497 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6495 New best_val_rmse: 0.6495

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6547 Still best_val_rmse: 0.6495 (from epoch 0)


[32m[I 2021-07-11 10:59:16,769][0m Trial 6 finished with value: 0.6494649648666382 and parameters: {'base_lr': 7.973970970226153e-05, 'last_lr': 0.002755802330682124}. Best is trial 0 with value: 0.49270474910736084.[0m



##### Using fold 4


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.3 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.12 New best_val_rmse: 1.12

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.24 Still best_val_rmse: 1.12 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.114 New best_val_rmse: 1.114

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.167 Still best_val_rmse: 1.114 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.186 Still best_val_rmse: 1.114 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.057 New best_val_rmse: 1.057

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.203 Still best_val_rmse: 1.057 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.107 Still best_val_rmse: 1.057 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.055 New best_val_rmse: 1.055


[32m[I 2021-07-11 11:02:45,812][0m Trial 7 finished with value: 1.0554085969924927 and parameters: {'base_lr': 0.0004995055247502465, 'last_lr': 0.0002850608072624741}. Best is trial 0 with value: 0.49270474910736084.[0m



##### Using fold 4


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.063 New best_val_rmse: 1.063

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.949 New best_val_rmse: 0.949

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7095 New best_val_rmse: 0.7095

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6593 New best_val_rmse: 0.6593

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6113 New best_val_rmse: 0.6113

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6032 New best_val_rmse: 0.6032

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5948 New best_val_rmse: 0.5948

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5788 New best_val_rmse: 0.5788

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6355 Still best_val_rmse: 0.5788 (from epoch 0)

16 steps took 12.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5397 New best_val_rmse: 0.5397

16 steps took 12.1 seconds
Epoch: 1 batch_n

[32m[I 2021-07-11 11:13:00,098][0m Trial 8 finished with value: 0.5056313276290894 and parameters: {'base_lr': 1.3787414141031954e-05, 'last_lr': 0.00024373871452270445}. Best is trial 0 with value: 0.49270474910736084.[0m



##### Using fold 4


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.843 New best_val_rmse: 0.843

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.059 Still best_val_rmse: 0.843 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.336 Still best_val_rmse: 0.843 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.018 Still best_val_rmse: 0.843 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.103 Still best_val_rmse: 0.843 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.046 Still best_val_rmse: 0.843 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.04 Still best_val_rmse: 0.843 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.049 Still best_val_rmse: 0.843 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.082 Still best_val_rmse: 0.843 (from epoch 0)


[32m[I 2021-07-11 11:16:22,233][0m Trial 9 finished with value: 0.8430390954017639 and parameters: {'base_lr': 8.85205212608486e-05, 'last_lr': 0.0022779066126276696}. Best is trial 0 with value: 0.49270474910736084.[0m



##### Using fold 4


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9478 New best_val_rmse: 0.9478

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6872 New best_val_rmse: 0.6872

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.005 Still best_val_rmse: 0.6872 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8061 Still best_val_rmse: 0.6872 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.745 Still best_val_rmse: 0.6872 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6326 New best_val_rmse: 0.6326

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5962 New best_val_rmse: 0.5962

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6004 Still best_val_rmse: 0.5962 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5491 New best_val_rmse: 0.5491

16 steps took 12.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5477 New best_val_rmse

[32m[I 2021-07-11 11:26:26,851][0m Trial 10 finished with value: 0.5003262162208557 and parameters: {'base_lr': 3.311575696025064e-05, 'last_lr': 0.0008638794575571843}. Best is trial 0 with value: 0.49270474910736084.[0m



##### Using fold 4


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9465 New best_val_rmse: 0.9465

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6969 New best_val_rmse: 0.6969

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9743 Still best_val_rmse: 0.6969 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7749 Still best_val_rmse: 0.6969 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7621 Still best_val_rmse: 0.6969 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6622 New best_val_rmse: 0.6622

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5886 New best_val_rmse: 0.5886

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5707 New best_val_rmse: 0.5707

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6114 Still best_val_rmse: 0.5707 (from epoch 0)

16 steps took 12.3 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.6214 Still best_val_

[32m[I 2021-07-11 11:36:24,394][0m Trial 11 finished with value: 0.5037858486175537 and parameters: {'base_lr': 3.3539706120753366e-05, 'last_lr': 0.0008654608420454564}. Best is trial 0 with value: 0.49270474910736084.[0m



##### Using fold 4


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.951 New best_val_rmse: 0.951

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6782 New best_val_rmse: 0.6782

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.035 Still best_val_rmse: 0.6782 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7676 Still best_val_rmse: 0.6782 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7518 Still best_val_rmse: 0.6782 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6737 New best_val_rmse: 0.6737

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5781 New best_val_rmse: 0.5781

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6054 Still best_val_rmse: 0.5781 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5479 New best_val_rmse: 0.5479

16 steps took 12.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.557 Still best_val_rmse

[32m[I 2021-07-11 11:46:36,475][0m Trial 12 finished with value: 0.5003801584243774 and parameters: {'base_lr': 3.2523515760001636e-05, 'last_lr': 0.0008465770027184623}. Best is trial 0 with value: 0.49270474910736084.[0m



##### Using fold 4


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.346 New best_val_rmse: 1.346

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7417 New best_val_rmse: 0.7417

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9716 Still best_val_rmse: 0.7417 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.9958 Still best_val_rmse: 0.7417 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7809 Still best_val_rmse: 0.7417 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.8567 Still best_val_rmse: 0.7417 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.042 Still best_val_rmse: 0.7417 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.103 Still best_val_rmse: 0.7417 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.069 Still best_val_rmse: 0.7417 (from epoch 0)


[32m[I 2021-07-11 11:50:02,623][0m Trial 13 finished with value: 0.7416815757751465 and parameters: {'base_lr': 0.00014929685467720974, 'last_lr': 0.0005998880572453282}. Best is trial 0 with value: 0.49270474910736084.[0m



##### Using fold 4


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9692 New best_val_rmse: 0.9692

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7107 New best_val_rmse: 0.7107

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.043 Still best_val_rmse: 0.7107 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7533 Still best_val_rmse: 0.7107 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6496 New best_val_rmse: 0.6496

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6896 Still best_val_rmse: 0.6496 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5855 New best_val_rmse: 0.5855

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5758 New best_val_rmse: 0.5758

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5971 Still best_val_rmse: 0.5758 (from epoch 0)

16 steps took 12.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5539 New best_val_rms

[32m[I 2021-07-11 12:00:15,356][0m Trial 14 finished with value: 0.5027363300323486 and parameters: {'base_lr': 2.7217394418855534e-05, 'last_lr': 0.0010528696023137842}. Best is trial 0 with value: 0.49270474910736084.[0m



##### Using fold 4


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8647 New best_val_rmse: 0.8647

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9205 Still best_val_rmse: 0.8647 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8855 Still best_val_rmse: 0.8647 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.041 Still best_val_rmse: 0.8647 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.038 Still best_val_rmse: 0.8647 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.039 Still best_val_rmse: 0.8647 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.04 Still best_val_rmse: 0.8647 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.072 Still best_val_rmse: 0.8647 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.081 Still best_val_rmse: 0.8647 (from epoch 0)


[32m[I 2021-07-11 12:03:36,127][0m Trial 15 finished with value: 0.8646877408027649 and parameters: {'base_lr': 0.00017542193606878774, 'last_lr': 0.004947415715299062}. Best is trial 0 with value: 0.49270474910736084.[0m



##### Using fold 4


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9465 New best_val_rmse: 0.9465

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.186 Still best_val_rmse: 0.9465 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8792 New best_val_rmse: 0.8792

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7415 New best_val_rmse: 0.7415

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6902 New best_val_rmse: 0.6902

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6502 New best_val_rmse: 0.6502

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6027 New best_val_rmse: 0.6027

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6291 Still best_val_rmse: 0.6027 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5905 New best_val_rmse: 0.5905

16 steps took 12.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.541 New best_val_rmse: 0.541

16 steps took 12.1 second

[32m[I 2021-07-11 12:15:00,773][0m Trial 16 finished with value: 0.49172893166542053 and parameters: {'base_lr': 4.25982124671479e-05, 'last_lr': 0.00046783085975998843}. Best is trial 16 with value: 0.49172893166542053.[0m



##### Using fold 4


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.017 New best_val_rmse: 1.017

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.036 Still best_val_rmse: 1.017 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8625 New best_val_rmse: 0.8625

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6924 New best_val_rmse: 0.6924

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.669 New best_val_rmse: 0.669

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6589 New best_val_rmse: 0.6589

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6222 New best_val_rmse: 0.6222

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6085 New best_val_rmse: 0.6085

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5715 New best_val_rmse: 0.5715

16 steps took 12.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5769 Still best_val_rmse: 0.5715 (from epoch 0)

16 steps took 12.1 seconds
E

[32m[I 2021-07-11 12:25:13,529][0m Trial 17 finished with value: 0.535816490650177 and parameters: {'base_lr': 8.090440111990233e-06, 'last_lr': 8.280069297700935e-05}. Best is trial 16 with value: 0.49172893166542053.[0m



##### Using fold 4


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8258 New best_val_rmse: 0.8258

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8926 Still best_val_rmse: 0.8258 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.792 New best_val_rmse: 0.792

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8385 Still best_val_rmse: 0.792 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7655 New best_val_rmse: 0.7655

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6902 New best_val_rmse: 0.6902

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6116 New best_val_rmse: 0.6116

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6033 New best_val_rmse: 0.6033

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6036 Still best_val_rmse: 0.6033 (from epoch 0)


[32m[I 2021-07-11 12:28:51,289][0m Trial 18 finished with value: 0.6033336520195007 and parameters: {'base_lr': 6.195357929866518e-05, 'last_lr': 0.00045241704354138495}. Best is trial 16 with value: 0.49172893166542053.[0m



##### Using fold 4


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.535 New best_val_rmse: 1.535

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8699 New best_val_rmse: 0.8699

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7074 New best_val_rmse: 0.7074

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.797 Still best_val_rmse: 0.7074 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6375 New best_val_rmse: 0.6375

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.61 New best_val_rmse: 0.61

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6601 Still best_val_rmse: 0.61 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6411 Still best_val_rmse: 0.61 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6286 Still best_val_rmse: 0.61 (from epoch 0)


[32m[I 2021-07-11 12:32:24,289][0m Trial 19 finished with value: 0.6099985241889954 and parameters: {'base_lr': 0.0001337250646422305, 'last_lr': 0.0003298751720339185}. Best is trial 16 with value: 0.49172893166542053.[0m
[32m[I 2021-07-11 12:32:24,291][0m A new study created in memory with name: no-name-86dd7c39-ba3d-44c0-a9a2-265eba2df1df[0m



 Best value:  0.49172893166542053
 Best params: 
    base_lr: 4.25982124671479e-05
    last_lr: 0.00046783085975998843
##### Using fold 5


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9253 New best_val_rmse: 0.9253

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7431 New best_val_rmse: 0.7431

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.664 New best_val_rmse: 0.664

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7751 Still best_val_rmse: 0.664 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7152 Still best_val_rmse: 0.664 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6315 New best_val_rmse: 0.6315

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6378 Still best_val_rmse: 0.6315 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.585 New best_val_rmse: 0.585

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5612 New best_val_rmse: 0.5612

16 steps took 12.3 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5238 New best_val_rmse: 0.5238

16 steps to

[32m[I 2021-07-11 12:45:52,782][0m Trial 0 finished with value: 0.4810352027416229 and parameters: {'base_lr': 2.517671262528407e-05, 'last_lr': 0.00406777602451577}. Best is trial 0 with value: 0.4810352027416229.[0m



##### Using fold 5


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9661 New best_val_rmse: 0.9661

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.75 New best_val_rmse: 0.75

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6913 New best_val_rmse: 0.6913

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6711 New best_val_rmse: 0.6711

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6686 New best_val_rmse: 0.6686

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5502 New best_val_rmse: 0.5502

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5968 Still best_val_rmse: 0.5502 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.8112 Still best_val_rmse: 0.5502 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6082 Still best_val_rmse: 0.5502 (from epoch 0)

16 steps took 12.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 1.043 Still best_val_rmse: 0.5502 (from epo

[32m[I 2021-07-11 12:55:31,967][0m Trial 1 finished with value: 0.5501596927642822 and parameters: {'base_lr': 8.375302263422773e-05, 'last_lr': 0.0004311746355821461}. Best is trial 0 with value: 0.4810352027416229.[0m



##### Using fold 5


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.194 New best_val_rmse: 1.194

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.085 New best_val_rmse: 1.085

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6777 New best_val_rmse: 0.6777

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7128 Still best_val_rmse: 0.6777 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5913 New best_val_rmse: 0.5913

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5536 New best_val_rmse: 0.5536

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7023 Still best_val_rmse: 0.5536 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5971 Still best_val_rmse: 0.5536 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.556 Still best_val_rmse: 0.5536 (from epoch 0)

16 steps took 12.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5467 New best_val_rmse: 0

[32m[I 2021-07-11 13:08:16,576][0m Trial 2 finished with value: 0.4815599322319031 and parameters: {'base_lr': 1.8493681713701894e-05, 'last_lr': 0.0030814367033004386}. Best is trial 0 with value: 0.4810352027416229.[0m



##### Using fold 5


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9368 New best_val_rmse: 0.9368

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9012 New best_val_rmse: 0.9012

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.011 Still best_val_rmse: 0.9012 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7637 New best_val_rmse: 0.7637

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7309 New best_val_rmse: 0.7309

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6732 New best_val_rmse: 0.6732

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5866 New best_val_rmse: 0.5866

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6044 Still best_val_rmse: 0.5866 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5796 New best_val_rmse: 0.5796

16 steps took 12.3 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.599 Still best_val_rmse: 0.5796 (from epoch 0)

16 step

[32m[I 2021-07-11 13:18:47,333][0m Trial 3 finished with value: 0.498441219329834 and parameters: {'base_lr': 8.941843080851954e-05, 'last_lr': 0.001837188858801888}. Best is trial 0 with value: 0.4810352027416229.[0m



##### Using fold 5


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.3 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.013 New best_val_rmse: 1.013

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9741 New best_val_rmse: 0.9741

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.105 Still best_val_rmse: 0.9741 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.025 Still best_val_rmse: 0.9741 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.038 Still best_val_rmse: 0.9741 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.029 Still best_val_rmse: 0.9741 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.02 Still best_val_rmse: 0.9741 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.079 Still best_val_rmse: 0.9741 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.061 Still best_val_rmse: 0.9741 (from epoch 0)


[32m[I 2021-07-11 13:22:10,424][0m Trial 4 finished with value: 0.9740661978721619 and parameters: {'base_lr': 0.00017781014466275693, 'last_lr': 0.003533750914273647}. Best is trial 0 with value: 0.4810352027416229.[0m



##### Using fold 5


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.022 New best_val_rmse: 1.022

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.768 New best_val_rmse: 0.768

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7392 New best_val_rmse: 0.7392

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6949 New best_val_rmse: 0.6949

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6451 New best_val_rmse: 0.6451

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5749 New best_val_rmse: 0.5749

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5767 Still best_val_rmse: 0.5749 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5662 New best_val_rmse: 0.5662

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6244 Still best_val_rmse: 0.5662 (from epoch 0)

16 steps took 12.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5057 New best_val_rmse: 0.5057

16 steps took 12.1 seconds

[32m[I 2021-07-11 13:37:27,736][0m Trial 5 finished with value: 0.47618797421455383 and parameters: {'base_lr': 3.1211012676571296e-05, 'last_lr': 0.003503681146690041}. Best is trial 5 with value: 0.47618797421455383.[0m



##### Using fold 5


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.203 New best_val_rmse: 1.203

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.13 New best_val_rmse: 1.13

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.688 New best_val_rmse: 0.688

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6974 Still best_val_rmse: 0.688 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6124 New best_val_rmse: 0.6124

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5664 New best_val_rmse: 0.5664

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5949 Still best_val_rmse: 0.5664 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5515 New best_val_rmse: 0.5515

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.59 Still best_val_rmse: 0.5515 (from epoch 0)

16 steps took 12.5 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5198 New best_val_rmse: 0.5198

16 steps took 12

[32m[I 2021-07-11 13:50:53,572][0m Trial 6 finished with value: 0.47968539595603943 and parameters: {'base_lr': 1.819371369732658e-05, 'last_lr': 0.002281520488029198}. Best is trial 5 with value: 0.47618797421455383.[0m



##### Using fold 5


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.076 New best_val_rmse: 1.076

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.066 New best_val_rmse: 1.066

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7071 New best_val_rmse: 0.7071

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6459 New best_val_rmse: 0.6459

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6011 New best_val_rmse: 0.6011

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5793 New best_val_rmse: 0.5793

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.565 New best_val_rmse: 0.565

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6191 Still best_val_rmse: 0.565 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6155 Still best_val_rmse: 0.565 (from epoch 0)

16 steps took 12.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5375 New best_val_rmse: 0.5375

16 steps took 12.1 seconds
Epo

[32m[I 2021-07-11 14:04:13,659][0m Trial 7 finished with value: 0.4810379445552826 and parameters: {'base_lr': 2.2975827471191227e-05, 'last_lr': 0.0004391967966467416}. Best is trial 5 with value: 0.47618797421455383.[0m



##### Using fold 5


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.3 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.985 New best_val_rmse: 0.985

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7299 New best_val_rmse: 0.7299

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9189 Still best_val_rmse: 0.7299 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7402 Still best_val_rmse: 0.7299 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.9065 Still best_val_rmse: 0.7299 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.672 New best_val_rmse: 0.672

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7443 Still best_val_rmse: 0.672 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.8526 Still best_val_rmse: 0.672 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.03 Still best_val_rmse: 0.672 (from epoch 0)


[32m[I 2021-07-11 14:07:41,638][0m Trial 8 finished with value: 0.672038197517395 and parameters: {'base_lr': 0.00016163527182352886, 'last_lr': 0.0011072832131516972}. Best is trial 5 with value: 0.47618797421455383.[0m



##### Using fold 5


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.011 New best_val_rmse: 1.011

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7488 New best_val_rmse: 0.7488

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6342 New best_val_rmse: 0.6342

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8114 Still best_val_rmse: 0.6342 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6133 New best_val_rmse: 0.6133

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6256 Still best_val_rmse: 0.6133 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6311 Still best_val_rmse: 0.6133 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6326 Still best_val_rmse: 0.6133 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5668 New best_val_rmse: 0.5668

16 steps took 12.3 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5071 New best_val_rmse

[32m[I 2021-07-11 14:20:21,598][0m Trial 9 finished with value: 0.48475295305252075 and parameters: {'base_lr': 4.988824996116573e-05, 'last_lr': 0.0001505909143424294}. Best is trial 5 with value: 0.47618797421455383.[0m



##### Using fold 5


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.097 New best_val_rmse: 1.097

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.108 Still best_val_rmse: 1.097 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7927 New best_val_rmse: 0.7927

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7666 New best_val_rmse: 0.7666

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6231 New best_val_rmse: 0.6231

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.588 New best_val_rmse: 0.588

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5742 New best_val_rmse: 0.5742

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5524 New best_val_rmse: 0.5524

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5451 New best_val_rmse: 0.5451

16 steps took 12.3 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5505 Still best_val_rmse: 0.5451 (from epoch 0)

16 steps took 12.1 seconds
E

[32m[I 2021-07-11 14:31:10,208][0m Trial 10 finished with value: 0.49558311700820923 and parameters: {'base_lr': 8.017085602014504e-06, 'last_lr': 0.00013492093789299912}. Best is trial 5 with value: 0.47618797421455383.[0m



##### Using fold 5


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.199 New best_val_rmse: 1.199

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.21 Still best_val_rmse: 1.199 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9498 New best_val_rmse: 0.9498

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.9283 New best_val_rmse: 0.9283

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6395 New best_val_rmse: 0.6395

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6178 New best_val_rmse: 0.6178

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5998 New best_val_rmse: 0.5998

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5853 New best_val_rmse: 0.5853

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5625 New best_val_rmse: 0.5625

16 steps took 12.3 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5902 Still best_val_rmse: 0.5625 (from epoch 0)

16 steps took 12.1 seconds


[32m[I 2021-07-11 14:41:27,460][0m Trial 11 finished with value: 0.51274573802948 and parameters: {'base_lr': 8.162314023752154e-06, 'last_lr': 0.0012648043538341749}. Best is trial 5 with value: 0.47618797421455383.[0m



##### Using fold 5


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.148 New best_val_rmse: 1.148

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6693 New best_val_rmse: 0.6693

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.657 New best_val_rmse: 0.657

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7588 Still best_val_rmse: 0.657 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6051 New best_val_rmse: 0.6051

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6251 Still best_val_rmse: 0.6051 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7151 Still best_val_rmse: 0.6051 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6284 Still best_val_rmse: 0.6051 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5375 New best_val_rmse: 0.5375

16 steps took 12.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5398 Still best_val_rmse:

[32m[I 2021-07-11 14:54:30,854][0m Trial 12 finished with value: 0.4829988181591034 and parameters: {'base_lr': 3.993251697368309e-05, 'last_lr': 0.004782203168092541}. Best is trial 5 with value: 0.47618797421455383.[0m



##### Using fold 5


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9993 New best_val_rmse: 0.9993

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9077 New best_val_rmse: 0.9077

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.68 New best_val_rmse: 0.68

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6592 New best_val_rmse: 0.6592

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6086 New best_val_rmse: 0.6086

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5701 New best_val_rmse: 0.5701

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7384 Still best_val_rmse: 0.5701 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5761 Still best_val_rmse: 0.5701 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6004 Still best_val_rmse: 0.5701 (from epoch 0)

16 steps took 12.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5511 New best_val_rmse: 0.5511

16 steps 

[32m[I 2021-07-11 15:05:08,092][0m Trial 13 finished with value: 0.49801746010780334 and parameters: {'base_lr': 1.3118716479802712e-05, 'last_lr': 0.0019475564222367723}. Best is trial 5 with value: 0.47618797421455383.[0m



##### Using fold 5


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.152 New best_val_rmse: 1.152

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.075 New best_val_rmse: 1.075

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.026 New best_val_rmse: 1.026

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.158 Still best_val_rmse: 1.026 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.081 Still best_val_rmse: 1.026 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.019 New best_val_rmse: 1.019

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.027 Still best_val_rmse: 1.019 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.065 Still best_val_rmse: 1.019 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.084 Still best_val_rmse: 1.019 (from epoch 0)


[32m[I 2021-07-11 15:08:38,374][0m Trial 14 finished with value: 1.0189093351364136 and parameters: {'base_lr': 0.00048724104170082695, 'last_lr': 0.0006631778951914523}. Best is trial 5 with value: 0.47618797421455383.[0m



##### Using fold 5


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.07 New best_val_rmse: 1.07

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9305 New best_val_rmse: 0.9305

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.667 New best_val_rmse: 0.667

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.644 New best_val_rmse: 0.644

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5959 New best_val_rmse: 0.5959

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.563 New best_val_rmse: 0.563

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7647 Still best_val_rmse: 0.563 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6152 Still best_val_rmse: 0.563 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5632 Still best_val_rmse: 0.563 (from epoch 0)

16 steps took 12.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5499 New best_val_rmse: 0.5499

16 steps took 12.1

[32m[I 2021-07-11 15:19:17,839][0m Trial 15 finished with value: 0.4953896999359131 and parameters: {'base_lr': 1.3497842269193575e-05, 'last_lr': 0.0022516911780975715}. Best is trial 5 with value: 0.47618797421455383.[0m



##### Using fold 5


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9399 New best_val_rmse: 0.9399

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8119 New best_val_rmse: 0.8119

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6663 New best_val_rmse: 0.6663

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7917 Still best_val_rmse: 0.6663 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6626 New best_val_rmse: 0.6626

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5827 New best_val_rmse: 0.5827

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5677 New best_val_rmse: 0.5677

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5896 Still best_val_rmse: 0.5677 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.546 New best_val_rmse: 0.546

16 steps took 12.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5153 New best_val_rmse: 0.5153

16 steps took 12.1 secon

[32m[I 2021-07-11 15:32:07,875][0m Trial 16 finished with value: 0.4826622009277344 and parameters: {'base_lr': 3.331068288166276e-05, 'last_lr': 0.0009179994715435172}. Best is trial 5 with value: 0.47618797421455383.[0m



##### Using fold 5


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.072 New best_val_rmse: 1.072

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9895 New best_val_rmse: 0.9895

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6842 New best_val_rmse: 0.6842

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6699 New best_val_rmse: 0.6699

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6094 New best_val_rmse: 0.6094

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.565 New best_val_rmse: 0.565

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5541 New best_val_rmse: 0.5541

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5872 Still best_val_rmse: 0.5541 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5379 New best_val_rmse: 0.5379

16 steps took 12.5 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.532 New best_val_rmse: 0.532

16 steps took 12.0 seconds
Epoch: 1 batch_num

[32m[I 2021-07-11 15:45:18,122][0m Trial 17 finished with value: 0.4872971773147583 and parameters: {'base_lr': 1.3001208754924894e-05, 'last_lr': 0.00024317357096544272}. Best is trial 5 with value: 0.47618797421455383.[0m



##### Using fold 5


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9715 New best_val_rmse: 0.9715

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7895 New best_val_rmse: 0.7895

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9471 Still best_val_rmse: 0.7895 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.075 Still best_val_rmse: 0.7895 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.8776 Still best_val_rmse: 0.7895 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.9583 Still best_val_rmse: 0.7895 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.8304 Still best_val_rmse: 0.7895 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6568 New best_val_rmse: 0.6568

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.7231 Still best_val_rmse: 0.6568 (from epoch 0)


[32m[I 2021-07-11 15:48:48,033][0m Trial 18 finished with value: 0.6567704081535339 and parameters: {'base_lr': 6.495609880726438e-05, 'last_lr': 0.0025172819620726955}. Best is trial 5 with value: 0.47618797421455383.[0m



##### Using fold 5


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.3 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.169 New best_val_rmse: 1.169

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7031 New best_val_rmse: 0.7031

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6681 New best_val_rmse: 0.6681

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8923 Still best_val_rmse: 0.6681 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6045 New best_val_rmse: 0.6045

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5919 New best_val_rmse: 0.5919

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6644 Still best_val_rmse: 0.5919 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5758 New best_val_rmse: 0.5758

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5583 New best_val_rmse: 0.5583

16 steps took 12.3 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5339 New best_val_rmse: 0.5339

16 steps took 12.1 secon

[32m[I 2021-07-11 16:02:39,973][0m Trial 19 finished with value: 0.4797256886959076 and parameters: {'base_lr': 3.2222410815486126e-05, 'last_lr': 0.004855094453925757}. Best is trial 5 with value: 0.47618797421455383.[0m



 Best value:  0.47618797421455383
 Best params: 
    base_lr: 3.1211012676571296e-05
    last_lr: 0.003503681146690041
CPU times: user 6h 4min 54s, sys: 1h 35min 34s, total: 7h 40min 29s
Wall time: 8h 3min 38s


In [None]:
%%time

list_val_rmse = []
use_lr_map = {0: False, 1: True, 2: False, 3: False, 4: False, 5: False}

pbar = tqdm(enumerate(splits), total=cfg.NUM_FOLDS, position=0, leave=True)
for fold, (train_indices, val_indices) in pbar:
    pbar.set_description(f'Fold {fold}')
    model_path = cfg.MODEL_FOLDER/f"{cfg.model_name.replace('/', '_')}_{fold + 1}/model_{fold + 1}.pth"
    
    set_random_seed(cfg.SEED + fold)
    
    tokenizer = AutoTokenizer.from_pretrained(cfg.TOKENIZER_PATH)
    
    train_dataset = CommonLitDataset(train_df.loc[train_indices], tokenizer)    
    val_dataset = CommonLitDataset(train_df.loc[val_indices], tokenizer)
    
    train_loader = DataLoader(train_dataset, batch_size=cfg.BATCH_SIZE,
                              drop_last=False, shuffle=True, num_workers=cfg.NUM_WORKERS)    
    val_loader = DataLoader(val_dataset, batch_size=cfg.BATCH_SIZE,
                            drop_last=False, shuffle=False, num_workers=cfg.NUM_WORKERS)
    
    set_random_seed(cfg.SEED + fold)
    
    model = CommonLitModel().to(cfg.DEVICE)
        
    optimizer = create_optimizer(model, use_lr=use_lr_map[fold])
    
    scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                num_training_steps=cfg.NUM_EPOCHS * len(train_loader), 
                                                num_warmup_steps=50)
    
    trainer = Trainer(model, model_path, train_loader, val_loader, optimizer, scheduler = scheduler)
    list_val_rmse.append(trainer.train())
    
    tokenizer.save_pretrained(str(model_path.parent))
    del model
    gc.collect()
    
    if cfg.DEVICE == 'cuda':
        torch.cuda.empty_cache()
        
print("\nPerformance estimates:")
print(list_val_rmse)
print("Mean:", np.array(list_val_rmse).mean())

### Verify the model

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm

In [None]:
cfg.model_offset = 0
cfg.model_limit = 6
cfg.n_folds = 5
cfg.svm_kernels = ['rbf']
cfg.svm_c = 5

In [None]:
num_bins = int(np.ceil(np.log2(len(train_df))))
train_df['bins'] = pd.cut(train_df['target'], bins=num_bins, labels=False)
bins = train_df['bins'].values

In [None]:
%%time

inference_models = []
for i in range(1, cfg.NUM_FOLDS + 1):
    print(f'Model {i}')
    inference_model = CommonLitModel()
    inference_model = inference_model.cuda()
    inference_model.load_state_dict(torch.load(str(MODELS_PATH/f"{cfg.model_name.replace('/', '_')}_{i}/model_{i}.pth")))
    inference_model.eval();
    inference_models.append(inference_model)

In [None]:
from transformers import RobertaTokenizer

tokenizers = []
for i in range(1, cfg.NUM_FOLDS):
    tokenizer = RobertaTokenizer.from_pretrained(MODELS_PATH/f"{cfg.model_name.replace('/', '_')}_{i}")
    tokenizers.append(tokenizer)

In [None]:
def get_cls_embeddings(dl, transformer_model):
    cls_embeddings = []
    with torch.no_grad():
        for input_features in tqdm(dl, total=len(dl)):
            output, context_vector = transformer_model(input_features['input_ids'].cuda(), input_features['attention_mask'].cuda())
#             cls_embeddings.extend(output['last_hidden_state'][:,0,:].detach().cpu().numpy())
            embedding_out = context_vector.detach().cpu().numpy()
            cls_embeddings.extend(embedding_out)
    return np.array(cls_embeddings)

In [None]:
def rmse_score(X, y):
    return np.sqrt(mean_squared_error(X, y))

In [None]:
def convert_to_list(t):
    return t.flatten().long()

class CommonLitDataset(nn.Module):
    def __init__(self, text, test_id, tokenizer, max_len=128):
        self.excerpt = text
        self.test_id = test_id
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        return {'input_ids': convert_to_list(encode['input_ids']),
                'attention_mask': convert_to_list(encode['attention_mask']),
                'id': self.test_id[idx]}
    
    def __len__(self):
        return len(self.excerpt)

In [None]:
def create_dl(df, tokenizer):
    text = df['excerpt'].values
    ids = df['id'].values
    ds = CommonLitDataset(text, ids, tokenizer, max_len=cfg.MAX_LEN)
    return DataLoader(ds, 
                      batch_size = cfg.BATCH_SIZE,
                      shuffle=False,
                      num_workers = 1,
                      pin_memory=True,
                      drop_last=False
                     )

In [None]:
train_df = pd.read_csv(DATA_PATH/'train-orig.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')
remove_unnecessary(train_df)

In [None]:
train_target_mean = train_df['target'].mean()
train_target_std = train_df['target'].std()
train_df['normalized_target'] = (train_df['target'] - train_target_mean) / train_target_std

In [None]:
%%time

train_target = train_df['normalized_target'].values

def calc_mean(scores):
    return np.mean(np.array(scores), axis=0)

final_scores = []
final_rmse = []
kernel_rmse_score_mean = []
final_kernel_predictions_means = []
for j, (inference_model, tokenizer) in enumerate(zip(inference_models, tokenizers)):
    print('Model', j)
    test_dl = create_dl(test_df, tokenizer)
    train_dl = create_dl(train_df, tokenizer)
    transformer_model = inference_model
    transformer_model.cuda()
    X = get_cls_embeddings(train_dl, transformer_model)
    
    y = train_target
    X_test = get_cls_embeddings(test_dl, transformer_model)
    
    kfold = StratifiedKFold(n_splits=cfg.NUM_FOLDS)
    scores = []
    rmse_scores = []
    kernel_predictions_means = []
    for kernel in cfg.svm_kernels:
        print('Kernel', kernel)
        kernel_scores = []
        kernel_rmse_scores = []
        kernel_predictions = []
        for k, (train_idx, valid_idx) in enumerate(kfold.split(X, bins)):

            print('Fold', k, train_idx.shape, valid_idx.shape)
            model = SVR(C=cfg.svm_c, kernel=kernel, gamma='auto')

            X_train, y_train = X[train_idx], y[train_idx]
            X_valid, y_valid = X[valid_idx], y[valid_idx]
            model.fit(X_train, y_train)
            prediction = model.predict(X_valid)
            kernel_predictions.append(prediction)
            kernel_rmse_scores.append(rmse_score(prediction, y_valid))
            print('rmse_score', kernel_rmse_scores[k])
            kernel_scores.append(model.predict(X_test))
        kernel_predictions_means.append(np.array([np.mean(kp) for kp in kernel_predictions]).mean())
        scores.append(calc_mean(kernel_scores))
        kernel_rmse_score = calc_mean(kernel_rmse_scores)
        kernel_rmse_score_mean.append(kernel_rmse_score)
        rmse_scores.append(kernel_rmse_score)
    final_kernel_predictions_means.append(kernel_predictions_means)
    final_scores.append(calc_mean(scores))
    final_rmse.append(calc_mean(rmse_scores))
print('FINAL RMSE score', np.mean(np.array(final_rmse)))

In [None]:
final_kernel_predictions_means

In [None]:
# (train_df['target'] - cfg.train_target_mean) / cfg.train_target_std
final_scores_normalized = np.array(final_scores) * train_target_std + train_target_mean

In [None]:
kernel_rmse_score_mean_array = np.array(kernel_rmse_score_mean)
kernel_rmse_score_mean_sum = np.sum(kernel_rmse_score_mean_array)
prop_losses = kernel_rmse_score_mean_array / kernel_rmse_score_mean_sum
prop_losses_sum = (1 - prop_losses).sum()
weights = (1 - prop_losses) / prop_losses_sum
weights

In [None]:
def calc_mean(scores, weights=weights):
    return np.average(np.array(scores), weights=weights, axis=0)

In [None]:
target_mean = train_df['target'].mean()
final_scores_flat = calc_mean(final_scores_normalized).flatten()
final_scores_mean = final_scores_flat.mean()
target_mean, np.array(final_scores_normalized).mean()
# (-0.9579984513405823, -0.8029817438292849)

In [None]:
final_scores_flat

In [None]:
mean_diff = target_mean - final_scores_mean
mean_diff, mean_diff / len(final_scores)

In [None]:
sample_df['target'] = final_scores_flat + mean_diff
# sample_df['target'] = len(final_scores) / np.sum(1 / np.array(final_scores), axis=0) # harmonic mean
sample_df

### Prepare Packaging

In [None]:
cfg.model_name

In [None]:
BEST_MODEL_FOLDER = MODELS_PATH/cfg.model_name/'best'
!rm -rf {BEST_MODEL_FOLDER}
!mkdir -p {BEST_MODEL_FOLDER}

In [None]:
BEST_MODEL_FOLDER

In [None]:
cfg.NUM_FOLDS

In [None]:
bestmodels = [MODELS_PATH/f'{cfg.model_name}_{i + 1}' for i in range(0, cfg.NUM_FOLDS)]

In [None]:
bestmodels

In [None]:
from shutil import copyfile

def normalize_name(path_name):
    return path_name.replace('', '')

for i, best_model in enumerate(bestmodels):
    print(f'Processing {i}th model')
    i = i + 1
    best_model_file = f'{best_model}/model_{i}.pth'
    if Path(best_model_file).exists():
        copyfile(best_model_file, f'{BEST_MODEL_FOLDER}/{i}_pytorch_model.bin')
        tokenizer_path = Path(BEST_MODEL_FOLDER/f'tokenizer-{i}')
        tokenizer_path.mkdir(parents=True, exist_ok=True)
        assert tokenizer_path.exists()

        tokenizer_json = Path(normalize_name(f'{MODELS_PATH/cfg.model_name}_{i}/tokenizer_config.json'))
        assert tokenizer_json.exists(), f'{tokenizer_json} does not exist'
        copyfile(tokenizer_json, tokenizer_path/'tokenizer.json')

        vocab_txt = Path(normalize_name(f'{MODELS_PATH/cfg.model_name}_{i}/vocab.json'))
        assert vocab_txt.exists(), f'{vocab_txt} does not exist'
        copyfile(vocab_txt, tokenizer_path/'vocab.json')

        merges = Path(normalize_name(f'{MODELS_PATH/cfg.model_name}_{i}/merges.txt'))
        assert merges.exists()
        copyfile(merges, tokenizer_path/'merges.txt')
    else:
        print(f'{best_model_file} is missing')

In [None]:
import shutil

shutil.make_archive(MODELS_PATH/cfg.model_name/'best_models', 'zip', BEST_MODEL_FOLDER)

In [None]:
!ls {MODELS_PATH/cfg.model_name}

In [None]:
!mv {MODELS_PATH}/{cfg.model_name}.yaml {MODELS_PATH/cfg.model_name}

In [None]:
transformer_model.transformer_model.save_pretrained(save_directory=f'{MODELS_PATH/cfg.model_name}/lm')

In [None]:
!du -h {MODELS_PATH/cfg.model_name}/*

In [None]:
shutil.make_archive(MODELS_PATH/cfg.model_name/'lm', 'zip', f'{MODELS_PATH/cfg.model_name}/lm')

In [None]:
!kaggle datasets init -p {MODELS_PATH/cfg.model_name}

In [None]:
dataset_json_path = Path(MODELS_PATH/cfg.model_name/'dataset-metadata.json')
assert dataset_json_path.exists()

In [None]:
!cat {str(dataset_json_path)}

In [None]:
with open(dataset_json_path, 'r') as f:
    dataset_json = f.read()
    dataset_json = dataset_json.replace('INSERT_TITLE_HERE', f'commonlit-{cfg.model_name}-light').replace('INSERT_SLUG_HERE', f'commonlit-{cfg.model_name}-light')
    print(dataset_json)
with(open(dataset_json_path, 'w')) as f:
    f.write(dataset_json)

In [None]:
!rm -rf {MODELS_PATH/cfg.model_name}/best
!rm -rf {MODELS_PATH/cfg.model_name}/lm

In [None]:
!kaggle datasets create -p {MODELS_PATH/cfg.model_name}

In [None]:
!kaggle datasets version -p {MODELS_PATH/cfg.model_name} -m "Version with merges.txt" -d

In [None]:
state_dict = torch.load(str(MODELS_PATH/f'distilroberta-0/checkpoint-105/pytorch_model.bin'))

In [None]:
loaded_model = CommonLitModel()

In [None]:
loaded_model.load_state_dict(state_dict)