In [1]:
# !pip install optuna

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import gc, warnings, random, time, os

from pathlib import Path

from tqdm.notebook import tqdm

warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from transformers import AutoModel, AutoTokenizer, AutoConfig
from transformers import get_cosine_schedule_with_warmup

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

import seaborn as sns

import gc
gc.enable()

import optuna

### Folders and Dataframes

In [2]:
DATA_PATH = Path('/home/commonlit/data/')
assert DATA_PATH.exists()
MODELS_PATH = Path('/home/commonlit/models/')
if not MODELS_PATH.exists():
    os.mkdir(MODELS_PATH)
assert MODELS_PATH.exists()

In [3]:
train_df = pd.read_csv(DATA_PATH/'train-orig.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')
sample_df = pd.read_csv(DATA_PATH/'sample_submission.csv')

In [4]:
def remove_unnecessary(df):
    df.drop(df[df['target'] == 0].index, inplace=True)
    df.reset_index(drop=True, inplace=True)
    
remove_unnecessary(train_df)

In [5]:
train_df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845
...,...,...,...,...,...,...
2828,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,When you think of dinosaurs and where they liv...,1.711390,0.646900
2829,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,So what is a solid? Solids are usually hard be...,0.189476,0.535648
2830,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,The second state of matter we will discuss is ...,0.255209,0.483866
2831,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,Solids are shapes that you can actually touch....,-0.215279,0.514128


### Config and Seeding

In [6]:
class Config(): 
    NUM_FOLDS = 6
    NUM_EPOCHS = 3
    BATCH_SIZE = 16
    MAX_LEN = 248
    EVAL_SCHEDULE = [(0.50, 16), (0.49, 8), (0.48, 4), (0.47, 2), (-1., 1)]
    MODEL_PATH = 'google/electra-large-discriminator'
    TOKENIZER_PATH = 'google/electra-large-discriminator'
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    SEED = 1000
    NUM_WORKERS = 2
    MODEL_FOLDER = MODELS_PATH
    model_name = 'electra-large-discriminator'
    svm_kernels = ['rbf']
    svm_c = 5

cfg = Config()

In [7]:
if not cfg.MODEL_FOLDER.exists():
    os.mkdir(cfg.MODEL_FOLDER)

In [8]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True

### Dataset

In [9]:
def add_bins(train_df, num_bins):
    train_df.loc[:, 'bins'] = pd.cut(train_df['target'], bins=num_bins, labels=False)
    return num_bins

In [10]:
add_bins(train_df, cfg.NUM_FOLDS)

6

In [11]:
train_df.groupby(['bins'])['target'].agg(['count', 'mean'])

Unnamed: 0_level_0,count,mean
bins,Unnamed: 1_level_1,Unnamed: 2_level_1
0,122,-3.125765
1,441,-2.270279
2,784,-1.41215
3,886,-0.548095
4,494,0.289716
5,106,1.070237


In [12]:
tokenizer = AutoTokenizer.from_pretrained(cfg.TOKENIZER_PATH)

In [13]:
class CommonLitDataset(Dataset):
    def __init__(self, df, tokenizer, inference_only=False):
        super().__init__()
        self.df, self.inference_only = df, inference_only
        self.text = df['excerpt'].tolist()
        self.bins = df['bins']
        if not inference_only:
            self.target = torch.tensor(df['target'].to_numpy(), dtype = torch.float32)
        
        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',
            max_length = cfg.MAX_LEN,
            truncation = True,
            return_attention_mask=True
        )
        
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return {'input_ids': input_ids, 'attention_mask': attention_mask}
        else:
            target = self.target[index]
            return {'input_ids': input_ids, 'attention_mask': attention_mask, 'target': target}
    
    def __len__(self):
        return len(self.df)

In [14]:
sample_ds = CommonLitDataset(train_df, tokenizer)

### Model

In [15]:
class AttentionHead(nn.Module):
    
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        
        self.hidden_layer = nn.Linear(in_features, hidden_dim)
        self.final_layer = nn.Linear(hidden_dim, num_targets)
        self.out_features = hidden_dim
        
    def forward(self, features):
        att = torch.tanh(self.hidden_layer(features))
        score = self.final_layer(att)
        attention_weights = torch.softmax(score, dim=1)
        return attention_weights

In [16]:
from transformers import AutoModelForSequenceClassification

class CommonLitModel(nn.Module):
    def __init__(self):
        super(CommonLitModel, self).__init__()
        config = AutoConfig.from_pretrained(cfg.MODEL_PATH)
        config.update({
            "output_hidden_states": True,
            "hidden_dropout_prob": 0.0,
            "layer_norm_eps": 1e-7
        })
        self.transformer_model = AutoModelForSequenceClassification.from_pretrained(cfg.MODEL_PATH, config=config)
        self.attention = AttentionHead(config.hidden_size, 512, 1)
        self.regressor = nn.Linear(config.hidden_size, 1)
    
    def forward(self, input_ids, attention_mask):
        hidden_states = self.transformer_model(input_ids=input_ids, attention_mask=attention_mask)['hidden_states']
        last_layer_hidden_states = hidden_states[-1]
        weights = self.attention(last_layer_hidden_states)
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1) 
        return self.regressor(context_vector), context_vector

In [17]:
sample_model = CommonLitModel()

Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

In [18]:
import re

for i, (name, param) in enumerate(sample_model.named_parameters()):
    if(name.find('layer') > -1):
        layer_name = re.sub(r'.+(layer\.\d+).+', r'\1', name)

In [19]:
for i, (name, param) in enumerate(sample_model.named_parameters()):
    print(i, name, param.size())

0 transformer_model.electra.embeddings.word_embeddings.weight torch.Size([30522, 1024])
1 transformer_model.electra.embeddings.position_embeddings.weight torch.Size([512, 1024])
2 transformer_model.electra.embeddings.token_type_embeddings.weight torch.Size([2, 1024])
3 transformer_model.electra.embeddings.LayerNorm.weight torch.Size([1024])
4 transformer_model.electra.embeddings.LayerNorm.bias torch.Size([1024])
5 transformer_model.electra.encoder.layer.0.attention.self.query.weight torch.Size([1024, 1024])
6 transformer_model.electra.encoder.layer.0.attention.self.query.bias torch.Size([1024])
7 transformer_model.electra.encoder.layer.0.attention.self.key.weight torch.Size([1024, 1024])
8 transformer_model.electra.encoder.layer.0.attention.self.key.bias torch.Size([1024])
9 transformer_model.electra.encoder.layer.0.attention.self.value.weight torch.Size([1024, 1024])
10 transformer_model.electra.encoder.layer.0.attention.self.value.bias torch.Size([1024])
11 transformer_model.electra.

In [20]:
sample_input_ids = torch.randint(0, 1000, [8, 248])
sample_attention_mask = torch.randint(0, 1000, [8, 248])

In [21]:
internal_out = sample_model.transformer_model(sample_input_ids, sample_attention_mask)

In [22]:
internal_out.keys()

odict_keys(['logits', 'hidden_states'])

In [23]:
internal_out.logits.shape, len(internal_out.hidden_states), internal_out.hidden_states[-1].shape

(torch.Size([8, 2]), 25, torch.Size([8, 248, 1024]))

In [24]:
sample_res = sample_model(sample_input_ids, sample_attention_mask)

In [25]:
sample_res[0].shape, sample_res[1].shape

(torch.Size([8, 1]), torch.Size([8, 1024]))

In [26]:
torch.sum(torch.randn([8, 496, 768]), axis=1)

tensor([[ -2.4525,  34.9220,  35.5474,  ...,  17.2497,   3.8168,   4.0134],
        [ 16.6691,  -1.8793,  23.6438,  ...,   4.7158,  -0.5189, -31.0638],
        [-18.9652,  31.9174,  50.0490,  ...,   0.2722,  27.7940, -27.1529],
        ...,
        [-34.4472, -19.2569,   3.0118,  ...,  -1.4974, -10.1866,  -8.4745],
        [  7.0862,  11.9600,   3.8157,  ...,   0.5911, -23.2350,  10.0487],
        [ -4.8602,  18.3438,  -5.1319,  ...,   0.5165,  51.6398,   0.4375]])

### Evaluation and Prediction

In [27]:
def eval_mse(model, data_loader):
    model.eval()
    mse_sum = 0
    mse_loss = nn.MSELoss(reduction='sum')
    
    with torch.no_grad():
        for batch_num, record in enumerate(data_loader):
            input_ids, attention_mask, target = record['input_ids'].to(cfg.DEVICE), record['attention_mask'].to(cfg.DEVICE), record['target'].to(cfg.DEVICE)
            pred, _ = model(input_ids, attention_mask)
            mse_sum += mse_loss(pred.flatten().cpu(), target.cpu())
            
    return mse_sum / len(data_loader.dataset)

In [28]:
def predict(model, data_loader):
    model.eval()
    result = []
    
    with torch.no_grad():
        for batch_num, record in tqdm(enumerate(data_loader), total=len(data_loader)):
            input_ids, attention_mask = record['input_ids'].to(cfg.DEVICE), record['attention_mask'].to(cfg.DEVICE)
            pred, _ = model(input_ids, attention_mask)
            result.extend(pred.flatten().to("cpu").tolist())
            
    return np.array(result)

In [29]:
sample_dl = DataLoader(sample_ds, shuffle=False, batch_size=16, num_workers=1)

### Optimizer and Sampler

In [30]:
5e-5 / 2.5, 5e-5 / 0.5, 5e-5

(2e-05, 0.0001, 5e-05)

In [None]:
def create_optimizer(model, base_lr=5e-5, last_lr=None):
    named_parameters = list(model.named_parameters())
    
    regressor_param_start = 397
    attention_param_start = 393
    roberta_parameters = named_parameters[:attention_param_start]
    attention_parameters = named_parameters[attention_param_start:regressor_param_start]
    regressor_parameters = named_parameters[regressor_param_start:]
    
    attention_group = [params for (name, params) in attention_parameters]
    regressor_group = [params for (name, params) in regressor_parameters]
    
    parameters = []
    if last_lr is not None:
        parameters.append({"params": attention_group, "lr": last_lr})
        parameters.append({"params": regressor_group, "lr": last_lr})
    else:
        parameters.append({"params": attention_group})
        parameters.append({"params": regressor_group})
        
    # Change on different models
    layer_low_threshold = 133
    layer_middle_threshold = 261
    
    for layer_num, (name, params) in enumerate(roberta_parameters):
        weight_decay = 0.0 if 'bias' in name else 0.01
        
        lr = base_lr / 2.5 # 2e-05
        if layer_num >= layer_middle_threshold:
            lr = base_lr / 0.5 # 1e-4
        elif layer_num >= layer_low_threshold:        
            lr = base_lr    
            
        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})
        
    return AdamW(parameters)

In [32]:
sample_optimizer = create_optimizer(sample_model)

In [33]:
from torch.utils.data import Sampler,SequentialSampler,RandomSampler,SubsetRandomSampler
from collections import Counter

class WeightedSampler(Sampler):
    
    def __init__(self, dataset):
        
        self.indices = list(range(len(dataset)))
        self.num_samples = len(dataset)
        self.label_to_count = dict(Counter(dataset.bins))
        weights = [1/self.label_to_count[i] for i in dataset.bins]
        
        self.weights = torch.tensor(weights,dtype=torch.double)
        
    def __iter__(self):
        count = 0
        index = [self.indices[i] for i in torch.multinomial(self.weights, self.num_samples, replacement=True)]
        while count < self.num_samples:
            yield index[count]
            count += 1
    
    def __len__(self):
        return self.num_samples

### Training

In [34]:
def choose_eval_period(val_rmse):
    for rmse, period in cfg.EVAL_SCHEDULE:
        if val_rmse >= rmse:
            return period

In [35]:
def serialize_best(best_val_rmse, best_epoch, val_rmse, epoch, model, model_path):
    if not best_val_rmse or val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        best_epoch = epoch
        if not model_path.parent.exists():
            os.makedirs(model_path.parent)
        
#         torch.save(model.state_dict(), model_path)
        print(f"New best_val_rmse: {best_val_rmse:0.4}")
    else:       
        print(f"Still best_val_rmse: {best_val_rmse:0.4}",
              f"(from epoch {best_epoch})")
    return best_epoch, best_val_rmse

In [36]:
class Trainer():
    def __init__(self, scaler, model, model_path, train_loader, val_loader, optimizer, scheduler=None, num_epochs=cfg.NUM_EPOCHS):
        self.scaler, self.model, self.model_path, self.train_loader, self.val_loader, self.optimizer, self.scheduler, self.num_epochs = (
            scaler, model, model_path, train_loader, val_loader, optimizer, scheduler, num_epochs
        )
            
    def train(self):
        self.model.train()
        
        mse_loss = nn.MSELoss(reduction='mean')
        
        best_val_rmse = None
        best_epoch = 0
        step = 0
        last_eval_step = 0
        eval_period = cfg.EVAL_SCHEDULE[0][1]    

        start = time.time()
        val_rmse_list = []
        
        tbar = tqdm(range(self.num_epochs), total=self.num_epochs)
        for epoch in tbar:
            tbar.set_description(f'Epoch: {epoch}')
            val_rmse = None
            for batch_num, record in enumerate(self.train_loader):
                input_ids, attention_mask, target = record['input_ids'].to(cfg.DEVICE), record['attention_mask'].to(cfg.DEVICE), record['target'].to(cfg.DEVICE)
                
                self.optimizer.zero_grad()
                
                # Casts operations to mixed precision
#                 with torch.cuda.amp.autocast():
                pred, _ = self.model(input_ids, attention_mask)
                mse = mse_loss(pred.flatten(), target)
                    
#                 self.scaler.scale(mse).backward()
#                 self.scaler.step(self.optimizer)
#                 self.scaler.update()
                
                mse.backward()
                self.optimizer.step()
                
                if self.scheduler:
                    self.scheduler.step()
                    
                if step >= last_eval_step + eval_period:
                    elapsed_seconds = time.time() - start
                    num_steps = step - last_eval_step
                    print(f"\n{num_steps} steps took {elapsed_seconds:0.3} seconds")
                    last_eval_step = step
                    
                    val_rmse = np.sqrt(eval_mse(self.model, self.val_loader))
                    print(f"Epoch: {epoch} batch_num: {batch_num}", f"val_rmse: {val_rmse:0.4} ", end='')
                    
                    eval_period = choose_eval_period(val_rmse)
                    best_epoch, best_val_rmse = serialize_best(best_val_rmse, best_epoch, val_rmse, epoch, self.model, self.model_path)
                    val_rmse_list.append(val_rmse)
                    start = time.time()
                # Finish early on condition
                if epoch > 0 and best_val_rmse > 0.6 or (len(val_rmse_list) > 5 and np.array(val_rmse_list).mean() > 1.0):
                    return best_val_rmse
                
                step += 1
        return best_val_rmse

In [37]:
kfold = KFold(n_splits=cfg.NUM_FOLDS, random_state=cfg.SEED, shuffle=True)
splits = list(kfold.split(train_df))

### Optuna

In [38]:
# Best results
# Fold 0: {'base_lr': 6.155021772017101e-05, 'last_lr': 0.004642225106260296}. Best is trial 11 with value: 0.4787840247154236
# Fold 1: {'base_lr': 8.902912488113375e-05, 'last_lr': 0.00023076670834499074}. Best is trial 9 with value: 0.447449654340744
# Fold 2: {'base_lr': 0.00010637579365513648, 'last_lr': 8.276647346442369e-05}. Best is trial 9 with value: 0.47160205245018005
# Fold 3: {'base_lr': 3.543430147790451e-05, 'last_lr': 0.00011412924670673832}. Best is trial 6 with value: 0.46940940618515015
# Fold 4: {'base_lr': 4.8817697487830015e-05, 'last_lr': 0.00010249423984922014}. Best is trial 1 with value: 0.48146629333496094
# Fold 5: {'base_lr': 8.26551556012735e-05, 'last_lr': 0.00012486898084560538}. Best is trial 1 with value: 0.47751927375793457

In [39]:
from transformers import ElectraTokenizer

fold = 0

def objective(trial):
    base_lr = trial.suggest_float("base_lr", 3e-5, 5e-4, log=True)
    last_lr = trial.suggest_float("last_lr", 8e-5, 5e-3, log=True)
    
    print(f'##### Using fold {fold}')
    print(f'##### Using base_lr {base_lr} last_lr {last_lr}')
    
    model_path = cfg.MODEL_FOLDER/f"{cfg.model_name.replace('/', '_')}_{fold + 1}/model_{fold + 1}.pth"
    
    set_random_seed(cfg.SEED + fold)
    
    tokenizer = ElectraTokenizer.from_pretrained(cfg.TOKENIZER_PATH)
    
    train_indices, val_indices = splits[fold]
    train_dataset = CommonLitDataset(train_df.loc[train_indices], tokenizer)    
    val_dataset = CommonLitDataset(train_df.loc[val_indices], tokenizer)
    
    train_loader = DataLoader(train_dataset, batch_size=cfg.BATCH_SIZE,
                              drop_last=False, shuffle=True, num_workers=cfg.NUM_WORKERS)    
    val_loader = DataLoader(val_dataset, batch_size=cfg.BATCH_SIZE,
                            drop_last=False, shuffle=False, num_workers=cfg.NUM_WORKERS)
    
    set_random_seed(cfg.SEED + fold)
    
    model = CommonLitModel().to(cfg.DEVICE)
    
    optimizer = create_optimizer(model, base_lr=base_lr, last_lr=last_lr)
    
    scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                num_training_steps=cfg.NUM_EPOCHS * len(train_loader), 
                                                num_warmup_steps=50)
    scaler = torch.cuda.amp.GradScaler()
    
    trainer = Trainer(scaler, model, model_path, train_loader, val_loader, optimizer, scheduler = scheduler)
    rmse_val = trainer.train()
    
    del trainer
    del model
    del tokenizer
    del scaler
    del optimizer
    del train_loader
    del val_loader
    torch.cuda.empty_cache()
    gc.collect()
    
    return rmse_val

In [75]:
%%time

for i in range(3, len(list(splits))):
    fold = i
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=20)
    print(" Best value: ", study.best_trial.value)
    print(" Best params: ")
    for key, value in study.best_trial.params.items():
        print(f"    {key}: {value}")

[32m[I 2021-07-18 08:57:08,338][0m A new study created in memory with name: no-name-41a4e5f8-98a1-4af4-b3e9-1e7baa3c3054[0m


##### Using fold 3
##### Using base_lr 3.9221729204532446e-05 last_lr 0.0001143213470823608


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 14.0 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.067 New best_val_rmse: 1.067

16 steps took 11.8 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7445 New best_val_rmse: 0.7445

16 steps took 11.9 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9626 Still best_val_rmse: 0.7445 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6666 New best_val_rmse: 0.6666

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6087 New best_val_rmse: 0.6087

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6204 Still best_val_rmse: 0.6087 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5539 New best_val_rmse: 0.5539

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6022 Still best_val_rmse: 0.5539 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5351 New best_val_rmse: 0.5351

16 steps took 12.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5432 Still best_val_rmse: 0.5351 (from 

[32m[I 2021-07-18 09:09:25,558][0m Trial 0 finished with value: 0.48292896151542664 and parameters: {'base_lr': 3.9221729204532446e-05, 'last_lr': 0.0001143213470823608}. Best is trial 0 with value: 0.48292896151542664.[0m



##### Using fold 3
##### Using base_lr 0.00033391960782772637 last_lr 0.0005147256938677182


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.257 New best_val_rmse: 1.257

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.069 New best_val_rmse: 1.069

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.044 New best_val_rmse: 1.044

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.042 New best_val_rmse: 1.042

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.046 Still best_val_rmse: 1.042 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.06 Still best_val_rmse: 1.042 (from epoch 0)


[32m[I 2021-07-18 09:11:53,157][0m Trial 1 finished with value: 1.0417264699935913 and parameters: {'base_lr': 0.00033391960782772637, 'last_lr': 0.0005147256938677182}. Best is trial 0 with value: 0.48292896151542664.[0m



##### Using fold 3
##### Using base_lr 7.885487280888193e-05 last_lr 0.00010156645272416291


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9799 New best_val_rmse: 0.9799

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7057 New best_val_rmse: 0.7057

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.256 Still best_val_rmse: 0.7057 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.055 Still best_val_rmse: 0.7057 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.041 Still best_val_rmse: 0.7057 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.063 Still best_val_rmse: 0.7057 (from epoch 0)


[32m[I 2021-07-18 09:14:20,184][0m Trial 2 finished with value: 0.7056781649589539 and parameters: {'base_lr': 7.885487280888193e-05, 'last_lr': 0.00010156645272416291}. Best is trial 0 with value: 0.48292896151542664.[0m



##### Using fold 3
##### Using base_lr 0.00030817536655917727 last_lr 0.0004468268179472102


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.981 New best_val_rmse: 0.981

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7054 New best_val_rmse: 0.7054

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.042 Still best_val_rmse: 0.7054 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.041 Still best_val_rmse: 0.7054 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.047 Still best_val_rmse: 0.7054 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.055 Still best_val_rmse: 0.7054 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.045 Still best_val_rmse: 0.7054 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.07 Still best_val_rmse: 0.7054 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.041 Still best_val_rmse: 0.7054 (from epoch 0)


[32m[I 2021-07-18 09:17:50,555][0m Trial 3 finished with value: 0.7053548693656921 and parameters: {'base_lr': 0.00030817536655917727, 'last_lr': 0.0004468268179472102}. Best is trial 0 with value: 0.48292896151542664.[0m



##### Using fold 3
##### Using base_lr 6.559812562224263e-05 last_lr 0.0007934980185832123


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.038 New best_val_rmse: 1.038

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7863 New best_val_rmse: 0.7863

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9318 Still best_val_rmse: 0.7863 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7288 New best_val_rmse: 0.7288

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.9383 Still best_val_rmse: 0.7288 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6899 New best_val_rmse: 0.6899

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.616 New best_val_rmse: 0.616

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6132 New best_val_rmse: 0.6132

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5723 New best_val_rmse: 0.5723

16 steps took 12.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5633 New best_val_rmse: 0.5633

16 steps took 12.1 seconds

[32m[I 2021-07-18 09:31:23,650][0m Trial 4 finished with value: 0.4787239134311676 and parameters: {'base_lr': 6.559812562224263e-05, 'last_lr': 0.0007934980185832123}. Best is trial 4 with value: 0.4787239134311676.[0m



##### Using fold 3
##### Using base_lr 0.00026967781912501837 last_lr 0.0001016613078384455


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9488 New best_val_rmse: 0.9488

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.948 New best_val_rmse: 0.948

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.081 Still best_val_rmse: 0.948 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.01 Still best_val_rmse: 0.948 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.041 Still best_val_rmse: 0.948 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.098 Still best_val_rmse: 0.948 (from epoch 0)


[32m[I 2021-07-18 09:33:52,609][0m Trial 5 finished with value: 0.9480197429656982 and parameters: {'base_lr': 0.00026967781912501837, 'last_lr': 0.0001016613078384455}. Best is trial 4 with value: 0.4787239134311676.[0m



##### Using fold 3
##### Using base_lr 3.543430147790451e-05 last_lr 0.00011412924670673832


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.082 New best_val_rmse: 1.082

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7597 New best_val_rmse: 0.7597

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7525 New best_val_rmse: 0.7525

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6422 New best_val_rmse: 0.6422

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5748 New best_val_rmse: 0.5748

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7174 Still best_val_rmse: 0.5748 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5421 New best_val_rmse: 0.5421

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5537 Still best_val_rmse: 0.5421 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5463 Still best_val_rmse: 0.5421 (from epoch 0)

16 steps took 12.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5767 Still best_val_rmse: 0.5421 (from 

[32m[I 2021-07-18 09:51:17,488][0m Trial 6 finished with value: 0.46940940618515015 and parameters: {'base_lr': 3.543430147790451e-05, 'last_lr': 0.00011412924670673832}. Best is trial 6 with value: 0.46940940618515015.[0m



##### Using fold 3
##### Using base_lr 0.00023136374344373656 last_lr 0.00034607050791877667


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9946 New best_val_rmse: 0.9946

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7824 New best_val_rmse: 0.7824

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.126 Still best_val_rmse: 0.7824 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.122 Still best_val_rmse: 0.7824 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6531 New best_val_rmse: 0.6531

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.77 Still best_val_rmse: 0.6531 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.585 New best_val_rmse: 0.585

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6383 Still best_val_rmse: 0.585 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6341 Still best_val_rmse: 0.585 (from epoch 0)

16 steps took 12.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.6188 Still 

[32m[I 2021-07-18 10:01:10,255][0m Trial 7 finished with value: 0.5061682462692261 and parameters: {'base_lr': 0.00023136374344373656, 'last_lr': 0.00034607050791877667}. Best is trial 6 with value: 0.46940940618515015.[0m



##### Using fold 3
##### Using base_lr 0.00033138965366677297 last_lr 9.83992068017567e-05


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9691 New best_val_rmse: 0.9691

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7084 New best_val_rmse: 0.7084

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.434 Still best_val_rmse: 0.7084 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.047 Still best_val_rmse: 0.7084 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.059 Still best_val_rmse: 0.7084 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.059 Still best_val_rmse: 0.7084 (from epoch 0)


[32m[I 2021-07-18 10:03:38,510][0m Trial 8 finished with value: 0.7083932757377625 and parameters: {'base_lr': 0.00033138965366677297, 'last_lr': 9.83992068017567e-05}. Best is trial 6 with value: 0.46940940618515015.[0m



##### Using fold 3
##### Using base_lr 0.00043098128450056675 last_lr 0.0007715512174267046


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.039 New best_val_rmse: 1.039

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.07 Still best_val_rmse: 1.039 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.057 Still best_val_rmse: 1.039 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.046 Still best_val_rmse: 1.039 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.057 Still best_val_rmse: 1.039 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.054 Still best_val_rmse: 1.039 (from epoch 0)


[32m[I 2021-07-18 10:06:05,552][0m Trial 9 finished with value: 1.0392134189605713 and parameters: {'base_lr': 0.00043098128450056675, 'last_lr': 0.0007715512174267046}. Best is trial 6 with value: 0.46940940618515015.[0m



##### Using fold 3
##### Using base_lr 3.103082061786509e-05 last_lr 0.004775431900384966


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.043 New best_val_rmse: 1.043

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8488 New best_val_rmse: 0.8488

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7942 New best_val_rmse: 0.7942

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7057 New best_val_rmse: 0.7057

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5971 New best_val_rmse: 0.5971

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6341 Still best_val_rmse: 0.5971 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.611 Still best_val_rmse: 0.5971 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6206 Still best_val_rmse: 0.5971 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.564 New best_val_rmse: 0.564

16 steps took 12.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5243 New best_val_rmse: 0.5243

16 steps t

[32m[I 2021-07-18 10:18:48,626][0m Trial 10 finished with value: 0.4833422005176544 and parameters: {'base_lr': 3.103082061786509e-05, 'last_lr': 0.004775431900384966}. Best is trial 6 with value: 0.46940940618515015.[0m



##### Using fold 3
##### Using base_lr 6.133337931136716e-05 last_lr 0.001524412460715417


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.002 New best_val_rmse: 1.002

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7004 New best_val_rmse: 0.7004

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7624 Still best_val_rmse: 0.7004 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6621 New best_val_rmse: 0.6621

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.084 Still best_val_rmse: 0.6621 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.073 Still best_val_rmse: 0.6621 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.042 Still best_val_rmse: 0.6621 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.116 Still best_val_rmse: 0.6621 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.049 Still best_val_rmse: 0.6621 (from epoch 0)


[32m[I 2021-07-18 10:22:22,386][0m Trial 11 finished with value: 0.6621189713478088 and parameters: {'base_lr': 6.133337931136716e-05, 'last_lr': 0.001524412460715417}. Best is trial 6 with value: 0.46940940618515015.[0m



##### Using fold 3
##### Using base_lr 5.337276554246315e-05 last_lr 0.0015170864781839132


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.038 New best_val_rmse: 1.038

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.769 New best_val_rmse: 0.769

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.304 Still best_val_rmse: 0.769 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.054 Still best_val_rmse: 0.769 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.041 Still best_val_rmse: 0.769 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.066 Still best_val_rmse: 0.769 (from epoch 0)


[32m[I 2021-07-18 10:24:49,802][0m Trial 12 finished with value: 0.7689698934555054 and parameters: {'base_lr': 5.337276554246315e-05, 'last_lr': 0.0015170864781839132}. Best is trial 6 with value: 0.46940940618515015.[0m



##### Using fold 3
##### Using base_lr 0.00011596965301373618 last_lr 0.0002290996985630546


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9369 New best_val_rmse: 0.9369

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8947 New best_val_rmse: 0.8947

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9847 Still best_val_rmse: 0.8947 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.042 Still best_val_rmse: 0.8947 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.044 Still best_val_rmse: 0.8947 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.048 Still best_val_rmse: 0.8947 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.055 Still best_val_rmse: 0.8947 (from epoch 0)


[32m[I 2021-07-18 10:27:38,917][0m Trial 13 finished with value: 0.8946634531021118 and parameters: {'base_lr': 0.00011596965301373618, 'last_lr': 0.0002290996985630546}. Best is trial 6 with value: 0.46940940618515015.[0m



##### Using fold 3
##### Using base_lr 0.00010246766833906455 last_lr 0.001134522287708525


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9247 New best_val_rmse: 0.9247

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8135 New best_val_rmse: 0.8135

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9865 Still best_val_rmse: 0.8135 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7652 New best_val_rmse: 0.7652

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7546 New best_val_rmse: 0.7546

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.9987 Still best_val_rmse: 0.7546 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.8334 Still best_val_rmse: 0.7546 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7827 Still best_val_rmse: 0.7546 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.7239 New best_val_rmse: 0.7239


[32m[I 2021-07-18 10:31:12,708][0m Trial 14 finished with value: 0.7239358425140381 and parameters: {'base_lr': 0.00010246766833906455, 'last_lr': 0.001134522287708525}. Best is trial 6 with value: 0.46940940618515015.[0m



##### Using fold 3
##### Using base_lr 3.0207803172612474e-05 last_lr 0.0032293506317271343


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.055 New best_val_rmse: 1.055

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7023 New best_val_rmse: 0.7023

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9026 Still best_val_rmse: 0.7023 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.63 New best_val_rmse: 0.63

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7403 Still best_val_rmse: 0.63 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6205 New best_val_rmse: 0.6205

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5523 New best_val_rmse: 0.5523

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5841 Still best_val_rmse: 0.5523 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5284 New best_val_rmse: 0.5284

16 steps took 13.0 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5678 Still best_val_rmse: 0.5284 (from epoch 

[32m[I 2021-07-18 10:45:56,315][0m Trial 15 finished with value: 0.47899168729782104 and parameters: {'base_lr': 3.0207803172612474e-05, 'last_lr': 0.0032293506317271343}. Best is trial 6 with value: 0.46940940618515015.[0m



##### Using fold 3
##### Using base_lr 4.4980120599992625e-05 last_lr 0.00019972895712460457


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.021 New best_val_rmse: 1.021

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8023 New best_val_rmse: 0.8023

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6983 New best_val_rmse: 0.6983

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6202 New best_val_rmse: 0.6202

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6056 New best_val_rmse: 0.6056

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.61 Still best_val_rmse: 0.6056 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5887 New best_val_rmse: 0.5887

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5796 New best_val_rmse: 0.5796

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5547 New best_val_rmse: 0.5547

16 steps took 13.0 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.59 Still best_val_rmse: 0.5547 (from epoch 0)

16 steps took 12.1 seconds
E

[32m[I 2021-07-18 10:58:31,130][0m Trial 16 finished with value: 0.48208609223365784 and parameters: {'base_lr': 4.4980120599992625e-05, 'last_lr': 0.00019972895712460457}. Best is trial 6 with value: 0.46940940618515015.[0m



##### Using fold 3
##### Using base_lr 8.032455013131949e-05 last_lr 0.0007624001479884987


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9964 New best_val_rmse: 0.9964

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6798 New best_val_rmse: 0.6798

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9649 Still best_val_rmse: 0.6798 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7542 Still best_val_rmse: 0.6798 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6761 New best_val_rmse: 0.6761

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6685 New best_val_rmse: 0.6685

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6293 New best_val_rmse: 0.6293

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6255 New best_val_rmse: 0.6255

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5605 New best_val_rmse: 0.5605

16 steps took 13.0 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.557 New best_val_rmse: 0.557

16 steps took 12.1 secon

[32m[I 2021-07-18 11:10:14,813][0m Trial 17 finished with value: 0.4859916865825653 and parameters: {'base_lr': 8.032455013131949e-05, 'last_lr': 0.0007624001479884987}. Best is trial 6 with value: 0.46940940618515015.[0m



##### Using fold 3
##### Using base_lr 0.00019743414980651842 last_lr 0.00023746132486575692


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.213 New best_val_rmse: 1.213

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.075 New best_val_rmse: 1.075

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.049 New best_val_rmse: 1.049

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.042 New best_val_rmse: 1.042

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.047 Still best_val_rmse: 1.042 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.058 Still best_val_rmse: 1.042 (from epoch 0)


[32m[I 2021-07-18 11:12:43,149][0m Trial 18 finished with value: 1.041556715965271 and parameters: {'base_lr': 0.00019743414980651842, 'last_lr': 0.00023746132486575692}. Best is trial 6 with value: 0.46940940618515015.[0m



##### Using fold 3
##### Using base_lr 6.516633034520869e-05 last_lr 0.0028155571124699737


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9489 New best_val_rmse: 0.9489

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.667 New best_val_rmse: 0.667

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8921 Still best_val_rmse: 0.667 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7166 Still best_val_rmse: 0.667 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7091 Still best_val_rmse: 0.667 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6824 Still best_val_rmse: 0.667 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5797 New best_val_rmse: 0.5797

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6028 Still best_val_rmse: 0.5797 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5815 Still best_val_rmse: 0.5797 (from epoch 0)

16 steps took 13.0 seconds
Epoch: 1 batch_num: 12 val_

[32m[I 2021-07-18 11:28:51,374][0m Trial 19 finished with value: 0.4711739718914032 and parameters: {'base_lr': 6.516633034520869e-05, 'last_lr': 0.0028155571124699737}. Best is trial 6 with value: 0.46940940618515015.[0m
[32m[I 2021-07-18 11:28:51,376][0m A new study created in memory with name: no-name-cfc91bb7-1e04-4dda-9ebe-742427d1e474[0m



 Best value:  0.46940940618515015
 Best params: 
    base_lr: 3.543430147790451e-05
    last_lr: 0.00011412924670673832
##### Using fold 4
##### Using base_lr 0.00024128117424837637 last_lr 0.0003515748371973511


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8137 New best_val_rmse: 0.8137

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.02 Still best_val_rmse: 0.8137 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.052 Still best_val_rmse: 0.8137 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.206 Still best_val_rmse: 0.8137 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.036 Still best_val_rmse: 0.8137 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.049 Still best_val_rmse: 0.8137 (from epoch 0)


[32m[I 2021-07-18 11:31:18,737][0m Trial 0 finished with value: 0.8137476444244385 and parameters: {'base_lr': 0.00024128117424837637, 'last_lr': 0.0003515748371973511}. Best is trial 0 with value: 0.8137476444244385.[0m



##### Using fold 4
##### Using base_lr 4.8817697487830015e-05 last_lr 0.00010249423984922014


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9823 New best_val_rmse: 0.9823

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8517 New best_val_rmse: 0.8517

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6583 New best_val_rmse: 0.6583

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6434 New best_val_rmse: 0.6434

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5579 New best_val_rmse: 0.5579

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5827 Still best_val_rmse: 0.5579 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7169 Still best_val_rmse: 0.5579 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6163 Still best_val_rmse: 0.5579 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5809 Still best_val_rmse: 0.5579 (from epoch 0)

16 steps took 12.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5231 New best_val_rm

[32m[I 2021-07-18 11:44:01,212][0m Trial 1 finished with value: 0.48146629333496094 and parameters: {'base_lr': 4.8817697487830015e-05, 'last_lr': 0.00010249423984922014}. Best is trial 1 with value: 0.48146629333496094.[0m



##### Using fold 4
##### Using base_lr 3.427661737141705e-05 last_lr 0.002791313003700882


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9917 New best_val_rmse: 0.9917

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7435 New best_val_rmse: 0.7435

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7171 New best_val_rmse: 0.7171

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6122 New best_val_rmse: 0.6122

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6102 New best_val_rmse: 0.6102

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.629 Still best_val_rmse: 0.6102 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6018 New best_val_rmse: 0.6018

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6367 Still best_val_rmse: 0.6018 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.546 New best_val_rmse: 0.546

16 steps took 13.0 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5519 Still best_val_rmse: 0.546 (from epoch 0)

16 steps 

[32m[I 2021-07-18 11:57:10,595][0m Trial 2 finished with value: 0.48303160071372986 and parameters: {'base_lr': 3.427661737141705e-05, 'last_lr': 0.002791313003700882}. Best is trial 1 with value: 0.48146629333496094.[0m



##### Using fold 4
##### Using base_lr 0.0001580663961310632 last_lr 0.00011707760956862297


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7088 New best_val_rmse: 0.7088

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7558 Still best_val_rmse: 0.7088 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7592 Still best_val_rmse: 0.7088 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6098 New best_val_rmse: 0.6098

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6278 Still best_val_rmse: 0.6098 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6095 New best_val_rmse: 0.6095

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7028 Still best_val_rmse: 0.6095 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6637 Still best_val_rmse: 0.6095 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.571 New best_val_rmse: 0.571

16 steps took 13.0 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5628 

[32m[I 2021-07-18 12:08:37,103][0m Trial 3 finished with value: 0.48762640357017517 and parameters: {'base_lr': 0.0001580663961310632, 'last_lr': 0.00011707760956862297}. Best is trial 1 with value: 0.48146629333496094.[0m



##### Using fold 4
##### Using base_lr 3.162557743081626e-05 last_lr 0.0009448409692747924


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.09 New best_val_rmse: 1.09

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.745 New best_val_rmse: 0.745

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7289 New best_val_rmse: 0.7289

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6695 New best_val_rmse: 0.6695

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6965 Still best_val_rmse: 0.6695 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6069 New best_val_rmse: 0.6069

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5694 New best_val_rmse: 0.5694

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6167 Still best_val_rmse: 0.5694 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6069 Still best_val_rmse: 0.5694 (from epoch 0)

16 steps took 13.0 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5791 Still best_val_rmse: 0.5694 (from epoc

[32m[I 2021-07-18 12:19:38,307][0m Trial 4 finished with value: 0.49399229884147644 and parameters: {'base_lr': 3.162557743081626e-05, 'last_lr': 0.0009448409692747924}. Best is trial 1 with value: 0.48146629333496094.[0m



##### Using fold 4
##### Using base_lr 0.00021642273631281925 last_lr 0.0010479854857927283


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8109 New best_val_rmse: 0.8109

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9566 Still best_val_rmse: 0.8109 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.849 Still best_val_rmse: 0.8109 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.9185 Still best_val_rmse: 0.8109 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.147 Still best_val_rmse: 0.8109 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.146 Still best_val_rmse: 0.8109 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.079 Still best_val_rmse: 0.8109 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.061 Still best_val_rmse: 0.8109 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.078 Still best_val_rmse: 0.8109 (from epoch 0)


[32m[I 2021-07-18 12:23:07,633][0m Trial 5 finished with value: 0.8109349012374878 and parameters: {'base_lr': 0.00021642273631281925, 'last_lr': 0.0010479854857927283}. Best is trial 1 with value: 0.48146629333496094.[0m



##### Using fold 4
##### Using base_lr 9.479351815983019e-05 last_lr 8.95451137019212e-05


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9025 New best_val_rmse: 0.9025

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7665 New best_val_rmse: 0.7665

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6783 New best_val_rmse: 0.6783

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.08 Still best_val_rmse: 0.6783 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.04 Still best_val_rmse: 0.6783 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.039 Still best_val_rmse: 0.6783 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.046 Still best_val_rmse: 0.6783 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.052 Still best_val_rmse: 0.6783 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.081 Still best_val_rmse: 0.6783 (from epoch 0)


[32m[I 2021-07-18 12:26:40,904][0m Trial 6 finished with value: 0.6783124804496765 and parameters: {'base_lr': 9.479351815983019e-05, 'last_lr': 8.95451137019212e-05}. Best is trial 1 with value: 0.48146629333496094.[0m



##### Using fold 4
##### Using base_lr 3.6298085873028507e-05 last_lr 0.0016829203069565596


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.028 New best_val_rmse: 1.028

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7716 New best_val_rmse: 0.7716

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6459 New best_val_rmse: 0.6459

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7821 Still best_val_rmse: 0.6459 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.621 New best_val_rmse: 0.621

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.608 New best_val_rmse: 0.608

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5681 New best_val_rmse: 0.5681

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6129 Still best_val_rmse: 0.5681 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5699 Still best_val_rmse: 0.5681 (from epoch 0)

16 steps took 12.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5251 New best_val_rmse: 0.5251

16 steps to

[32m[I 2021-07-18 12:38:48,063][0m Trial 7 finished with value: 0.48582446575164795 and parameters: {'base_lr': 3.6298085873028507e-05, 'last_lr': 0.0016829203069565596}. Best is trial 1 with value: 0.48146629333496094.[0m



##### Using fold 4
##### Using base_lr 8.578451138142933e-05 last_lr 0.0001888383008819838


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9531 New best_val_rmse: 0.9531

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8252 New best_val_rmse: 0.8252

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.749 New best_val_rmse: 0.749

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6586 New best_val_rmse: 0.6586

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6777 Still best_val_rmse: 0.6586 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6198 New best_val_rmse: 0.6198

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5951 New best_val_rmse: 0.5951

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5704 New best_val_rmse: 0.5704

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6072 Still best_val_rmse: 0.5704 (from epoch 0)

16 steps took 12.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5484 New best_val_rmse: 0.5484

16 steps took 12.1 secon

[32m[I 2021-07-18 12:49:55,668][0m Trial 8 finished with value: 0.48891398310661316 and parameters: {'base_lr': 8.578451138142933e-05, 'last_lr': 0.0001888383008819838}. Best is trial 1 with value: 0.48146629333496094.[0m



##### Using fold 4
##### Using base_lr 6.021074245983651e-05 last_lr 0.00020394351353837165


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9425 New best_val_rmse: 0.9425

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7679 New best_val_rmse: 0.7679

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6585 New best_val_rmse: 0.6585

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6535 New best_val_rmse: 0.6535

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.042 Still best_val_rmse: 0.6535 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6365 New best_val_rmse: 0.6365

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5855 New best_val_rmse: 0.5855

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5863 Still best_val_rmse: 0.5855 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6678 Still best_val_rmse: 0.5855 (from epoch 0)

16 steps took 12.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 1.118 Still best_val_rmse: 0.5855 (from 

[32m[I 2021-07-18 12:59:44,998][0m Trial 9 finished with value: 0.5855399370193481 and parameters: {'base_lr': 6.021074245983651e-05, 'last_lr': 0.00020394351353837165}. Best is trial 1 with value: 0.48146629333496094.[0m



##### Using fold 4
##### Using base_lr 0.00043303891630465564 last_lr 0.00044239048866307607


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8347 New best_val_rmse: 0.8347

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.238 Still best_val_rmse: 0.8347 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.038 Still best_val_rmse: 0.8347 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.141 Still best_val_rmse: 0.8347 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.042 Still best_val_rmse: 0.8347 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.047 Still best_val_rmse: 0.8347 (from epoch 0)


[32m[I 2021-07-18 13:02:13,503][0m Trial 10 finished with value: 0.8346537947654724 and parameters: {'base_lr': 0.00043303891630465564, 'last_lr': 0.00044239048866307607}. Best is trial 1 with value: 0.48146629333496094.[0m



##### Using fold 4
##### Using base_lr 4.665885742771091e-05 last_lr 0.00481452069395669


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9515 New best_val_rmse: 0.9515

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8063 New best_val_rmse: 0.8063

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7532 New best_val_rmse: 0.7532

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6454 New best_val_rmse: 0.6454

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5874 New best_val_rmse: 0.5874

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6142 Still best_val_rmse: 0.5874 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5914 Still best_val_rmse: 0.5874 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6266 Still best_val_rmse: 0.5874 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5624 New best_val_rmse: 0.5624

16 steps took 12.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5464 New best_val_rmse: 0.5464

16 st

[32m[I 2021-07-18 13:12:05,194][0m Trial 11 finished with value: 0.5056087970733643 and parameters: {'base_lr': 4.665885742771091e-05, 'last_lr': 0.00481452069395669}. Best is trial 1 with value: 0.48146629333496094.[0m



##### Using fold 4
##### Using base_lr 5.8986698024001875e-05 last_lr 0.003946584134608693


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9444 New best_val_rmse: 0.9444

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7973 New best_val_rmse: 0.7973

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7911 New best_val_rmse: 0.7911

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6406 New best_val_rmse: 0.6406

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.689 Still best_val_rmse: 0.6406 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6195 New best_val_rmse: 0.6195

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5951 New best_val_rmse: 0.5951

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6273 Still best_val_rmse: 0.5951 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5632 New best_val_rmse: 0.5632

16 steps took 12.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5582 New best_val_rmse: 0.5582

16 steps took 12.0 seco

[32m[I 2021-07-18 13:21:54,322][0m Trial 12 finished with value: 0.5581570863723755 and parameters: {'base_lr': 5.8986698024001875e-05, 'last_lr': 0.003946584134608693}. Best is trial 1 with value: 0.48146629333496094.[0m



##### Using fold 4
##### Using base_lr 3.1452237402460025e-05 last_lr 0.00217949357063039


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.058 New best_val_rmse: 1.058

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7499 New best_val_rmse: 0.7499

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6526 New best_val_rmse: 0.6526

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6007 New best_val_rmse: 0.6007

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5698 New best_val_rmse: 0.5698

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6025 Still best_val_rmse: 0.5698 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5687 New best_val_rmse: 0.5687

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5763 Still best_val_rmse: 0.5687 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5595 New best_val_rmse: 0.5595

16 steps took 12.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5938 Still best_val_rmse: 0.5595 (from epoch 0)

16 step

[32m[I 2021-07-18 13:32:39,485][0m Trial 13 finished with value: 0.4934297502040863 and parameters: {'base_lr': 3.1452237402460025e-05, 'last_lr': 0.00217949357063039}. Best is trial 1 with value: 0.48146629333496094.[0m



##### Using fold 4
##### Using base_lr 5.55414698832763e-05 last_lr 0.002841040578159797


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9315 New best_val_rmse: 0.9315

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7318 New best_val_rmse: 0.7318

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7928 Still best_val_rmse: 0.7318 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.608 New best_val_rmse: 0.608

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5832 New best_val_rmse: 0.5832

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.607 Still best_val_rmse: 0.5832 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6841 Still best_val_rmse: 0.5832 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6207 Still best_val_rmse: 0.5832 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6326 Still best_val_rmse: 0.5832 (from epoch 0)

16 steps took 13.0 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5593 N

[32m[I 2021-07-18 13:43:31,649][0m Trial 14 finished with value: 0.49052903056144714 and parameters: {'base_lr': 5.55414698832763e-05, 'last_lr': 0.002841040578159797}. Best is trial 1 with value: 0.48146629333496094.[0m



##### Using fold 4
##### Using base_lr 4.2581929842872666e-05 last_lr 0.0007445548325755319


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.009 New best_val_rmse: 1.009

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8005 New best_val_rmse: 0.8005

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6706 New best_val_rmse: 0.6706

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.648 New best_val_rmse: 0.648

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7257 Still best_val_rmse: 0.648 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.614 New best_val_rmse: 0.614

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5542 New best_val_rmse: 0.5542

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5711 Still best_val_rmse: 0.5542 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5491 New best_val_rmse: 0.5491

16 steps took 12.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5576 Still best_val_rmse: 0.5491 (from epoch 0)

16 steps too

[32m[I 2021-07-18 13:55:41,373][0m Trial 15 finished with value: 0.4863857626914978 and parameters: {'base_lr': 4.2581929842872666e-05, 'last_lr': 0.0007445548325755319}. Best is trial 1 with value: 0.48146629333496094.[0m



##### Using fold 4
##### Using base_lr 8.510029147462861e-05 last_lr 0.0013711985991853187


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.932 New best_val_rmse: 0.932

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8573 New best_val_rmse: 0.8573

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6992 New best_val_rmse: 0.6992

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7571 Still best_val_rmse: 0.6992 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7162 Still best_val_rmse: 0.6992 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.665 New best_val_rmse: 0.665

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5699 New best_val_rmse: 0.5699

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5832 Still best_val_rmse: 0.5699 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5507 New best_val_rmse: 0.5507

16 steps took 12.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.6239 Still best_val_rmse: 0.5507 (from ep

[32m[I 2021-07-18 14:08:09,645][0m Trial 16 finished with value: 0.48239296674728394 and parameters: {'base_lr': 8.510029147462861e-05, 'last_lr': 0.0013711985991853187}. Best is trial 1 with value: 0.48146629333496094.[0m



##### Using fold 4
##### Using base_lr 8.596779419507262e-05 last_lr 0.0013630158228118356


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9393 New best_val_rmse: 0.9393

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7672 New best_val_rmse: 0.7672

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7444 New best_val_rmse: 0.7444

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7974 Still best_val_rmse: 0.7444 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6219 New best_val_rmse: 0.6219

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7832 Still best_val_rmse: 0.6219 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6336 Still best_val_rmse: 0.6219 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.8381 Still best_val_rmse: 0.6219 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.064 Still best_val_rmse: 0.6219 (from epoch 0)


[32m[I 2021-07-18 14:11:44,145][0m Trial 17 finished with value: 0.6219467520713806 and parameters: {'base_lr': 8.596779419507262e-05, 'last_lr': 0.0013630158228118356}. Best is trial 1 with value: 0.48146629333496094.[0m



##### Using fold 4
##### Using base_lr 0.00013525124851816367 last_lr 0.00048456432513444334


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7681 New best_val_rmse: 0.7681

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7868 Still best_val_rmse: 0.7681 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6911 New best_val_rmse: 0.6911

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7043 Still best_val_rmse: 0.6911 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7421 Still best_val_rmse: 0.6911 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.641 New best_val_rmse: 0.641

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5617 New best_val_rmse: 0.5617

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6488 Still best_val_rmse: 0.5617 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5706 Still best_val_rmse: 0.5617 (from epoch 0)

16 steps took 13.0 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5615 

[32m[I 2021-07-18 14:24:10,434][0m Trial 18 finished with value: 0.48597991466522217 and parameters: {'base_lr': 0.00013525124851816367, 'last_lr': 0.00048456432513444334}. Best is trial 1 with value: 0.48146629333496094.[0m



##### Using fold 4
##### Using base_lr 7.526393961835154e-05 last_lr 0.0002398473868424543


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9476 New best_val_rmse: 0.9476

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7828 New best_val_rmse: 0.7828

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.678 New best_val_rmse: 0.678

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7538 Still best_val_rmse: 0.678 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6012 New best_val_rmse: 0.6012

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.601 New best_val_rmse: 0.601

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6465 Still best_val_rmse: 0.601 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.569 New best_val_rmse: 0.569

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6048 Still best_val_rmse: 0.569 (from epoch 0)

16 steps took 12.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5536 New best_val_rmse: 0.5536

16 steps took 

[32m[I 2021-07-18 14:36:12,667][0m Trial 19 finished with value: 0.48502135276794434 and parameters: {'base_lr': 7.526393961835154e-05, 'last_lr': 0.0002398473868424543}. Best is trial 1 with value: 0.48146629333496094.[0m
[32m[I 2021-07-18 14:36:12,670][0m A new study created in memory with name: no-name-6b32fe68-d5e1-4363-9e00-d044ec228817[0m



 Best value:  0.48146629333496094
 Best params: 
    base_lr: 4.8817697487830015e-05
    last_lr: 0.00010249423984922014
##### Using fold 5
##### Using base_lr 0.00034156537880768314 last_lr 0.0005061073297723381


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.157 New best_val_rmse: 1.157

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.263 Still best_val_rmse: 1.157 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.142 New best_val_rmse: 1.142

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.019 New best_val_rmse: 1.019

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.025 Still best_val_rmse: 1.019 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.022 Still best_val_rmse: 1.019 (from epoch 0)


[32m[I 2021-07-18 14:38:40,140][0m Trial 0 finished with value: 1.018947720527649 and parameters: {'base_lr': 0.00034156537880768314, 'last_lr': 0.0005061073297723381}. Best is trial 0 with value: 1.018947720527649.[0m



##### Using fold 5
##### Using base_lr 8.26551556012735e-05 last_lr 0.00012486898084560538


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9287 New best_val_rmse: 0.9287

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7149 New best_val_rmse: 0.7149

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7342 Still best_val_rmse: 0.7149 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6573 New best_val_rmse: 0.6573

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7195 Still best_val_rmse: 0.6573 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5843 New best_val_rmse: 0.5843

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6044 Still best_val_rmse: 0.5843 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6077 Still best_val_rmse: 0.5843 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5619 New best_val_rmse: 0.5619

16 steps took 12.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.558 New best_val_rms

[32m[I 2021-07-18 14:54:06,790][0m Trial 1 finished with value: 0.47751927375793457 and parameters: {'base_lr': 8.26551556012735e-05, 'last_lr': 0.00012486898084560538}. Best is trial 1 with value: 0.47751927375793457.[0m



##### Using fold 5
##### Using base_lr 5.07848230852257e-05 last_lr 0.0027807105491802877


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9518 New best_val_rmse: 0.9518

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.829 New best_val_rmse: 0.829

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6824 New best_val_rmse: 0.6824

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7403 Still best_val_rmse: 0.6824 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6534 New best_val_rmse: 0.6534

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6048 New best_val_rmse: 0.6048

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5869 New best_val_rmse: 0.5869

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7119 Still best_val_rmse: 0.5869 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5415 New best_val_rmse: 0.5415

16 steps took 13.0 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.557 Still best_val_rmse: 0.5415 (from epoch 0)

16 steps

[32m[I 2021-07-18 15:05:41,799][0m Trial 2 finished with value: 0.4883090555667877 and parameters: {'base_lr': 5.07848230852257e-05, 'last_lr': 0.0027807105491802877}. Best is trial 1 with value: 0.47751927375793457.[0m



##### Using fold 5
##### Using base_lr 3.128338765685475e-05 last_lr 0.0009565802288329959


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.045 New best_val_rmse: 1.045

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9107 New best_val_rmse: 0.9107

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.74 New best_val_rmse: 0.74

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7275 New best_val_rmse: 0.7275

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6066 New best_val_rmse: 0.6066

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6424 Still best_val_rmse: 0.6066 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5525 New best_val_rmse: 0.5525

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5862 Still best_val_rmse: 0.5525 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5512 New best_val_rmse: 0.5512

16 steps took 13.0 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5363 New best_val_rmse: 0.5363

16 steps took 12.1 seconds
E

[32m[I 2021-07-18 15:20:25,301][0m Trial 3 finished with value: 0.4785512387752533 and parameters: {'base_lr': 3.128338765685475e-05, 'last_lr': 0.0009565802288329959}. Best is trial 1 with value: 0.47751927375793457.[0m



##### Using fold 5
##### Using base_lr 0.00020214649664270073 last_lr 0.0013066014388123754


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.819 New best_val_rmse: 0.819

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7113 New best_val_rmse: 0.7113

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7612 Still best_val_rmse: 0.7113 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6803 New best_val_rmse: 0.6803

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6944 Still best_val_rmse: 0.6803 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.518 Still best_val_rmse: 0.6803 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.064 Still best_val_rmse: 0.6803 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.283 Still best_val_rmse: 0.6803 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.02 Still best_val_rmse: 0.6803 (from epoch 0)


[32m[I 2021-07-18 15:23:58,258][0m Trial 4 finished with value: 0.680277407169342 and parameters: {'base_lr': 0.00020214649664270073, 'last_lr': 0.0013066014388123754}. Best is trial 1 with value: 0.47751927375793457.[0m



##### Using fold 5
##### Using base_lr 0.00019280756020195115 last_lr 0.00025209261521705074


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8803 New best_val_rmse: 0.8803

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.247 Still best_val_rmse: 0.8803 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.086 Still best_val_rmse: 0.8803 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.019 Still best_val_rmse: 0.8803 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.027 Still best_val_rmse: 0.8803 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.024 Still best_val_rmse: 0.8803 (from epoch 0)


[32m[I 2021-07-18 15:26:25,448][0m Trial 5 finished with value: 0.8803056478500366 and parameters: {'base_lr': 0.00019280756020195115, 'last_lr': 0.00025209261521705074}. Best is trial 1 with value: 0.47751927375793457.[0m



##### Using fold 5
##### Using base_lr 0.0003283469741795501 last_lr 0.0005682650127158392


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.073 New best_val_rmse: 1.073

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.259 Still best_val_rmse: 1.073 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.04 New best_val_rmse: 1.04

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.019 New best_val_rmse: 1.019

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.028 Still best_val_rmse: 1.019 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.022 Still best_val_rmse: 1.019 (from epoch 0)


[32m[I 2021-07-18 15:28:52,240][0m Trial 6 finished with value: 1.0189990997314453 and parameters: {'base_lr': 0.0003283469741795501, 'last_lr': 0.0005682650127158392}. Best is trial 1 with value: 0.47751927375793457.[0m



##### Using fold 5
##### Using base_lr 0.0003619342745902642 last_lr 0.0002793166004383098


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7668 New best_val_rmse: 0.7668

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9914 Still best_val_rmse: 0.7668 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.201 Still best_val_rmse: 0.7668 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.023 Still best_val_rmse: 0.7668 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.022 Still best_val_rmse: 0.7668 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.021 Still best_val_rmse: 0.7668 (from epoch 0)


[32m[I 2021-07-18 15:31:19,567][0m Trial 7 finished with value: 0.7668207287788391 and parameters: {'base_lr': 0.0003619342745902642, 'last_lr': 0.0002793166004383098}. Best is trial 1 with value: 0.47751927375793457.[0m



##### Using fold 5
##### Using base_lr 0.00018475470361487584 last_lr 0.0017538885737424285


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8872 New best_val_rmse: 0.8872

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8868 New best_val_rmse: 0.8868

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.441 Still best_val_rmse: 0.8868 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.023 Still best_val_rmse: 0.8868 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.04 Still best_val_rmse: 0.8868 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.029 Still best_val_rmse: 0.8868 (from epoch 0)


[32m[I 2021-07-18 15:33:48,324][0m Trial 8 finished with value: 0.8868013024330139 and parameters: {'base_lr': 0.00018475470361487584, 'last_lr': 0.0017538885737424285}. Best is trial 1 with value: 0.47751927375793457.[0m



##### Using fold 5
##### Using base_lr 0.0002658884716312595 last_lr 0.0002686222191806717


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7271 New best_val_rmse: 0.7271

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.404 Still best_val_rmse: 0.7271 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.08 Still best_val_rmse: 0.7271 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.019 Still best_val_rmse: 0.7271 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.022 Still best_val_rmse: 0.7271 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.021 Still best_val_rmse: 0.7271 (from epoch 0)


[32m[I 2021-07-18 15:36:15,470][0m Trial 9 finished with value: 0.7271288633346558 and parameters: {'base_lr': 0.0002658884716312595, 'last_lr': 0.0002686222191806717}. Best is trial 1 with value: 0.47751927375793457.[0m



##### Using fold 5
##### Using base_lr 8.211697552701408e-05 last_lr 8.57661845639057e-05


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9113 New best_val_rmse: 0.9113

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6853 New best_val_rmse: 0.6853

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7027 Still best_val_rmse: 0.6853 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.681 New best_val_rmse: 0.681

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7108 Still best_val_rmse: 0.681 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7688 Still best_val_rmse: 0.681 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6625 New best_val_rmse: 0.6625

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5848 New best_val_rmse: 0.5848

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.67 Still best_val_rmse: 0.5848 (from epoch 0)

16 steps took 12.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.595 Still best_val_rmse: 0

[32m[I 2021-07-18 15:47:33,212][0m Trial 10 finished with value: 0.4871692955493927 and parameters: {'base_lr': 8.211697552701408e-05, 'last_lr': 8.57661845639057e-05}. Best is trial 1 with value: 0.47751927375793457.[0m



##### Using fold 5
##### Using base_lr 3.5490024609633844e-05 last_lr 0.0001255600179035026


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.035 New best_val_rmse: 1.035

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6964 New best_val_rmse: 0.6964

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6703 New best_val_rmse: 0.6703

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6156 New best_val_rmse: 0.6156

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.587 New best_val_rmse: 0.587

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6486 Still best_val_rmse: 0.587 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5548 New best_val_rmse: 0.5548

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5761 Still best_val_rmse: 0.5548 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.605 Still best_val_rmse: 0.5548 (from epoch 0)

16 steps took 13.0 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.572 Still best_val_rmse: 0.5548 (from epoch

[32m[I 2021-07-18 15:59:26,474][0m Trial 11 finished with value: 0.4860093891620636 and parameters: {'base_lr': 3.5490024609633844e-05, 'last_lr': 0.0001255600179035026}. Best is trial 1 with value: 0.47751927375793457.[0m



##### Using fold 5
##### Using base_lr 3.015459400919106e-05 last_lr 0.0010162442423036439


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.05 New best_val_rmse: 1.05

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9636 New best_val_rmse: 0.9636

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.715 New best_val_rmse: 0.715

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8584 Still best_val_rmse: 0.715 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6193 New best_val_rmse: 0.6193

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6181 New best_val_rmse: 0.6181

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5551 New best_val_rmse: 0.5551

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7061 Still best_val_rmse: 0.5551 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5866 Still best_val_rmse: 0.5551 (from epoch 0)

16 steps took 12.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5365 New best_val_rmse: 0.5365

16 steps too

[32m[I 2021-07-18 16:10:01,279][0m Trial 12 finished with value: 0.4918219745159149 and parameters: {'base_lr': 3.015459400919106e-05, 'last_lr': 0.0010162442423036439}. Best is trial 1 with value: 0.47751927375793457.[0m



##### Using fold 5
##### Using base_lr 8.48434209883237e-05 last_lr 0.0037678206336956926


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9444 New best_val_rmse: 0.9444

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7445 New best_val_rmse: 0.7445

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.009 Still best_val_rmse: 0.7445 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6438 New best_val_rmse: 0.6438

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6251 New best_val_rmse: 0.6251

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.9223 Still best_val_rmse: 0.6251 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7759 Still best_val_rmse: 0.6251 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.077 Still best_val_rmse: 0.6251 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.058 Still best_val_rmse: 0.6251 (from epoch 0)


[32m[I 2021-07-18 16:13:35,166][0m Trial 13 finished with value: 0.6251128911972046 and parameters: {'base_lr': 8.48434209883237e-05, 'last_lr': 0.0037678206336956926}. Best is trial 1 with value: 0.47751927375793457.[0m



##### Using fold 5
##### Using base_lr 5.039073444083897e-05 last_lr 0.00013636397981993983


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9854 New best_val_rmse: 0.9854

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6976 New best_val_rmse: 0.6976

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7385 Still best_val_rmse: 0.6976 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7511 Still best_val_rmse: 0.6976 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6033 New best_val_rmse: 0.6033

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6509 Still best_val_rmse: 0.6033 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5784 New best_val_rmse: 0.5784

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5439 New best_val_rmse: 0.5439

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5787 Still best_val_rmse: 0.5439 (from epoch 0)

16 steps took 12.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5727 Still best_val_

[32m[I 2021-07-18 16:25:17,832][0m Trial 14 finished with value: 0.4847445785999298 and parameters: {'base_lr': 5.039073444083897e-05, 'last_lr': 0.00013636397981993983}. Best is trial 1 with value: 0.47751927375793457.[0m



##### Using fold 5
##### Using base_lr 0.00010922317894753754 last_lr 0.0008560933625476431


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9468 New best_val_rmse: 0.9468

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8095 New best_val_rmse: 0.8095

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7774 New best_val_rmse: 0.7774

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.628 New best_val_rmse: 0.628

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7425 Still best_val_rmse: 0.628 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6998 Still best_val_rmse: 0.628 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5567 New best_val_rmse: 0.5567

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.717 Still best_val_rmse: 0.5567 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6476 Still best_val_rmse: 0.5567 (from epoch 0)

16 steps took 13.0 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.567 Still best_val_rmse: 

[32m[I 2021-07-18 16:40:50,962][0m Trial 15 finished with value: 0.4778694808483124 and parameters: {'base_lr': 0.00010922317894753754, 'last_lr': 0.0008560933625476431}. Best is trial 1 with value: 0.47751927375793457.[0m



##### Using fold 5
##### Using base_lr 0.00011114433176355297 last_lr 0.0004177483097837711


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9009 New best_val_rmse: 0.9009

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7354 New best_val_rmse: 0.7354

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8748 Still best_val_rmse: 0.7354 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.023 Still best_val_rmse: 0.7354 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.058 Still best_val_rmse: 0.7354 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.024 Still best_val_rmse: 0.7354 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.02 Still best_val_rmse: 0.7354 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.094 Still best_val_rmse: 0.7354 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.019 Still best_val_rmse: 0.7354 (from epoch 0)


[32m[I 2021-07-18 16:44:24,049][0m Trial 16 finished with value: 0.7354328036308289 and parameters: {'base_lr': 0.00011114433176355297, 'last_lr': 0.0004177483097837711}. Best is trial 1 with value: 0.47751927375793457.[0m



##### Using fold 5
##### Using base_lr 0.0001243332575249866 last_lr 0.0007661593158737323


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9477 New best_val_rmse: 0.9477

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.063 Still best_val_rmse: 0.9477 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.048 Still best_val_rmse: 0.9477 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.02 Still best_val_rmse: 0.9477 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.024 Still best_val_rmse: 0.9477 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.021 Still best_val_rmse: 0.9477 (from epoch 0)


[32m[I 2021-07-18 16:46:50,923][0m Trial 17 finished with value: 0.9476915597915649 and parameters: {'base_lr': 0.0001243332575249866, 'last_lr': 0.0007661593158737323}. Best is trial 1 with value: 0.47751927375793457.[0m



##### Using fold 5
##### Using base_lr 6.389127063538818e-05 last_lr 0.0020876227287264706


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9429 New best_val_rmse: 0.9429

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.74 New best_val_rmse: 0.74

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7617 Still best_val_rmse: 0.74 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7081 New best_val_rmse: 0.7081

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.84 Still best_val_rmse: 0.7081 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6406 New best_val_rmse: 0.6406

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5873 New best_val_rmse: 0.5873

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7791 Still best_val_rmse: 0.5873 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5594 New best_val_rmse: 0.5594

16 steps took 12.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5865 Still best_val_rmse: 0.5594 (from epoch 

[32m[I 2021-07-18 16:57:33,951][0m Trial 18 finished with value: 0.49168142676353455 and parameters: {'base_lr': 6.389127063538818e-05, 'last_lr': 0.0020876227287264706}. Best is trial 1 with value: 0.47751927375793457.[0m



##### Using fold 5
##### Using base_lr 0.00013796165088360738 last_lr 0.0001683495630041909


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9098 New best_val_rmse: 0.9098

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.203 Still best_val_rmse: 0.9098 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.04 Still best_val_rmse: 0.9098 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.02 Still best_val_rmse: 0.9098 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.023 Still best_val_rmse: 0.9098 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.023 Still best_val_rmse: 0.9098 (from epoch 0)


[32m[I 2021-07-18 17:00:01,626][0m Trial 19 finished with value: 0.9098045825958252 and parameters: {'base_lr': 0.00013796165088360738, 'last_lr': 0.0001683495630041909}. Best is trial 1 with value: 0.47751927375793457.[0m



 Best value:  0.47751927375793457
 Best params: 
    base_lr: 8.26551556012735e-05
    last_lr: 0.00012486898084560538
CPU times: user 6h 14min 46s, sys: 1h 35min 34s, total: 7h 50min 20s
Wall time: 8h 2min 53s


In [None]:
for i in range(0, 3):
    fold = i
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=20)
    print(" Best value: ", study.best_trial.value)
    print(" Best params: ")
    for key, value in study.best_trial.params.items():
        print(f"    {key}: {value}")

[32m[I 2021-07-18 17:00:01,637][0m A new study created in memory with name: no-name-af99b6bb-7317-45de-acd2-202575794b30[0m


##### Using fold 0
##### Using base_lr 0.0004639416902680064 last_lr 0.004754676740356026


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.033 New best_val_rmse: 1.033

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.309 Still best_val_rmse: 1.033 (from epoch 0)

16 steps took 11.9 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.083 Still best_val_rmse: 1.033 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.292 Still best_val_rmse: 1.033 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.046 Still best_val_rmse: 1.033 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.034 Still best_val_rmse: 1.033 (from epoch 0)


[32m[I 2021-07-18 17:02:28,049][0m Trial 0 finished with value: 1.0334450006484985 and parameters: {'base_lr': 0.0004639416902680064, 'last_lr': 0.004754676740356026}. Best is trial 0 with value: 1.0334450006484985.[0m



##### Using fold 0
##### Using base_lr 0.00026780595944539176 last_lr 0.0004427113015418867


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.08 New best_val_rmse: 1.08

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.041 New best_val_rmse: 1.041

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.277 Still best_val_rmse: 1.041 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.085 Still best_val_rmse: 1.041 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.042 Still best_val_rmse: 1.041 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.043 Still best_val_rmse: 1.041 (from epoch 0)


[32m[I 2021-07-18 17:04:54,858][0m Trial 1 finished with value: 1.0410969257354736 and parameters: {'base_lr': 0.00026780595944539176, 'last_lr': 0.0004427113015418867}. Best is trial 0 with value: 1.0334450006484985.[0m



##### Using fold 0
##### Using base_lr 0.00048418508239822506 last_lr 0.0001679007782255595


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.581 New best_val_rmse: 1.581

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.057 New best_val_rmse: 1.057

16 steps took 11.9 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.115 Still best_val_rmse: 1.057 (from epoch 0)

16 steps took 11.9 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.136 Still best_val_rmse: 1.057 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.034 New best_val_rmse: 1.034

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.037 Still best_val_rmse: 1.034 (from epoch 0)


[32m[I 2021-07-18 17:07:22,838][0m Trial 2 finished with value: 1.0341919660568237 and parameters: {'base_lr': 0.00048418508239822506, 'last_lr': 0.0001679007782255595}. Best is trial 0 with value: 1.0334450006484985.[0m



##### Using fold 0
##### Using base_lr 0.00017152055749308505 last_lr 8.979830270843491e-05


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.009 New best_val_rmse: 1.009

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.041 Still best_val_rmse: 1.009 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.1 Still best_val_rmse: 1.009 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.069 Still best_val_rmse: 1.009 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.037 Still best_val_rmse: 1.009 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.045 Still best_val_rmse: 1.009 (from epoch 0)


[32m[I 2021-07-18 17:09:50,035][0m Trial 3 finished with value: 1.0085803270339966 and parameters: {'base_lr': 0.00017152055749308505, 'last_lr': 8.979830270843491e-05}. Best is trial 3 with value: 1.0085803270339966.[0m



##### Using fold 0
##### Using base_lr 3.684280497803618e-05 last_lr 0.001455401689185026


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9898 New best_val_rmse: 0.9898

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.911 New best_val_rmse: 0.911

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6758 New best_val_rmse: 0.6758

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6435 New best_val_rmse: 0.6435

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.751 Still best_val_rmse: 0.6435 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.047 Still best_val_rmse: 0.6435 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7194 Still best_val_rmse: 0.6435 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7206 Still best_val_rmse: 0.6435 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6043 New best_val_rmse: 0.6043


[32m[I 2021-07-18 17:13:23,402][0m Trial 4 finished with value: 0.6043326258659363 and parameters: {'base_lr': 3.684280497803618e-05, 'last_lr': 0.001455401689185026}. Best is trial 4 with value: 0.6043326258659363.[0m



##### Using fold 0
##### Using base_lr 6.646113844445599e-05 last_lr 0.000990784832291025


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8628 New best_val_rmse: 0.8628

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7718 New best_val_rmse: 0.7718

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.008 Still best_val_rmse: 0.7718 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7548 New best_val_rmse: 0.7548

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6864 New best_val_rmse: 0.6864

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7752 Still best_val_rmse: 0.6864 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6081 New best_val_rmse: 0.6081

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5984 New best_val_rmse: 0.5984

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.7358 Still best_val_rmse: 0.5984 (from epoch 0)

16 steps took 12.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5707 New best_val_rmse: 0.5707

16 ste

[32m[I 2021-07-18 17:23:49,350][0m Trial 5 finished with value: 0.4964250326156616 and parameters: {'base_lr': 6.646113844445599e-05, 'last_lr': 0.000990784832291025}. Best is trial 5 with value: 0.4964250326156616.[0m



##### Using fold 0
##### Using base_lr 0.0002827982851197117 last_lr 0.00014207261282089427


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.16 New best_val_rmse: 1.16

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.07 New best_val_rmse: 1.07

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.077 Still best_val_rmse: 1.07 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.089 Still best_val_rmse: 1.07 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.038 New best_val_rmse: 1.038

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.042 Still best_val_rmse: 1.038 (from epoch 0)


[32m[I 2021-07-18 17:26:17,653][0m Trial 6 finished with value: 1.0377686023712158 and parameters: {'base_lr': 0.0002827982851197117, 'last_lr': 0.00014207261282089427}. Best is trial 5 with value: 0.4964250326156616.[0m



##### Using fold 0
##### Using base_lr 0.00012703529957781474 last_lr 0.00011717300579650607


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9501 New best_val_rmse: 0.9501

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.004 Still best_val_rmse: 0.9501 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.06 Still best_val_rmse: 0.9501 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.191 Still best_val_rmse: 0.9501 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.036 Still best_val_rmse: 0.9501 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.041 Still best_val_rmse: 0.9501 (from epoch 0)


[32m[I 2021-07-18 17:28:44,534][0m Trial 7 finished with value: 0.950054943561554 and parameters: {'base_lr': 0.00012703529957781474, 'last_lr': 0.00011717300579650607}. Best is trial 5 with value: 0.4964250326156616.[0m



##### Using fold 0
##### Using base_lr 3.3423606764108414e-05 last_lr 0.00015748526576262877


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.032 New best_val_rmse: 1.032

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8521 New best_val_rmse: 0.8521

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6398 New best_val_rmse: 0.6398

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6169 New best_val_rmse: 0.6169

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6504 Still best_val_rmse: 0.6169 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.8885 Still best_val_rmse: 0.6169 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6575 Still best_val_rmse: 0.6169 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6632 Still best_val_rmse: 0.6169 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6192 Still best_val_rmse: 0.6169 (from epoch 0)


[32m[I 2021-07-18 17:32:19,590][0m Trial 8 finished with value: 0.6168890595436096 and parameters: {'base_lr': 3.3423606764108414e-05, 'last_lr': 0.00015748526576262877}. Best is trial 5 with value: 0.4964250326156616.[0m



##### Using fold 0
##### Using base_lr 9.109929231927102e-05 last_lr 0.0011340424636788516


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8718 New best_val_rmse: 0.8718

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9891 Still best_val_rmse: 0.8718 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7633 New best_val_rmse: 0.7633

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.013 Still best_val_rmse: 0.7633 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.08 Still best_val_rmse: 0.7633 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.118 Still best_val_rmse: 0.7633 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.108 Still best_val_rmse: 0.7633 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.037 Still best_val_rmse: 0.7633 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.038 Still best_val_rmse: 0.7633 (from epoch 0)


[32m[I 2021-07-18 17:35:48,443][0m Trial 9 finished with value: 0.763325572013855 and parameters: {'base_lr': 9.109929231927102e-05, 'last_lr': 0.0011340424636788516}. Best is trial 5 with value: 0.4964250326156616.[0m



##### Using fold 0
##### Using base_lr 6.2968821405157e-05 last_lr 0.003659047748949041


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9379 New best_val_rmse: 0.9379

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.025 Still best_val_rmse: 0.9379 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8499 New best_val_rmse: 0.8499

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6561 New best_val_rmse: 0.6561

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.8844 Still best_val_rmse: 0.6561 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.795 Still best_val_rmse: 0.6561 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6214 New best_val_rmse: 0.6214

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5936 New best_val_rmse: 0.5936

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.8482 Still best_val_rmse: 0.5936 (from epoch 0)

16 steps took 12.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5645 New best_val_rmse

[32m[I 2021-07-18 17:47:06,198][0m Trial 10 finished with value: 0.4857335090637207 and parameters: {'base_lr': 6.2968821405157e-05, 'last_lr': 0.003659047748949041}. Best is trial 10 with value: 0.4857335090637207.[0m



##### Using fold 0
##### Using base_lr 6.155021772017101e-05 last_lr 0.004642225106260296


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9218 New best_val_rmse: 0.9218

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9639 Still best_val_rmse: 0.9218 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7486 New best_val_rmse: 0.7486

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6446 New best_val_rmse: 0.6446

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6073 New best_val_rmse: 0.6073

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6321 Still best_val_rmse: 0.6073 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5637 New best_val_rmse: 0.5637

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5644 Still best_val_rmse: 0.5637 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.655 Still best_val_rmse: 0.5637 (from epoch 0)

16 steps took 12.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5554 New best_val_rms

[32m[I 2021-07-18 18:00:06,404][0m Trial 11 finished with value: 0.4787840247154236 and parameters: {'base_lr': 6.155021772017101e-05, 'last_lr': 0.004642225106260296}. Best is trial 11 with value: 0.4787840247154236.[0m



##### Using fold 0
##### Using base_lr 5.616824860695961e-05 last_lr 0.004562971767435031


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9442 New best_val_rmse: 0.9442

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.101 Still best_val_rmse: 0.9442 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7399 New best_val_rmse: 0.7399

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6475 New best_val_rmse: 0.6475

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7218 Still best_val_rmse: 0.6475 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6749 Still best_val_rmse: 0.6475 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6236 New best_val_rmse: 0.6236

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5901 New best_val_rmse: 0.5901

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.8255 Still best_val_rmse: 0.5901 (from epoch 0)

16 steps took 12.8 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5844 New best_val_rms

[32m[I 2021-07-18 18:09:57,725][0m Trial 12 finished with value: 0.5160984992980957 and parameters: {'base_lr': 5.616824860695961e-05, 'last_lr': 0.004562971767435031}. Best is trial 11 with value: 0.4787840247154236.[0m



##### Using fold 0
##### Using base_lr 5.2915478433349325e-05 last_lr 0.00372059827950533


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9706 New best_val_rmse: 0.9706

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9569 New best_val_rmse: 0.9569

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7064 New best_val_rmse: 0.7064

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7697 Still best_val_rmse: 0.7064 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6719 New best_val_rmse: 0.6719

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6637 New best_val_rmse: 0.6637

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6762 Still best_val_rmse: 0.6637 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6094 New best_val_rmse: 0.6094

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.775 Still best_val_rmse: 0.6094 (from epoch 0)


[32m[I 2021-07-18 18:13:33,042][0m Trial 13 finished with value: 0.6094065308570862 and parameters: {'base_lr': 5.2915478433349325e-05, 'last_lr': 0.00372059827950533}. Best is trial 11 with value: 0.4787840247154236.[0m



##### Using fold 0
##### Using base_lr 8.522296358704523e-05 last_lr 0.0023865771027824435


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9054 New best_val_rmse: 0.9054

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8695 New best_val_rmse: 0.8695

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8397 New best_val_rmse: 0.8397

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7787 New best_val_rmse: 0.7787

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6336 New best_val_rmse: 0.6336

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6978 Still best_val_rmse: 0.6336 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6725 Still best_val_rmse: 0.6336 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7524 Still best_val_rmse: 0.6336 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.796 Still best_val_rmse: 0.6336 (from epoch 0)


[32m[I 2021-07-18 18:17:06,738][0m Trial 14 finished with value: 0.6335976719856262 and parameters: {'base_lr': 8.522296358704523e-05, 'last_lr': 0.0023865771027824435}. Best is trial 11 with value: 0.4787840247154236.[0m



##### Using fold 0
##### Using base_lr 4.781960529449164e-05 last_lr 0.0023509276681760484


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.984 New best_val_rmse: 0.984

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8348 New best_val_rmse: 0.8348

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8025 New best_val_rmse: 0.8025

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.9103 Still best_val_rmse: 0.8025 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6157 New best_val_rmse: 0.6157

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.673 Still best_val_rmse: 0.6157 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5732 New best_val_rmse: 0.5732

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.57 New best_val_rmse: 0.57

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6999 Still best_val_rmse: 0.57 (from epoch 0)

16 steps took 12.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5672 New best_val_rmse: 0.5672

16 steps took 

[32m[I 2021-07-18 18:27:33,519][0m Trial 15 finished with value: 0.49745792150497437 and parameters: {'base_lr': 4.781960529449164e-05, 'last_lr': 0.0023509276681760484}. Best is trial 11 with value: 0.4787840247154236.[0m



##### Using fold 0
##### Using base_lr 8.768357562742521e-05 last_lr 0.0004004954918230132


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8625 New best_val_rmse: 0.8625

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.906 Still best_val_rmse: 0.8625 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8937 Still best_val_rmse: 0.8625 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7067 New best_val_rmse: 0.7067

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.654 New best_val_rmse: 0.654

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6079 New best_val_rmse: 0.6079

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5513 New best_val_rmse: 0.5513

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5738 Still best_val_rmse: 0.5513 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.835 Still best_val_rmse: 0.5513 (from epoch 0)

16 steps took 12.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.561 Still best_val_rmse:

[32m[I 2021-07-18 18:37:27,281][0m Trial 16 finished with value: 0.5162623524665833 and parameters: {'base_lr': 8.768357562742521e-05, 'last_lr': 0.0004004954918230132}. Best is trial 11 with value: 0.4787840247154236.[0m



##### Using fold 0
##### Using base_lr 3.10563052114089e-05 last_lr 0.002255757409956735


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.0 New best_val_rmse: 1.0

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9558 New best_val_rmse: 0.9558

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6664 New best_val_rmse: 0.6664

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6105 New best_val_rmse: 0.6105

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.8823 Still best_val_rmse: 0.6105 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.9202 Still best_val_rmse: 0.6105 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6787 Still best_val_rmse: 0.6105 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6385 Still best_val_rmse: 0.6105 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5709 New best_val_rmse: 0.5709

16 steps took 12.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.557 New best_val_rmse: 0.5

[32m[I 2021-07-18 18:47:19,161][0m Trial 17 finished with value: 0.513886034488678 and parameters: {'base_lr': 3.10563052114089e-05, 'last_lr': 0.002255757409956735}. Best is trial 11 with value: 0.4787840247154236.[0m


##### Using fold 0
##### Using base_lr 0.00014923663057185219 last_lr 0.0036700968666363615


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8881 New best_val_rmse: 0.8881

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8841 New best_val_rmse: 0.8841

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.182 Still best_val_rmse: 0.8841 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.226 Still best_val_rmse: 0.8841 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.048 Still best_val_rmse: 0.8841 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.034 Still best_val_rmse: 0.8841 (from epoch 0)


[32m[I 2021-07-18 18:49:46,322][0m Trial 18 finished with value: 0.8841065764427185 and parameters: {'base_lr': 0.00014923663057185219, 'last_lr': 0.0036700968666363615}. Best is trial 11 with value: 0.4787840247154236.[0m



##### Using fold 0
##### Using base_lr 4.505946550034719e-05 last_lr 0.0006950808662543709


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9583 New best_val_rmse: 0.9583

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.059 Still best_val_rmse: 0.9583 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7487 New best_val_rmse: 0.7487

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8165 Still best_val_rmse: 0.7487 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6827 New best_val_rmse: 0.6827

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7945 Still best_val_rmse: 0.6827 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6125 New best_val_rmse: 0.6125

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6123 New best_val_rmse: 0.6123

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.644 Still best_val_rmse: 0.6123 (from epoch 0)


[32m[I 2021-07-18 18:53:19,779][0m Trial 19 finished with value: 0.612310528755188 and parameters: {'base_lr': 4.505946550034719e-05, 'last_lr': 0.0006950808662543709}. Best is trial 11 with value: 0.4787840247154236.[0m





[32m[I 2021-07-18 18:53:19,782][0m A new study created in memory with name: no-name-d87b0b41-1a2b-4eb6-bc8d-98a7259c35b7[0m


 Best value:  0.4787840247154236
 Best params: 
    base_lr: 6.155021772017101e-05
    last_lr: 0.004642225106260296
##### Using fold 1
##### Using base_lr 6.958061098448612e-05 last_lr 0.00010150127149249639


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7689 New best_val_rmse: 0.7689

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7523 New best_val_rmse: 0.7523

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6449 New best_val_rmse: 0.6449

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6127 New best_val_rmse: 0.6127

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6135 Still best_val_rmse: 0.6127 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6266 Still best_val_rmse: 0.6127 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6024 New best_val_rmse: 0.6024

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5689 New best_val_rmse: 0.5689

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5409 New best_val_rmse: 0.5409

16 steps took 12.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.6043 Still best_val_rmse: 0.5409 (from epoch 0)

16 st

[32m[I 2021-07-18 19:18:19,336][0m Trial 0 finished with value: 0.45818933844566345 and parameters: {'base_lr': 6.958061098448612e-05, 'last_lr': 0.00010150127149249639}. Best is trial 0 with value: 0.45818933844566345.[0m



##### Using fold 1
##### Using base_lr 0.00043321692350401365 last_lr 0.0007649685886782807


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.019 New best_val_rmse: 1.019

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.152 Still best_val_rmse: 1.019 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.008 New best_val_rmse: 1.008

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.01 Still best_val_rmse: 1.008 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.007 New best_val_rmse: 1.007

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.012 Still best_val_rmse: 1.007 (from epoch 0)


[32m[I 2021-07-18 19:20:48,042][0m Trial 1 finished with value: 1.0074111223220825 and parameters: {'base_lr': 0.00043321692350401365, 'last_lr': 0.0007649685886782807}. Best is trial 0 with value: 0.45818933844566345.[0m



##### Using fold 1
##### Using base_lr 0.00016818585976809472 last_lr 8.40918161514374e-05


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.707 New best_val_rmse: 0.707

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8461 Still best_val_rmse: 0.707 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 2.19 Still best_val_rmse: 0.707 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.003 Still best_val_rmse: 0.707 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.9951 Still best_val_rmse: 0.707 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.8266 Still best_val_rmse: 0.707 (from epoch 0)


[32m[I 2021-07-18 19:23:16,692][0m Trial 2 finished with value: 0.7069610953330994 and parameters: {'base_lr': 0.00016818585976809472, 'last_lr': 8.40918161514374e-05}. Best is trial 0 with value: 0.45818933844566345.[0m



##### Using fold 1
##### Using base_lr 0.0001269955858196054 last_lr 0.0008131643017437049


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.763 New best_val_rmse: 0.763

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8391 Still best_val_rmse: 0.763 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9406 Still best_val_rmse: 0.763 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.665 New best_val_rmse: 0.665

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.034 Still best_val_rmse: 0.665 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.123 Still best_val_rmse: 0.665 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.041 Still best_val_rmse: 0.665 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.033 Still best_val_rmse: 0.665 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.023 Still best_val_rmse: 0.665 (from epoch 0)


[32m[I 2021-07-18 19:26:49,982][0m Trial 3 finished with value: 0.6649777889251709 and parameters: {'base_lr': 0.0001269955858196054, 'last_lr': 0.0008131643017437049}. Best is trial 0 with value: 0.45818933844566345.[0m



##### Using fold 1
##### Using base_lr 0.0001363275165644892 last_lr 0.0017510925487700946


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7133 New best_val_rmse: 0.7133

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7225 Still best_val_rmse: 0.7133 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9689 Still best_val_rmse: 0.7133 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8951 Still best_val_rmse: 0.7133 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.9028 Still best_val_rmse: 0.7133 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.139 Still best_val_rmse: 0.7133 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.007 Still best_val_rmse: 0.7133 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.012 Still best_val_rmse: 0.7133 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.014 Still best_val_rmse: 0.7133 (from epoch 0)


[32m[I 2021-07-18 19:30:24,047][0m Trial 4 finished with value: 0.7133458256721497 and parameters: {'base_lr': 0.0001363275165644892, 'last_lr': 0.0017510925487700946}. Best is trial 0 with value: 0.45818933844566345.[0m



##### Using fold 1
##### Using base_lr 5.2677458973125453e-05 last_lr 0.0002728708000650865


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9263 New best_val_rmse: 0.9263

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6716 New best_val_rmse: 0.6716

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6684 New best_val_rmse: 0.6684

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6598 New best_val_rmse: 0.6598

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6181 New best_val_rmse: 0.6181

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5612 New best_val_rmse: 0.5612

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.8101 Still best_val_rmse: 0.5612 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6445 Still best_val_rmse: 0.5612 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5298 New best_val_rmse: 0.5298

16 steps took 12.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5263 New best_val_rmse: 0.5263

16 steps took 12.1 sec

[32m[I 2021-07-18 19:54:06,451][0m Trial 5 finished with value: 0.4601954519748688 and parameters: {'base_lr': 5.2677458973125453e-05, 'last_lr': 0.0002728708000650865}. Best is trial 0 with value: 0.45818933844566345.[0m



##### Using fold 1
##### Using base_lr 0.000126982597021093 last_lr 0.00043746697815513454


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.729 New best_val_rmse: 0.729

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7682 Still best_val_rmse: 0.729 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7584 Still best_val_rmse: 0.729 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6673 New best_val_rmse: 0.6673

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.215 Still best_val_rmse: 0.6673 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.007 Still best_val_rmse: 0.6673 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.021 Still best_val_rmse: 0.6673 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.008 Still best_val_rmse: 0.6673 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.015 Still best_val_rmse: 0.6673 (from epoch 0)


[32m[I 2021-07-18 19:57:41,648][0m Trial 6 finished with value: 0.667289137840271 and parameters: {'base_lr': 0.000126982597021093, 'last_lr': 0.00043746697815513454}. Best is trial 0 with value: 0.45818933844566345.[0m



##### Using fold 1
##### Using base_lr 0.0001757311713624694 last_lr 0.0002838836177636023


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7166 New best_val_rmse: 0.7166

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.159 Still best_val_rmse: 0.7166 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.127 Still best_val_rmse: 0.7166 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.025 Still best_val_rmse: 0.7166 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.011 Still best_val_rmse: 0.7166 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.016 Still best_val_rmse: 0.7166 (from epoch 0)


[32m[I 2021-07-18 20:00:09,074][0m Trial 7 finished with value: 0.7166176438331604 and parameters: {'base_lr': 0.0001757311713624694, 'last_lr': 0.0002838836177636023}. Best is trial 0 with value: 0.45818933844566345.[0m



##### Using fold 1
##### Using base_lr 0.000281983561784283 last_lr 0.00011011481218420704


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9047 New best_val_rmse: 0.9047

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.218 Still best_val_rmse: 0.9047 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.014 Still best_val_rmse: 0.9047 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.021 Still best_val_rmse: 0.9047 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.012 Still best_val_rmse: 0.9047 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.015 Still best_val_rmse: 0.9047 (from epoch 0)


[32m[I 2021-07-18 20:02:35,731][0m Trial 8 finished with value: 0.9046867489814758 and parameters: {'base_lr': 0.000281983561784283, 'last_lr': 0.00011011481218420704}. Best is trial 0 with value: 0.45818933844566345.[0m



##### Using fold 1
##### Using base_lr 8.902912488113375e-05 last_lr 0.00023076670834499074


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7079 New best_val_rmse: 0.7079

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6837 New best_val_rmse: 0.6837

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9618 Still best_val_rmse: 0.6837 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6293 New best_val_rmse: 0.6293

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6916 Still best_val_rmse: 0.6293 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5629 New best_val_rmse: 0.5629

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6269 Still best_val_rmse: 0.5629 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5378 New best_val_rmse: 0.5378

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5174 New best_val_rmse: 0.5174

16 steps took 13.0 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5937 Still best_val_rmse: 0.5174 (fro

[32m[I 2021-07-18 20:31:17,972][0m Trial 9 finished with value: 0.447449654340744 and parameters: {'base_lr': 8.902912488113375e-05, 'last_lr': 0.00023076670834499074}. Best is trial 9 with value: 0.447449654340744.[0m



##### Using fold 1
##### Using base_lr 4.067727869504078e-05 last_lr 0.004465318938487712


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.048 New best_val_rmse: 1.048

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6598 New best_val_rmse: 0.6598

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6413 New best_val_rmse: 0.6413

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6951 Still best_val_rmse: 0.6413 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.8654 Still best_val_rmse: 0.6413 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.727 Still best_val_rmse: 0.6413 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.9201 Still best_val_rmse: 0.6413 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7306 Still best_val_rmse: 0.6413 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5834 New best_val_rmse: 0.5834

16 steps took 12.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5642 N

In [40]:
for i in range(2, 3):
    fold = i
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=20)
    print(" Best value: ", study.best_trial.value)
    print(" Best params: ")
    for key, value in study.best_trial.params.items():
        print(f"    {key}: {value}")

[32m[I 2021-07-19 08:03:49,382][0m A new study created in memory with name: no-name-1b0b8ca5-bbbf-4207-94cf-b93393383ddf[0m


##### Using fold 2
##### Using base_lr 0.0003552760540192187 last_lr 0.0038043117994500013


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 14.0 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.485 New best_val_rmse: 1.485

16 steps took 11.8 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.086 New best_val_rmse: 1.086

16 steps took 11.9 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.38 Still best_val_rmse: 1.086 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.098 Still best_val_rmse: 1.086 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.076 New best_val_rmse: 1.076

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.101 Still best_val_rmse: 1.076 (from epoch 0)


[32m[I 2021-07-19 08:06:17,801][0m Trial 0 finished with value: 1.07648766040802 and parameters: {'base_lr': 0.0003552760540192187, 'last_lr': 0.0038043117994500013}. Best is trial 0 with value: 1.07648766040802.[0m



##### Using fold 2
##### Using base_lr 4.57661676399739e-05 last_lr 0.0004619110960854159


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.163 New best_val_rmse: 1.163

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7022 New best_val_rmse: 0.7022

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.722 Still best_val_rmse: 0.7022 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7581 Still best_val_rmse: 0.7022 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6998 New best_val_rmse: 0.6998

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.8563 Still best_val_rmse: 0.6998 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5806 New best_val_rmse: 0.5806

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5663 New best_val_rmse: 0.5663

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6074 Still best_val_rmse: 0.5663 (from epoch 0)

16 steps took 12.8 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5729 Still best_val_rms

[32m[I 2021-07-19 08:16:55,396][0m Trial 1 finished with value: 0.49514997005462646 and parameters: {'base_lr': 4.57661676399739e-05, 'last_lr': 0.0004619110960854159}. Best is trial 1 with value: 0.49514997005462646.[0m



##### Using fold 2
##### Using base_lr 0.00015686819593532155 last_lr 0.00043328911914538524


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.168 New best_val_rmse: 1.168

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.075 New best_val_rmse: 1.075

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.094 Still best_val_rmse: 1.075 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.076 Still best_val_rmse: 1.075 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.063 New best_val_rmse: 1.063

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.097 Still best_val_rmse: 1.063 (from epoch 0)


[32m[I 2021-07-19 08:19:22,081][0m Trial 2 finished with value: 1.0626918077468872 and parameters: {'base_lr': 0.00015686819593532155, 'last_lr': 0.00043328911914538524}. Best is trial 1 with value: 0.49514997005462646.[0m



##### Using fold 2
##### Using base_lr 0.00011472919004535602 last_lr 0.0017329266411097838


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8314 New best_val_rmse: 0.8314

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8183 New best_val_rmse: 0.8183

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9561 Still best_val_rmse: 0.8183 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7514 New best_val_rmse: 0.7514

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.094 Still best_val_rmse: 0.7514 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.685 New best_val_rmse: 0.685

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.9212 Still best_val_rmse: 0.685 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6612 New best_val_rmse: 0.6612

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5858 New best_val_rmse: 0.5858

16 steps took 12.8 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.612 Still best_val_rmse: 0.5858 (from epo

[32m[I 2021-07-19 08:31:34,364][0m Trial 3 finished with value: 0.4841878116130829 and parameters: {'base_lr': 0.00011472919004535602, 'last_lr': 0.0017329266411097838}. Best is trial 3 with value: 0.4841878116130829.[0m



##### Using fold 2
##### Using base_lr 7.48556563348634e-05 last_lr 0.0004908001885823281


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8619 New best_val_rmse: 0.8619

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.823 New best_val_rmse: 0.823

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9686 Still best_val_rmse: 0.823 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8791 Still best_val_rmse: 0.823 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6282 New best_val_rmse: 0.6282

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6561 Still best_val_rmse: 0.6282 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5937 New best_val_rmse: 0.5937

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6405 Still best_val_rmse: 0.5937 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6364 Still best_val_rmse: 0.5937 (from epoch 0)

16 steps took 12.8 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5538 Ne

[32m[I 2021-07-19 08:44:27,165][0m Trial 4 finished with value: 0.4840955138206482 and parameters: {'base_lr': 7.48556563348634e-05, 'last_lr': 0.0004908001885823281}. Best is trial 4 with value: 0.4840955138206482.[0m



##### Using fold 2
##### Using base_lr 3.057078843312379e-05 last_lr 0.0009549025910941674


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.027 New best_val_rmse: 1.027

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8379 New best_val_rmse: 0.8379

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8646 Still best_val_rmse: 0.8379 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8617 Still best_val_rmse: 0.8379 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6574 New best_val_rmse: 0.6574

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.604 New best_val_rmse: 0.604

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5889 New best_val_rmse: 0.5889

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6779 Still best_val_rmse: 0.5889 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.585 New best_val_rmse: 0.585

16 steps took 12.8 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5649 New best_val_rmse: 0.5649

16 steps to

[32m[I 2021-07-19 08:54:33,799][0m Trial 5 finished with value: 0.49941134452819824 and parameters: {'base_lr': 3.057078843312379e-05, 'last_lr': 0.0009549025910941674}. Best is trial 4 with value: 0.4840955138206482.[0m



##### Using fold 2
##### Using base_lr 0.00040362445879098193 last_lr 0.0006691421017614235


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.195 New best_val_rmse: 1.195

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.073 New best_val_rmse: 1.073

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.125 Still best_val_rmse: 1.073 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.073 Still best_val_rmse: 1.073 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.093 Still best_val_rmse: 1.073 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.069 New best_val_rmse: 1.069


[32m[I 2021-07-19 08:57:00,663][0m Trial 6 finished with value: 1.0692239999771118 and parameters: {'base_lr': 0.00040362445879098193, 'last_lr': 0.0006691421017614235}. Best is trial 4 with value: 0.4840955138206482.[0m



##### Using fold 2
##### Using base_lr 6.285357587314095e-05 last_lr 0.0009582771584342349


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9278 New best_val_rmse: 0.9278

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8398 New best_val_rmse: 0.8398

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7729 New best_val_rmse: 0.7729

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8208 Still best_val_rmse: 0.7729 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6994 New best_val_rmse: 0.6994

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7497 Still best_val_rmse: 0.6994 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5686 New best_val_rmse: 0.5686

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5668 New best_val_rmse: 0.5668

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5898 Still best_val_rmse: 0.5668 (from epoch 0)

16 steps took 12.8 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5908 Still best_val_rmse: 0.5668 (fro

[32m[I 2021-07-19 09:09:19,818][0m Trial 7 finished with value: 0.4850175082683563 and parameters: {'base_lr': 6.285357587314095e-05, 'last_lr': 0.0009582771584342349}. Best is trial 4 with value: 0.4840955138206482.[0m



##### Using fold 2
##### Using base_lr 0.0002314602967657199 last_lr 9.851194497387779e-05


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.012 New best_val_rmse: 1.012

16 steps took 11.9 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.08 Still best_val_rmse: 1.012 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.218 Still best_val_rmse: 1.012 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.071 Still best_val_rmse: 1.012 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.079 Still best_val_rmse: 1.012 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.086 Still best_val_rmse: 1.012 (from epoch 0)


[32m[I 2021-07-19 09:11:47,707][0m Trial 8 finished with value: 1.0120644569396973 and parameters: {'base_lr': 0.0002314602967657199, 'last_lr': 9.851194497387779e-05}. Best is trial 4 with value: 0.4840955138206482.[0m



##### Using fold 2
##### Using base_lr 5.400989949741741e-05 last_lr 0.00011196462142611125


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.09 New best_val_rmse: 1.09

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9043 New best_val_rmse: 0.9043

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7569 New best_val_rmse: 0.7569

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8641 Still best_val_rmse: 0.7569 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.731 New best_val_rmse: 0.731

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6332 New best_val_rmse: 0.6332

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5742 New best_val_rmse: 0.5742

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.567 New best_val_rmse: 0.567

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5711 Still best_val_rmse: 0.567 (from epoch 0)

16 steps took 12.8 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.6086 Still best_val_rmse: 0.567 (from epoch 0)

16 steps took 1

[32m[I 2021-07-19 09:24:31,447][0m Trial 9 finished with value: 0.4813994765281677 and parameters: {'base_lr': 5.400989949741741e-05, 'last_lr': 0.00011196462142611125}. Best is trial 9 with value: 0.4813994765281677.[0m



##### Using fold 2
##### Using base_lr 3.0349228979439946e-05 last_lr 8.356763721839508e-05


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.07 New best_val_rmse: 1.07

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9153 New best_val_rmse: 0.9153

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7683 New best_val_rmse: 0.7683

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7944 Still best_val_rmse: 0.7683 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6394 New best_val_rmse: 0.6394

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5858 New best_val_rmse: 0.5858

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6185 Still best_val_rmse: 0.5858 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.634 Still best_val_rmse: 0.5858 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5752 New best_val_rmse: 0.5752

16 steps took 12.8 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.54 New best_val_rmse: 0.54

16 steps took 

[32m[I 2021-07-19 09:36:17,501][0m Trial 10 finished with value: 0.487863153219223 and parameters: {'base_lr': 3.0349228979439946e-05, 'last_lr': 8.356763721839508e-05}. Best is trial 9 with value: 0.4813994765281677.[0m



##### Using fold 2
##### Using base_lr 7.604566984349528e-05 last_lr 0.00013966559365722486


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8743 New best_val_rmse: 0.8743

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7826 New best_val_rmse: 0.7826

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9785 Still best_val_rmse: 0.7826 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7343 New best_val_rmse: 0.7343

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.9431 Still best_val_rmse: 0.7343 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6562 New best_val_rmse: 0.6562

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7393 Still best_val_rmse: 0.6562 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5712 New best_val_rmse: 0.5712

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.576 Still best_val_rmse: 0.5712 (from epoch 0)

16 steps took 12.8 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.579 Still best_val_rm

[32m[I 2021-07-19 09:49:10,119][0m Trial 11 finished with value: 0.47985729575157166 and parameters: {'base_lr': 7.604566984349528e-05, 'last_lr': 0.00013966559365722486}. Best is trial 11 with value: 0.47985729575157166.[0m



##### Using fold 2
##### Using base_lr 8.661283535667235e-05 last_lr 0.00018490772170980246


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8598 New best_val_rmse: 0.8598

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.893 Still best_val_rmse: 0.8598 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9614 Still best_val_rmse: 0.8598 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.9419 Still best_val_rmse: 0.8598 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7173 New best_val_rmse: 0.7173

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.8428 Still best_val_rmse: 0.7173 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.9031 Still best_val_rmse: 0.7173 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.043 Still best_val_rmse: 0.7173 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.031 Still best_val_rmse: 0.7173 (from epoch 0)


[32m[I 2021-07-19 09:52:43,132][0m Trial 12 finished with value: 0.7173497080802917 and parameters: {'base_lr': 8.661283535667235e-05, 'last_lr': 0.00018490772170980246}. Best is trial 11 with value: 0.47985729575157166.[0m



##### Using fold 2
##### Using base_lr 5.495320159797113e-05 last_lr 0.0001949936611791773


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.108 New best_val_rmse: 1.108

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.935 New best_val_rmse: 0.935

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8139 New best_val_rmse: 0.8139

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8465 Still best_val_rmse: 0.8139 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6577 New best_val_rmse: 0.6577

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6899 Still best_val_rmse: 0.6577 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7097 Still best_val_rmse: 0.6577 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5827 New best_val_rmse: 0.5827

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.564 New best_val_rmse: 0.564

16 steps took 12.8 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5867 Still best_val_rmse: 0.564 (from epoch

[32m[I 2021-07-19 10:05:27,306][0m Trial 13 finished with value: 0.4842141568660736 and parameters: {'base_lr': 5.495320159797113e-05, 'last_lr': 0.0001949936611791773}. Best is trial 11 with value: 0.47985729575157166.[0m



##### Using fold 2
##### Using base_lr 4.075877307284862e-05 last_lr 0.0001624498006278733


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.044 New best_val_rmse: 1.044

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8562 New best_val_rmse: 0.8562

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9492 Still best_val_rmse: 0.8562 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7744 New best_val_rmse: 0.7744

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.857 Still best_val_rmse: 0.7744 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7259 New best_val_rmse: 0.7259

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6584 New best_val_rmse: 0.6584

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7345 Still best_val_rmse: 0.6584 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6501 New best_val_rmse: 0.6501


[32m[I 2021-07-19 10:09:00,133][0m Trial 14 finished with value: 0.6501085162162781 and parameters: {'base_lr': 4.075877307284862e-05, 'last_lr': 0.0001624498006278733}. Best is trial 11 with value: 0.47985729575157166.[0m



##### Using fold 2
##### Using base_lr 0.00010637579365513648 last_lr 8.276647346442369e-05


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.158 New best_val_rmse: 1.158

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7568 New best_val_rmse: 0.7568

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8052 Still best_val_rmse: 0.7568 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7244 New best_val_rmse: 0.7244

16 steps took 12.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7129 New best_val_rmse: 0.7129

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6232 New best_val_rmse: 0.6232

16 steps took 12.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6115 New best_val_rmse: 0.6115

16 steps took 12.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6461 Still best_val_rmse: 0.6115 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5694 New best_val_rmse: 0.5694

16 steps took 12.8 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.6209 Still best_val_rmse: 0.5694 (from epoch 0)

16 step

[32m[I 2021-07-19 10:25:31,053][0m Trial 15 finished with value: 0.47160205245018005 and parameters: {'base_lr': 0.00010637579365513648, 'last_lr': 8.276647346442369e-05}. Best is trial 15 with value: 0.47160205245018005.[0m



##### Using fold 2
##### Using base_lr 0.00014957407459464006 last_lr 0.00028294729583032354


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8874 New best_val_rmse: 0.8874

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7423 New best_val_rmse: 0.7423

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8506 Still best_val_rmse: 0.7423 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.049 Still best_val_rmse: 0.7423 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.128 Still best_val_rmse: 0.7423 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.083 Still best_val_rmse: 0.7423 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.06 Still best_val_rmse: 0.7423 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.059 Still best_val_rmse: 0.7423 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.065 Still best_val_rmse: 0.7423 (from epoch 0)


[32m[I 2021-07-19 10:29:02,494][0m Trial 16 finished with value: 0.7422624826431274 and parameters: {'base_lr': 0.00014957407459464006, 'last_lr': 0.00028294729583032354}. Best is trial 15 with value: 0.47160205245018005.[0m



##### Using fold 2
##### Using base_lr 0.00010197064730942197 last_lr 0.00012323780507671805


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.067 New best_val_rmse: 1.067

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8543 New best_val_rmse: 0.8543

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.097 Still best_val_rmse: 0.8543 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.062 Still best_val_rmse: 0.8543 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.073 Still best_val_rmse: 0.8543 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.069 Still best_val_rmse: 0.8543 (from epoch 0)


[32m[I 2021-07-19 10:31:29,153][0m Trial 17 finished with value: 0.8542517423629761 and parameters: {'base_lr': 0.00010197064730942197, 'last_lr': 0.00012323780507671805}. Best is trial 15 with value: 0.47160205245018005.[0m



##### Using fold 2
##### Using base_lr 0.00020300382337553154 last_lr 0.0002578690732592039


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.018 New best_val_rmse: 1.018

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.161 Still best_val_rmse: 1.018 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.098 Still best_val_rmse: 1.018 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.08 Still best_val_rmse: 1.018 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.077 Still best_val_rmse: 1.018 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.096 Still best_val_rmse: 1.018 (from epoch 0)


[32m[I 2021-07-19 10:33:56,418][0m Trial 18 finished with value: 1.0176031589508057 and parameters: {'base_lr': 0.00020300382337553154, 'last_lr': 0.0002578690732592039}. Best is trial 15 with value: 0.47160205245018005.[0m



##### Using fold 2
##### Using base_lr 8.121779661903581e-05 last_lr 8.341126960078048e-05


Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-large-discriminator and are newly initialized: ['classifier

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 13.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8434 New best_val_rmse: 0.8434

16 steps took 12.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8109 New best_val_rmse: 0.8109

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7472 New best_val_rmse: 0.7472

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.109 Still best_val_rmse: 0.7472 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.091 Still best_val_rmse: 0.7472 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.094 Still best_val_rmse: 0.7472 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.072 Still best_val_rmse: 0.7472 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.06 Still best_val_rmse: 0.7472 (from epoch 0)

16 steps took 12.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.062 Still best_val_rmse: 0.7472 (from epoch 0)


[32m[I 2021-07-19 10:37:28,281][0m Trial 19 finished with value: 0.7471550703048706 and parameters: {'base_lr': 8.121779661903581e-05, 'last_lr': 8.341126960078048e-05}. Best is trial 15 with value: 0.47160205245018005.[0m



 Best value:  0.47160205245018005
 Best params: 
    base_lr: 0.00010637579365513648
    last_lr: 8.276647346442369e-05


### Verify the model

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm

In [None]:
cfg.model_offset = 0
cfg.model_limit = 6
cfg.n_folds = 5
cfg.svm_kernels = ['rbf']
cfg.svm_c = 5

In [None]:
num_bins = int(np.ceil(np.log2(len(train_df))))
train_df['bins'] = pd.cut(train_df['target'], bins=num_bins, labels=False)
bins = train_df['bins'].values

In [None]:
%%time

inference_models = []
for i in range(1, cfg.NUM_FOLDS + 1):
    print(f'Model {i}')
    inference_model = CommonLitModel()
    inference_model = inference_model.cuda()
    inference_model.load_state_dict(torch.load(str(MODELS_PATH/f"{cfg.model_name.replace('/', '_')}_{i}/model_{i}.pth")))
    inference_model.eval();
    inference_models.append(inference_model)

In [None]:
from transformers import RobertaTokenizer

tokenizers = []
for i in range(1, cfg.NUM_FOLDS):
    tokenizer = RobertaTokenizer.from_pretrained(MODELS_PATH/f"{cfg.model_name.replace('/', '_')}_{i}")
    tokenizers.append(tokenizer)

In [None]:
def get_cls_embeddings(dl, transformer_model):
    cls_embeddings = []
    with torch.no_grad():
        for input_features in tqdm(dl, total=len(dl)):
            output, context_vector = transformer_model(input_features['input_ids'].cuda(), input_features['attention_mask'].cuda())
#             cls_embeddings.extend(output['last_hidden_state'][:,0,:].detach().cpu().numpy())
            embedding_out = context_vector.detach().cpu().numpy()
            cls_embeddings.extend(embedding_out)
    return np.array(cls_embeddings)

In [None]:
def rmse_score(X, y):
    return np.sqrt(mean_squared_error(X, y))

In [None]:
def convert_to_list(t):
    return t.flatten().long()

class CommonLitDataset(nn.Module):
    def __init__(self, text, test_id, tokenizer, max_len=128):
        self.excerpt = text
        self.test_id = test_id
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        return {'input_ids': convert_to_list(encode['input_ids']),
                'attention_mask': convert_to_list(encode['attention_mask']),
                'id': self.test_id[idx]}
    
    def __len__(self):
        return len(self.excerpt)

In [None]:
def create_dl(df, tokenizer):
    text = df['excerpt'].values
    ids = df['id'].values
    ds = CommonLitDataset(text, ids, tokenizer, max_len=cfg.MAX_LEN)
    return DataLoader(ds, 
                      batch_size = cfg.BATCH_SIZE,
                      shuffle=False,
                      num_workers = 1,
                      pin_memory=True,
                      drop_last=False
                     )

In [None]:
train_df = pd.read_csv(DATA_PATH/'train-orig.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')
remove_unnecessary(train_df)

In [None]:
train_target_mean = train_df['target'].mean()
train_target_std = train_df['target'].std()
train_df['normalized_target'] = (train_df['target'] - train_target_mean) / train_target_std

In [None]:
%%time

train_target = train_df['normalized_target'].values

def calc_mean(scores):
    return np.mean(np.array(scores), axis=0)

final_scores = []
final_rmse = []
kernel_rmse_score_mean = []
final_kernel_predictions_means = []
for j, (inference_model, tokenizer) in enumerate(zip(inference_models, tokenizers)):
    print('Model', j)
    test_dl = create_dl(test_df, tokenizer)
    train_dl = create_dl(train_df, tokenizer)
    transformer_model = inference_model
    transformer_model.cuda()
    X = get_cls_embeddings(train_dl, transformer_model)
    
    y = train_target
    X_test = get_cls_embeddings(test_dl, transformer_model)
    
    kfold = StratifiedKFold(n_splits=cfg.NUM_FOLDS)
    scores = []
    rmse_scores = []
    kernel_predictions_means = []
    for kernel in cfg.svm_kernels:
        print('Kernel', kernel)
        kernel_scores = []
        kernel_rmse_scores = []
        kernel_predictions = []
        for k, (train_idx, valid_idx) in enumerate(kfold.split(X, bins)):

            print('Fold', k, train_idx.shape, valid_idx.shape)
            model = SVR(C=cfg.svm_c, kernel=kernel, gamma='auto')

            X_train, y_train = X[train_idx], y[train_idx]
            X_valid, y_valid = X[valid_idx], y[valid_idx]
            model.fit(X_train, y_train)
            prediction = model.predict(X_valid)
            kernel_predictions.append(prediction)
            kernel_rmse_scores.append(rmse_score(prediction, y_valid))
            print('rmse_score', kernel_rmse_scores[k])
            kernel_scores.append(model.predict(X_test))
        kernel_predictions_means.append(np.array([np.mean(kp) for kp in kernel_predictions]).mean())
        scores.append(calc_mean(kernel_scores))
        kernel_rmse_score = calc_mean(kernel_rmse_scores)
        kernel_rmse_score_mean.append(kernel_rmse_score)
        rmse_scores.append(kernel_rmse_score)
    final_kernel_predictions_means.append(kernel_predictions_means)
    final_scores.append(calc_mean(scores))
    final_rmse.append(calc_mean(rmse_scores))
print('FINAL RMSE score', np.mean(np.array(final_rmse)))

In [None]:
final_kernel_predictions_means

In [None]:
# (train_df['target'] - cfg.train_target_mean) / cfg.train_target_std
final_scores_normalized = np.array(final_scores) * train_target_std + train_target_mean

In [None]:
kernel_rmse_score_mean_array = np.array(kernel_rmse_score_mean)
kernel_rmse_score_mean_sum = np.sum(kernel_rmse_score_mean_array)
prop_losses = kernel_rmse_score_mean_array / kernel_rmse_score_mean_sum
prop_losses_sum = (1 - prop_losses).sum()
weights = (1 - prop_losses) / prop_losses_sum
weights

In [None]:
def calc_mean(scores, weights=weights):
    return np.average(np.array(scores), weights=weights, axis=0)

In [None]:
target_mean = train_df['target'].mean()
final_scores_flat = calc_mean(final_scores_normalized).flatten()
final_scores_mean = final_scores_flat.mean()
target_mean, np.array(final_scores_normalized).mean()
# (-0.9579984513405823, -0.8029817438292849)

In [None]:
final_scores_flat

In [None]:
mean_diff = target_mean - final_scores_mean
mean_diff, mean_diff / len(final_scores)

In [None]:
sample_df['target'] = final_scores_flat + mean_diff
# sample_df['target'] = len(final_scores) / np.sum(1 / np.array(final_scores), axis=0) # harmonic mean
sample_df

### Prepare Packaging

In [None]:
cfg.model_name

In [None]:
BEST_MODEL_FOLDER = MODELS_PATH/cfg.model_name/'best'
!rm -rf {BEST_MODEL_FOLDER}
!mkdir -p {BEST_MODEL_FOLDER}

In [None]:
BEST_MODEL_FOLDER

In [None]:
cfg.NUM_FOLDS

In [None]:
bestmodels = [MODELS_PATH/f'{cfg.model_name}_{i + 1}' for i in range(0, cfg.NUM_FOLDS)]

In [None]:
bestmodels

In [None]:
from shutil import copyfile

def normalize_name(path_name):
    return path_name.replace('', '')

for i, best_model in enumerate(bestmodels):
    print(f'Processing {i}th model')
    i = i + 1
    best_model_file = f'{best_model}/model_{i}.pth'
    if Path(best_model_file).exists():
        copyfile(best_model_file, f'{BEST_MODEL_FOLDER}/{i}_pytorch_model.bin')
        tokenizer_path = Path(BEST_MODEL_FOLDER/f'tokenizer-{i}')
        tokenizer_path.mkdir(parents=True, exist_ok=True)
        assert tokenizer_path.exists()

        tokenizer_json = Path(normalize_name(f'{MODELS_PATH/cfg.model_name}_{i}/tokenizer_config.json'))
        assert tokenizer_json.exists(), f'{tokenizer_json} does not exist'
        copyfile(tokenizer_json, tokenizer_path/'tokenizer.json')

        vocab_txt = Path(normalize_name(f'{MODELS_PATH/cfg.model_name}_{i}/vocab.json'))
        assert vocab_txt.exists(), f'{vocab_txt} does not exist'
        copyfile(vocab_txt, tokenizer_path/'vocab.json')

        merges = Path(normalize_name(f'{MODELS_PATH/cfg.model_name}_{i}/merges.txt'))
        assert merges.exists()
        copyfile(merges, tokenizer_path/'merges.txt')
    else:
        print(f'{best_model_file} is missing')

In [None]:
import shutil

shutil.make_archive(MODELS_PATH/cfg.model_name/'best_models', 'zip', BEST_MODEL_FOLDER)

In [None]:
!ls {MODELS_PATH/cfg.model_name}

In [None]:
!mv {MODELS_PATH}/{cfg.model_name}.yaml {MODELS_PATH/cfg.model_name}

In [None]:
transformer_model.transformer_model.save_pretrained(save_directory=f'{MODELS_PATH/cfg.model_name}/lm')

In [None]:
!du -h {MODELS_PATH/cfg.model_name}/*

In [None]:
shutil.make_archive(MODELS_PATH/cfg.model_name/'lm', 'zip', f'{MODELS_PATH/cfg.model_name}/lm')

In [None]:
!kaggle datasets init -p {MODELS_PATH/cfg.model_name}

In [None]:
dataset_json_path = Path(MODELS_PATH/cfg.model_name/'dataset-metadata.json')
assert dataset_json_path.exists()

In [None]:
!cat {str(dataset_json_path)}

In [None]:
with open(dataset_json_path, 'r') as f:
    dataset_json = f.read()
    dataset_json = dataset_json.replace('INSERT_TITLE_HERE', f'commonlit-{cfg.model_name}-light').replace('INSERT_SLUG_HERE', f'commonlit-{cfg.model_name}-light')
    print(dataset_json)
with(open(dataset_json_path, 'w')) as f:
    f.write(dataset_json)

In [None]:
!rm -rf {MODELS_PATH/cfg.model_name}/best
!rm -rf {MODELS_PATH/cfg.model_name}/lm

In [None]:
!kaggle datasets create -p {MODELS_PATH/cfg.model_name}

In [None]:
!kaggle datasets version -p {MODELS_PATH/cfg.model_name} -m "Version with merges.txt" -d

In [None]:
state_dict = torch.load(str(MODELS_PATH/f'distilroberta-0/checkpoint-105/pytorch_model.bin'))

In [None]:
loaded_model = CommonLitModel()

In [None]:
loaded_model.load_state_dict(state_dict)