In [None]:
# !pip install optuna

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import gc, warnings, random, time, os

from pathlib import Path

from tqdm.notebook import tqdm

warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from transformers import AutoModel, AutoTokenizer, AutoConfig
from transformers import get_cosine_schedule_with_warmup

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

import seaborn as sns

import gc
gc.enable()

import optuna

### Folders and Dataframes

In [2]:
DATA_PATH = Path('/home/commonlit/data/')
assert DATA_PATH.exists()
MODELS_PATH = Path('/home/commonlit/models/')
if not MODELS_PATH.exists():
    os.mkdir(MODELS_PATH)
assert MODELS_PATH.exists()

In [3]:
train_df = pd.read_csv(DATA_PATH/'train-orig.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')
sample_df = pd.read_csv(DATA_PATH/'sample_submission.csv')

In [4]:
def remove_unnecessary(df):
    df.drop(df[df['target'] == 0].index, inplace=True)
    df.reset_index(drop=True, inplace=True)
    
remove_unnecessary(train_df)

In [5]:
train_df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845
...,...,...,...,...,...,...
2828,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,When you think of dinosaurs and where they liv...,1.711390,0.646900
2829,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,So what is a solid? Solids are usually hard be...,0.189476,0.535648
2830,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,The second state of matter we will discuss is ...,0.255209,0.483866
2831,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,Solids are shapes that you can actually touch....,-0.215279,0.514128


### Config and Seeding

In [6]:
class Config(): 
    NUM_FOLDS = 6
    NUM_EPOCHS = 3
    BATCH_SIZE = 16
    MAX_LEN = 248
    EVAL_SCHEDULE = [(0.50, 16), (0.49, 8), (0.48, 4), (0.47, 2), (-1., 1)]
    MODEL_PATH = 'microsoft/deberta-xlarge'
    TOKENIZER_PATH = 'microsoft/deberta-xlarge'
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    SEED = 1000
    NUM_WORKERS = 2
    MODEL_FOLDER = MODELS_PATH
    model_name = 'deberta-xlarge'
    svm_kernels = ['rbf']
    svm_c = 5

cfg = Config()

In [7]:
if not cfg.MODEL_FOLDER.exists():
    os.mkdir(cfg.MODEL_FOLDER)

In [8]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True

### Dataset

In [9]:
def add_bins(train_df, num_bins):
    train_df.loc[:, 'bins'] = pd.cut(train_df['target'], bins=num_bins, labels=False)
    return num_bins

In [10]:
add_bins(train_df, cfg.NUM_FOLDS)

6

In [11]:
train_df.groupby(['bins'])['target'].agg(['count', 'mean'])

Unnamed: 0_level_0,count,mean
bins,Unnamed: 1_level_1,Unnamed: 2_level_1
0,122,-3.125765
1,441,-2.270279
2,784,-1.41215
3,886,-0.548095
4,494,0.289716
5,106,1.070237


In [12]:
tokenizer = AutoTokenizer.from_pretrained(cfg.TOKENIZER_PATH)

In [13]:
with open('../data/tokenizer.vocab.txt', 'w') as f:
    for k, v in tokenizer.vocab.items():
        f.write(f'{k}: {v}\n')

In [14]:
pad_token = '______'

In [15]:
class CommonLitDataset(Dataset):
    def __init__(self, df, tokenizer, inference_only=False):
        super().__init__()
        self.df, self.inference_only = df, inference_only
        self.text = df['excerpt'].tolist()
        self.bins = df['bins']
        if not inference_only:
            self.target = torch.tensor(df['target'].to_numpy(), dtype = torch.float32)
        
#         tokenizer.add_special_tokens({'pad_token': pad_token})
#         assert tokenizer.pad_token == pad_token
        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',
            max_length = cfg.MAX_LEN,
            truncation = True,
            return_attention_mask=True
        )
        self.tokenizer = tokenizer
        
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return {'input_ids': input_ids, 'attention_mask': attention_mask}
        else:
            target = self.target[index]
            return {'input_ids': input_ids, 'attention_mask': attention_mask, 'target': target}
    
    def __len__(self):
        return len(self.df)

In [16]:
sample_ds = CommonLitDataset(train_df, tokenizer)

### Model

In [17]:
class AttentionHead(nn.Module):
    
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        
        self.hidden_layer = nn.Linear(in_features, hidden_dim)
        self.final_layer = nn.Linear(hidden_dim, num_targets)
        self.out_features = hidden_dim
        
    def forward(self, features):
        att = torch.tanh(self.hidden_layer(features))
        score = self.final_layer(att)
        attention_weights = torch.softmax(score, dim=1)
        return attention_weights

In [18]:
config = AutoConfig.from_pretrained(cfg.MODEL_PATH)

In [19]:
config.vocab_size, tokenizer.vocab_size

(50265, 50265)

In [20]:
from transformers import AutoModelForSequenceClassification

class CommonLitModel(nn.Module):
    def __init__(self):
        super(CommonLitModel, self).__init__()
        config = AutoConfig.from_pretrained(cfg.MODEL_PATH)
        config.update({
            "output_hidden_states": True,
            "hidden_dropout_prob": 0.0,
            "layer_norm_eps": 1e-7
        })
        self.transformer_model = AutoModelForSequenceClassification.from_pretrained(cfg.MODEL_PATH, config=config)
        self.attention = AttentionHead(config.hidden_size, 512, 1)
        self.regressor = nn.Linear(config.hidden_size, 1)
    
    def forward(self, input_ids, attention_mask):
        hidden_states = self.transformer_model(input_ids=input_ids, attention_mask=attention_mask)['hidden_states']
        last_layer_hidden_states = hidden_states[-1]
        weights = self.attention(last_layer_hidden_states)
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1) 
        return self.regressor(context_vector), context_vector

In [21]:
sample_model = CommonLitModel()

Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.weight', 'pool

In [22]:
import re

for i, (name, param) in enumerate(sample_model.named_parameters()):
    if(name.find('layer') > -1):
        layer_name = re.sub(r'.+(layer\.\d+).+', r'\1', name)

In [23]:
for i, (name, param) in enumerate(sample_model.named_parameters()):
    print(i, name, param.size())

0 transformer_model.deberta.embeddings.word_embeddings.weight torch.Size([50265, 1024])
1 transformer_model.deberta.embeddings.LayerNorm.weight torch.Size([1024])
2 transformer_model.deberta.embeddings.LayerNorm.bias torch.Size([1024])
3 transformer_model.deberta.encoder.layer.0.attention.self.q_bias torch.Size([1024])
4 transformer_model.deberta.encoder.layer.0.attention.self.v_bias torch.Size([1024])
5 transformer_model.deberta.encoder.layer.0.attention.self.in_proj.weight torch.Size([3072, 1024])
6 transformer_model.deberta.encoder.layer.0.attention.self.pos_proj.weight torch.Size([1024, 1024])
7 transformer_model.deberta.encoder.layer.0.attention.self.pos_q_proj.weight torch.Size([1024, 1024])
8 transformer_model.deberta.encoder.layer.0.attention.self.pos_q_proj.bias torch.Size([1024])
9 transformer_model.deberta.encoder.layer.0.attention.output.dense.weight torch.Size([1024, 1024])
10 transformer_model.deberta.encoder.layer.0.attention.output.dense.bias torch.Size([1024])
11 trans

In [24]:
# sample_input_ids = torch.randint(0, 1000, [2, 248])
# sample_attention_mask = torch.randint(0, 1000, [2, 248])

In [25]:
sample_records = [sample_ds[i] for i in range(2)]

In [26]:
sample_records[0].keys()

dict_keys(['input_ids', 'attention_mask', 'target'])

In [27]:
sample_input_ids = torch.stack([r['input_ids'] for r in sample_records])
sample_attention_mask = torch.stack([r['attention_mask'] for r in sample_records])

In [28]:
sample_input_ids.shape, sample_attention_mask.shape

(torch.Size([2, 248]), torch.Size([2, 248]))

In [29]:
sample_input_ids

tensor([[    1,  1779,     5,   664,    82,  1835,     7,     5,  1011,  4294,
             6,    24,  2633,    10, 27265,  1714,  2772,     4,  2978,     9,
            41,  6291,  1310,     6,    24,    21,    10,  2608,  5252,     4,
         50118,   133,  1929,    21,  2913,    19,  1958,    12,  9830, 20790,
             6,    45,  4976,    15, 17359,     6,    53, 11122, 18331,    81,
         24271,     8,  9910,  6368,     6,   101,    10,   588,  1958,   882,
             4,    20,  3617, 38325,     8,   655,   571, 18656,    14,    56,
         14633,     5,   929,     6,    58, 39143,    19, 15039,     8, 22246,
         11538,    19, 13145, 21811,     9, 13178,     6,   101,  1958,     4,
          1578, 11720,  8402,    56,    57, 14998, 38073,    15,   106,     6,
             8, 19053,   154, 16155, 41591, 20846, 10601,    31,     5,  9836,
             4, 50118,  3750,   349,   253,     9,     5,   929,     6,    15,
             5,  2204,     6, 10601,    10,  2721,  

In [30]:
internal_out = sample_model.transformer_model(sample_input_ids, attention_mask=sample_attention_mask)

In [31]:
internal_out.keys()

odict_keys(['logits', 'hidden_states'])

In [32]:
len(internal_out.hidden_states), internal_out.hidden_states[-1].shape

(49, torch.Size([2, 248, 1024]))

In [33]:
sample_res = sample_model(sample_input_ids, sample_attention_mask)

In [34]:
sample_res[0].shape, sample_res[1].shape

(torch.Size([2, 1]), torch.Size([2, 1024]))

In [35]:
torch.sum(torch.randn([8, 496, 768]), axis=1)

tensor([[-17.6797, -28.8163,  18.6234,  ..., -11.7443,   8.2338, -43.5786],
        [  2.8545,  -1.2901, -12.1370,  ..., -29.8070, -36.4419, -13.0323],
        [-10.6063,   1.3496,  -1.7358,  ..., -18.7509, -19.3895, -39.3124],
        ...,
        [-40.0713,  37.8588, -14.1708,  ...,  28.2023, -26.2297,   9.8448],
        [-61.6582, -41.9132, -11.8418,  ...,   8.0438,  -9.5543,  -6.3620],
        [-15.7379,  19.2065,   6.3814,  ..., -17.2292,  22.6391,  25.3546]])

### Evaluation and Prediction

In [36]:
def eval_mse(model, data_loader):
    model.eval()
    mse_sum = 0
    mse_loss = nn.MSELoss(reduction='sum')
    
    with torch.no_grad():
        for batch_num, record in enumerate(data_loader):
            input_ids, attention_mask, target = record['input_ids'].to(cfg.DEVICE), record['attention_mask'].to(cfg.DEVICE), record['target'].to(cfg.DEVICE)
            pred, _ = model(input_ids, attention_mask)
            mse_sum += mse_loss(pred.flatten().cpu(), target.cpu())
            
    return mse_sum / len(data_loader.dataset)

In [37]:
def predict(model, data_loader):
    model.eval()
    result = []
    
    with torch.no_grad():
        for batch_num, record in tqdm(enumerate(data_loader), total=len(data_loader)):
            input_ids, attention_mask = record['input_ids'].to(cfg.DEVICE), record['attention_mask'].to(cfg.DEVICE)
            pred, _ = model(input_ids, attention_mask)
            result.extend(pred.flatten().to("cpu").tolist())
            
    return np.array(result)

In [38]:
sample_dl = DataLoader(sample_ds, shuffle=False, batch_size=16, num_workers=1)

### Optimizer and Sampler

In [39]:
def create_optimizer(model, base_lr=5e-5, last_lr=None):
    named_parameters = list(model.named_parameters())
    attention_param_start = 776
    regressor_param_start = 780
    roberta_parameters = named_parameters[:attention_param_start]
    attention_parameters = named_parameters[attention_param_start:regressor_param_start]
    regressor_parameters = named_parameters[regressor_param_start:]
    
    attention_group = [params for (name, params) in attention_parameters]
    regressor_group = [params for (name, params) in regressor_parameters]
    
    parameters = []
    if last_lr is not None:
        parameters.append({"params": attention_group, "lr": last_lr})
        parameters.append({"params": regressor_group, "lr": last_lr})
    else:
        parameters.append({"params": attention_group})
        parameters.append({"params": regressor_group})
        
    # Change on different models
    layer_low_threshold = 275
    layer_middle_threshold = 571
    
    for layer_num, (name, params) in enumerate(roberta_parameters):
        weight_decay = 0.0 if 'bias' in name else 0.01
        
        lr = base_lr / 2.5 # 2e-05
        if layer_num >= layer_middle_threshold:
            lr = base_lr / 0.5 # 1e-4
        elif layer_num >= layer_low_threshold:        
            lr = base_lr    
            
        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})
        
    return AdamW(parameters)

In [40]:
sample_optimizer = create_optimizer(sample_model)

In [41]:
from torch.utils.data import Sampler,SequentialSampler,RandomSampler,SubsetRandomSampler
from collections import Counter

class WeightedSampler(Sampler):
    
    def __init__(self, dataset):
        
        self.indices = list(range(len(dataset)))
        self.num_samples = len(dataset)
        self.label_to_count = dict(Counter(dataset.bins))
        weights = [1/self.label_to_count[i] for i in dataset.bins]
        
        self.weights = torch.tensor(weights,dtype=torch.double)
        
    def __iter__(self):
        count = 0
        index = [self.indices[i] for i in torch.multinomial(self.weights, self.num_samples, replacement=True)]
        while count < self.num_samples:
            yield index[count]
            count += 1
    
    def __len__(self):
        return self.num_samples

### Training

In [42]:
def choose_eval_period(val_rmse):
    for rmse, period in cfg.EVAL_SCHEDULE:
        if val_rmse >= rmse:
            return period

In [43]:
def serialize_best(best_val_rmse, best_epoch, val_rmse, epoch, model, model_path):
    if not best_val_rmse or val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        best_epoch = epoch
        if not model_path.parent.exists():
            os.makedirs(model_path.parent)
        
#         torch.save(model.state_dict(), model_path)
        print(f"New best_val_rmse: {best_val_rmse:0.4}")
    else:       
        print(f"Still best_val_rmse: {best_val_rmse:0.4}",
              f"(from epoch {best_epoch})")
    return best_epoch, best_val_rmse

In [44]:
class Trainer():
    def __init__(self, scaler, model, model_path, train_loader, val_loader, optimizer, scheduler=None, num_epochs=cfg.NUM_EPOCHS):
        self.scaler, self.model, self.model_path, self.train_loader, self.val_loader, self.optimizer, self.scheduler, self.num_epochs = (
            scaler, model, model_path, train_loader, val_loader, optimizer, scheduler, num_epochs
        )
            
    def train(self):
        self.model.train()
        
        mse_loss = nn.MSELoss(reduction='mean')
        
        best_val_rmse = None
        best_epoch = 0
        step = 0
        last_eval_step = 0
        eval_period = cfg.EVAL_SCHEDULE[0][1]    

        start = time.time()
        val_rmse_list = []
        
        tbar = tqdm(range(self.num_epochs), total=self.num_epochs)
        for epoch in tbar:
            tbar.set_description(f'Epoch: {epoch}')
            val_rmse = None
            for batch_num, record in enumerate(self.train_loader):
                input_ids, attention_mask, target = record['input_ids'].to(cfg.DEVICE), record['attention_mask'].to(cfg.DEVICE), record['target'].to(cfg.DEVICE)
                
                self.optimizer.zero_grad()
                
                # Casts operations to mixed precision
                with torch.cuda.amp.autocast():
                    pred, _ = self.model(input_ids, attention_mask)
                    mse = mse_loss(pred.flatten(), target)
                    
                self.scaler.scale(mse).backward()
                self.scaler.step(self.optimizer)
                self.scaler.update()
                
#                 mse.backward()
#                 self.optimizer.step()
                
                if self.scheduler:
                    self.scheduler.step()
                    
                if step >= last_eval_step + eval_period:
                    elapsed_seconds = time.time() - start
                    num_steps = step - last_eval_step
                    print(f"\n{num_steps} steps took {elapsed_seconds:0.3} seconds")
                    last_eval_step = step
                    
                    val_rmse = np.sqrt(eval_mse(self.model, self.val_loader))
                    print(f"Epoch: {epoch} batch_num: {batch_num}", f"val_rmse: {val_rmse:0.4} ", end='')
                    
                    eval_period = choose_eval_period(val_rmse)
                    best_epoch, best_val_rmse = serialize_best(best_val_rmse, best_epoch, val_rmse, epoch, self.model, self.model_path)
                    val_rmse_list.append(val_rmse)
                    start = time.time()
                # Finish early on condition
                if epoch > 0 and best_val_rmse > 0.6 or (len(val_rmse_list) > 5 and np.array(val_rmse_list).mean() > 1.0):
                    return best_val_rmse
                
                step += 1
        return best_val_rmse

In [45]:
kfold = KFold(n_splits=cfg.NUM_FOLDS, random_state=cfg.SEED, shuffle=True)
splits = list(kfold.split(train_df))

### Optuna

In [46]:
del sample_model
gc.collect()
torch.cuda.empty_cache()

In [47]:
# Best results
# Fold 0: { 'base_lr': 4.3596909535440914e-05, 'last_lr': 0.0004188473213340135, 'epochs': 4 } Best value: 0.4722290635108948
# Fold 1: {'base_lr': 3.093409522252196e-05, 'last_lr': 0.0004074485086437216, 'epochs': 4} Best is trial 9 with value: 0.4512692391872406
# Fold 2: {'base_lr': 5.9004819673113075e-05, 'last_lr': 0.0003701804156340247, 'epochs': 5}   Best value:  0.46230143308639526
# Fold 3: {'base_lr': 3.091841397163233e-05, 'last_lr': 0.00010409734625896974, 'epochs': 4}. Best value:  0.474480539560318
# Fold 4: {'base_lr': 3.2314567372708084e-05, 'last_lr': 8.327155005618419e-05, 'epochs': 4}. Best is trial 0 with value: 0.45970267057418823
# Fold 5: {'base_lr': 3.5379120180791935e-05, 'last_lr': 0.00021137535166837663, 'epochs': 5}. Best is trial 0 with value: 0.46050626039505005

In [48]:

fold = 0

def objective(trial):
    base_lr = trial.suggest_float("base_lr", 3e-5, 5e-4, log=True)
    last_lr = trial.suggest_float("last_lr", 8e-5, 5e-3, log=True)
    epochs = trial.suggest_int('epochs', 3, 5)
    
    print(f'##### Using fold {fold}')
    print(f'##### Using base_lr {base_lr} last_lr {last_lr} epochs {epochs}')
    
    model_path = cfg.MODEL_FOLDER/f"{cfg.model_name.replace('/', '_')}_{fold + 1}/model_{fold + 1}.pth"
    
    set_random_seed(cfg.SEED + fold)
    
    tokenizer = AutoTokenizer.from_pretrained(cfg.TOKENIZER_PATH)
    
    train_indices, val_indices = splits[fold]
    train_dataset = CommonLitDataset(train_df.loc[train_indices], tokenizer)    
    val_dataset = CommonLitDataset(train_df.loc[val_indices], tokenizer)
    
    train_loader = DataLoader(train_dataset, batch_size=cfg.BATCH_SIZE,
                              drop_last=False, shuffle=True, num_workers=cfg.NUM_WORKERS)    
    val_loader = DataLoader(val_dataset, batch_size=cfg.BATCH_SIZE,
                            drop_last=False, shuffle=False, num_workers=cfg.NUM_WORKERS)
    
    model = CommonLitModel().to(cfg.DEVICE)
    
    optimizer = create_optimizer(model, base_lr=base_lr, last_lr=last_lr)
    
    scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                num_training_steps=cfg.NUM_EPOCHS * len(train_loader), 
                                                num_warmup_steps=50)
    scaler = torch.cuda.amp.GradScaler() # fp16
    
    trainer = Trainer(scaler, model, model_path, train_loader, val_loader, optimizer, 
                      scheduler = scheduler, num_epochs = epochs)
    rmse_val = trainer.train()
    
    del trainer
    del model
    del tokenizer
    del scaler
    del optimizer
    del train_loader
    del val_loader
    torch.cuda.empty_cache()
    gc.collect()
    
    return rmse_val

In [None]:
for i in range(0, 3):
    fold = i
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=20)
    print(" Best value: ", study.best_trial.value)
    print(" Best params: ")
    for key, value in study.best_trial.params.items():
        print(f"    {key}: {value}")

[32m[I 2021-07-24 12:32:46,237][0m A new study created in memory with name: no-name-08343f36-e798-4667-9c07-e6b584b44ca2[0m


##### Using fold 0
##### Using base_lr 3.52061547111578e-05 last_lr 0.002526936749634316 epochs 3


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 16.2 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8216 New best_val_rmse: 0.8216

16 steps took 13.2 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6873 New best_val_rmse: 0.6873

16 steps took 13.3 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6986 Still best_val_rmse: 0.6873 (from epoch 0)

16 steps took 13.3 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5683 New best_val_rmse: 0.5683

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.8753 Still best_val_rmse: 0.5683 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6333 Still best_val_rmse: 0.5683 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6638 Still best_val_rmse: 0.5683 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.643 Still best_val_rmse: 0.5683 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6932 Still best_val_rmse: 0.5683 (from epoch 0)

16 steps took 13.9 seconds
Epoch: 1 batch_num: 12

[32m[I 2021-07-24 12:45:55,623][0m Trial 0 finished with value: 0.48194995522499084 and parameters: {'base_lr': 3.52061547111578e-05, 'last_lr': 0.002526936749634316, 'epochs': 3}. Best is trial 0 with value: 0.48194995522499084.[0m



##### Using fold 0
##### Using base_lr 0.00025139675070443403 last_lr 8.48669845443173e-05 epochs 5


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 15.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9014 New best_val_rmse: 0.9014

16 steps took 13.5 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.899 New best_val_rmse: 0.899

16 steps took 13.5 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9366 Still best_val_rmse: 0.899 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7911 New best_val_rmse: 0.7911

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.8751 Still best_val_rmse: 0.7911 (from epoch 0)

16 steps took 13.3 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.039 Still best_val_rmse: 0.7911 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.051 Still best_val_rmse: 0.7911 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.149 Still best_val_rmse: 0.7911 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.023 Still best_val_rmse: 0.7911 (from epoch 0)


[32m[I 2021-07-24 12:49:50,544][0m Trial 1 finished with value: 0.7910526394844055 and parameters: {'base_lr': 0.00025139675070443403, 'last_lr': 8.48669845443173e-05, 'epochs': 5}. Best is trial 0 with value: 0.48194995522499084.[0m



##### Using fold 0
##### Using base_lr 4.3596909535440914e-05 last_lr 0.0004188473213340135 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8007 New best_val_rmse: 0.8007

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7216 New best_val_rmse: 0.7216

16 steps took 13.5 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8994 Still best_val_rmse: 0.7216 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8762 Still best_val_rmse: 0.7216 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6523 New best_val_rmse: 0.6523

16 steps took 13.5 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.782 Still best_val_rmse: 0.6523 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6558 Still best_val_rmse: 0.6523 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5925 New best_val_rmse: 0.5925

16 steps took 13.5 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6099 Still best_val_rmse: 0.5925 (from epoch 0)

16 steps took 14.0 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5123

[32m[I 2021-07-24 13:20:56,919][0m Trial 2 finished with value: 0.4722290635108948 and parameters: {'base_lr': 4.3596909535440914e-05, 'last_lr': 0.0004188473213340135, 'epochs': 4}. Best is trial 2 with value: 0.4722290635108948.[0m



##### Using fold 0
##### Using base_lr 8.727465061799723e-05 last_lr 9.710102125226154e-05 epochs 3


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 15.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7606 New best_val_rmse: 0.7606

16 steps took 13.5 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7279 New best_val_rmse: 0.7279

16 steps took 13.5 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8377 Still best_val_rmse: 0.7279 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7301 Still best_val_rmse: 0.7279 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6221 New best_val_rmse: 0.6221

16 steps took 13.5 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5944 New best_val_rmse: 0.5944

16 steps took 13.5 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6094 Still best_val_rmse: 0.5944 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7479 Still best_val_rmse: 0.5944 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.8172 Still best_val_rmse: 0.5944 (from epoch 0)

16 steps took 13.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.602

[32m[I 2021-07-24 13:31:59,009][0m Trial 3 finished with value: 0.5944061279296875 and parameters: {'base_lr': 8.727465061799723e-05, 'last_lr': 9.710102125226154e-05, 'epochs': 3}. Best is trial 2 with value: 0.4722290635108948.[0m



##### Using fold 0
##### Using base_lr 0.00021673763882290424 last_lr 0.0009094132155860931 epochs 5


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 15.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9168 New best_val_rmse: 0.9168

16 steps took 13.3 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.16 Still best_val_rmse: 0.9168 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.025 Still best_val_rmse: 0.9168 (from epoch 0)

16 steps took 13.3 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.058 Still best_val_rmse: 0.9168 (from epoch 0)

16 steps took 13.3 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.03 Still best_val_rmse: 0.9168 (from epoch 0)

16 steps took 13.3 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.028 Still best_val_rmse: 0.9168 (from epoch 0)


[32m[I 2021-07-24 13:34:38,757][0m Trial 4 finished with value: 0.9168067574501038 and parameters: {'base_lr': 0.00021673763882290424, 'last_lr': 0.0009094132155860931, 'epochs': 5}. Best is trial 2 with value: 0.4722290635108948.[0m



##### Using fold 0
##### Using base_lr 0.00017449894035314006 last_lr 9.380576700424332e-05 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8548 New best_val_rmse: 0.8548

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8815 Still best_val_rmse: 0.8548 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.195 Still best_val_rmse: 0.8548 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.063 Still best_val_rmse: 0.8548 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.048 Still best_val_rmse: 0.8548 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.044 Still best_val_rmse: 0.8548 (from epoch 0)


[32m[I 2021-07-24 13:37:17,800][0m Trial 5 finished with value: 0.8548403382301331 and parameters: {'base_lr': 0.00017449894035314006, 'last_lr': 9.380576700424332e-05, 'epochs': 4}. Best is trial 2 with value: 0.4722290635108948.[0m



##### Using fold 0
##### Using base_lr 0.000149709311373108 last_lr 0.0001765122619670107 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.802 New best_val_rmse: 0.802

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.316 Still best_val_rmse: 0.802 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.07 Still best_val_rmse: 0.802 (from epoch 0)

16 steps took 13.3 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.046 Still best_val_rmse: 0.802 (from epoch 0)

16 steps took 13.3 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.026 Still best_val_rmse: 0.802 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.037 Still best_val_rmse: 0.802 (from epoch 0)


[32m[I 2021-07-24 13:39:59,710][0m Trial 6 finished with value: 0.8019664287567139 and parameters: {'base_lr': 0.000149709311373108, 'last_lr': 0.0001765122619670107, 'epochs': 4}. Best is trial 2 with value: 0.4722290635108948.[0m



##### Using fold 0
##### Using base_lr 0.00016319764240705915 last_lr 0.0030962193948501603 epochs 5


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 15.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.022 New best_val_rmse: 1.022

16 steps took 13.5 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9537 New best_val_rmse: 0.9537

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9725 Still best_val_rmse: 0.9537 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6798 New best_val_rmse: 0.6798

16 steps took 13.5 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.9345 Still best_val_rmse: 0.6798 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7188 Still best_val_rmse: 0.6798 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7979 Still best_val_rmse: 0.6798 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.029 Still best_val_rmse: 0.6798 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.7558 Still best_val_rmse: 0.6798 (from epoch 0)



[32m[I 2021-07-24 13:43:54,721][0m Trial 7 finished with value: 0.6798244118690491 and parameters: {'base_lr': 0.00016319764240705915, 'last_lr': 0.0030962193948501603, 'epochs': 5}. Best is trial 2 with value: 0.4722290635108948.[0m


##### Using fold 0
##### Using base_lr 0.00037494663476135657 last_lr 0.00015163485961944566 epochs 5


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 15.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.014 New best_val_rmse: 1.014

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.024 Still best_val_rmse: 1.014 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.101 Still best_val_rmse: 1.014 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.018 Still best_val_rmse: 1.014 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.028 Still best_val_rmse: 1.014 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.019 Still best_val_rmse: 1.014 (from epoch 0)


[32m[I 2021-07-24 13:46:34,288][0m Trial 8 finished with value: 1.0135622024536133 and parameters: {'base_lr': 0.00037494663476135657, 'last_lr': 0.00015163485961944566, 'epochs': 5}. Best is trial 2 with value: 0.4722290635108948.[0m



##### Using fold 0
##### Using base_lr 4.9054858117051126e-05 last_lr 0.0004702842365807719 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8983 New best_val_rmse: 0.8983

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6421 New best_val_rmse: 0.6421

16 steps took 13.5 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7121 Still best_val_rmse: 0.6421 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6883 Still best_val_rmse: 0.6421 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6127 New best_val_rmse: 0.6127

16 steps took 13.5 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.55 New best_val_rmse: 0.55

16 steps took 13.5 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5921 Still best_val_rmse: 0.55 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6059 Still best_val_rmse: 0.55 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6247 Still best_val_rmse: 0.55 (from epoch 0)

16 steps took 13.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5201 New best

[32m[I 2021-07-24 14:07:52,519][0m Trial 9 finished with value: 0.48002734780311584 and parameters: {'base_lr': 4.9054858117051126e-05, 'last_lr': 0.0004702842365807719, 'epochs': 4}. Best is trial 2 with value: 0.4722290635108948.[0m



##### Using fold 0
##### Using base_lr 7.30578195892316e-05 last_lr 0.0005070855728570029 epochs 3


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 15.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7423 New best_val_rmse: 0.7423

16 steps took 13.5 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6939 New best_val_rmse: 0.6939

16 steps took 13.5 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.65 New best_val_rmse: 0.65

16 steps took 13.5 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7631 Still best_val_rmse: 0.65 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.725 Still best_val_rmse: 0.65 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6271 New best_val_rmse: 0.6271

16 steps took 13.5 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6442 Still best_val_rmse: 0.6271 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5929 New best_val_rmse: 0.5929

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.704 Still best_val_rmse: 0.5929 (from epoch 0)

16 steps took 14.0 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5705 New best_val_rmse: 0.5705

[32m[I 2021-07-24 14:20:39,768][0m Trial 10 finished with value: 0.48680758476257324 and parameters: {'base_lr': 7.30578195892316e-05, 'last_lr': 0.0005070855728570029, 'epochs': 3}. Best is trial 2 with value: 0.4722290635108948.[0m



##### Using fold 0
##### Using base_lr 3.149222831637006e-05 last_lr 0.0005758972050563623 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9105 New best_val_rmse: 0.9105

16 steps took 13.5 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.632 New best_val_rmse: 0.632

16 steps took 13.5 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8844 Still best_val_rmse: 0.632 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7186 Still best_val_rmse: 0.632 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.713 Still best_val_rmse: 0.632 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7416 Still best_val_rmse: 0.632 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6893 Still best_val_rmse: 0.632 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5858 New best_val_rmse: 0.5858

16 steps took 13.5 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6673 Still best_val_rmse: 0.5858 (from epoch 0)

16 steps took 14.0 seconds
Epoch: 1 batch_num: 12 val_rm

[32m[I 2021-07-24 14:41:46,823][0m Trial 11 finished with value: 0.48100438714027405 and parameters: {'base_lr': 3.149222831637006e-05, 'last_lr': 0.0005758972050563623, 'epochs': 4}. Best is trial 2 with value: 0.4722290635108948.[0m



##### Using fold 0
##### Using base_lr 5.378964540224331e-05 last_lr 0.000321518446584975 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8802 New best_val_rmse: 0.8802

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.613 New best_val_rmse: 0.613

16 steps took 13.5 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6414 Still best_val_rmse: 0.613 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6029 New best_val_rmse: 0.6029

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6926 Still best_val_rmse: 0.6029 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6063 Still best_val_rmse: 0.6029 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6327 Still best_val_rmse: 0.6029 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6559 Still best_val_rmse: 0.6029 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.599 New best_val_rmse: 0.599

16 steps took 14.0 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5166 New

[32m[I 2021-07-24 15:04:05,338][0m Trial 12 finished with value: 0.4811325669288635 and parameters: {'base_lr': 5.378964540224331e-05, 'last_lr': 0.000321518446584975, 'epochs': 4}. Best is trial 2 with value: 0.4722290635108948.[0m



##### Using fold 0
##### Using base_lr 5.0197324888794674e-05 last_lr 0.001040925108107359 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9508 New best_val_rmse: 0.9508

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6423 New best_val_rmse: 0.6423

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8509 Still best_val_rmse: 0.6423 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.725 Still best_val_rmse: 0.6423 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.821 Still best_val_rmse: 0.6423 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5926 New best_val_rmse: 0.5926

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6038 Still best_val_rmse: 0.5926 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.58 New best_val_rmse: 0.58

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.648 Still best_val_rmse: 0.58 (from epoch 0)

16 steps took 14.0 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5498 New bes

[32m[I 2021-07-24 15:26:32,852][0m Trial 13 finished with value: 0.47635605931282043 and parameters: {'base_lr': 5.0197324888794674e-05, 'last_lr': 0.001040925108107359, 'epochs': 4}. Best is trial 2 with value: 0.4722290635108948.[0m



##### Using fold 0
##### Using base_lr 4.4574105236474114e-05 last_lr 0.0012177898793993265 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.848 New best_val_rmse: 0.848

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6961 New best_val_rmse: 0.6961

16 steps took 13.5 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6787 New best_val_rmse: 0.6787

16 steps took 13.5 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6374 New best_val_rmse: 0.6374

16 steps took 13.5 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.8057 Still best_val_rmse: 0.6374 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7773 Still best_val_rmse: 0.6374 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6259 New best_val_rmse: 0.6259

16 steps took 13.5 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6152 New best_val_rmse: 0.6152

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6125 New best_val_rmse: 0.6125


[32m[I 2021-07-24 15:30:27,918][0m Trial 14 finished with value: 0.6124522686004639 and parameters: {'base_lr': 4.4574105236474114e-05, 'last_lr': 0.0012177898793993265, 'epochs': 4}. Best is trial 2 with value: 0.4722290635108948.[0m



##### Using fold 0
##### Using base_lr 9.82098014680715e-05 last_lr 0.0015505061907821445 epochs 3


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 15.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7732 New best_val_rmse: 0.7732

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7319 New best_val_rmse: 0.7319

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7905 Still best_val_rmse: 0.7319 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8004 Still best_val_rmse: 0.7319 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.739 Still best_val_rmse: 0.7319 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6267 New best_val_rmse: 0.6267

16 steps took 13.5 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7103 Still best_val_rmse: 0.6267 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6684 Still best_val_rmse: 0.6267 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.7785 Still best_val_rmse: 0.6267 (from epoch 0)


[32m[I 2021-07-24 15:34:22,654][0m Trial 15 finished with value: 0.6266731023788452 and parameters: {'base_lr': 9.82098014680715e-05, 'last_lr': 0.0015505061907821445, 'epochs': 3}. Best is trial 2 with value: 0.4722290635108948.[0m



##### Using fold 0
##### Using base_lr 3.015168087829758e-05 last_lr 0.00024764700595687967 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9311 New best_val_rmse: 0.9311

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6291 New best_val_rmse: 0.6291

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8726 Still best_val_rmse: 0.6291 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8267 Still best_val_rmse: 0.6291 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6974 Still best_val_rmse: 0.6291 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7037 Still best_val_rmse: 0.6291 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6874 Still best_val_rmse: 0.6291 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.564 New best_val_rmse: 0.564

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6877 Still best_val_rmse: 0.564 (from epoch 0)

16 steps took 13.9 seconds
Epoch: 1 batch_num: 12 v

[32m[I 2021-07-24 15:55:29,298][0m Trial 16 finished with value: 0.4814762771129608 and parameters: {'base_lr': 3.015168087829758e-05, 'last_lr': 0.00024764700595687967, 'epochs': 4}. Best is trial 2 with value: 0.4722290635108948.[0m



##### Using fold 0
##### Using base_lr 6.535214839410794e-05 last_lr 0.0008696239949969886 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7778 New best_val_rmse: 0.7778

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6669 New best_val_rmse: 0.6669

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8377 Still best_val_rmse: 0.6669 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6336 New best_val_rmse: 0.6336

16 steps took 13.5 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.9348 Still best_val_rmse: 0.6336 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6796 Still best_val_rmse: 0.6336 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5773 New best_val_rmse: 0.5773

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.9824 Still best_val_rmse: 0.5773 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6515 Still best_val_rmse: 0.5773 (from epoch 0)

16 steps took 13.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.594

[32m[I 2021-07-24 16:20:19,087][0m Trial 17 finished with value: 0.477342814207077 and parameters: {'base_lr': 6.535214839410794e-05, 'last_lr': 0.0008696239949969886, 'epochs': 4}. Best is trial 2 with value: 0.4722290635108948.[0m



##### Using fold 0
##### Using base_lr 3.840605134767292e-05 last_lr 0.0018472946836358812 epochs 3


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 15.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8192 New best_val_rmse: 0.8192

16 steps took 13.5 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7646 New best_val_rmse: 0.7646

16 steps took 13.5 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8818 Still best_val_rmse: 0.7646 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7447 New best_val_rmse: 0.7447

16 steps took 13.5 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5804 New best_val_rmse: 0.5804

16 steps took 13.5 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6274 Still best_val_rmse: 0.5804 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6393 Still best_val_rmse: 0.5804 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5887 Still best_val_rmse: 0.5804 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6952 Still best_val_rmse: 0.5804 (from epoch 0)

16 steps took 13.9 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.591

[32m[I 2021-07-24 16:34:17,094][0m Trial 18 finished with value: 0.48388057947158813 and parameters: {'base_lr': 3.840605134767292e-05, 'last_lr': 0.0018472946836358812, 'epochs': 3}. Best is trial 2 with value: 0.4722290635108948.[0m



##### Using fold 0
##### Using base_lr 9.900413506375257e-05 last_lr 0.004894051644271908 epochs 5


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 15.3 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9259 New best_val_rmse: 0.9259

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7994 New best_val_rmse: 0.7994

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9247 Still best_val_rmse: 0.7994 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.936 Still best_val_rmse: 0.7994 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.194 Still best_val_rmse: 0.7994 (from epoch 0)

16 steps took 13.3 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.066 Still best_val_rmse: 0.7994 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.04 Still best_val_rmse: 0.7994 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.16 Still best_val_rmse: 0.7994 (from epoch 0)


[32m[I 2021-07-24 16:37:43,741][0m Trial 19 finished with value: 0.7993664145469666 and parameters: {'base_lr': 9.900413506375257e-05, 'last_lr': 0.004894051644271908, 'epochs': 5}. Best is trial 2 with value: 0.4722290635108948.[0m





[32m[I 2021-07-24 16:37:43,743][0m A new study created in memory with name: no-name-051ff919-33eb-4390-a237-cd5bfc133971[0m


 Best value:  0.4722290635108948
 Best params: 
    base_lr: 4.3596909535440914e-05
    last_lr: 0.0004188473213340135
    epochs: 4
##### Using fold 1
##### Using base_lr 0.00014093376329743534 last_lr 0.0005197005435285995 epochs 3


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 15.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8935 New best_val_rmse: 0.8935

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8103 New best_val_rmse: 0.8103

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.172 Still best_val_rmse: 0.8103 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.013 Still best_val_rmse: 0.8103 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.012 Still best_val_rmse: 0.8103 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.017 Still best_val_rmse: 0.8103 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.044 Still best_val_rmse: 0.8103 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.017 Still best_val_rmse: 0.8103 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.038 Still best_val_rmse: 0.8103 (from epoch 0)


[32m[I 2021-07-24 16:41:32,936][0m Trial 0 finished with value: 0.8102750778198242 and parameters: {'base_lr': 0.00014093376329743534, 'last_lr': 0.0005197005435285995, 'epochs': 3}. Best is trial 0 with value: 0.8102750778198242.[0m



##### Using fold 1
##### Using base_lr 7.276983099283288e-05 last_lr 0.0009825677410960766 epochs 5


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 15.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.059 New best_val_rmse: 1.059

16 steps took 13.5 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8111 New best_val_rmse: 0.8111

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7844 New best_val_rmse: 0.7844

16 steps took 13.5 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.975 Still best_val_rmse: 0.7844 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.9136 Still best_val_rmse: 0.7844 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.19 Still best_val_rmse: 0.7844 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6964 New best_val_rmse: 0.6964

16 steps took 13.5 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6625 New best_val_rmse: 0.6625

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6158 New best_val_rmse: 0.6158


[32m[I 2021-07-24 16:45:28,760][0m Trial 1 finished with value: 0.6158090829849243 and parameters: {'base_lr': 7.276983099283288e-05, 'last_lr': 0.0009825677410960766, 'epochs': 5}. Best is trial 1 with value: 0.6158090829849243.[0m



##### Using fold 1
##### Using base_lr 8.209345092870527e-05 last_lr 0.0005140007857320931 epochs 5


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 15.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8619 New best_val_rmse: 0.8619

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7083 New best_val_rmse: 0.7083

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9729 Still best_val_rmse: 0.7083 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.663 New best_val_rmse: 0.663

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.8279 Still best_val_rmse: 0.663 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6654 Still best_val_rmse: 0.663 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6451 New best_val_rmse: 0.6451

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.531 New best_val_rmse: 0.531

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6947 Still best_val_rmse: 0.531 (from epoch 0)

16 steps took 14.0 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.6084 Still best_val_rmse: 0

[32m[I 2021-07-24 17:03:46,483][0m Trial 2 finished with value: 0.5309603214263916 and parameters: {'base_lr': 8.209345092870527e-05, 'last_lr': 0.0005140007857320931, 'epochs': 5}. Best is trial 2 with value: 0.5309603214263916.[0m



##### Using fold 1
##### Using base_lr 0.00010042540410701344 last_lr 0.00013194069340692225 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8349 New best_val_rmse: 0.8349

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8191 New best_val_rmse: 0.8191

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8557 Still best_val_rmse: 0.8191 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.663 New best_val_rmse: 0.663

16 steps took 13.5 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7191 Still best_val_rmse: 0.663 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.554 Still best_val_rmse: 0.663 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.056 Still best_val_rmse: 0.663 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.009 Still best_val_rmse: 0.663 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.009 Still best_val_rmse: 0.663 (from epoch 0)


[32m[I 2021-07-24 17:07:40,851][0m Trial 3 finished with value: 0.6630330681800842 and parameters: {'base_lr': 0.00010042540410701344, 'last_lr': 0.00013194069340692225, 'epochs': 4}. Best is trial 2 with value: 0.5309603214263916.[0m



##### Using fold 1
##### Using base_lr 3.6766910797869016e-05 last_lr 0.0001633214365553677 epochs 5


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 15.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8759 New best_val_rmse: 0.8759

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7709 New best_val_rmse: 0.7709

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.707 New best_val_rmse: 0.707

16 steps took 13.5 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6319 New best_val_rmse: 0.6319

16 steps took 13.5 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6388 Still best_val_rmse: 0.6319 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.711 Still best_val_rmse: 0.6319 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5259 New best_val_rmse: 0.5259

16 steps took 13.5 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5414 Still best_val_rmse: 0.5259 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.571 Still best_val_rmse: 0.5259 (from epoch 0)

16 steps took 14.0 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5453 Still best_val_rmse

[32m[I 2021-07-24 18:06:36,493][0m Trial 4 finished with value: 0.46281054615974426 and parameters: {'base_lr': 3.6766910797869016e-05, 'last_lr': 0.0001633214365553677, 'epochs': 5}. Best is trial 4 with value: 0.46281054615974426.[0m



##### Using fold 1
##### Using base_lr 7.946695511107508e-05 last_lr 0.0012029943571250225 epochs 5


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 15.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8637 New best_val_rmse: 0.8637

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7937 New best_val_rmse: 0.7937

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8605 Still best_val_rmse: 0.7937 (from epoch 0)

16 steps took 13.3 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.019 Still best_val_rmse: 0.7937 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.021 Still best_val_rmse: 0.7937 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.032 Still best_val_rmse: 0.7937 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.011 Still best_val_rmse: 0.7937 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.041 Still best_val_rmse: 0.7937 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.005 Still best_val_rmse: 0.7937 (from epoch 0)


[32m[I 2021-07-24 18:10:30,209][0m Trial 5 finished with value: 0.793735146522522 and parameters: {'base_lr': 7.946695511107508e-05, 'last_lr': 0.0012029943571250225, 'epochs': 5}. Best is trial 4 with value: 0.46281054615974426.[0m



##### Using fold 1
##### Using base_lr 6.276471903172248e-05 last_lr 0.0037184123683035365 epochs 3


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 15.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9806 New best_val_rmse: 0.9806

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6705 New best_val_rmse: 0.6705

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9365 Still best_val_rmse: 0.6705 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6213 New best_val_rmse: 0.6213

16 steps took 13.5 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.019 Still best_val_rmse: 0.6213 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6116 New best_val_rmse: 0.6116

16 steps took 13.5 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7607 Still best_val_rmse: 0.6116 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5737 New best_val_rmse: 0.5737

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5333 New best_val_rmse: 0.5333

16 steps took 14.0 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5359 Still best_val_rmse: 0.5333 (from

[32m[I 2021-07-24 18:39:51,799][0m Trial 6 finished with value: 0.45986250042915344 and parameters: {'base_lr': 6.276471903172248e-05, 'last_lr': 0.0037184123683035365, 'epochs': 3}. Best is trial 6 with value: 0.45986250042915344.[0m



##### Using fold 1
##### Using base_lr 0.00047215451187118294 last_lr 0.0036797076393644897 epochs 5


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 15.3 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.32 New best_val_rmse: 1.32

16 steps took 13.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.045 New best_val_rmse: 1.045

16 steps took 13.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.073 Still best_val_rmse: 1.045 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.133 Still best_val_rmse: 1.045 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.106 Still best_val_rmse: 1.045 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.027 New best_val_rmse: 1.027


[32m[I 2021-07-24 18:42:32,448][0m Trial 7 finished with value: 1.0271624326705933 and parameters: {'base_lr': 0.00047215451187118294, 'last_lr': 0.0036797076393644897, 'epochs': 5}. Best is trial 6 with value: 0.45986250042915344.[0m



##### Using fold 1
##### Using base_lr 0.00020424922669723563 last_lr 0.00011726241769744605 epochs 3


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 15.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8868 New best_val_rmse: 0.8868

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8337 New best_val_rmse: 0.8337

16 steps took 13.2 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.057 Still best_val_rmse: 0.8337 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.052 Still best_val_rmse: 0.8337 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.004 Still best_val_rmse: 0.8337 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.005 Still best_val_rmse: 0.8337 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.002 Still best_val_rmse: 0.8337 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.016 Still best_val_rmse: 0.8337 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.0 Still best_val_rmse: 0.8337 (from epoch 0)


[32m[I 2021-07-24 18:46:25,170][0m Trial 8 finished with value: 0.8337472081184387 and parameters: {'base_lr': 0.00020424922669723563, 'last_lr': 0.00011726241769744605, 'epochs': 3}. Best is trial 6 with value: 0.45986250042915344.[0m



##### Using fold 1
##### Using base_lr 3.093409522252196e-05 last_lr 0.0004074485086437216 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8201 New best_val_rmse: 0.8201

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7671 New best_val_rmse: 0.7671

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6906 New best_val_rmse: 0.6906

16 steps took 13.5 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6273 New best_val_rmse: 0.6273

16 steps took 13.5 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5923 New best_val_rmse: 0.5923

16 steps took 13.5 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5623 New best_val_rmse: 0.5623

16 steps took 13.5 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.56 New best_val_rmse: 0.56

16 steps took 13.5 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6405 Still best_val_rmse: 0.56 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.562 Still best_val_rmse: 0.56 (from epoch 0)

16 steps took 14.0 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5441 New best_val_rmse: 0.5441

16 steps took 13.5 seconds
Epoc

[32m[I 2021-07-24 19:46:03,667][0m Trial 9 finished with value: 0.4512692391872406 and parameters: {'base_lr': 3.093409522252196e-05, 'last_lr': 0.0004074485086437216, 'epochs': 4}. Best is trial 9 with value: 0.4512692391872406.[0m



##### Using fold 1
##### Using base_lr 3.1366132843316186e-05 last_lr 0.0002947624357351441 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8231 New best_val_rmse: 0.8231

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7554 New best_val_rmse: 0.7554

16 steps took 13.5 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.687 New best_val_rmse: 0.687

16 steps took 13.5 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6122 New best_val_rmse: 0.6122

16 steps took 13.5 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7245 Still best_val_rmse: 0.6122 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6313 Still best_val_rmse: 0.6122 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6666 Still best_val_rmse: 0.6122 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6489 Still best_val_rmse: 0.6122 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6155 Still best_val_rmse: 0.6122 (from epoch 0)


[32m[I 2021-07-24 19:50:00,599][0m Trial 10 finished with value: 0.6121996641159058 and parameters: {'base_lr': 3.1366132843316186e-05, 'last_lr': 0.0002947624357351441, 'epochs': 4}. Best is trial 9 with value: 0.4512692391872406.[0m



##### Using fold 1
##### Using base_lr 4.502645042406232e-05 last_lr 0.004770565920096722 epochs 3


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 15.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8616 New best_val_rmse: 0.8616

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6293 New best_val_rmse: 0.6293

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.028 Still best_val_rmse: 0.6293 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6163 New best_val_rmse: 0.6163

16 steps took 13.5 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6817 Still best_val_rmse: 0.6163 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.8007 Still best_val_rmse: 0.6163 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5644 New best_val_rmse: 0.5644

16 steps took 13.5 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.576 Still best_val_rmse: 0.5644 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5404 New best_val_rmse: 0.5404

16 steps took 14.0 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5255 New best_val_rmse

[32m[I 2021-07-24 20:13:24,816][0m Trial 11 finished with value: 0.46788129210472107 and parameters: {'base_lr': 4.502645042406232e-05, 'last_lr': 0.004770565920096722, 'epochs': 3}. Best is trial 9 with value: 0.4512692391872406.[0m



##### Using fold 1
##### Using base_lr 4.5499746096638965e-05 last_lr 0.00216149961050388 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9461 New best_val_rmse: 0.9461

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6324 New best_val_rmse: 0.6324

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9024 Still best_val_rmse: 0.6324 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6691 Still best_val_rmse: 0.6324 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6765 Still best_val_rmse: 0.6324 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7851 Still best_val_rmse: 0.6324 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5979 New best_val_rmse: 0.5979

16 steps took 13.5 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7165 Still best_val_rmse: 0.5979 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5316 New best_val_rmse: 0.5316

16 steps took 14.0 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.507

[32m[I 2021-07-24 20:58:04,196][0m Trial 12 finished with value: 0.4646318554878235 and parameters: {'base_lr': 4.5499746096638965e-05, 'last_lr': 0.00216149961050388, 'epochs': 4}. Best is trial 9 with value: 0.4512692391872406.[0m



##### Using fold 1
##### Using base_lr 5.782684270655918e-05 last_lr 0.00028840065254797455 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['pooler.dense.weight', 'cl

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.5 seconds


In [None]:
%%time

for i in range(2, len(list(splits))):
    fold = i
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=20)
    print(" Best value: ", study.best_trial.value)
    print(" Best params: ")
    for key, value in study.best_trial.params.items():
        print(f"    {key}: {value}")

[32m[I 2021-07-25 07:50:04,801][0m A new study created in memory with name: no-name-7b2b036e-d79a-4b20-b589-16cd1ccaf8d4[0m


##### Using fold 2
##### Using base_lr 7.033182405116535e-05 last_lr 0.00428360023846075 epochs 5


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 16.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7287 New best_val_rmse: 0.7287

16 steps took 13.2 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9069 Still best_val_rmse: 0.7287 (from epoch 0)

16 steps took 13.3 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7714 Still best_val_rmse: 0.7287 (from epoch 0)

16 steps took 13.3 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7524 Still best_val_rmse: 0.7287 (from epoch 0)

16 steps took 13.3 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.8319 Still best_val_rmse: 0.7287 (from epoch 0)

16 steps took 13.3 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.8324 Still best_val_rmse: 0.7287 (from epoch 0)

16 steps took 13.3 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5864 New best_val_rmse: 0.5864

16 steps took 13.3 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5985 Still best_val_rmse: 0.5864 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6026 Still best_val_rmse: 0.5864 (from epoch 0)

16 steps took 14.2 seconds
Epoc

[32m[I 2021-07-25 08:25:59,835][0m Trial 0 finished with value: 0.46916714310646057 and parameters: {'base_lr': 7.033182405116535e-05, 'last_lr': 0.00428360023846075, 'epochs': 5}. Best is trial 0 with value: 0.46916714310646057.[0m



##### Using fold 2
##### Using base_lr 0.0002794442210437961 last_lr 0.0013646811725046258 epochs 5


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 15.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.114 New best_val_rmse: 1.114

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.088 New best_val_rmse: 1.088

16 steps took 13.3 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.087 New best_val_rmse: 1.087

16 steps took 13.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.048 New best_val_rmse: 1.048

16 steps took 13.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.141 Still best_val_rmse: 1.048 (from epoch 0)

16 steps took 13.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.065 Still best_val_rmse: 1.048 (from epoch 0)


[32m[I 2021-07-25 08:28:41,515][0m Trial 1 finished with value: 1.0484046936035156 and parameters: {'base_lr': 0.0002794442210437961, 'last_lr': 0.0013646811725046258, 'epochs': 5}. Best is trial 0 with value: 0.46916714310646057.[0m



##### Using fold 2
##### Using base_lr 9.234931180472906e-05 last_lr 0.002047504740885993 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7477 New best_val_rmse: 0.7477

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8921 Still best_val_rmse: 0.7477 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7388 New best_val_rmse: 0.7388

16 steps took 13.5 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.707 New best_val_rmse: 0.707

16 steps took 13.5 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7974 Still best_val_rmse: 0.707 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.8005 Still best_val_rmse: 0.707 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6308 New best_val_rmse: 0.6308

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6354 Still best_val_rmse: 0.6308 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6137 New best_val_rmse: 0.6137


[32m[I 2021-07-25 08:32:38,881][0m Trial 2 finished with value: 0.6136829257011414 and parameters: {'base_lr': 9.234931180472906e-05, 'last_lr': 0.002047504740885993, 'epochs': 4}. Best is trial 0 with value: 0.46916714310646057.[0m



##### Using fold 2
##### Using base_lr 3.20792930444734e-05 last_lr 0.004561950554091286 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7631 New best_val_rmse: 0.7631

16 steps took 13.3 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9567 Still best_val_rmse: 0.7631 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.652 New best_val_rmse: 0.652

16 steps took 13.4 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6025 New best_val_rmse: 0.6025

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.8982 Still best_val_rmse: 0.6025 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6698 Still best_val_rmse: 0.6025 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6361 Still best_val_rmse: 0.6025 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5611 New best_val_rmse: 0.5611

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5634 Still best_val_rmse: 0.5611 (from epoch 0)

16 steps took 14.2 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5944 

[32m[I 2021-07-25 09:00:36,851][0m Trial 3 finished with value: 0.476400226354599 and parameters: {'base_lr': 3.20792930444734e-05, 'last_lr': 0.004561950554091286, 'epochs': 4}. Best is trial 0 with value: 0.46916714310646057.[0m



##### Using fold 2
##### Using base_lr 0.00026871561067013386 last_lr 0.0010726127570971598 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.117 New best_val_rmse: 1.117

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.028 New best_val_rmse: 1.028

16 steps took 13.3 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.157 Still best_val_rmse: 1.028 (from epoch 0)

16 steps took 13.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.141 Still best_val_rmse: 1.028 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.117 Still best_val_rmse: 1.028 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.071 Still best_val_rmse: 1.028 (from epoch 0)


[32m[I 2021-07-25 09:03:21,115][0m Trial 4 finished with value: 1.0277255773544312 and parameters: {'base_lr': 0.00026871561067013386, 'last_lr': 0.0010726127570971598, 'epochs': 4}. Best is trial 0 with value: 0.46916714310646057.[0m



##### Using fold 2
##### Using base_lr 0.00010220703311425338 last_lr 0.002149079750874539 epochs 5


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 15.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8047 New best_val_rmse: 0.8047

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.021 Still best_val_rmse: 0.8047 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7258 New best_val_rmse: 0.7258

16 steps took 13.4 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7058 New best_val_rmse: 0.7058

16 steps took 13.5 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6779 New best_val_rmse: 0.6779

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7703 Still best_val_rmse: 0.6779 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6016 New best_val_rmse: 0.6016

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6286 Still best_val_rmse: 0.6016 (from epoch 0)

16 steps took 13.3 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.156 Still best_val_rmse: 0.6016 (from epoch 0)


[32m[I 2021-07-25 09:07:18,581][0m Trial 5 finished with value: 0.6016260385513306 and parameters: {'base_lr': 0.00010220703311425338, 'last_lr': 0.002149079750874539, 'epochs': 5}. Best is trial 0 with value: 0.46916714310646057.[0m



##### Using fold 2
##### Using base_lr 0.00014586429163420997 last_lr 0.0021265123779889887 epochs 3


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 15.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.87 New best_val_rmse: 0.87

16 steps took 13.3 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9596 Still best_val_rmse: 0.87 (from epoch 0)

16 steps took 13.3 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8502 New best_val_rmse: 0.8502

16 steps took 13.3 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.9637 Still best_val_rmse: 0.8502 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.059 Still best_val_rmse: 0.8502 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.069 Still best_val_rmse: 0.8502 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.105 Still best_val_rmse: 0.8502 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.124 Still best_val_rmse: 0.8502 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.066 Still best_val_rmse: 0.8502 (from epoch 0)



[32m[I 2021-07-25 09:11:11,756][0m Trial 6 finished with value: 0.8501816987991333 and parameters: {'base_lr': 0.00014586429163420997, 'last_lr': 0.0021265123779889887, 'epochs': 3}. Best is trial 0 with value: 0.46916714310646057.[0m


##### Using fold 2
##### Using base_lr 8.355822005934142e-05 last_lr 0.0002848717475584195 epochs 3


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 15.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7324 New best_val_rmse: 0.7324

16 steps took 13.5 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.848 Still best_val_rmse: 0.7324 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7172 New best_val_rmse: 0.7172

16 steps took 13.4 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6038 New best_val_rmse: 0.6038

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6453 Still best_val_rmse: 0.6038 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7758 Still best_val_rmse: 0.6038 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6982 Still best_val_rmse: 0.6038 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5606 New best_val_rmse: 0.5606

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5481 New best_val_rmse: 0.5481

16 steps took 14.3 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.6061 Still best_val_r

[32m[I 2021-07-25 09:36:55,762][0m Trial 7 finished with value: 0.4657308757305145 and parameters: {'base_lr': 8.355822005934142e-05, 'last_lr': 0.0002848717475584195, 'epochs': 3}. Best is trial 7 with value: 0.4657308757305145.[0m



##### Using fold 2
##### Using base_lr 0.00017595546495067366 last_lr 0.0008292184813936002 epochs 3


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 15.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7622 New best_val_rmse: 0.7622

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.009 Still best_val_rmse: 0.7622 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.816 Still best_val_rmse: 0.7622 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.342 Still best_val_rmse: 0.7622 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.081 Still best_val_rmse: 0.7622 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.1 Still best_val_rmse: 0.7622 (from epoch 0)


[32m[I 2021-07-25 09:39:39,979][0m Trial 8 finished with value: 0.7621898651123047 and parameters: {'base_lr': 0.00017595546495067366, 'last_lr': 0.0008292184813936002, 'epochs': 3}. Best is trial 7 with value: 0.4657308757305145.[0m



##### Using fold 2
##### Using base_lr 0.0001637178761186698 last_lr 0.00019455912682363454 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7968 New best_val_rmse: 0.7968

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9739 Still best_val_rmse: 0.7968 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8892 Still best_val_rmse: 0.7968 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8723 Still best_val_rmse: 0.7968 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7268 New best_val_rmse: 0.7268

16 steps took 13.5 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7304 Still best_val_rmse: 0.7268 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6902 New best_val_rmse: 0.6902

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6111 New best_val_rmse: 0.6111

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.8724 Still best_val_rmse: 0.6111 (from epoch 0)


[32m[I 2021-07-25 09:43:37,274][0m Trial 9 finished with value: 0.6110950708389282 and parameters: {'base_lr': 0.0001637178761186698, 'last_lr': 0.00019455912682363454, 'epochs': 4}. Best is trial 7 with value: 0.4657308757305145.[0m



##### Using fold 2
##### Using base_lr 4.320549847358141e-05 last_lr 0.00019201376406272403 epochs 3


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 15.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7237 New best_val_rmse: 0.7237

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8278 Still best_val_rmse: 0.7237 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8375 Still best_val_rmse: 0.7237 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5945 New best_val_rmse: 0.5945

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6253 Still best_val_rmse: 0.5945 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6751 Still best_val_rmse: 0.5945 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7342 Still best_val_rmse: 0.5945 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5694 New best_val_rmse: 0.5694

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5608 New best_val_rmse: 0.5608

16 steps took 14.2 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.580

[32m[I 2021-07-25 10:04:00,560][0m Trial 10 finished with value: 0.4692443907260895 and parameters: {'base_lr': 4.320549847358141e-05, 'last_lr': 0.00019201376406272403, 'epochs': 3}. Best is trial 7 with value: 0.4657308757305145.[0m



##### Using fold 2
##### Using base_lr 5.519639831224118e-05 last_lr 0.00044124301485893967 epochs 5


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 15.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7494 New best_val_rmse: 0.7494

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.751 Still best_val_rmse: 0.7494 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.5992 New best_val_rmse: 0.5992

16 steps took 13.4 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.594 New best_val_rmse: 0.594

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7603 Still best_val_rmse: 0.594 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7473 Still best_val_rmse: 0.594 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6773 Still best_val_rmse: 0.594 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5918 New best_val_rmse: 0.5918

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5774 New best_val_rmse: 0.5774

16 steps took 14.2 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.6042 Still best_val_rmse: 

[32m[I 2021-07-25 10:49:31,696][0m Trial 11 finished with value: 0.4659542441368103 and parameters: {'base_lr': 5.519639831224118e-05, 'last_lr': 0.00044124301485893967, 'epochs': 5}. Best is trial 7 with value: 0.4657308757305145.[0m



##### Using fold 2
##### Using base_lr 5.9004819673113075e-05 last_lr 0.0003701804156340247 epochs 5


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 16.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7255 New best_val_rmse: 0.7255

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7519 Still best_val_rmse: 0.7255 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6375 New best_val_rmse: 0.6375

16 steps took 13.5 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6152 New best_val_rmse: 0.6152

16 steps took 13.5 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7851 Still best_val_rmse: 0.6152 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6317 Still best_val_rmse: 0.6152 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7351 Still best_val_rmse: 0.6152 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6361 Still best_val_rmse: 0.6152 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5968 New best_val_rmse: 0.5968

16 steps took 14.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.568

[32m[I 2021-07-25 11:41:36,111][0m Trial 12 finished with value: 0.46230143308639526 and parameters: {'base_lr': 5.9004819673113075e-05, 'last_lr': 0.0003701804156340247, 'epochs': 5}. Best is trial 12 with value: 0.46230143308639526.[0m



##### Using fold 2
##### Using base_lr 3.7377002659052826e-05 last_lr 0.0003793496701689377 epochs 3


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 15.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8384 New best_val_rmse: 0.8384

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6816 New best_val_rmse: 0.6816

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6372 New best_val_rmse: 0.6372

16 steps took 13.4 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5714 New best_val_rmse: 0.5714

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7869 Still best_val_rmse: 0.5714 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7194 Still best_val_rmse: 0.5714 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6297 Still best_val_rmse: 0.5714 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7142 Still best_val_rmse: 0.5714 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5514 New best_val_rmse: 0.5514

16 steps took 14.3 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.6044 Still best_val_

[32m[I 2021-07-25 11:55:41,032][0m Trial 13 finished with value: 0.4815579950809479 and parameters: {'base_lr': 3.7377002659052826e-05, 'last_lr': 0.0003793496701689377, 'epochs': 3}. Best is trial 12 with value: 0.46230143308639526.[0m



##### Using fold 2
##### Using base_lr 6.909690719029559e-05 last_lr 0.0001009920153959834 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.6884 New best_val_rmse: 0.6884

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7294 Still best_val_rmse: 0.6884 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8486 Still best_val_rmse: 0.6884 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.9075 Still best_val_rmse: 0.6884 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.94 Still best_val_rmse: 0.6884 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7422 Still best_val_rmse: 0.6884 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6321 New best_val_rmse: 0.6321

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7611 Still best_val_rmse: 0.6321 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6832 Still best_val_rmse: 0.6321 (from epoch 0)


[32m[I 2021-07-25 11:59:38,911][0m Trial 14 finished with value: 0.6320924162864685 and parameters: {'base_lr': 6.909690719029559e-05, 'last_lr': 0.0001009920153959834, 'epochs': 4}. Best is trial 12 with value: 0.46230143308639526.[0m



##### Using fold 2
##### Using base_lr 6.00765282551364e-05 last_lr 0.00027678381128333566 epochs 3


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 15.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.726 New best_val_rmse: 0.726

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6978 New best_val_rmse: 0.6978

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.602 New best_val_rmse: 0.602

16 steps took 13.4 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5831 New best_val_rmse: 0.5831

16 steps took 13.5 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7001 Still best_val_rmse: 0.5831 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.9152 Still best_val_rmse: 0.5831 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6844 Still best_val_rmse: 0.5831 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6173 Still best_val_rmse: 0.5831 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5435 New best_val_rmse: 0.5435

16 steps took 14.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.577 Still best_val_rmse:

[32m[I 2021-07-25 12:17:19,018][0m Trial 15 finished with value: 0.47596481442451477 and parameters: {'base_lr': 6.00765282551364e-05, 'last_lr': 0.00027678381128333566, 'epochs': 3}. Best is trial 12 with value: 0.46230143308639526.[0m



##### Using fold 2
##### Using base_lr 0.0004773588679623398 last_lr 0.00010049280280583319 epochs 5


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 16.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9982 New best_val_rmse: 0.9982

16 steps took 13.2 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.177 Still best_val_rmse: 0.9982 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.052 Still best_val_rmse: 0.9982 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.041 Still best_val_rmse: 0.9982 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.059 Still best_val_rmse: 0.9982 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.055 Still best_val_rmse: 0.9982 (from epoch 0)


[32m[I 2021-07-25 12:20:05,535][0m Trial 16 finished with value: 0.9981786608695984 and parameters: {'base_lr': 0.0004773588679623398, 'last_lr': 0.00010049280280583319, 'epochs': 5}. Best is trial 12 with value: 0.46230143308639526.[0m



##### Using fold 2
##### Using base_lr 9.475640030300052e-05 last_lr 0.000560355390869576 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 16.2 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7632 New best_val_rmse: 0.7632

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9093 Still best_val_rmse: 0.7632 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6859 New best_val_rmse: 0.6859

16 steps took 13.5 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.837 Still best_val_rmse: 0.6859 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7826 Still best_val_rmse: 0.6859 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7518 Still best_val_rmse: 0.6859 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6551 New best_val_rmse: 0.6551

16 steps took 13.5 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5864 New best_val_rmse: 0.5864

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.568 New best_val_rmse: 0.568

16 steps took 14.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.6128 Still best_val_rms

[32m[I 2021-07-25 12:34:58,783][0m Trial 17 finished with value: 0.5470067858695984 and parameters: {'base_lr': 9.475640030300052e-05, 'last_lr': 0.000560355390869576, 'epochs': 4}. Best is trial 12 with value: 0.46230143308639526.[0m



##### Using fold 2
##### Using base_lr 5.347633403724961e-05 last_lr 0.0001662465454143939 epochs 3


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 15.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7516 New best_val_rmse: 0.7516

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7395 New best_val_rmse: 0.7395

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6031 New best_val_rmse: 0.6031

16 steps took 13.4 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6066 Still best_val_rmse: 0.6031 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7105 Still best_val_rmse: 0.6031 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7058 Still best_val_rmse: 0.6031 (from epoch 0)

16 steps took 13.3 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6903 Still best_val_rmse: 0.6031 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6002 New best_val_rmse: 0.6002

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5424 New best_val_rmse: 0.5424

16 steps took 14.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5968 Still best_val_

[32m[I 2021-07-25 12:52:47,439][0m Trial 18 finished with value: 0.4714738428592682 and parameters: {'base_lr': 5.347633403724961e-05, 'last_lr': 0.0001662465454143939, 'epochs': 3}. Best is trial 12 with value: 0.46230143308639526.[0m



##### Using fold 2
##### Using base_lr 0.00011994588027894586 last_lr 0.0003033274865484539 epochs 5


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 15.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.099 New best_val_rmse: 1.099

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.065 New best_val_rmse: 1.065

16 steps took 13.2 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.117 Still best_val_rmse: 1.065 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.085 Still best_val_rmse: 1.065 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.065 New best_val_rmse: 1.065

16 steps took 13.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.059 New best_val_rmse: 1.059


[32m[I 2021-07-25 12:55:31,004][0m Trial 19 finished with value: 1.0589079856872559 and parameters: {'base_lr': 0.00011994588027894586, 'last_lr': 0.0003033274865484539, 'epochs': 5}. Best is trial 12 with value: 0.46230143308639526.[0m





[32m[I 2021-07-25 12:55:31,007][0m A new study created in memory with name: no-name-5808cef8-4ab5-47a0-84fd-8a85460a4897[0m


 Best value:  0.46230143308639526
 Best params: 
    base_lr: 5.9004819673113075e-05
    last_lr: 0.0003701804156340247
    epochs: 5
##### Using fold 3
##### Using base_lr 0.00017878362193620318 last_lr 0.0037204985677183515 epochs 5


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 16.0 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.6768 New best_val_rmse: 0.6768

16 steps took 13.3 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6974 Still best_val_rmse: 0.6768 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7663 Still best_val_rmse: 0.6768 (from epoch 0)

16 steps took 13.3 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.145 Still best_val_rmse: 0.6768 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.049 Still best_val_rmse: 0.6768 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.047 Still best_val_rmse: 0.6768 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.048 Still best_val_rmse: 0.6768 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.044 Still best_val_rmse: 0.6768 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.041 Still best_val_rmse: 0.6768 (from epoch 0)


[32m[I 2021-07-25 12:59:28,888][0m Trial 0 finished with value: 0.6768454313278198 and parameters: {'base_lr': 0.00017878362193620318, 'last_lr': 0.0037204985677183515, 'epochs': 5}. Best is trial 0 with value: 0.6768454313278198.[0m



##### Using fold 3
##### Using base_lr 6.357124481434351e-05 last_lr 0.0006135683687551055 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.072 New best_val_rmse: 1.072

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6998 New best_val_rmse: 0.6998

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8744 Still best_val_rmse: 0.6998 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6426 New best_val_rmse: 0.6426

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6572 Still best_val_rmse: 0.6426 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6525 Still best_val_rmse: 0.6426 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6648 Still best_val_rmse: 0.6426 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6452 Still best_val_rmse: 0.6426 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.788 Still best_val_rmse: 0.6426 (from epoch 0)


[32m[I 2021-07-25 13:03:27,265][0m Trial 1 finished with value: 0.642588198184967 and parameters: {'base_lr': 6.357124481434351e-05, 'last_lr': 0.0006135683687551055, 'epochs': 4}. Best is trial 1 with value: 0.642588198184967.[0m



##### Using fold 3
##### Using base_lr 0.0002667598428149489 last_lr 0.0027877296041089014 epochs 3


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 16.2 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.515 New best_val_rmse: 1.515

16 steps took 13.2 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.072 New best_val_rmse: 1.072

16 steps took 13.2 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.306 Still best_val_rmse: 1.072 (from epoch 0)

16 steps took 13.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.137 Still best_val_rmse: 1.072 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.107 Still best_val_rmse: 1.072 (from epoch 0)

16 steps took 13.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.052 New best_val_rmse: 1.052


[32m[I 2021-07-25 13:06:09,387][0m Trial 2 finished with value: 1.0518639087677002 and parameters: {'base_lr': 0.0002667598428149489, 'last_lr': 0.0027877296041089014, 'epochs': 3}. Best is trial 1 with value: 0.642588198184967.[0m



##### Using fold 3
##### Using base_lr 7.94016491187567e-05 last_lr 0.0005979266225285472 epochs 3


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 15.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.6652 New best_val_rmse: 0.6652

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7094 Still best_val_rmse: 0.6652 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9922 Still best_val_rmse: 0.6652 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7472 Still best_val_rmse: 0.6652 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6323 New best_val_rmse: 0.6323

16 steps took 13.5 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6358 Still best_val_rmse: 0.6323 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.8027 Still best_val_rmse: 0.6323 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5899 New best_val_rmse: 0.5899

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5926 Still best_val_rmse: 0.5899 (from epoch 0)

16 steps took 14.2 seconds
Epoch: 1 batch_num: 1

[32m[I 2021-07-25 13:19:46,497][0m Trial 3 finished with value: 0.48658525943756104 and parameters: {'base_lr': 7.94016491187567e-05, 'last_lr': 0.0005979266225285472, 'epochs': 3}. Best is trial 3 with value: 0.48658525943756104.[0m



##### Using fold 3
##### Using base_lr 4.1384868313036215e-05 last_lr 0.004064085394675545 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 16.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.045 New best_val_rmse: 1.045

16 steps took 13.3 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8202 New best_val_rmse: 0.8202

16 steps took 13.5 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7123 New best_val_rmse: 0.7123

16 steps took 13.5 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6617 New best_val_rmse: 0.6617

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5952 New best_val_rmse: 0.5952

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5908 New best_val_rmse: 0.5908

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7124 Still best_val_rmse: 0.5908 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6163 Still best_val_rmse: 0.5908 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6916 Still best_val_rmse: 0.5908 (from epoch 0)

16 steps took 14.2 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.6025 Still best_val_rmse: 0.5908 (from 

[32m[I 2021-07-25 13:42:26,825][0m Trial 4 finished with value: 0.48103463649749756 and parameters: {'base_lr': 4.1384868313036215e-05, 'last_lr': 0.004064085394675545, 'epochs': 4}. Best is trial 4 with value: 0.48103463649749756.[0m



##### Using fold 3
##### Using base_lr 0.0001077856054370807 last_lr 0.001845347640590279 epochs 3


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 15.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8986 New best_val_rmse: 0.8986

16 steps took 13.3 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6913 New best_val_rmse: 0.6913

16 steps took 13.2 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9822 Still best_val_rmse: 0.6913 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.342 Still best_val_rmse: 0.6913 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.049 Still best_val_rmse: 0.6913 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.06 Still best_val_rmse: 0.6913 (from epoch 0)


[32m[I 2021-07-25 13:45:09,313][0m Trial 5 finished with value: 0.6913104057312012 and parameters: {'base_lr': 0.0001077856054370807, 'last_lr': 0.001845347640590279, 'epochs': 3}. Best is trial 4 with value: 0.48103463649749756.[0m



##### Using fold 3
##### Using base_lr 0.0003374026343358957 last_lr 0.0011642525648616515 epochs 5


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 15.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 2.096 New best_val_rmse: 2.096

16 steps took 13.3 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.049 New best_val_rmse: 1.049

16 steps took 13.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.045 New best_val_rmse: 1.045

16 steps took 13.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.051 Still best_val_rmse: 1.045 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.042 New best_val_rmse: 1.042

16 steps took 13.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.043 Still best_val_rmse: 1.042 (from epoch 0)


[32m[I 2021-07-25 13:47:49,838][0m Trial 6 finished with value: 1.0422661304473877 and parameters: {'base_lr': 0.0003374026343358957, 'last_lr': 0.0011642525648616515, 'epochs': 5}. Best is trial 4 with value: 0.48103463649749756.[0m



##### Using fold 3
##### Using base_lr 0.00010592609202916195 last_lr 0.004119408133697415 epochs 5


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 15.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9151 New best_val_rmse: 0.9151

16 steps took 13.3 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6376 New best_val_rmse: 0.6376

16 steps took 13.3 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9541 Still best_val_rmse: 0.6376 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.393 Still best_val_rmse: 0.6376 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.049 Still best_val_rmse: 0.6376 (from epoch 0)

16 steps took 13.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.042 Still best_val_rmse: 0.6376 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.04 Still best_val_rmse: 0.6376 (from epoch 0)


[32m[I 2021-07-25 13:50:55,218][0m Trial 7 finished with value: 0.6376178860664368 and parameters: {'base_lr': 0.00010592609202916195, 'last_lr': 0.004119408133697415, 'epochs': 5}. Best is trial 4 with value: 0.48103463649749756.[0m



##### Using fold 3
##### Using base_lr 0.00041321857511704467 last_lr 0.0007541790981300038 epochs 3


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 15.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.116 New best_val_rmse: 1.116

16 steps took 13.2 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.236 Still best_val_rmse: 1.116 (from epoch 0)

16 steps took 13.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.143 Still best_val_rmse: 1.116 (from epoch 0)

16 steps took 13.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.046 New best_val_rmse: 1.046

16 steps took 13.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.04 New best_val_rmse: 1.04

16 steps took 13.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.04 New best_val_rmse: 1.04


[32m[I 2021-07-25 13:53:34,976][0m Trial 8 finished with value: 1.0396934747695923 and parameters: {'base_lr': 0.00041321857511704467, 'last_lr': 0.0007541790981300038, 'epochs': 3}. Best is trial 4 with value: 0.48103463649749756.[0m



##### Using fold 3
##### Using base_lr 0.0002980810441226992 last_lr 0.0028927902818133653 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9817 New best_val_rmse: 0.9817

16 steps took 13.2 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.061 Still best_val_rmse: 0.9817 (from epoch 0)

16 steps took 13.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.099 Still best_val_rmse: 0.9817 (from epoch 0)

16 steps took 13.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.172 Still best_val_rmse: 0.9817 (from epoch 0)

16 steps took 13.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.069 Still best_val_rmse: 0.9817 (from epoch 0)

16 steps took 13.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.042 Still best_val_rmse: 0.9817 (from epoch 0)


[32m[I 2021-07-25 13:56:19,247][0m Trial 9 finished with value: 0.9816675186157227 and parameters: {'base_lr': 0.0002980810441226992, 'last_lr': 0.0028927902818133653, 'epochs': 4}. Best is trial 4 with value: 0.48103463649749756.[0m



##### Using fold 3
##### Using base_lr 3.091841397163233e-05 last_lr 0.00010409734625896974 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8222 New best_val_rmse: 0.8222

16 steps took 13.3 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6415 New best_val_rmse: 0.6415

16 steps took 13.3 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7187 Still best_val_rmse: 0.6415 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6958 Still best_val_rmse: 0.6415 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6556 Still best_val_rmse: 0.6415 (from epoch 0)

16 steps took 13.3 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.58 New best_val_rmse: 0.58

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7503 Still best_val_rmse: 0.58 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6572 Still best_val_rmse: 0.58 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5782 New best_val_rmse: 0.5782

16 steps took 14.3 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5185 New be

[32m[I 2021-07-25 14:27:56,182][0m Trial 10 finished with value: 0.474480539560318 and parameters: {'base_lr': 3.091841397163233e-05, 'last_lr': 0.00010409734625896974, 'epochs': 4}. Best is trial 10 with value: 0.474480539560318.[0m



##### Using fold 3
##### Using base_lr 3.705019083502437e-05 last_lr 9.73975159832815e-05 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8667 New best_val_rmse: 0.8667

16 steps took 13.3 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6758 New best_val_rmse: 0.6758

16 steps took 13.3 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8294 Still best_val_rmse: 0.6758 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.729 Still best_val_rmse: 0.6758 (from epoch 0)

16 steps took 13.3 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6323 New best_val_rmse: 0.6323

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6451 Still best_val_rmse: 0.6323 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6526 Still best_val_rmse: 0.6323 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6231 New best_val_rmse: 0.6231

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5699 New best_val_rmse: 0.5699

16 steps took 14.2 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5239 New best_val_rms

[32m[I 2021-07-25 14:57:49,859][0m Trial 11 finished with value: 0.47759729623794556 and parameters: {'base_lr': 3.705019083502437e-05, 'last_lr': 9.73975159832815e-05, 'epochs': 4}. Best is trial 10 with value: 0.474480539560318.[0m



##### Using fold 3
##### Using base_lr 3.559207163174552e-05 last_lr 8.291720349787048e-05 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8572 New best_val_rmse: 0.8572

16 steps took 13.3 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6538 New best_val_rmse: 0.6538

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8514 Still best_val_rmse: 0.6538 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6875 Still best_val_rmse: 0.6538 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6421 New best_val_rmse: 0.6421

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5634 New best_val_rmse: 0.5634

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6277 Still best_val_rmse: 0.5634 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6626 Still best_val_rmse: 0.5634 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5721 Still best_val_rmse: 0.5634 (from epoch 0)

16 steps took 14.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.560

[32m[I 2021-07-25 15:24:26,696][0m Trial 12 finished with value: 0.4792397618293762 and parameters: {'base_lr': 3.559207163174552e-05, 'last_lr': 8.291720349787048e-05, 'epochs': 4}. Best is trial 10 with value: 0.474480539560318.[0m



##### Using fold 3
##### Using base_lr 4.7971392667718245e-05 last_lr 9.266094011245145e-05 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7832 New best_val_rmse: 0.7832

16 steps took 13.3 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7271 New best_val_rmse: 0.7271

16 steps took 13.3 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8338 Still best_val_rmse: 0.7271 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6891 New best_val_rmse: 0.6891

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6302 New best_val_rmse: 0.6302

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5918 New best_val_rmse: 0.5918

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6933 Still best_val_rmse: 0.5918 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.603 Still best_val_rmse: 0.5918 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.7475 Still best_val_rmse: 0.5918 (from epoch 0)

16 steps took 14.2 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5633 New best_val_rms

[32m[I 2021-07-25 15:47:59,559][0m Trial 13 finished with value: 0.4782479703426361 and parameters: {'base_lr': 4.7971392667718245e-05, 'last_lr': 9.266094011245145e-05, 'epochs': 4}. Best is trial 10 with value: 0.474480539560318.[0m



##### Using fold 3
##### Using base_lr 3.564026861587217e-05 last_lr 0.00020414462061572284 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8608 New best_val_rmse: 0.8608

16 steps took 13.3 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6558 New best_val_rmse: 0.6558

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7571 Still best_val_rmse: 0.6558 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8096 Still best_val_rmse: 0.6558 (from epoch 0)

16 steps took 13.3 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7542 Still best_val_rmse: 0.6558 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6217 New best_val_rmse: 0.6217

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5838 New best_val_rmse: 0.5838

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6476 Still best_val_rmse: 0.5838 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.7471 Still best_val_rmse: 0.5838 (from epoch 0)

16 steps took 14.3 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.573

[32m[I 2021-07-25 16:09:05,300][0m Trial 14 finished with value: 0.483257919549942 and parameters: {'base_lr': 3.564026861587217e-05, 'last_lr': 0.00020414462061572284, 'epochs': 4}. Best is trial 10 with value: 0.474480539560318.[0m



##### Using fold 3
##### Using base_lr 3.132222593692676e-05 last_lr 0.00017727201606161639 epochs 5


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 15.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8243 New best_val_rmse: 0.8243

16 steps took 13.3 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6403 New best_val_rmse: 0.6403

16 steps took 13.3 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7419 Still best_val_rmse: 0.6403 (from epoch 0)

16 steps took 13.3 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7232 Still best_val_rmse: 0.6403 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7528 Still best_val_rmse: 0.6403 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5622 New best_val_rmse: 0.5622

16 steps took 13.3 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7259 Still best_val_rmse: 0.5622 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6574 Still best_val_rmse: 0.5622 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5485 New best_val_rmse: 0.5485

16 steps took 14.2 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.524

[32m[I 2021-07-25 16:35:00,086][0m Trial 15 finished with value: 0.48396533727645874 and parameters: {'base_lr': 3.132222593692676e-05, 'last_lr': 0.00017727201606161639, 'epochs': 5}. Best is trial 10 with value: 0.474480539560318.[0m



##### Using fold 3
##### Using base_lr 5.640809897820872e-05 last_lr 0.00016923843273761527 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9751 New best_val_rmse: 0.9751

16 steps took 13.3 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6698 New best_val_rmse: 0.6698

16 steps took 13.3 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6747 Still best_val_rmse: 0.6698 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6881 Still best_val_rmse: 0.6698 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7508 Still best_val_rmse: 0.6698 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5598 New best_val_rmse: 0.5598

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6534 Still best_val_rmse: 0.5598 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6081 Still best_val_rmse: 0.5598 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5748 Still best_val_rmse: 0.5598 (from epoch 0)

16 steps took 14.2 seconds
Epoch: 1 batch_num: 1

[32m[I 2021-07-25 17:02:23,501][0m Trial 16 finished with value: 0.4776155352592468 and parameters: {'base_lr': 5.640809897820872e-05, 'last_lr': 0.00016923843273761527, 'epochs': 4}. Best is trial 10 with value: 0.474480539560318.[0m



##### Using fold 3
##### Using base_lr 7.52349749835514e-05 last_lr 0.00029315990374111886 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 16.1 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.693 New best_val_rmse: 0.693

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7429 Still best_val_rmse: 0.693 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.065 Still best_val_rmse: 0.693 (from epoch 0)

16 steps took 13.3 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6229 New best_val_rmse: 0.6229

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5926 New best_val_rmse: 0.5926

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6248 Still best_val_rmse: 0.5926 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6854 Still best_val_rmse: 0.5926 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5543 New best_val_rmse: 0.5543

16 steps took 13.3 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6847 Still best_val_rmse: 0.5543 (from epoch 0)

16 steps took 14.2 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5413 New

[32m[I 2021-07-25 17:28:22,421][0m Trial 17 finished with value: 0.47943636775016785 and parameters: {'base_lr': 7.52349749835514e-05, 'last_lr': 0.00029315990374111886, 'epochs': 4}. Best is trial 10 with value: 0.474480539560318.[0m



##### Using fold 3
##### Using base_lr 0.0001681916034814876 last_lr 0.00011708139817754288 epochs 3


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 15.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8056 New best_val_rmse: 0.8056

16 steps took 13.3 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7463 New best_val_rmse: 0.7463

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7373 New best_val_rmse: 0.7373

16 steps took 13.3 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7882 Still best_val_rmse: 0.7373 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.156 Still best_val_rmse: 0.7373 (from epoch 0)

16 steps took 13.3 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.129 Still best_val_rmse: 0.7373 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.164 Still best_val_rmse: 0.7373 (from epoch 0)

16 steps took 13.0 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.069 Still best_val_rmse: 0.7373 (from epoch 0)

16 steps took 13.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.05 Still best_val_rmse: 0.7373 (from epoch 0)


[32m[I 2021-07-25 17:32:20,826][0m Trial 18 finished with value: 0.7373365163803101 and parameters: {'base_lr': 0.0001681916034814876, 'last_lr': 0.00011708139817754288, 'epochs': 3}. Best is trial 10 with value: 0.474480539560318.[0m



##### Using fold 3
##### Using base_lr 3.234597423887574e-05 last_lr 0.00032705834462378423 epochs 5


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 15.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8416 New best_val_rmse: 0.8416

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6419 New best_val_rmse: 0.6419

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7346 Still best_val_rmse: 0.6419 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6815 Still best_val_rmse: 0.6419 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6198 New best_val_rmse: 0.6198

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5965 New best_val_rmse: 0.5965

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6422 Still best_val_rmse: 0.5965 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5541 New best_val_rmse: 0.5541

16 steps took 13.5 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5819 Still best_val_rmse: 0.5541 (from epoch 0)

16 steps took 14.3 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5756 Still best_val_

[32m[I 2021-07-25 17:59:52,082][0m Trial 19 finished with value: 0.48085349798202515 and parameters: {'base_lr': 3.234597423887574e-05, 'last_lr': 0.00032705834462378423, 'epochs': 5}. Best is trial 10 with value: 0.474480539560318.[0m
[32m[I 2021-07-25 17:59:52,085][0m A new study created in memory with name: no-name-b95f8224-9fac-4cc2-b00e-b18f3cf9b133[0m



 Best value:  0.474480539560318
 Best params: 
    base_lr: 3.091841397163233e-05
    last_lr: 0.00010409734625896974
    epochs: 4
##### Using fold 4
##### Using base_lr 3.2314567372708084e-05 last_lr 8.327155005618419e-05 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9454 New best_val_rmse: 0.9454

16 steps took 13.3 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7717 New best_val_rmse: 0.7717

16 steps took 13.3 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7838 Still best_val_rmse: 0.7717 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6301 New best_val_rmse: 0.6301

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5986 New best_val_rmse: 0.5986

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5716 New best_val_rmse: 0.5716

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5557 New best_val_rmse: 0.5557

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5844 Still best_val_rmse: 0.5557 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6491 Still best_val_rmse: 0.5557 (from epoch 0)

16 steps took 14.3 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.4925 New best_val_rmse: 0.4925

8 ste

[32m[I 2021-07-25 18:57:56,662][0m Trial 0 finished with value: 0.45970267057418823 and parameters: {'base_lr': 3.2314567372708084e-05, 'last_lr': 8.327155005618419e-05, 'epochs': 4}. Best is trial 0 with value: 0.45970267057418823.[0m



##### Using fold 4
##### Using base_lr 6.228115263581648e-05 last_lr 0.002568618416899804 epochs 5


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 15.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8833 New best_val_rmse: 0.8833

16 steps took 13.3 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7761 New best_val_rmse: 0.7761

16 steps took 13.3 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.031 Still best_val_rmse: 0.7761 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.665 New best_val_rmse: 0.665

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7338 Still best_val_rmse: 0.665 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6186 New best_val_rmse: 0.6186

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.8649 Still best_val_rmse: 0.6186 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5956 New best_val_rmse: 0.5956

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6117 Still best_val_rmse: 0.5956 (from epoch 0)

16 steps took 14.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.6036 Still best_val_rmse

[32m[I 2021-07-25 19:31:43,595][0m Trial 1 finished with value: 0.47378069162368774 and parameters: {'base_lr': 6.228115263581648e-05, 'last_lr': 0.002568618416899804, 'epochs': 5}. Best is trial 0 with value: 0.45970267057418823.[0m



##### Using fold 4
##### Using base_lr 0.00029311765128279367 last_lr 0.0002174214933522278 epochs 3


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 15.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.318 New best_val_rmse: 1.318

16 steps took 13.3 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.147 New best_val_rmse: 1.147

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.073 New best_val_rmse: 1.073

16 steps took 13.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.113 Still best_val_rmse: 1.073 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.139 Still best_val_rmse: 1.073 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.063 New best_val_rmse: 1.063


[32m[I 2021-07-25 19:34:25,553][0m Trial 2 finished with value: 1.0630989074707031 and parameters: {'base_lr': 0.00029311765128279367, 'last_lr': 0.0002174214933522278, 'epochs': 3}. Best is trial 0 with value: 0.45970267057418823.[0m



##### Using fold 4
##### Using base_lr 4.591130660171518e-05 last_lr 0.003713677086075337 epochs 3


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.bias', 'pooler

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 15.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8924 New best_val_rmse: 0.8924

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8478 New best_val_rmse: 0.8478

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6503 New best_val_rmse: 0.6503

16 steps took 13.3 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7102 Still best_val_rmse: 0.6503 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6758 Still best_val_rmse: 0.6503 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6754 Still best_val_rmse: 0.6503 (from epoch 0)

16 steps took 13.3 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5809 New best_val_rmse: 0.5809

16 steps took 13.3 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6768 Still best_val_rmse: 0.5809 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6637 Still best_val_rmse: 0.5809 (from epoch 0)

16 steps took 14.3 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.577

In [49]:
for i in range(5, len(list(splits))):
    fold = i
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=20)
    print(" Best value: ", study.best_trial.value)
    print(" Best params: ")
    for key, value in study.best_trial.params.items():
        print(f"    {key}: {value}")

[32m[I 2021-07-26 06:42:49,908][0m A new study created in memory with name: no-name-ea841a57-63cb-41b1-983d-7f8c65378d80[0m


##### Using fold 5
##### Using base_lr 3.5379120180791935e-05 last_lr 0.00021137535166837663 epochs 5


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.weight', 'pool

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 16.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7172 New best_val_rmse: 0.7172

16 steps took 13.2 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7892 Still best_val_rmse: 0.7172 (from epoch 0)

16 steps took 13.3 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6249 New best_val_rmse: 0.6249

16 steps took 13.4 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7399 Still best_val_rmse: 0.6249 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6589 Still best_val_rmse: 0.6249 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5633 New best_val_rmse: 0.5633

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5695 Still best_val_rmse: 0.5633 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5865 Still best_val_rmse: 0.5633 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5579 New best_val_rmse: 0.5579

16 steps took 14.2 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.545

[32m[I 2021-07-26 07:37:52,054][0m Trial 0 finished with value: 0.46050626039505005 and parameters: {'base_lr': 3.5379120180791935e-05, 'last_lr': 0.00021137535166837663, 'epochs': 5}. Best is trial 0 with value: 0.46050626039505005.[0m



##### Using fold 5
##### Using base_lr 0.00026457307680595145 last_lr 0.0006234142279885256 epochs 5


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.weight', 'pool

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 15.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.04 New best_val_rmse: 1.04

16 steps took 13.2 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.138 Still best_val_rmse: 1.04 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.079 Still best_val_rmse: 1.04 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.062 Still best_val_rmse: 1.04 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.021 New best_val_rmse: 1.021

16 steps took 13.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.025 Still best_val_rmse: 1.021 (from epoch 0)


[32m[I 2021-07-26 07:40:32,768][0m Trial 1 finished with value: 1.0209934711456299 and parameters: {'base_lr': 0.00026457307680595145, 'last_lr': 0.0006234142279885256, 'epochs': 5}. Best is trial 0 with value: 0.46050626039505005.[0m



##### Using fold 5
##### Using base_lr 0.0003510699921684945 last_lr 0.0009127027790204334 epochs 3


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.weight', 'pool

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 15.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9981 New best_val_rmse: 0.9981

16 steps took 13.3 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.089 Still best_val_rmse: 0.9981 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.041 Still best_val_rmse: 0.9981 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.038 Still best_val_rmse: 0.9981 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.025 Still best_val_rmse: 0.9981 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.019 Still best_val_rmse: 0.9981 (from epoch 0)


[32m[I 2021-07-26 07:43:13,621][0m Trial 2 finished with value: 0.9981076121330261 and parameters: {'base_lr': 0.0003510699921684945, 'last_lr': 0.0009127027790204334, 'epochs': 3}. Best is trial 0 with value: 0.46050626039505005.[0m



##### Using fold 5
##### Using base_lr 0.000323161672880498 last_lr 0.0013930997171154112 epochs 3


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.weight', 'pool

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 15.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.047 New best_val_rmse: 1.047

16 steps took 13.2 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.039 New best_val_rmse: 1.039

16 steps took 13.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.075 Still best_val_rmse: 1.039 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.065 Still best_val_rmse: 1.039 (from epoch 0)

16 steps took 13.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.027 New best_val_rmse: 1.027

16 steps took 13.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.019 New best_val_rmse: 1.019


[32m[I 2021-07-26 07:45:54,082][0m Trial 3 finished with value: 1.0193175077438354 and parameters: {'base_lr': 0.000323161672880498, 'last_lr': 0.0013930997171154112, 'epochs': 3}. Best is trial 0 with value: 0.46050626039505005.[0m



##### Using fold 5
##### Using base_lr 0.00010896104669695506 last_lr 0.0029946969275472285 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.weight', 'pool

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8292 New best_val_rmse: 0.8292

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7292 New best_val_rmse: 0.7292

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7376 Still best_val_rmse: 0.7292 (from epoch 0)

16 steps took 13.3 seconds
Epoch: 0 batch_num: 64 val_rmse: 2.552 Still best_val_rmse: 0.7292 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.9966 Still best_val_rmse: 0.7292 (from epoch 0)

16 steps took 13.2 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.035 Still best_val_rmse: 0.7292 (from epoch 0)



[32m[I 2021-07-26 07:48:35,865][0m Trial 4 finished with value: 0.7291932702064514 and parameters: {'base_lr': 0.00010896104669695506, 'last_lr': 0.0029946969275472285, 'epochs': 4}. Best is trial 0 with value: 0.46050626039505005.[0m


##### Using fold 5
##### Using base_lr 0.0003799082469670823 last_lr 0.00021149001927943728 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.weight', 'pool

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.388 New best_val_rmse: 1.388

16 steps took 13.3 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.045 New best_val_rmse: 1.045

16 steps took 13.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.04 New best_val_rmse: 1.04

16 steps took 13.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.067 Still best_val_rmse: 1.04 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.022 New best_val_rmse: 1.022

16 steps took 13.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.03 Still best_val_rmse: 1.022 (from epoch 0)


[32m[I 2021-07-26 07:51:17,103][0m Trial 5 finished with value: 1.0216212272644043 and parameters: {'base_lr': 0.0003799082469670823, 'last_lr': 0.00021149001927943728, 'epochs': 4}. Best is trial 0 with value: 0.46050626039505005.[0m



##### Using fold 5
##### Using base_lr 0.0002568209038046303 last_lr 0.004062311874892521 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.weight', 'pool

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.213 New best_val_rmse: 1.213

16 steps took 13.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.059 New best_val_rmse: 1.059

16 steps took 13.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.056 New best_val_rmse: 1.056

16 steps took 13.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.205 Still best_val_rmse: 1.056 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.03 New best_val_rmse: 1.03

16 steps took 13.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.027 New best_val_rmse: 1.027


[32m[I 2021-07-26 07:54:00,368][0m Trial 6 finished with value: 1.0267387628555298 and parameters: {'base_lr': 0.0002568209038046303, 'last_lr': 0.004062311874892521, 'epochs': 4}. Best is trial 0 with value: 0.46050626039505005.[0m



##### Using fold 5
##### Using base_lr 0.00037828558432408273 last_lr 0.004913422413649723 epochs 4


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.weight', 'pool

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 15.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9992 New best_val_rmse: 0.9992

16 steps took 13.2 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.041 Still best_val_rmse: 0.9992 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.021 Still best_val_rmse: 0.9992 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.093 Still best_val_rmse: 0.9992 (from epoch 0)

16 steps took 13.0 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.046 Still best_val_rmse: 0.9992 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.044 Still best_val_rmse: 0.9992 (from epoch 0)


[32m[I 2021-07-26 07:56:41,562][0m Trial 7 finished with value: 0.9992078542709351 and parameters: {'base_lr': 0.00037828558432408273, 'last_lr': 0.004913422413649723, 'epochs': 4}. Best is trial 0 with value: 0.46050626039505005.[0m



##### Using fold 5
##### Using base_lr 0.000346757166348739 last_lr 8.658505891799743e-05 epochs 5


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.weight', 'pool

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 15.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.014 New best_val_rmse: 1.014

16 steps took 13.2 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.026 Still best_val_rmse: 1.014 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.044 Still best_val_rmse: 1.014 (from epoch 0)

16 steps took 13.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.079 Still best_val_rmse: 1.014 (from epoch 0)

16 steps took 13.1 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.018 Still best_val_rmse: 1.014 (from epoch 0)

16 steps took 13.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.022 Still best_val_rmse: 1.014 (from epoch 0)


[32m[I 2021-07-26 07:59:23,031][0m Trial 8 finished with value: 1.013664722442627 and parameters: {'base_lr': 0.000346757166348739, 'last_lr': 8.658505891799743e-05, 'epochs': 5}. Best is trial 0 with value: 0.46050626039505005.[0m



##### Using fold 5
##### Using base_lr 5.9744555136421535e-05 last_lr 0.0023670292030870026 epochs 3


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.weight', 'pool

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 15.8 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.121 New best_val_rmse: 1.121

16 steps took 13.4 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.781 New best_val_rmse: 0.781

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8025 Still best_val_rmse: 0.781 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6496 New best_val_rmse: 0.6496

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6836 Still best_val_rmse: 0.6496 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5734 New best_val_rmse: 0.5734

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5568 New best_val_rmse: 0.5568

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5409 New best_val_rmse: 0.5409

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5718 Still best_val_rmse: 0.5409 (from epoch 0)

16 steps took 14.2 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5197 New best_val_rmse: 0.5197

16 steps t

[32m[I 2021-07-26 08:29:40,740][0m Trial 9 finished with value: 0.46622639894485474 and parameters: {'base_lr': 5.9744555136421535e-05, 'last_lr': 0.0023670292030870026, 'epochs': 3}. Best is trial 0 with value: 0.46050626039505005.[0m



##### Using fold 5
##### Using base_lr 3.1267138552113085e-05 last_lr 0.0002785557414483006 epochs 5


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.weight', 'pool

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 15.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7192 New best_val_rmse: 0.7192

16 steps took 13.5 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7907 Still best_val_rmse: 0.7192 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6839 New best_val_rmse: 0.6839

16 steps took 13.4 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5648 New best_val_rmse: 0.5648

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.552 New best_val_rmse: 0.552

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5114 New best_val_rmse: 0.5114

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5738 Still best_val_rmse: 0.5114 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5592 Still best_val_rmse: 0.5114 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5419 Still best_val_rmse: 0.5114 (from epoch 0)

16 steps took 14.3 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5214 Still best_val_rm

[32m[I 2021-07-26 09:20:43,372][0m Trial 10 finished with value: 0.46617308259010315 and parameters: {'base_lr': 3.1267138552113085e-05, 'last_lr': 0.0002785557414483006, 'epochs': 5}. Best is trial 0 with value: 0.46050626039505005.[0m



##### Using fold 5
##### Using base_lr 3.229859428898334e-05 last_lr 0.0002711377163548122 epochs 5


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.weight', 'pool

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 16.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7183 New best_val_rmse: 0.7183

16 steps took 13.5 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7769 Still best_val_rmse: 0.7183 (from epoch 0)

16 steps took 13.5 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6664 New best_val_rmse: 0.6664

16 steps took 13.4 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.768 Still best_val_rmse: 0.6664 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.721 Still best_val_rmse: 0.6664 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5416 New best_val_rmse: 0.5416

16 steps took 13.5 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6379 Still best_val_rmse: 0.5416 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6592 Still best_val_rmse: 0.5416 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5961 Still best_val_rmse: 0.5416 (from epoch 0)

16 steps took 14.4 seconds
Epoch: 1 batch_num: 12 

[32m[I 2021-07-26 10:14:49,715][0m Trial 11 finished with value: 0.46542713046073914 and parameters: {'base_lr': 3.229859428898334e-05, 'last_lr': 0.0002711377163548122, 'epochs': 5}. Best is trial 0 with value: 0.46050626039505005.[0m



##### Using fold 5
##### Using base_lr 3.0377004073224824e-05 last_lr 0.00018524387422817287 epochs 5


Some weights of the model checkpoint at microsoft/deberta-xlarge were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-xlarge and are newly initialized: ['classifier.weight', 'pool

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 15.9 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7238 New best_val_rmse: 0.7238

16 steps took 13.5 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7792 Still best_val_rmse: 0.7238 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6185 New best_val_rmse: 0.6185

16 steps took 13.5 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5717 New best_val_rmse: 0.5717

16 steps took 13.4 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5663 New best_val_rmse: 0.5663

16 steps took 13.4 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5253 New best_val_rmse: 0.5253

16 steps took 13.4 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5886 Still best_val_rmse: 0.5253 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5498 Still best_val_rmse: 0.5253 (from epoch 0)

16 steps took 13.4 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5842 Still best_val_rmse: 0.5253 (from epoch 0)

16 steps took 14.3 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5374 Still best_val_

KeyboardInterrupt: 

### Verify the model

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm

In [None]:
cfg.model_offset = 0
cfg.model_limit = 6
cfg.n_folds = 5
cfg.svm_kernels = ['rbf']
cfg.svm_c = 5

In [None]:
num_bins = int(np.ceil(np.log2(len(train_df))))
train_df['bins'] = pd.cut(train_df['target'], bins=num_bins, labels=False)
bins = train_df['bins'].values

In [None]:
%%time

inference_models = []
for i in range(1, cfg.NUM_FOLDS + 1):
    print(f'Model {i}')
    inference_model = CommonLitModel()
    inference_model = inference_model.cuda()
    inference_model.load_state_dict(torch.load(str(MODELS_PATH/f"{cfg.model_name.replace('/', '_')}_{i}/model_{i}.pth")))
    inference_model.eval();
    inference_models.append(inference_model)

In [None]:
from transformers import RobertaTokenizer

tokenizers = []
for i in range(1, cfg.NUM_FOLDS):
    tokenizer = RobertaTokenizer.from_pretrained(MODELS_PATH/f"{cfg.model_name.replace('/', '_')}_{i}")
    tokenizers.append(tokenizer)

In [None]:
def get_cls_embeddings(dl, transformer_model):
    cls_embeddings = []
    with torch.no_grad():
        for input_features in tqdm(dl, total=len(dl)):
            output, context_vector = transformer_model(input_features['input_ids'].cuda(), input_features['attention_mask'].cuda())
#             cls_embeddings.extend(output['last_hidden_state'][:,0,:].detach().cpu().numpy())
            embedding_out = context_vector.detach().cpu().numpy()
            cls_embeddings.extend(embedding_out)
    return np.array(cls_embeddings)

In [None]:
def rmse_score(X, y):
    return np.sqrt(mean_squared_error(X, y))

In [None]:
def convert_to_list(t):
    return t.flatten().long()

class CommonLitDataset(nn.Module):
    def __init__(self, text, test_id, tokenizer, max_len=128):
        self.excerpt = text
        self.test_id = test_id
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        return {'input_ids': convert_to_list(encode['input_ids']),
                'attention_mask': convert_to_list(encode['attention_mask']),
                'id': self.test_id[idx]}
    
    def __len__(self):
        return len(self.excerpt)

In [None]:
def create_dl(df, tokenizer):
    text = df['excerpt'].values
    ids = df['id'].values
    ds = CommonLitDataset(text, ids, tokenizer, max_len=cfg.MAX_LEN)
    return DataLoader(ds, 
                      batch_size = cfg.BATCH_SIZE,
                      shuffle=False,
                      num_workers = 1,
                      pin_memory=True,
                      drop_last=False
                     )

In [None]:
train_df = pd.read_csv(DATA_PATH/'train-orig.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')
remove_unnecessary(train_df)

In [None]:
train_target_mean = train_df['target'].mean()
train_target_std = train_df['target'].std()
train_df['normalized_target'] = (train_df['target'] - train_target_mean) / train_target_std

In [None]:
%%time

train_target = train_df['normalized_target'].values

def calc_mean(scores):
    return np.mean(np.array(scores), axis=0)

final_scores = []
final_rmse = []
kernel_rmse_score_mean = []
final_kernel_predictions_means = []
for j, (inference_model, tokenizer) in enumerate(zip(inference_models, tokenizers)):
    print('Model', j)
    test_dl = create_dl(test_df, tokenizer)
    train_dl = create_dl(train_df, tokenizer)
    transformer_model = inference_model
    transformer_model.cuda()
    X = get_cls_embeddings(train_dl, transformer_model)
    
    y = train_target
    X_test = get_cls_embeddings(test_dl, transformer_model)
    
    kfold = StratifiedKFold(n_splits=cfg.NUM_FOLDS)
    scores = []
    rmse_scores = []
    kernel_predictions_means = []
    for kernel in cfg.svm_kernels:
        print('Kernel', kernel)
        kernel_scores = []
        kernel_rmse_scores = []
        kernel_predictions = []
        for k, (train_idx, valid_idx) in enumerate(kfold.split(X, bins)):

            print('Fold', k, train_idx.shape, valid_idx.shape)
            model = SVR(C=cfg.svm_c, kernel=kernel, gamma='auto')

            X_train, y_train = X[train_idx], y[train_idx]
            X_valid, y_valid = X[valid_idx], y[valid_idx]
            model.fit(X_train, y_train)
            prediction = model.predict(X_valid)
            kernel_predictions.append(prediction)
            kernel_rmse_scores.append(rmse_score(prediction, y_valid))
            print('rmse_score', kernel_rmse_scores[k])
            kernel_scores.append(model.predict(X_test))
        kernel_predictions_means.append(np.array([np.mean(kp) for kp in kernel_predictions]).mean())
        scores.append(calc_mean(kernel_scores))
        kernel_rmse_score = calc_mean(kernel_rmse_scores)
        kernel_rmse_score_mean.append(kernel_rmse_score)
        rmse_scores.append(kernel_rmse_score)
    final_kernel_predictions_means.append(kernel_predictions_means)
    final_scores.append(calc_mean(scores))
    final_rmse.append(calc_mean(rmse_scores))
print('FINAL RMSE score', np.mean(np.array(final_rmse)))

In [None]:
final_kernel_predictions_means

In [None]:
# (train_df['target'] - cfg.train_target_mean) / cfg.train_target_std
final_scores_normalized = np.array(final_scores) * train_target_std + train_target_mean

In [None]:
kernel_rmse_score_mean_array = np.array(kernel_rmse_score_mean)
kernel_rmse_score_mean_sum = np.sum(kernel_rmse_score_mean_array)
prop_losses = kernel_rmse_score_mean_array / kernel_rmse_score_mean_sum
prop_losses_sum = (1 - prop_losses).sum()
weights = (1 - prop_losses) / prop_losses_sum
weights

In [None]:
def calc_mean(scores, weights=weights):
    return np.average(np.array(scores), weights=weights, axis=0)

In [None]:
target_mean = train_df['target'].mean()
final_scores_flat = calc_mean(final_scores_normalized).flatten()
final_scores_mean = final_scores_flat.mean()
target_mean, np.array(final_scores_normalized).mean()
# (-0.9579984513405823, -0.8029817438292849)

In [None]:
final_scores_flat

In [None]:
mean_diff = target_mean - final_scores_mean
mean_diff, mean_diff / len(final_scores)

In [None]:
sample_df['target'] = final_scores_flat + mean_diff
# sample_df['target'] = len(final_scores) / np.sum(1 / np.array(final_scores), axis=0) # harmonic mean
sample_df

### Prepare Packaging

In [None]:
cfg.model_name

In [None]:
BEST_MODEL_FOLDER = MODELS_PATH/cfg.model_name/'best'
!rm -rf {BEST_MODEL_FOLDER}
!mkdir -p {BEST_MODEL_FOLDER}

In [None]:
BEST_MODEL_FOLDER

In [None]:
cfg.NUM_FOLDS

In [None]:
bestmodels = [MODELS_PATH/f'{cfg.model_name}_{i + 1}' for i in range(0, cfg.NUM_FOLDS)]

In [None]:
bestmodels

In [None]:
from shutil import copyfile

def normalize_name(path_name):
    return path_name.replace('', '')

for i, best_model in enumerate(bestmodels):
    print(f'Processing {i}th model')
    i = i + 1
    best_model_file = f'{best_model}/model_{i}.pth'
    if Path(best_model_file).exists():
        copyfile(best_model_file, f'{BEST_MODEL_FOLDER}/{i}_pytorch_model.bin')
        tokenizer_path = Path(BEST_MODEL_FOLDER/f'tokenizer-{i}')
        tokenizer_path.mkdir(parents=True, exist_ok=True)
        assert tokenizer_path.exists()

        tokenizer_json = Path(normalize_name(f'{MODELS_PATH/cfg.model_name}_{i}/tokenizer_config.json'))
        assert tokenizer_json.exists(), f'{tokenizer_json} does not exist'
        copyfile(tokenizer_json, tokenizer_path/'tokenizer.json')

        vocab_txt = Path(normalize_name(f'{MODELS_PATH/cfg.model_name}_{i}/vocab.json'))
        assert vocab_txt.exists(), f'{vocab_txt} does not exist'
        copyfile(vocab_txt, tokenizer_path/'vocab.json')

        merges = Path(normalize_name(f'{MODELS_PATH/cfg.model_name}_{i}/merges.txt'))
        assert merges.exists()
        copyfile(merges, tokenizer_path/'merges.txt')
    else:
        print(f'{best_model_file} is missing')

In [None]:
import shutil

shutil.make_archive(MODELS_PATH/cfg.model_name/'best_models', 'zip', BEST_MODEL_FOLDER)

In [None]:
!ls {MODELS_PATH/cfg.model_name}

In [None]:
!mv {MODELS_PATH}/{cfg.model_name}.yaml {MODELS_PATH/cfg.model_name}

In [None]:
transformer_model.transformer_model.save_pretrained(save_directory=f'{MODELS_PATH/cfg.model_name}/lm')

In [None]:
!du -h {MODELS_PATH/cfg.model_name}/*

In [None]:
shutil.make_archive(MODELS_PATH/cfg.model_name/'lm', 'zip', f'{MODELS_PATH/cfg.model_name}/lm')

In [None]:
!kaggle datasets init -p {MODELS_PATH/cfg.model_name}

In [None]:
dataset_json_path = Path(MODELS_PATH/cfg.model_name/'dataset-metadata.json')
assert dataset_json_path.exists()

In [None]:
!cat {str(dataset_json_path)}

In [None]:
with open(dataset_json_path, 'r') as f:
    dataset_json = f.read()
    dataset_json = dataset_json.replace('INSERT_TITLE_HERE', f'commonlit-{cfg.model_name}-light').replace('INSERT_SLUG_HERE', f'commonlit-{cfg.model_name}-light')
    print(dataset_json)
with(open(dataset_json_path, 'w')) as f:
    f.write(dataset_json)

In [None]:
!rm -rf {MODELS_PATH/cfg.model_name}/best
!rm -rf {MODELS_PATH/cfg.model_name}/lm

In [None]:
!kaggle datasets create -p {MODELS_PATH/cfg.model_name}

In [None]:
!kaggle datasets version -p {MODELS_PATH/cfg.model_name} -m "Version with merges.txt" -d

In [None]:
state_dict = torch.load(str(MODELS_PATH/f'distilroberta-0/checkpoint-105/pytorch_model.bin'))

In [None]:
loaded_model = CommonLitModel()

In [None]:
loaded_model.load_state_dict(state_dict)