In [None]:
# !pip install optuna

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import gc, warnings, random, time, os

from pathlib import Path

from tqdm.notebook import tqdm

warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from transformers import AutoModel, AutoTokenizer, AutoConfig
from transformers import get_cosine_schedule_with_warmup

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

import seaborn as sns

import gc
gc.enable()

import optuna

### Folders and Dataframes

In [2]:
DATA_PATH = Path('/home/commonlit/data/')
assert DATA_PATH.exists()
MODELS_PATH = Path('/home/commonlit/models/')
if not MODELS_PATH.exists():
    os.mkdir(MODELS_PATH)
assert MODELS_PATH.exists()

In [3]:
train_df = pd.read_csv(DATA_PATH/'train-orig.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')
sample_df = pd.read_csv(DATA_PATH/'sample_submission.csv')

In [4]:
def remove_unnecessary(df):
    df.drop(df[df['target'] == 0].index, inplace=True)
    df.reset_index(drop=True, inplace=True)
    
remove_unnecessary(train_df)

In [5]:
train_df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845
...,...,...,...,...,...,...
2828,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,When you think of dinosaurs and where they liv...,1.711390,0.646900
2829,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,So what is a solid? Solids are usually hard be...,0.189476,0.535648
2830,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,The second state of matter we will discuss is ...,0.255209,0.483866
2831,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,Solids are shapes that you can actually touch....,-0.215279,0.514128


### Config and Seeding

In [6]:
class Config(): 
    NUM_FOLDS = 6
    NUM_EPOCHS = 3
    BATCH_SIZE = 16
    MAX_LEN = 248
    EVAL_SCHEDULE = [(0.50, 16), (0.49, 8), (0.48, 4), (0.47, 2), (-1., 1)]
    ROBERTA_PATH = 'microsoft/deberta-large'
    TOKENIZER_PATH = 'microsoft/deberta-large'
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    SEED = 1000
    NUM_WORKERS = 2
    MODEL_FOLDER = MODELS_PATH
    model_name = 'deberta-large'
    svm_kernels = ['rbf']
    svm_c = 5

cfg = Config()

In [7]:
if not cfg.MODEL_FOLDER.exists():
    os.mkdir(cfg.MODEL_FOLDER)

In [8]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True

### Dataset

In [9]:
def add_bins(train_df, num_bins):
    train_df.loc[:, 'bins'] = pd.cut(train_df['target'], bins=num_bins, labels=False)
    return num_bins

In [10]:
add_bins(train_df, cfg.NUM_FOLDS)

6

In [11]:
train_df.groupby(['bins'])['target'].agg(['count', 'mean'])

Unnamed: 0_level_0,count,mean
bins,Unnamed: 1_level_1,Unnamed: 2_level_1
0,122,-3.125765
1,441,-2.270279
2,784,-1.41215
3,886,-0.548095
4,494,0.289716
5,106,1.070237


In [12]:
tokenizer = AutoTokenizer.from_pretrained(cfg.TOKENIZER_PATH)

In [13]:
class CommonLitDataset(Dataset):
    def __init__(self, df, tokenizer, inference_only=False):
        super().__init__()
        self.df, self.inference_only = df, inference_only
        self.text = df['excerpt'].tolist()
        self.bins = df['bins']
        if not inference_only:
            self.target = torch.tensor(df['target'].to_numpy(), dtype = torch.float32)
        
        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',
            max_length = cfg.MAX_LEN,
            truncation = True,
            return_attention_mask=True
        )
        
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return {'input_ids': input_ids, 'attention_mask': attention_mask}
        else:
            target = self.target[index]
            return {'input_ids': input_ids, 'attention_mask': attention_mask, 'target': target}
    
    def __len__(self):
        return len(self.df)

In [14]:
sample_ds = CommonLitDataset(train_df, tokenizer)

### Model

In [15]:
class AttentionHead(nn.Module):
    
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        
        self.hidden_layer = nn.Linear(in_features, hidden_dim)
        self.final_layer = nn.Linear(hidden_dim, num_targets)
        self.out_features = hidden_dim
        
    def forward(self, features):
        att = torch.tanh(self.hidden_layer(features))
        score = self.final_layer(att)
        attention_weights = torch.softmax(score, dim=1)
        return attention_weights

In [16]:
class CommonLitModel(nn.Module):
    def __init__(self):
        super(CommonLitModel, self).__init__()
        config = AutoConfig.from_pretrained(cfg.ROBERTA_PATH)
        config.update({
            "output_hidden_states": True,
            "hidden_dropout_prob": 0.0,
            "layer_norm_eps": 1e-7
        })
        self.transformer_model = AutoModel.from_pretrained(cfg.ROBERTA_PATH, config=config)
        self.attention = AttentionHead(config.hidden_size, 512, 1)
        self.regressor = nn.Linear(config.hidden_size, 1)
    
    def forward(self, input_ids, attention_mask):
        last_layer_hidden_states = self.transformer_model(input_ids=input_ids, attention_mask=attention_mask)['last_hidden_state']
        weights = self.attention(last_layer_hidden_states)
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1) 
        return self.regressor(context_vector), context_vector

In [17]:
sample_model = CommonLitModel()

Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
import re

for i, (name, param) in enumerate(sample_model.named_parameters()):
    if(name.find('layer') > -1):
        layer_name = re.sub(r'.+(layer\.\d+).+', r'\1', name)

In [19]:
for i, (name, param) in enumerate(sample_model.named_parameters()):
    print(i, name, param.size())

0 transformer_model.embeddings.word_embeddings.weight torch.Size([50265, 1024])
1 transformer_model.embeddings.LayerNorm.weight torch.Size([1024])
2 transformer_model.embeddings.LayerNorm.bias torch.Size([1024])
3 transformer_model.encoder.layer.0.attention.self.q_bias torch.Size([1024])
4 transformer_model.encoder.layer.0.attention.self.v_bias torch.Size([1024])
5 transformer_model.encoder.layer.0.attention.self.in_proj.weight torch.Size([3072, 1024])
6 transformer_model.encoder.layer.0.attention.self.pos_proj.weight torch.Size([1024, 1024])
7 transformer_model.encoder.layer.0.attention.self.pos_q_proj.weight torch.Size([1024, 1024])
8 transformer_model.encoder.layer.0.attention.self.pos_q_proj.bias torch.Size([1024])
9 transformer_model.encoder.layer.0.attention.output.dense.weight torch.Size([1024, 1024])
10 transformer_model.encoder.layer.0.attention.output.dense.bias torch.Size([1024])
11 transformer_model.encoder.layer.0.attention.output.LayerNorm.weight torch.Size([1024])
12 tra

In [20]:
sample_input_ids = torch.randint(0, 1000, [8, 248])
sample_attention_mask = torch.randint(0, 1000, [8, 248])

In [21]:
sample_model(sample_input_ids, sample_attention_mask)[1].shape

torch.Size([8, 1024])

In [22]:
torch.sum(torch.randn([8, 496, 768]), axis=1)

tensor([[ 18.4460,  25.5957, -21.9767,  ...,   3.4511,  14.6272, -67.4450],
        [-21.1760,  14.9009, -18.4976,  ...,   4.7277,  36.7557, -29.4302],
        [ 61.5393,  -3.9496, -13.8055,  ..., -36.2256,   1.5399,  11.0703],
        ...,
        [-13.6829,  15.8333,  15.1891,  ...,  -4.2581, -49.4773,  15.2714],
        [  5.4886,  -6.1912,   5.1670,  ...,   5.5224,  10.3467,  20.8256],
        [-29.5128,  -9.6674, -51.0567,  ...,   2.4404,   0.4094, -15.4870]])

### Evaluation and Prediction

In [23]:
def eval_mse(model, data_loader):
    model.eval()
    mse_sum = 0
    mse_loss = nn.MSELoss(reduction='sum')
    
    with torch.no_grad():
        for batch_num, record in enumerate(data_loader):
            input_ids, attention_mask, target = record['input_ids'].to(cfg.DEVICE), record['attention_mask'].to(cfg.DEVICE), record['target'].to(cfg.DEVICE)
            pred, _ = model(input_ids, attention_mask)
            mse_sum += mse_loss(pred.flatten().cpu(), target.cpu())
            
    return mse_sum / len(data_loader.dataset)

In [24]:
def predict(model, data_loader):
    model.eval()
    result = []
    
    with torch.no_grad():
        for batch_num, record in tqdm(enumerate(data_loader), total=len(data_loader)):
            input_ids, attention_mask = record['input_ids'].to(cfg.DEVICE), record['attention_mask'].to(cfg.DEVICE)
            pred, _ = model(input_ids, attention_mask)
            result.extend(pred.flatten().to("cpu").tolist())
            
    return np.array(result)

In [25]:
sample_dl = DataLoader(sample_ds, shuffle=False, batch_size=16, num_workers=1)

### Optimizer and Sampler

In [26]:
5e-5 / 2.5, 5e-5 / 0.5, 5e-5

(2e-05, 0.0001, 5e-05)

In [27]:
def create_optimizer(model, base_lr=5e-5, last_lr=None):
    named_parameters = list(model.named_parameters())
    
    regressor_param_start = 392
    attention_param_start = 388
    roberta_parameters = named_parameters[:attention_param_start]
    attention_parameters = named_parameters[attention_param_start:regressor_param_start]
    regressor_parameters = named_parameters[regressor_param_start:]
    
    attention_group = [params for (name, params) in attention_parameters]
    regressor_group = [params for (name, params) in regressor_parameters]
    
    parameters = []
    if last_lr is not None:
        parameters.append({"params": attention_group, "lr": last_lr})
        parameters.append({"params": regressor_group, "lr": last_lr})
    else:
        parameters.append({"params": attention_group})
        parameters.append({"params": regressor_group})
    
    for layer_num, (name, params) in enumerate(roberta_parameters):
        weight_decay = 0.0 if 'bias' in name else 0.01
        
        lr = base_lr / 2.5 # 2e-05
        if layer_num >= 260:
            lr = base_lr / 0.5 # 1e-4
        elif layer_num >= 132:        
            lr = base_lr    
            
        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})
        
    return AdamW(parameters)

In [28]:
sample_optimizer = create_optimizer(sample_model)

In [29]:
from torch.utils.data import Sampler,SequentialSampler,RandomSampler,SubsetRandomSampler
from collections import Counter

class WeightedSampler(Sampler):
    
    def __init__(self, dataset):
        
        self.indices = list(range(len(dataset)))
        self.num_samples = len(dataset)
        self.label_to_count = dict(Counter(dataset.bins))
        weights = [1/self.label_to_count[i] for i in dataset.bins]
        
        self.weights = torch.tensor(weights,dtype=torch.double)
        
    def __iter__(self):
        count = 0
        index = [self.indices[i] for i in torch.multinomial(self.weights, self.num_samples, replacement=True)]
        while count < self.num_samples:
            yield index[count]
            count += 1
    
    def __len__(self):
        return self.num_samples

### Training

In [30]:
def choose_eval_period(val_rmse):
    for rmse, period in cfg.EVAL_SCHEDULE:
        if val_rmse >= rmse:
            return period

In [31]:
def serialize_best(best_val_rmse, best_epoch, val_rmse, epoch, model, model_path):
    if not best_val_rmse or val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        best_epoch = epoch
        if not model_path.parent.exists():
            os.makedirs(model_path.parent)
        
#         torch.save(model.state_dict(), model_path)
        print(f"New best_val_rmse: {best_val_rmse:0.4}")
    else:       
        print(f"Still best_val_rmse: {best_val_rmse:0.4}",
              f"(from epoch {best_epoch})")
    return best_epoch, best_val_rmse

In [32]:
class Trainer():
    def __init__(self, scaler, model, model_path, train_loader, val_loader, optimizer, scheduler=None, num_epochs=cfg.NUM_EPOCHS):
        self.scaler, self.model, self.model_path, self.train_loader, self.val_loader, self.optimizer, self.scheduler, self.num_epochs = (
            scaler, model, model_path, train_loader, val_loader, optimizer, scheduler, num_epochs
        )
            
    def train(self):
        self.model.train()
        
        mse_loss = nn.MSELoss(reduction='mean')
        
        best_val_rmse = None
        best_epoch = 0
        step = 0
        last_eval_step = 0
        eval_period = cfg.EVAL_SCHEDULE[0][1]    

        start = time.time()
        
        tbar = tqdm(range(self.num_epochs), total=self.num_epochs)
        for epoch in tbar:
            tbar.set_description(f'Epoch: {epoch}')
            val_rmse = None
            for batch_num, record in enumerate(self.train_loader):
                input_ids, attention_mask, target = record['input_ids'].to(cfg.DEVICE), record['attention_mask'].to(cfg.DEVICE), record['target'].to(cfg.DEVICE)
                
                self.optimizer.zero_grad()
                
                # Casts operations to mixed precision
                with torch.cuda.amp.autocast():
                    pred, _ = self.model(input_ids, attention_mask)
                    mse = mse_loss(pred.flatten(), target)
                    
                self.scaler.scale(mse).backward()
                self.scaler.step(self.optimizer)
                self.scaler.update()
                
                if self.scheduler:
                    self.scheduler.step()
                    
                if step >= last_eval_step + eval_period:
                    elapsed_seconds = time.time() - start
                    num_steps = step - last_eval_step
                    print(f"\n{num_steps} steps took {elapsed_seconds:0.3} seconds")
                    last_eval_step = step
                    
                    val_rmse = np.sqrt(eval_mse(self.model, self.val_loader))
                    print(f"Epoch: {epoch} batch_num: {batch_num}", f"val_rmse: {val_rmse:0.4} ", end='')
                    
                    eval_period = choose_eval_period(val_rmse)
                    best_epoch, best_val_rmse = serialize_best(best_val_rmse, best_epoch, val_rmse, epoch, self.model, self.model_path)
                    start = time.time()
                # Finish early on condition
                if epoch > 0 and best_val_rmse > 0.6:
                    return best_val_rmse
                
                step += 1
        return best_val_rmse

In [33]:
kfold = KFold(n_splits=cfg.NUM_FOLDS, random_state=cfg.SEED, shuffle=True)
splits = list(kfold.split(train_df))

### Optuna

In [34]:
# Best results
# fold 0: {'base_lr': 4.214048623230046e-05, 'last_lr': 0.00098671139242345}. Best is trial 0 with value: 0.46920305490493774.
# fold 1: {'base_lr': 3.4594372607385946e-05, 'last_lr': 0.0005479134338105077}. Best is trial 0 with value: 0.447492390871048
# fold 2: {'base_lr': 1.777623134028703e-05, 'last_lr': 0.004132549020616918}. Best is trial 0 with value: 0.46756473183631897
# fold 3: {'base_lr': 3.933402254716856e-05, 'last_lr': 0.0018473297738188957}. Best is trial 11 with value: 0.4719877541065216
# fold 4: {'base_lr': 1.845975941382356e-05, 'last_lr': 0.0006309278277674714}. Best is trial 15 with value: 0.46920618414878845
# fold 5: {'base_lr': 4.430444436442592e-05, 'last_lr': 0.000289231685619846}. Best is trial 6 with value: 0.4629150927066803

In [35]:
fold = 0

def objective(trial):
    base_lr = trial.suggest_float("base_lr", 8e-6, 5e-4, log=True)
    last_lr = trial.suggest_float("last_lr", 8e-5, 5e-3, log=True)
    
    print(f'##### Using fold {fold}')
    
    model_path = cfg.MODEL_FOLDER/f"{cfg.model_name.replace('/', '_')}_{fold + 1}/model_{fold + 1}.pth"
    
    set_random_seed(cfg.SEED + fold)
    
    tokenizer = AutoTokenizer.from_pretrained(cfg.TOKENIZER_PATH)
    
    train_indices, val_indices = splits[fold]
    train_dataset = CommonLitDataset(train_df.loc[train_indices], tokenizer)    
    val_dataset = CommonLitDataset(train_df.loc[val_indices], tokenizer)
    
    train_loader = DataLoader(train_dataset, batch_size=cfg.BATCH_SIZE,
                              drop_last=False, shuffle=True, num_workers=cfg.NUM_WORKERS)    
    val_loader = DataLoader(val_dataset, batch_size=cfg.BATCH_SIZE,
                            drop_last=False, shuffle=False, num_workers=cfg.NUM_WORKERS)
    
    set_random_seed(cfg.SEED + fold)
    
    model = CommonLitModel().to(cfg.DEVICE)
    
    optimizer = create_optimizer(model, base_lr=base_lr, last_lr=last_lr)
    
    scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                num_training_steps=cfg.NUM_EPOCHS * len(train_loader), 
                                                num_warmup_steps=50)
    scaler = torch.cuda.amp.GradScaler()
    
    trainer = Trainer(scaler, model, model_path, train_loader, val_loader, optimizer, scheduler = scheduler)
    rmse_val = trainer.train()
    
    del trainer
    del model
    del tokenizer
    del scaler
    del optimizer
    del train_loader
    del val_loader
    torch.cuda.empty_cache()
    gc.collect()
    
    return rmse_val

In [36]:
%%time

for i in range(3, len(list(splits))):
    fold = i
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=20)
    print(" Best value: ", study.best_trial.value)
    print(" Best params: ")
    for key, value in study.best_trial.params.items():
        print(f"    {key}: {value}")

[32m[I 2021-07-13 06:50:57,981][0m A new study created in memory with name: no-name-2440677c-11d9-43c3-9e8b-62d883d79864[0m


##### Using fold 3


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.81 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7108 New best_val_rmse: 0.7108

16 steps took 7.96 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6267 New best_val_rmse: 0.6267

16 steps took 7.97 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8061 Still best_val_rmse: 0.6267 (from epoch 0)

16 steps took 8.01 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.11 Still best_val_rmse: 0.6267 (from epoch 0)

16 steps took 7.95 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.044 Still best_val_rmse: 0.6267 (from epoch 0)

16 steps took 7.96 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.039 Still best_val_rmse: 0.6267 (from epoch 0)

16 steps took 7.94 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.048 Still best_val_rmse: 0.6267 (from epoch 0)

16 steps took 7.95 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.047 Still best_val_rmse: 0.6267 (from epoch 0)

16 steps took 7.96 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.04 Still best_val_rmse: 0.6267 (from epoch 0)


[32m[I 2021-07-13 06:54:00,256][0m Trial 0 finished with value: 0.6267367601394653 and parameters: {'base_lr': 0.0001236164359060086, 'last_lr': 0.0004392333557443599}. Best is trial 0 with value: 0.6267367601394653.[0m



##### Using fold 3


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.29 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8789 New best_val_rmse: 0.8789

16 steps took 8.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7196 New best_val_rmse: 0.7196

16 steps took 8.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7295 Still best_val_rmse: 0.7196 (from epoch 0)

16 steps took 8.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6253 New best_val_rmse: 0.6253

16 steps took 8.03 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5938 New best_val_rmse: 0.5938

16 steps took 8.02 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6007 Still best_val_rmse: 0.5938 (from epoch 0)

16 steps took 8.01 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6401 Still best_val_rmse: 0.5938 (from epoch 0)

16 steps took 8.05 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5568 New best_val_rmse: 0.5568

16 steps took 8.04 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5465 New best_val_rmse: 0.5465

16 steps took 8.31 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5353 New best_val_rmse: 0.5353

16 steps

[32m[I 2021-07-13 07:03:39,633][0m Trial 1 finished with value: 0.4918626546859741 and parameters: {'base_lr': 1.745954134846961e-05, 'last_lr': 0.001415690874499708}. Best is trial 1 with value: 0.4918626546859741.[0m



##### Using fold 3


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.29 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.002 New best_val_rmse: 1.002

16 steps took 8.03 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.017 Still best_val_rmse: 1.002 (from epoch 0)

16 steps took 8.01 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.157 Still best_val_rmse: 1.002 (from epoch 0)

16 steps took 8.01 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.084 Still best_val_rmse: 1.002 (from epoch 0)

16 steps took 7.93 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.053 Still best_val_rmse: 1.002 (from epoch 0)

16 steps took 7.92 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.047 Still best_val_rmse: 1.002 (from epoch 0)

16 steps took 7.91 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.042 Still best_val_rmse: 1.002 (from epoch 0)

16 steps took 7.92 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.052 Still best_val_rmse: 1.002 (from epoch 0)

16 steps took 7.91 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.053 Still best_val_rmse: 1.002 (from epoch 0)


[32m[I 2021-07-13 07:06:40,343][0m Trial 2 finished with value: 1.001896619796753 and parameters: {'base_lr': 0.0004378776578537877, 'last_lr': 0.003963182774909638}. Best is trial 1 with value: 0.4918626546859741.[0m



##### Using fold 3


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.59 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.11 New best_val_rmse: 1.11

16 steps took 8.02 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7107 New best_val_rmse: 0.7107

16 steps took 8.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9303 Still best_val_rmse: 0.7107 (from epoch 0)

16 steps took 8.01 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5797 New best_val_rmse: 0.5797

16 steps took 8.02 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5575 New best_val_rmse: 0.5575

16 steps took 8.06 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6202 Still best_val_rmse: 0.5575 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5301 New best_val_rmse: 0.5301

16 steps took 8.03 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5882 Still best_val_rmse: 0.5301 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5964 Still best_val_rmse: 0.5301 (from epoch 0)

16 steps took 8.37 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5049 New best_val_rmse: 0

[32m[I 2021-07-13 07:21:53,459][0m Trial 3 finished with value: 0.47842535376548767 and parameters: {'base_lr': 7.85942885541634e-05, 'last_lr': 0.0004061756428993061}. Best is trial 3 with value: 0.47842535376548767.[0m



##### Using fold 3


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.3 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.131 New best_val_rmse: 1.131

16 steps took 8.01 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7275 New best_val_rmse: 0.7275

16 steps took 8.01 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7124 New best_val_rmse: 0.7124

16 steps took 8.03 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5925 New best_val_rmse: 0.5925

16 steps took 8.03 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5741 New best_val_rmse: 0.5741

16 steps took 8.06 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5878 Still best_val_rmse: 0.5741 (from epoch 0)

16 steps took 8.04 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5621 New best_val_rmse: 0.5621

16 steps took 8.03 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5518 New best_val_rmse: 0.5518

16 steps took 8.03 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5432 New best_val_rmse: 0.5432

16 steps took 8.43 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5389 New best_val_rmse: 0.5389

16 steps took 8.04 seconds
Epoch: 1 batch_

[32m[I 2021-07-13 07:31:43,707][0m Trial 4 finished with value: 0.49199309945106506 and parameters: {'base_lr': 1.0950700468635982e-05, 'last_lr': 0.0007601420973750196}. Best is trial 3 with value: 0.47842535376548767.[0m



##### Using fold 3


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.29 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.046 New best_val_rmse: 1.046

16 steps took 8.02 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.806 New best_val_rmse: 0.806

16 steps took 8.03 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6806 New best_val_rmse: 0.6806

16 steps took 8.01 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6108 New best_val_rmse: 0.6108

16 steps took 8.03 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5608 New best_val_rmse: 0.5608

16 steps took 8.05 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5411 New best_val_rmse: 0.5411

16 steps took 8.02 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5486 Still best_val_rmse: 0.5411 (from epoch 0)

16 steps took 8.04 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.8226 Still best_val_rmse: 0.5411 (from epoch 0)

16 steps took 8.05 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.597 Still best_val_rmse: 0.5411 (from epoch 0)

16 steps took 8.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5623 Still best_val_rmse: 0.5411 (from epoc

[32m[I 2021-07-13 07:43:48,853][0m Trial 5 finished with value: 0.480339378118515 and parameters: {'base_lr': 9.561811501893819e-05, 'last_lr': 0.00021196312425399687}. Best is trial 3 with value: 0.47842535376548767.[0m



##### Using fold 3


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.28 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8781 New best_val_rmse: 0.8781

16 steps took 8.02 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6339 New best_val_rmse: 0.6339

16 steps took 8.02 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8235 Still best_val_rmse: 0.6339 (from epoch 0)

16 steps took 8.01 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5908 New best_val_rmse: 0.5908

16 steps took 8.03 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5742 New best_val_rmse: 0.5742

16 steps took 8.05 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5961 Still best_val_rmse: 0.5742 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5797 Still best_val_rmse: 0.5742 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5404 New best_val_rmse: 0.5404

16 steps took 8.03 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5426 Still best_val_rmse: 0.5404 (from epoch 0)

16 steps took 8.44 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5179 New best_val_rm

[32m[I 2021-07-13 07:56:24,859][0m Trial 6 finished with value: 0.4813295304775238 and parameters: {'base_lr': 2.5713054102170138e-05, 'last_lr': 8.00730383342181e-05}. Best is trial 3 with value: 0.47842535376548767.[0m



##### Using fold 3


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.29 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.039 New best_val_rmse: 1.039

16 steps took 8.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.655 New best_val_rmse: 0.655

16 steps took 8.01 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6001 New best_val_rmse: 0.6001

16 steps took 8.03 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5673 New best_val_rmse: 0.5673

16 steps took 8.04 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5569 New best_val_rmse: 0.5569

16 steps took 8.04 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6019 Still best_val_rmse: 0.5569 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5778 Still best_val_rmse: 0.5569 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5456 New best_val_rmse: 0.5456

16 steps took 8.02 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5305 New best_val_rmse: 0.5305

16 steps took 8.43 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5203 New best_val_rmse: 0.5203

16 steps took 8.04 seconds


[32m[I 2021-07-13 08:09:00,511][0m Trial 7 finished with value: 0.4818335473537445 and parameters: {'base_lr': 1.6074093603458293e-05, 'last_lr': 9.773433523491014e-05}. Best is trial 3 with value: 0.47842535376548767.[0m



##### Using fold 3


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.33 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.166 New best_val_rmse: 1.166

16 steps took 8.02 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7343 New best_val_rmse: 0.7343

16 steps took 8.05 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6814 New best_val_rmse: 0.6814

16 steps took 8.05 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5766 New best_val_rmse: 0.5766

16 steps took 8.06 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5623 New best_val_rmse: 0.5623

16 steps took 8.04 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5733 Still best_val_rmse: 0.5623 (from epoch 0)

16 steps took 8.05 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5609 New best_val_rmse: 0.5609

16 steps took 8.03 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5414 New best_val_rmse: 0.5414

16 steps took 8.04 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5311 New best_val_rmse: 0.5311

16 steps took 8.34 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5182 New best_val_rmse: 0.5182

16 steps took 8.04 seconds
Epoch: 1 batch

[32m[I 2021-07-13 08:20:55,487][0m Trial 8 finished with value: 0.4853384494781494 and parameters: {'base_lr': 1.2285175980980382e-05, 'last_lr': 0.00011493978232590547}. Best is trial 3 with value: 0.47842535376548767.[0m



##### Using fold 3


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.3 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8193 New best_val_rmse: 0.8193

16 steps took 8.01 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7549 New best_val_rmse: 0.7549

16 steps took 8.05 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7277 New best_val_rmse: 0.7277

16 steps took 8.02 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.636 New best_val_rmse: 0.636

16 steps took 8.04 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6004 New best_val_rmse: 0.6004

16 steps took 8.05 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6 New best_val_rmse: 0.6

16 steps took 8.03 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5124 New best_val_rmse: 0.5124

16 steps took 8.03 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.8435 Still best_val_rmse: 0.5124 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5697 Still best_val_rmse: 0.5124 (from epoch 0)

16 steps took 8.37 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5581 Still best_val_rmse: 0.5124 (from epoch 0)

16 steps took 

[32m[I 2021-07-13 08:39:05,473][0m Trial 9 finished with value: 0.4723660945892334 and parameters: {'base_lr': 4.93859406656209e-05, 'last_lr': 0.0015750094858236622}. Best is trial 9 with value: 0.4723660945892334.[0m



##### Using fold 3


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.27 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9189 New best_val_rmse: 0.9189

16 steps took 7.98 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.064 Still best_val_rmse: 0.9189 (from epoch 0)

16 steps took 7.99 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.048 Still best_val_rmse: 0.9189 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.04 Still best_val_rmse: 0.9189 (from epoch 0)

16 steps took 7.98 seconds
Epoch: 0 batch_num: 80 val_rmse: 5.768 Still best_val_rmse: 0.9189 (from epoch 0)

16 steps took 7.83 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.082 Still best_val_rmse: 0.9189 (from epoch 0)

16 steps took 7.94 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.055 Still best_val_rmse: 0.9189 (from epoch 0)

16 steps took 7.9 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.042 Still best_val_rmse: 0.9189 (from epoch 0)

16 steps took 7.93 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.052 Still best_val_rmse: 0.9189 (from epoch 0)


[32m[I 2021-07-13 08:42:06,046][0m Trial 10 finished with value: 0.9189337491989136 and parameters: {'base_lr': 0.00030188783957741804, 'last_lr': 0.004623364365297586}. Best is trial 9 with value: 0.4723660945892334.[0m



##### Using fold 3


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.26 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8689 New best_val_rmse: 0.8689

16 steps took 8.02 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7713 New best_val_rmse: 0.7713

16 steps took 8.01 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8985 Still best_val_rmse: 0.7713 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6077 New best_val_rmse: 0.6077

16 steps took 8.03 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6338 Still best_val_rmse: 0.6077 (from epoch 0)

16 steps took 8.05 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5491 New best_val_rmse: 0.5491

16 steps took 8.07 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5093 New best_val_rmse: 0.5093

16 steps took 8.06 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7782 Still best_val_rmse: 0.5093 (from epoch 0)

16 steps took 8.06 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5216 Still best_val_rmse: 0.5093 (from epoch 0)

16 steps took 8.38 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5129 Still best_val_

[32m[I 2021-07-13 09:00:34,357][0m Trial 11 finished with value: 0.4719877541065216 and parameters: {'base_lr': 3.933402254716856e-05, 'last_lr': 0.0018473297738188957}. Best is trial 11 with value: 0.4719877541065216.[0m



##### Using fold 3


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.27 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8707 New best_val_rmse: 0.8707

16 steps took 8.04 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7661 New best_val_rmse: 0.7661

16 steps took 8.03 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8035 Still best_val_rmse: 0.7661 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6049 New best_val_rmse: 0.6049

16 steps took 8.02 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6443 Still best_val_rmse: 0.6049 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5961 New best_val_rmse: 0.5961

16 steps took 8.04 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5164 New best_val_rmse: 0.5164

16 steps took 8.05 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.8776 Still best_val_rmse: 0.5164 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5193 Still best_val_rmse: 0.5164 (from epoch 0)

16 steps took 8.42 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5644 Still best_val_

[32m[I 2021-07-13 09:19:13,535][0m Trial 12 finished with value: 0.47275277972221375 and parameters: {'base_lr': 3.7865492905730714e-05, 'last_lr': 0.0020209346923263974}. Best is trial 11 with value: 0.4719877541065216.[0m



##### Using fold 3


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.31 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8109 New best_val_rmse: 0.8109

16 steps took 8.04 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7499 New best_val_rmse: 0.7499

16 steps took 8.03 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7282 New best_val_rmse: 0.7282

16 steps took 8.03 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6398 New best_val_rmse: 0.6398

16 steps took 8.05 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5808 New best_val_rmse: 0.5808

16 steps took 8.06 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.564 New best_val_rmse: 0.564

16 steps took 8.05 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5097 New best_val_rmse: 0.5097

16 steps took 8.04 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.869 Still best_val_rmse: 0.5097 (from epoch 0)

16 steps took 8.04 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5552 Still best_val_rmse: 0.5097 (from epoch 0)

16 steps took 8.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5494 Still best_val_rmse: 0.5097 (from epoch 0)

16 steps 

[32m[I 2021-07-13 09:38:02,604][0m Trial 13 finished with value: 0.47244179248809814 and parameters: {'base_lr': 4.80991294980512e-05, 'last_lr': 0.00235888450338897}. Best is trial 11 with value: 0.4719877541065216.[0m



##### Using fold 3


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.31 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8965 New best_val_rmse: 0.8965

16 steps took 8.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7775 New best_val_rmse: 0.7775

16 steps took 8.03 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7671 New best_val_rmse: 0.7671

16 steps took 8.02 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6421 New best_val_rmse: 0.6421

16 steps took 8.04 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6345 New best_val_rmse: 0.6345

16 steps took 8.03 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5466 New best_val_rmse: 0.5466

16 steps took 8.02 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5212 New best_val_rmse: 0.5212

16 steps took 8.03 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5665 Still best_val_rmse: 0.5212 (from epoch 0)

16 steps took 8.05 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5554 Still best_val_rmse: 0.5212 (from epoch 0)

16 steps took 8.37 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5163 New best_val_rmse: 0.5163

16 steps took 8.01 seco

[32m[I 2021-07-13 09:55:08,981][0m Trial 14 finished with value: 0.477120578289032 and parameters: {'base_lr': 3.375456005933041e-05, 'last_lr': 0.0010378060794420565}. Best is trial 11 with value: 0.4719877541065216.[0m



##### Using fold 3


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.33 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.0 New best_val_rmse: 1.0

16 steps took 8.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6071 New best_val_rmse: 0.6071

16 steps took 8.01 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6886 Still best_val_rmse: 0.6071 (from epoch 0)

16 steps took 8.04 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7574 Still best_val_rmse: 0.6071 (from epoch 0)

16 steps took 8.04 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6176 Still best_val_rmse: 0.6071 (from epoch 0)

16 steps took 8.04 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.598 New best_val_rmse: 0.598

16 steps took 7.98 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.042 Still best_val_rmse: 0.598 (from epoch 0)

16 steps took 7.98 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.053 Still best_val_rmse: 0.598 (from epoch 0)

16 steps took 7.97 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.055 Still best_val_rmse: 0.598 (from epoch 0)

16 steps took 8.32 seconds
Epoch: 1 batch_num: 12 val_rmse: 1.0

[32m[I 2021-07-13 10:03:44,250][0m Trial 15 finished with value: 0.5979675054550171 and parameters: {'base_lr': 0.00014562815724550082, 'last_lr': 0.002862539789708432}. Best is trial 11 with value: 0.4719877541065216.[0m



##### Using fold 3


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.31 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8459 New best_val_rmse: 0.8459

16 steps took 8.01 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6719 New best_val_rmse: 0.6719

16 steps took 8.03 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7071 Still best_val_rmse: 0.6719 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7028 Still best_val_rmse: 0.6719 (from epoch 0)

16 steps took 8.05 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.669 New best_val_rmse: 0.669

16 steps took 8.04 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5868 New best_val_rmse: 0.5868

16 steps took 8.04 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5136 New best_val_rmse: 0.5136

16 steps took 8.06 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.8133 Still best_val_rmse: 0.5136 (from epoch 0)

16 steps took 8.04 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5237 Still best_val_rmse: 0.5136 (from epoch 0)

16 steps took 8.33 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5519 Still best_val_rm

[32m[I 2021-07-13 10:22:12,498][0m Trial 16 finished with value: 0.47497743368148804 and parameters: {'base_lr': 5.949649546990705e-05, 'last_lr': 0.001394817761059422}. Best is trial 11 with value: 0.4719877541065216.[0m



##### Using fold 3


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.26 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8775 New best_val_rmse: 0.8775

16 steps took 8.02 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7235 New best_val_rmse: 0.7235

16 steps took 8.01 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8394 Still best_val_rmse: 0.7235 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6569 New best_val_rmse: 0.6569

16 steps took 8.04 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5507 New best_val_rmse: 0.5507

16 steps took 8.02 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5917 Still best_val_rmse: 0.5507 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5297 New best_val_rmse: 0.5297

16 steps took 8.04 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5411 Still best_val_rmse: 0.5297 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5638 Still best_val_rmse: 0.5297 (from epoch 0)

16 steps took 8.35 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5388 Still best_val_

[32m[I 2021-07-13 10:33:46,769][0m Trial 17 finished with value: 0.488165944814682 and parameters: {'base_lr': 2.6577087600861542e-05, 'last_lr': 0.0006555025937989686}. Best is trial 11 with value: 0.4719877541065216.[0m



##### Using fold 3


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.3 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.841 New best_val_rmse: 1.841

16 steps took 7.98 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8765 New best_val_rmse: 0.8765

16 steps took 8.01 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6342 New best_val_rmse: 0.6342

16 steps took 7.98 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.118 Still best_val_rmse: 0.6342 (from epoch 0)

16 steps took 7.92 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.121 Still best_val_rmse: 0.6342 (from epoch 0)

16 steps took 7.93 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.04 Still best_val_rmse: 0.6342 (from epoch 0)

16 steps took 7.93 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.041 Still best_val_rmse: 0.6342 (from epoch 0)

16 steps took 7.93 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.039 Still best_val_rmse: 0.6342 (from epoch 0)

16 steps took 7.94 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.036 Still best_val_rmse: 0.6342 (from epoch 0)


[32m[I 2021-07-13 10:36:49,086][0m Trial 18 finished with value: 0.6342186331748962 and parameters: {'base_lr': 0.00018188998981542552, 'last_lr': 0.0032771612277958054}. Best is trial 11 with value: 0.4719877541065216.[0m



##### Using fold 3


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.35 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.892 New best_val_rmse: 0.892

16 steps took 8.02 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6588 New best_val_rmse: 0.6588

16 steps took 8.05 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8821 Still best_val_rmse: 0.6588 (from epoch 0)

16 steps took 8.04 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6051 New best_val_rmse: 0.6051

16 steps took 8.04 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5817 New best_val_rmse: 0.5817

16 steps took 8.05 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6294 Still best_val_rmse: 0.5817 (from epoch 0)

16 steps took 8.05 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5552 New best_val_rmse: 0.5552

16 steps took 8.04 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.8564 Still best_val_rmse: 0.5552 (from epoch 0)

16 steps took 8.06 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5483 New best_val_rmse: 0.5483

16 steps took 8.36 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.562 Still best_val_rmse: 0.5483 (from e

[32m[I 2021-07-13 10:54:25,550][0m Trial 19 finished with value: 0.47603610157966614 and parameters: {'base_lr': 6.340567929223967e-05, 'last_lr': 0.0011594900524503188}. Best is trial 11 with value: 0.4719877541065216.[0m
[32m[I 2021-07-13 10:54:25,555][0m A new study created in memory with name: no-name-81c980ff-72c9-4dca-9827-5098aef4d5af[0m



 Best value:  0.4719877541065216
 Best params: 
    base_lr: 3.933402254716856e-05
    last_lr: 0.0018473297738188957
##### Using fold 4


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.31 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9356 New best_val_rmse: 0.9356

16 steps took 7.99 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6408 New best_val_rmse: 0.6408

16 steps took 8.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8461 Still best_val_rmse: 0.6408 (from epoch 0)

16 steps took 8.01 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6059 New best_val_rmse: 0.6059

16 steps took 8.05 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5731 New best_val_rmse: 0.5731

16 steps took 8.04 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7197 Still best_val_rmse: 0.5731 (from epoch 0)

16 steps took 8.06 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5938 Still best_val_rmse: 0.5731 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6765 Still best_val_rmse: 0.5731 (from epoch 0)

16 steps took 8.04 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5783 Still best_val_rmse: 0.5731 (from epoch 0)

16 steps took 8.33 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5924

[32m[I 2021-07-13 11:12:12,676][0m Trial 0 finished with value: 0.471155047416687 and parameters: {'base_lr': 3.637249863801786e-05, 'last_lr': 0.0016962627538473951}. Best is trial 0 with value: 0.471155047416687.[0m



##### Using fold 4


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.36 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.6844 New best_val_rmse: 0.6844

16 steps took 8.04 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.5989 New best_val_rmse: 0.5989

16 steps took 8.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9536 Still best_val_rmse: 0.5989 (from epoch 0)

16 steps took 8.01 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7987 Still best_val_rmse: 0.5989 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5922 New best_val_rmse: 0.5922

16 steps took 8.02 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6743 Still best_val_rmse: 0.5922 (from epoch 0)

16 steps took 8.04 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5988 Still best_val_rmse: 0.5922 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.8527 Still best_val_rmse: 0.5922 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5598 New best_val_rmse: 0.5598

16 steps took 8.36 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5837

[32m[I 2021-07-13 11:30:23,209][0m Trial 1 finished with value: 0.47269728779792786 and parameters: {'base_lr': 6.22889405760658e-05, 'last_lr': 9.249841272963643e-05}. Best is trial 0 with value: 0.471155047416687.[0m



##### Using fold 4


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.38 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8942 New best_val_rmse: 0.8942

16 steps took 7.95 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6449 New best_val_rmse: 0.6449

16 steps took 8.01 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8072 Still best_val_rmse: 0.6449 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5804 New best_val_rmse: 0.5804

16 steps took 8.03 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7197 Still best_val_rmse: 0.5804 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5962 Still best_val_rmse: 0.5804 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5877 Still best_val_rmse: 0.5804 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7855 Still best_val_rmse: 0.5804 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5589 New best_val_rmse: 0.5589

16 steps took 8.31 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.551

[32m[I 2021-07-13 11:48:21,012][0m Trial 2 finished with value: 0.4750526249408722 and parameters: {'base_lr': 2.906350701126783e-05, 'last_lr': 0.003390354303991374}. Best is trial 0 with value: 0.471155047416687.[0m



##### Using fold 4


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.41 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7562 New best_val_rmse: 0.7562

16 steps took 8.04 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6845 New best_val_rmse: 0.6845

16 steps took 8.01 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8432 Still best_val_rmse: 0.6845 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7494 Still best_val_rmse: 0.6845 (from epoch 0)

16 steps took 8.05 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6429 New best_val_rmse: 0.6429

16 steps took 8.06 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.649 Still best_val_rmse: 0.6429 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.639 New best_val_rmse: 0.639

16 steps took 8.04 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7474 Still best_val_rmse: 0.639 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5673 New best_val_rmse: 0.5673

16 steps took 8.39 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.538 New best_val_rmse: 0

[32m[I 2021-07-13 12:06:49,618][0m Trial 3 finished with value: 0.4741213619709015 and parameters: {'base_lr': 4.5210501000419023e-05, 'last_lr': 0.0001842045235576954}. Best is trial 0 with value: 0.471155047416687.[0m



##### Using fold 4


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.32 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8771 New best_val_rmse: 0.8771

16 steps took 8.01 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6802 New best_val_rmse: 0.6802

16 steps took 8.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6318 New best_val_rmse: 0.6318

16 steps took 8.02 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5564 New best_val_rmse: 0.5564

16 steps took 8.01 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5825 Still best_val_rmse: 0.5564 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7171 Still best_val_rmse: 0.5564 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5668 Still best_val_rmse: 0.5564 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7591 Still best_val_rmse: 0.5564 (from epoch 0)

16 steps took 8.01 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5441 New best_val_rmse: 0.5441

16 steps took 8.38 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5196 New best_val_rms

[32m[I 2021-07-13 12:24:57,399][0m Trial 4 finished with value: 0.4703187942504883 and parameters: {'base_lr': 1.4874639372459377e-05, 'last_lr': 0.002410173488190959}. Best is trial 4 with value: 0.4703187942504883.[0m



##### Using fold 4


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.4 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9243 New best_val_rmse: 0.9243

16 steps took 8.03 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7096 New best_val_rmse: 0.7096

16 steps took 8.03 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6871 New best_val_rmse: 0.6871

16 steps took 8.03 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5893 New best_val_rmse: 0.5893

16 steps took 8.01 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6224 Still best_val_rmse: 0.5893 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6169 Still best_val_rmse: 0.5893 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6461 Still best_val_rmse: 0.5893 (from epoch 0)

16 steps took 8.01 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7194 Still best_val_rmse: 0.5893 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5574 New best_val_rmse: 0.5574

16 steps took 8.39 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5311 New best_val_rms

[32m[I 2021-07-13 12:41:53,194][0m Trial 5 finished with value: 0.47324302792549133 and parameters: {'base_lr': 9.961501774772943e-06, 'last_lr': 0.0020107226677383645}. Best is trial 4 with value: 0.4703187942504883.[0m



##### Using fold 4


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.41 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.041 New best_val_rmse: 1.041

16 steps took 7.97 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7581 New best_val_rmse: 0.7581

16 steps took 8.02 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6738 New best_val_rmse: 0.6738

16 steps took 8.02 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7111 Still best_val_rmse: 0.6738 (from epoch 0)

16 steps took 8.05 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5683 New best_val_rmse: 0.5683

16 steps took 8.06 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5784 Still best_val_rmse: 0.5683 (from epoch 0)

16 steps took 8.04 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5815 Still best_val_rmse: 0.5683 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6035 Still best_val_rmse: 0.5683 (from epoch 0)

16 steps took 8.06 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5559 New best_val_rmse: 0.5559

16 steps took 8.31 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.528 New best_val_rmse:

[32m[I 2021-07-13 13:01:14,203][0m Trial 6 finished with value: 0.47142037749290466 and parameters: {'base_lr': 1.0399554645318021e-05, 'last_lr': 0.00011564456066425258}. Best is trial 4 with value: 0.4703187942504883.[0m



##### Using fold 4


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.35 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7031 New best_val_rmse: 0.7031

16 steps took 8.02 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.733 Still best_val_rmse: 0.7031 (from epoch 0)

16 steps took 8.0 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8097 Still best_val_rmse: 0.7031 (from epoch 0)

16 steps took 8.01 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7458 Still best_val_rmse: 0.7031 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6738 New best_val_rmse: 0.6738

16 steps took 8.01 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7253 Still best_val_rmse: 0.6738 (from epoch 0)

16 steps took 8.07 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5863 New best_val_rmse: 0.5863

16 steps took 8.03 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7838 Still best_val_rmse: 0.5863 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5812 New best_val_rmse: 0.5812

16 steps took 8.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5143 N

[32m[I 2021-07-13 13:19:33,625][0m Trial 7 finished with value: 0.4724603593349457 and parameters: {'base_lr': 4.7147362319298385e-05, 'last_lr': 0.00022188389800071334}. Best is trial 4 with value: 0.4703187942504883.[0m



##### Using fold 4


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.39 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8232 New best_val_rmse: 0.8232

16 steps took 8.03 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6651 New best_val_rmse: 0.6651

16 steps took 8.04 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6384 New best_val_rmse: 0.6384

16 steps took 8.05 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5698 New best_val_rmse: 0.5698

16 steps took 8.04 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6695 Still best_val_rmse: 0.5698 (from epoch 0)

16 steps took 8.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6021 Still best_val_rmse: 0.5698 (from epoch 0)

16 steps took 8.04 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6922 Still best_val_rmse: 0.5698 (from epoch 0)

16 steps took 8.07 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6483 Still best_val_rmse: 0.5698 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5467 New best_val_rmse: 0.5467

16 steps took 8.38 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5038 New best_val_rms

[32m[I 2021-07-13 13:37:52,280][0m Trial 8 finished with value: 0.47184598445892334 and parameters: {'base_lr': 1.970149129585611e-05, 'last_lr': 0.00100252784120404}. Best is trial 4 with value: 0.4703187942504883.[0m



##### Using fold 4


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.26 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.666 New best_val_rmse: 1.666

16 steps took 7.99 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7666 New best_val_rmse: 0.7666

16 steps took 8.03 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7597 New best_val_rmse: 0.7597

16 steps took 8.03 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6991 New best_val_rmse: 0.6991

16 steps took 8.02 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6907 New best_val_rmse: 0.6907

16 steps took 8.0 seconds
Epoch: 0 batch_num: 96 val_rmse: 4.2 Still best_val_rmse: 0.6907 (from epoch 0)

16 steps took 7.96 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.148 Still best_val_rmse: 0.6907 (from epoch 0)

16 steps took 7.92 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.034 Still best_val_rmse: 0.6907 (from epoch 0)

16 steps took 7.93 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.105 Still best_val_rmse: 0.6907 (from epoch 0)


[32m[I 2021-07-13 13:40:54,104][0m Trial 9 finished with value: 0.690746009349823 and parameters: {'base_lr': 0.0003317799849554229, 'last_lr': 0.0003281776713604505}. Best is trial 4 with value: 0.4703187942504883.[0m



##### Using fold 4


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.36 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.007 New best_val_rmse: 1.007

16 steps took 8.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6504 New best_val_rmse: 0.6504

16 steps took 7.97 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.044 Still best_val_rmse: 0.6504 (from epoch 0)

16 steps took 8.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.038 Still best_val_rmse: 0.6504 (from epoch 0)

16 steps took 7.94 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.05 Still best_val_rmse: 0.6504 (from epoch 0)

16 steps took 7.94 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.093 Still best_val_rmse: 0.6504 (from epoch 0)

16 steps took 7.93 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.048 Still best_val_rmse: 0.6504 (from epoch 0)

16 steps took 7.92 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.038 Still best_val_rmse: 0.6504 (from epoch 0)

16 steps took 7.92 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.037 Still best_val_rmse: 0.6504 (from epoch 0)


[32m[I 2021-07-13 13:43:55,451][0m Trial 10 finished with value: 0.6503852605819702 and parameters: {'base_lr': 0.0001803181764505108, 'last_lr': 0.003692075608241321}. Best is trial 4 with value: 0.4703187942504883.[0m



##### Using fold 4


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.37 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.829 New best_val_rmse: 0.829

16 steps took 8.02 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6708 New best_val_rmse: 0.6708

16 steps took 8.01 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6308 New best_val_rmse: 0.6308

16 steps took 8.05 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5644 New best_val_rmse: 0.5644

16 steps took 8.02 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5571 New best_val_rmse: 0.5571

16 steps took 8.03 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6138 Still best_val_rmse: 0.5571 (from epoch 0)

16 steps took 8.05 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5799 Still best_val_rmse: 0.5571 (from epoch 0)

16 steps took 8.05 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7296 Still best_val_rmse: 0.5571 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5496 New best_val_rmse: 0.5496

16 steps took 8.3 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5224 New best_val_rmse: 0.5224

16 steps

[32m[I 2021-07-13 14:02:46,556][0m Trial 11 finished with value: 0.4723910689353943 and parameters: {'base_lr': 1.8341141319343864e-05, 'last_lr': 0.0011650766745043514}. Best is trial 4 with value: 0.4703187942504883.[0m



##### Using fold 4


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.38 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9417 New best_val_rmse: 0.9417

16 steps took 8.04 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7311 New best_val_rmse: 0.7311

16 steps took 8.03 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7455 Still best_val_rmse: 0.7311 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7113 New best_val_rmse: 0.7113

16 steps took 8.01 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.8049 Still best_val_rmse: 0.7113 (from epoch 0)

16 steps took 7.96 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6505 New best_val_rmse: 0.6505

16 steps took 8.04 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6253 New best_val_rmse: 0.6253

16 steps took 8.05 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6892 Still best_val_rmse: 0.6253 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.664 Still best_val_rmse: 0.6253 (from epoch 0)


[32m[I 2021-07-13 14:05:48,709][0m Trial 12 finished with value: 0.6253228783607483 and parameters: {'base_lr': 9.549879790379279e-05, 'last_lr': 0.0019663403905362067}. Best is trial 4 with value: 0.4703187942504883.[0m



##### Using fold 4


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.41 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8501 New best_val_rmse: 0.8501

16 steps took 8.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6586 New best_val_rmse: 0.6586

16 steps took 8.04 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6735 Still best_val_rmse: 0.6586 (from epoch 0)

16 steps took 8.04 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5731 New best_val_rmse: 0.5731

16 steps took 8.04 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.598 Still best_val_rmse: 0.5731 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6531 Still best_val_rmse: 0.5731 (from epoch 0)

16 steps took 8.04 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5963 Still best_val_rmse: 0.5731 (from epoch 0)

16 steps took 8.01 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7161 Still best_val_rmse: 0.5731 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.556 New best_val_rmse: 0.556

16 steps took 8.38 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5045 Ne

[32m[I 2021-07-13 14:25:31,690][0m Trial 13 finished with value: 0.469761461019516 and parameters: {'base_lr': 1.755971846813911e-05, 'last_lr': 0.0006141892845557757}. Best is trial 13 with value: 0.469761461019516.[0m


Epoch: 2 batch_num: 147 val_rmse: 0.4712 Still best_val_rmse: 0.4698 (from epoch 2)

##### Using fold 4


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.35 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.052 New best_val_rmse: 1.052

16 steps took 7.96 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8371 New best_val_rmse: 0.8371

16 steps took 8.01 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6503 New best_val_rmse: 0.6503

16 steps took 8.05 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6529 Still best_val_rmse: 0.6503 (from epoch 0)

16 steps took 8.04 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5979 New best_val_rmse: 0.5979

16 steps took 8.05 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5914 New best_val_rmse: 0.5914

16 steps took 8.07 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6094 Still best_val_rmse: 0.5914 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5812 New best_val_rmse: 0.5812

16 steps took 8.04 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5705 New best_val_rmse: 0.5705

16 steps took 8.38 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5354 New best_val_rmse: 0.5354

16 steps took 8.05 secon

[32m[I 2021-07-13 14:39:01,594][0m Trial 14 finished with value: 0.47996050119400024 and parameters: {'base_lr': 8.736957650525945e-06, 'last_lr': 0.0004694326964204411}. Best is trial 13 with value: 0.469761461019516.[0m



##### Using fold 4


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.34 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8399 New best_val_rmse: 0.8399

16 steps took 8.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6516 New best_val_rmse: 0.6516

16 steps took 8.01 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7146 Still best_val_rmse: 0.6516 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5865 New best_val_rmse: 0.5865

16 steps took 8.04 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6733 Still best_val_rmse: 0.5865 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6069 Still best_val_rmse: 0.5865 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6322 Still best_val_rmse: 0.5865 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7055 Still best_val_rmse: 0.5865 (from epoch 0)

16 steps took 8.05 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5523 New best_val_rmse: 0.5523

16 steps took 8.37 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5052

[32m[I 2021-07-13 15:00:27,732][0m Trial 15 finished with value: 0.46920618414878845 and parameters: {'base_lr': 1.845975941382356e-05, 'last_lr': 0.0006309278277674714}. Best is trial 15 with value: 0.46920618414878845.[0m


Epoch: 2 batch_num: 147 val_rmse: 0.4705 Still best_val_rmse: 0.4692 (from epoch 2)

##### Using fold 4


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.34 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8327 New best_val_rmse: 0.8327

16 steps took 8.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6397 New best_val_rmse: 0.6397

16 steps took 8.05 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7712 Still best_val_rmse: 0.6397 (from epoch 0)

16 steps took 8.05 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5924 New best_val_rmse: 0.5924

16 steps took 8.06 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5603 New best_val_rmse: 0.5603

16 steps took 8.05 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7454 Still best_val_rmse: 0.5603 (from epoch 0)

16 steps took 8.07 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5352 New best_val_rmse: 0.5352

16 steps took 8.04 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6057 Still best_val_rmse: 0.5352 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5539 Still best_val_rmse: 0.5352 (from epoch 0)

16 steps took 8.37 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5806 Still best_val_r

[32m[I 2021-07-13 15:19:28,960][0m Trial 16 finished with value: 0.4699382781982422 and parameters: {'base_lr': 2.4833933515823515e-05, 'last_lr': 0.0007117616210852127}. Best is trial 15 with value: 0.46920618414878845.[0m



##### Using fold 4


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.34 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.052 New best_val_rmse: 1.052

16 steps took 7.98 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6398 New best_val_rmse: 0.6398

16 steps took 8.01 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9464 Still best_val_rmse: 0.6398 (from epoch 0)

16 steps took 8.01 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7123 Still best_val_rmse: 0.6398 (from epoch 0)

16 steps took 8.01 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6794 Still best_val_rmse: 0.6398 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6726 Still best_val_rmse: 0.6398 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7121 Still best_val_rmse: 0.6398 (from epoch 0)

16 steps took 8.05 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6967 Still best_val_rmse: 0.6398 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6503 Still best_val_rmse: 0.6398 (from epoch 0)


[32m[I 2021-07-13 15:22:32,226][0m Trial 17 finished with value: 0.6398378014564514 and parameters: {'base_lr': 0.00011914266709867103, 'last_lr': 0.0005277625982160621}. Best is trial 15 with value: 0.46920618414878845.[0m



##### Using fold 4


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.37 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9714 New best_val_rmse: 0.9714

16 steps took 8.01 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7257 New best_val_rmse: 0.7257

16 steps took 8.04 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6301 New best_val_rmse: 0.6301

16 steps took 8.06 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6015 New best_val_rmse: 0.6015

16 steps took 8.04 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5762 New best_val_rmse: 0.5762

16 steps took 8.05 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5876 Still best_val_rmse: 0.5762 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6596 Still best_val_rmse: 0.5762 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6537 Still best_val_rmse: 0.5762 (from epoch 0)

16 steps took 8.07 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5619 New best_val_rmse: 0.5619

16 steps took 8.33 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5117 New best_val_rmse: 0.5117

16 st

[32m[I 2021-07-13 15:41:54,521][0m Trial 18 finished with value: 0.4710596203804016 and parameters: {'base_lr': 1.1878531343620975e-05, 'last_lr': 0.00035396203468292405}. Best is trial 15 with value: 0.46920618414878845.[0m



##### Using fold 4


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.29 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8745 New best_val_rmse: 0.8745

16 steps took 8.03 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6961 New best_val_rmse: 0.6961

16 steps took 8.01 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6701 New best_val_rmse: 0.6701

16 steps took 8.03 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5687 New best_val_rmse: 0.5687

16 steps took 8.06 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6342 Still best_val_rmse: 0.5687 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6058 Still best_val_rmse: 0.5687 (from epoch 0)

16 steps took 8.06 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6632 Still best_val_rmse: 0.5687 (from epoch 0)

16 steps took 8.04 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.671 Still best_val_rmse: 0.5687 (from epoch 0)

16 steps took 8.04 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5496 New best_val_rmse: 0.5496

16 steps took 8.38 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5156 New best_val_rms

[32m[I 2021-07-13 15:59:01,034][0m Trial 19 finished with value: 0.4756593704223633 and parameters: {'base_lr': 1.5255093672945908e-05, 'last_lr': 0.0008108752119000499}. Best is trial 15 with value: 0.46920618414878845.[0m
[32m[I 2021-07-13 15:59:01,037][0m A new study created in memory with name: no-name-a893385a-6b86-4d58-8c06-3611fd571506[0m



 Best value:  0.46920618414878845
 Best params: 
    base_lr: 1.845975941382356e-05
    last_lr: 0.0006309278277674714
##### Using fold 5


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.28 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8557 New best_val_rmse: 0.8557

16 steps took 8.02 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6837 New best_val_rmse: 0.6837

16 steps took 8.02 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6163 New best_val_rmse: 0.6163

16 steps took 8.01 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.566 New best_val_rmse: 0.566

16 steps took 8.03 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5588 New best_val_rmse: 0.5588

16 steps took 8.04 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6468 Still best_val_rmse: 0.5588 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5589 Still best_val_rmse: 0.5588 (from epoch 0)

16 steps took 8.05 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.578 Still best_val_rmse: 0.5588 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5499 New best_val_rmse: 0.5499

16 steps took 8.33 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5383 New best_val_rmse: 0.5383

16 steps

[32m[I 2021-07-13 16:30:18,907][0m Trial 0 finished with value: 0.465007483959198 and parameters: {'base_lr': 9.312465436613192e-06, 'last_lr': 0.0029042699025681466}. Best is trial 0 with value: 0.465007483959198.[0m


Epoch: 2 batch_num: 147 val_rmse: 0.4671 Still best_val_rmse: 0.465 (from epoch 2)

##### Using fold 5


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.36 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9079 New best_val_rmse: 0.9079

16 steps took 8.02 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8916 New best_val_rmse: 0.8916

16 steps took 8.01 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6242 New best_val_rmse: 0.6242

16 steps took 8.02 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7972 Still best_val_rmse: 0.6242 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6644 Still best_val_rmse: 0.6242 (from epoch 0)

16 steps took 8.05 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7782 Still best_val_rmse: 0.6242 (from epoch 0)

16 steps took 8.01 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6638 Still best_val_rmse: 0.6242 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7407 Still best_val_rmse: 0.6242 (from epoch 0)

16 steps took 8.0 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.7381 Still best_val_rmse: 0.6242 (from epoch 0)


[32m[I 2021-07-13 16:33:20,817][0m Trial 1 finished with value: 0.6241759061813354 and parameters: {'base_lr': 0.00016555675424943537, 'last_lr': 0.0005712885346356431}. Best is trial 0 with value: 0.465007483959198.[0m



##### Using fold 5


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.32 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7645 New best_val_rmse: 0.7645

16 steps took 8.02 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6515 New best_val_rmse: 0.6515

16 steps took 8.04 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6742 Still best_val_rmse: 0.6515 (from epoch 0)

16 steps took 8.01 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.594 New best_val_rmse: 0.594

16 steps took 8.04 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5645 New best_val_rmse: 0.5645

16 steps took 8.01 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5809 Still best_val_rmse: 0.5645 (from epoch 0)

16 steps took 8.07 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5695 Still best_val_rmse: 0.5645 (from epoch 0)

16 steps took 8.05 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.611 Still best_val_rmse: 0.5645 (from epoch 0)

16 steps took 8.04 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5151 New best_val_rmse: 0.5151

16 steps took 8.36 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5037 New best_val_rmse:

[32m[I 2021-07-13 17:00:19,615][0m Trial 2 finished with value: 0.46307051181793213 and parameters: {'base_lr': 3.3743909778302605e-05, 'last_lr': 0.0005121870072732148}. Best is trial 2 with value: 0.46307051181793213.[0m


Epoch: 2 batch_num: 147 val_rmse: 0.4631 Still best_val_rmse: 0.4631 (from epoch 2)

##### Using fold 5


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.23 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.008 New best_val_rmse: 1.008

16 steps took 7.99 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.346 Still best_val_rmse: 1.008 (from epoch 0)

16 steps took 8.03 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.067 Still best_val_rmse: 1.008 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8383 New best_val_rmse: 0.8383

16 steps took 8.02 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.903 Still best_val_rmse: 0.8383 (from epoch 0)

16 steps took 7.99 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.025 Still best_val_rmse: 0.8383 (from epoch 0)

16 steps took 7.92 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.017 Still best_val_rmse: 0.8383 (from epoch 0)

16 steps took 7.91 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.032 Still best_val_rmse: 0.8383 (from epoch 0)

16 steps took 7.95 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.02 Still best_val_rmse: 0.8383 (from epoch 0)


[32m[I 2021-07-13 17:03:21,235][0m Trial 3 finished with value: 0.8382678031921387 and parameters: {'base_lr': 0.00033442959232821184, 'last_lr': 0.0003110821446659171}. Best is trial 2 with value: 0.46307051181793213.[0m



##### Using fold 5


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.34 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7787 New best_val_rmse: 0.7787

16 steps took 8.01 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6617 New best_val_rmse: 0.6617

16 steps took 8.02 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6232 New best_val_rmse: 0.6232

16 steps took 8.06 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6554 Still best_val_rmse: 0.6232 (from epoch 0)

16 steps took 8.05 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.613 New best_val_rmse: 0.613

16 steps took 8.04 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.565 New best_val_rmse: 0.565

16 steps took 8.02 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5412 New best_val_rmse: 0.5412

16 steps took 8.03 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6958 Still best_val_rmse: 0.5412 (from epoch 0)

16 steps took 8.04 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.609 Still best_val_rmse: 0.5412 (from epoch 0)

16 steps took 8.33 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5406 New best_val_rmse: 0.5406

16 steps t

[32m[I 2021-07-13 17:32:13,569][0m Trial 4 finished with value: 0.4635846018791199 and parameters: {'base_lr': 2.967895929001887e-05, 'last_lr': 0.0030633417185138667}. Best is trial 2 with value: 0.46307051181793213.[0m


Epoch: 2 batch_num: 147 val_rmse: 0.4638 Still best_val_rmse: 0.4636 (from epoch 2)

##### Using fold 5


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.27 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9965 New best_val_rmse: 0.9965

16 steps took 8.03 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.213 Still best_val_rmse: 0.9965 (from epoch 0)

16 steps took 8.01 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.093 Still best_val_rmse: 0.9965 (from epoch 0)

16 steps took 7.94 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.076 Still best_val_rmse: 0.9965 (from epoch 0)

16 steps took 7.93 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.036 Still best_val_rmse: 0.9965 (from epoch 0)

16 steps took 7.94 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.02 Still best_val_rmse: 0.9965 (from epoch 0)

16 steps took 7.94 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.044 Still best_val_rmse: 0.9965 (from epoch 0)

16 steps took 7.94 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.019 Still best_val_rmse: 0.9965 (from epoch 0)

16 steps took 7.94 seconds
Epoch: 0 batch_num: 144 val_rmse: 1.026 Still best_val_rmse: 0.9965 (from epoch 0)


[32m[I 2021-07-13 17:35:14,695][0m Trial 5 finished with value: 0.9964724779129028 and parameters: {'base_lr': 0.0004485841895433537, 'last_lr': 0.0003495453706911383}. Best is trial 2 with value: 0.46307051181793213.[0m



##### Using fold 5


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.32 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8116 New best_val_rmse: 0.8116

16 steps took 8.03 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6713 New best_val_rmse: 0.6713

16 steps took 8.03 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6413 New best_val_rmse: 0.6413

16 steps took 8.03 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5976 New best_val_rmse: 0.5976

16 steps took 8.04 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6084 Still best_val_rmse: 0.5976 (from epoch 0)

16 steps took 8.05 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5541 New best_val_rmse: 0.5541

16 steps took 8.01 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6146 Still best_val_rmse: 0.5541 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5835 Still best_val_rmse: 0.5541 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5376 New best_val_rmse: 0.5376

16 steps took 8.37 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5577 Still best_val_rmse: 0.5376 (fro

[32m[I 2021-07-13 18:03:14,933][0m Trial 6 finished with value: 0.4629150927066803 and parameters: {'base_lr': 4.430444436442592e-05, 'last_lr': 0.000289231685619846}. Best is trial 6 with value: 0.4629150927066803.[0m


Epoch: 2 batch_num: 147 val_rmse: 0.4654 Still best_val_rmse: 0.4629 (from epoch 2)

##### Using fold 5


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.33 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7818 New best_val_rmse: 0.7818

16 steps took 7.99 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7 New best_val_rmse: 0.7

16 steps took 8.02 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6136 New best_val_rmse: 0.6136

16 steps took 8.0 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.845 Still best_val_rmse: 0.6136 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6006 New best_val_rmse: 0.6006

16 steps took 8.05 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7325 Still best_val_rmse: 0.6006 (from epoch 0)

16 steps took 8.02 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6362 Still best_val_rmse: 0.6006 (from epoch 0)

16 steps took 8.01 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6959 Still best_val_rmse: 0.6006 (from epoch 0)

16 steps took 8.06 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.7211 Still best_val_rmse: 0.6006 (from epoch 0)


[32m[I 2021-07-13 18:06:17,718][0m Trial 7 finished with value: 0.6005736589431763 and parameters: {'base_lr': 9.6674926540802e-05, 'last_lr': 0.00010231346059528046}. Best is trial 6 with value: 0.4629150927066803.[0m



##### Using fold 5


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.3 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8044 New best_val_rmse: 0.8044

16 steps took 8.0 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6903 New best_val_rmse: 0.6903

16 steps took 8.02 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6092 New best_val_rmse: 0.6092

16 steps took 8.02 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5856 New best_val_rmse: 0.5856

16 steps took 8.06 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5966 Still best_val_rmse: 0.5856 (from epoch 0)

16 steps took 8.04 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6428 Still best_val_rmse: 0.5856 (from epoch 0)

16 steps took 8.01 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5922 Still best_val_rmse: 0.5856 (from epoch 0)

16 steps took 8.06 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.607 Still best_val_rmse: 0.5856 (from epoch 0)

16 steps took 8.04 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5705 New best_val_rmse: 0.5705

16 steps took 8.4 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5374 New best_val_rmse: 

[32m[I 2021-07-13 18:29:06,335][0m Trial 8 finished with value: 0.466734915971756 and parameters: {'base_lr': 9.146832246693641e-06, 'last_lr': 0.004898014673940014}. Best is trial 6 with value: 0.4629150927066803.[0m


Epoch: 2 batch_num: 147 val_rmse: 0.4673 Still best_val_rmse: 0.4667 (from epoch 2)

##### Using fold 5


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'config', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 9.26 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.005 New best_val_rmse: 1.005

16 steps took 8.03 seconds



KeyboardInterrupt: 

### Verify the model

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm

In [None]:
cfg.model_offset = 0
cfg.model_limit = 6
cfg.n_folds = 5
cfg.svm_kernels = ['rbf']
cfg.svm_c = 5

In [None]:
num_bins = int(np.ceil(np.log2(len(train_df))))
train_df['bins'] = pd.cut(train_df['target'], bins=num_bins, labels=False)
bins = train_df['bins'].values

In [None]:
%%time

inference_models = []
for i in range(1, cfg.NUM_FOLDS + 1):
    print(f'Model {i}')
    inference_model = CommonLitModel()
    inference_model = inference_model.cuda()
    inference_model.load_state_dict(torch.load(str(MODELS_PATH/f"{cfg.model_name.replace('/', '_')}_{i}/model_{i}.pth")))
    inference_model.eval();
    inference_models.append(inference_model)

In [None]:
from transformers import RobertaTokenizer

tokenizers = []
for i in range(1, cfg.NUM_FOLDS):
    tokenizer = RobertaTokenizer.from_pretrained(MODELS_PATH/f"{cfg.model_name.replace('/', '_')}_{i}")
    tokenizers.append(tokenizer)

In [None]:
def get_cls_embeddings(dl, transformer_model):
    cls_embeddings = []
    with torch.no_grad():
        for input_features in tqdm(dl, total=len(dl)):
            output, context_vector = transformer_model(input_features['input_ids'].cuda(), input_features['attention_mask'].cuda())
#             cls_embeddings.extend(output['last_hidden_state'][:,0,:].detach().cpu().numpy())
            embedding_out = context_vector.detach().cpu().numpy()
            cls_embeddings.extend(embedding_out)
    return np.array(cls_embeddings)

In [None]:
def rmse_score(X, y):
    return np.sqrt(mean_squared_error(X, y))

In [None]:
def convert_to_list(t):
    return t.flatten().long()

class CommonLitDataset(nn.Module):
    def __init__(self, text, test_id, tokenizer, max_len=128):
        self.excerpt = text
        self.test_id = test_id
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        return {'input_ids': convert_to_list(encode['input_ids']),
                'attention_mask': convert_to_list(encode['attention_mask']),
                'id': self.test_id[idx]}
    
    def __len__(self):
        return len(self.excerpt)

In [None]:
def create_dl(df, tokenizer):
    text = df['excerpt'].values
    ids = df['id'].values
    ds = CommonLitDataset(text, ids, tokenizer, max_len=cfg.MAX_LEN)
    return DataLoader(ds, 
                      batch_size = cfg.BATCH_SIZE,
                      shuffle=False,
                      num_workers = 1,
                      pin_memory=True,
                      drop_last=False
                     )

In [None]:
train_df = pd.read_csv(DATA_PATH/'train-orig.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')
remove_unnecessary(train_df)

In [None]:
train_target_mean = train_df['target'].mean()
train_target_std = train_df['target'].std()
train_df['normalized_target'] = (train_df['target'] - train_target_mean) / train_target_std

In [None]:
%%time

train_target = train_df['normalized_target'].values

def calc_mean(scores):
    return np.mean(np.array(scores), axis=0)

final_scores = []
final_rmse = []
kernel_rmse_score_mean = []
final_kernel_predictions_means = []
for j, (inference_model, tokenizer) in enumerate(zip(inference_models, tokenizers)):
    print('Model', j)
    test_dl = create_dl(test_df, tokenizer)
    train_dl = create_dl(train_df, tokenizer)
    transformer_model = inference_model
    transformer_model.cuda()
    X = get_cls_embeddings(train_dl, transformer_model)
    
    y = train_target
    X_test = get_cls_embeddings(test_dl, transformer_model)
    
    kfold = StratifiedKFold(n_splits=cfg.NUM_FOLDS)
    scores = []
    rmse_scores = []
    kernel_predictions_means = []
    for kernel in cfg.svm_kernels:
        print('Kernel', kernel)
        kernel_scores = []
        kernel_rmse_scores = []
        kernel_predictions = []
        for k, (train_idx, valid_idx) in enumerate(kfold.split(X, bins)):

            print('Fold', k, train_idx.shape, valid_idx.shape)
            model = SVR(C=cfg.svm_c, kernel=kernel, gamma='auto')

            X_train, y_train = X[train_idx], y[train_idx]
            X_valid, y_valid = X[valid_idx], y[valid_idx]
            model.fit(X_train, y_train)
            prediction = model.predict(X_valid)
            kernel_predictions.append(prediction)
            kernel_rmse_scores.append(rmse_score(prediction, y_valid))
            print('rmse_score', kernel_rmse_scores[k])
            kernel_scores.append(model.predict(X_test))
        kernel_predictions_means.append(np.array([np.mean(kp) for kp in kernel_predictions]).mean())
        scores.append(calc_mean(kernel_scores))
        kernel_rmse_score = calc_mean(kernel_rmse_scores)
        kernel_rmse_score_mean.append(kernel_rmse_score)
        rmse_scores.append(kernel_rmse_score)
    final_kernel_predictions_means.append(kernel_predictions_means)
    final_scores.append(calc_mean(scores))
    final_rmse.append(calc_mean(rmse_scores))
print('FINAL RMSE score', np.mean(np.array(final_rmse)))

In [None]:
final_kernel_predictions_means

In [None]:
# (train_df['target'] - cfg.train_target_mean) / cfg.train_target_std
final_scores_normalized = np.array(final_scores) * train_target_std + train_target_mean

In [None]:
kernel_rmse_score_mean_array = np.array(kernel_rmse_score_mean)
kernel_rmse_score_mean_sum = np.sum(kernel_rmse_score_mean_array)
prop_losses = kernel_rmse_score_mean_array / kernel_rmse_score_mean_sum
prop_losses_sum = (1 - prop_losses).sum()
weights = (1 - prop_losses) / prop_losses_sum
weights

In [None]:
def calc_mean(scores, weights=weights):
    return np.average(np.array(scores), weights=weights, axis=0)

In [None]:
target_mean = train_df['target'].mean()
final_scores_flat = calc_mean(final_scores_normalized).flatten()
final_scores_mean = final_scores_flat.mean()
target_mean, np.array(final_scores_normalized).mean()
# (-0.9579984513405823, -0.8029817438292849)

In [None]:
final_scores_flat

In [None]:
mean_diff = target_mean - final_scores_mean
mean_diff, mean_diff / len(final_scores)

In [None]:
sample_df['target'] = final_scores_flat + mean_diff
# sample_df['target'] = len(final_scores) / np.sum(1 / np.array(final_scores), axis=0) # harmonic mean
sample_df

### Prepare Packaging

In [None]:
cfg.model_name

In [None]:
BEST_MODEL_FOLDER = MODELS_PATH/cfg.model_name/'best'
!rm -rf {BEST_MODEL_FOLDER}
!mkdir -p {BEST_MODEL_FOLDER}

In [None]:
BEST_MODEL_FOLDER

In [None]:
cfg.NUM_FOLDS

In [None]:
bestmodels = [MODELS_PATH/f'{cfg.model_name}_{i + 1}' for i in range(0, cfg.NUM_FOLDS)]

In [None]:
bestmodels

In [None]:
from shutil import copyfile

def normalize_name(path_name):
    return path_name.replace('', '')

for i, best_model in enumerate(bestmodels):
    print(f'Processing {i}th model')
    i = i + 1
    best_model_file = f'{best_model}/model_{i}.pth'
    if Path(best_model_file).exists():
        copyfile(best_model_file, f'{BEST_MODEL_FOLDER}/{i}_pytorch_model.bin')
        tokenizer_path = Path(BEST_MODEL_FOLDER/f'tokenizer-{i}')
        tokenizer_path.mkdir(parents=True, exist_ok=True)
        assert tokenizer_path.exists()

        tokenizer_json = Path(normalize_name(f'{MODELS_PATH/cfg.model_name}_{i}/tokenizer_config.json'))
        assert tokenizer_json.exists(), f'{tokenizer_json} does not exist'
        copyfile(tokenizer_json, tokenizer_path/'tokenizer.json')

        vocab_txt = Path(normalize_name(f'{MODELS_PATH/cfg.model_name}_{i}/vocab.json'))
        assert vocab_txt.exists(), f'{vocab_txt} does not exist'
        copyfile(vocab_txt, tokenizer_path/'vocab.json')

        merges = Path(normalize_name(f'{MODELS_PATH/cfg.model_name}_{i}/merges.txt'))
        assert merges.exists()
        copyfile(merges, tokenizer_path/'merges.txt')
    else:
        print(f'{best_model_file} is missing')

In [None]:
import shutil

shutil.make_archive(MODELS_PATH/cfg.model_name/'best_models', 'zip', BEST_MODEL_FOLDER)

In [None]:
!ls {MODELS_PATH/cfg.model_name}

In [None]:
!mv {MODELS_PATH}/{cfg.model_name}.yaml {MODELS_PATH/cfg.model_name}

In [None]:
transformer_model.transformer_model.save_pretrained(save_directory=f'{MODELS_PATH/cfg.model_name}/lm')

In [None]:
!du -h {MODELS_PATH/cfg.model_name}/*

In [None]:
shutil.make_archive(MODELS_PATH/cfg.model_name/'lm', 'zip', f'{MODELS_PATH/cfg.model_name}/lm')

In [None]:
!kaggle datasets init -p {MODELS_PATH/cfg.model_name}

In [None]:
dataset_json_path = Path(MODELS_PATH/cfg.model_name/'dataset-metadata.json')
assert dataset_json_path.exists()

In [None]:
!cat {str(dataset_json_path)}

In [None]:
with open(dataset_json_path, 'r') as f:
    dataset_json = f.read()
    dataset_json = dataset_json.replace('INSERT_TITLE_HERE', f'commonlit-{cfg.model_name}-light').replace('INSERT_SLUG_HERE', f'commonlit-{cfg.model_name}-light')
    print(dataset_json)
with(open(dataset_json_path, 'w')) as f:
    f.write(dataset_json)

In [None]:
!rm -rf {MODELS_PATH/cfg.model_name}/best
!rm -rf {MODELS_PATH/cfg.model_name}/lm

In [None]:
!kaggle datasets create -p {MODELS_PATH/cfg.model_name}

In [None]:
!kaggle datasets version -p {MODELS_PATH/cfg.model_name} -m "Version with merges.txt" -d

In [None]:
state_dict = torch.load(str(MODELS_PATH/f'distilroberta-0/checkpoint-105/pytorch_model.bin'))

In [None]:
loaded_model = CommonLitModel()

In [None]:
loaded_model.load_state_dict(state_dict)