In [None]:
# !pip install optuna

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import gc, warnings, random, time, os

from pathlib import Path

from tqdm.notebook import tqdm

warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from transformers import AutoModel, AutoTokenizer, AutoConfig
from transformers import get_cosine_with_hard_restarts_schedule_with_warmup, get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

import seaborn as sns

import gc
gc.enable()

import optuna

### Folders and Dataframes

In [2]:
DATA_PATH = Path('/home/commonlit/data/')
assert DATA_PATH.exists()
MODELS_PATH = Path('/home/commonlit/models/')
if not MODELS_PATH.exists():
    os.mkdir(MODELS_PATH)
assert MODELS_PATH.exists()

In [3]:
train_df = pd.read_csv(DATA_PATH/'train-orig.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')
sample_df = pd.read_csv(DATA_PATH/'sample_submission.csv')

In [4]:
def remove_unnecessary(df):
    df.drop(df[df['target'] == 0].index, inplace=True)
    df.reset_index(drop=True, inplace=True)
    
remove_unnecessary(train_df)

In [5]:
train_df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845
...,...,...,...,...,...,...
2828,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,When you think of dinosaurs and where they liv...,1.711390,0.646900
2829,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,So what is a solid? Solids are usually hard be...,0.189476,0.535648
2830,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,The second state of matter we will discuss is ...,0.255209,0.483866
2831,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,Solids are shapes that you can actually touch....,-0.215279,0.514128


### Config and Seeding

In [6]:
class Config(): 
    NUM_FOLDS = 6
    NUM_EPOCHS = 3
    BATCH_SIZE = 16
    MAX_LEN = 248
    EVAL_SCHEDULE = [(0.50, 16), (0.49, 8), (0.48, 4), (0.47, 2), (-1., 1)]
    MODEL_PATH = 'xlnet-large-cased'
    TOKENIZER_PATH = 'xlnet-large-cased'
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
#     DEVICE = "cpu"
    SEED = 1000
    NUM_WORKERS = 2
    MODEL_FOLDER = MODELS_PATH
    model_name = 'xlnet-large-cased'
    svm_kernels = ['rbf']
    svm_c = 5

cfg = Config()

In [7]:
if not cfg.MODEL_FOLDER.exists():
    os.mkdir(cfg.MODEL_FOLDER)

In [8]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True

### Dataset

In [9]:
def add_bins(train_df, num_bins):
    train_df.loc[:, 'bins'] = pd.cut(train_df['target'], bins=num_bins, labels=False)
    return num_bins

In [10]:
add_bins(train_df, cfg.NUM_FOLDS)

6

In [11]:
train_df.groupby(['bins'])['target'].agg(['count', 'mean'])

Unnamed: 0_level_0,count,mean
bins,Unnamed: 1_level_1,Unnamed: 2_level_1
0,122,-3.125765
1,441,-2.270279
2,784,-1.41215
3,886,-0.548095
4,494,0.289716
5,106,1.070237


In [12]:
tokenizer = AutoTokenizer.from_pretrained(cfg.TOKENIZER_PATH)

In [13]:
with open('../data/tokenizer.vocab.txt', 'w') as f:
    for k, v in tokenizer.vocab.items():
        f.write(f'{k}: {v}\n')

In [14]:
pad_token = '______'

In [15]:
class CommonLitDataset(Dataset):
    def __init__(self, df, tokenizer, inference_only=False):
        super().__init__()
        self.df, self.inference_only = df, inference_only
        self.text = df['excerpt'].tolist()
        self.bins = df['bins']
        if not inference_only:
            self.target = torch.tensor(df['target'].to_numpy(), dtype = torch.float32)
        
#         tokenizer.add_special_tokens({'pad_token': pad_token})
#         assert tokenizer.pad_token == pad_token
        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',
            max_length = cfg.MAX_LEN,
            truncation = True,
            return_attention_mask=True
        )
        self.tokenizer = tokenizer
        
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return {'input_ids': input_ids, 'attention_mask': attention_mask}
        else:
            target = self.target[index]
            return {'input_ids': input_ids, 'attention_mask': attention_mask, 'target': target}
    
    def __len__(self):
        return len(self.df)

In [16]:
sample_ds = CommonLitDataset(train_df, tokenizer)

### Model

In [17]:
class AttentionHead(nn.Module):
    
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        
        self.hidden_layer = nn.Linear(in_features, hidden_dim)
        self.final_layer = nn.Linear(hidden_dim, num_targets)
        self.out_features = hidden_dim
        
    def forward(self, features):
        att = torch.tanh(self.hidden_layer(features))
        score = self.final_layer(att)
        attention_weights = torch.softmax(score, dim=1)
        return attention_weights

In [18]:
config = AutoConfig.from_pretrained(cfg.MODEL_PATH)

In [19]:
config.vocab_size, tokenizer.vocab_size

(32000, 32000)

In [20]:
from transformers import AutoModelForSequenceClassification

class CommonLitModel(nn.Module):
    def __init__(self):
        super(CommonLitModel, self).__init__()
        config = AutoConfig.from_pretrained(cfg.MODEL_PATH)
        config.update({
            "output_hidden_states": True,
            "hidden_dropout_prob": 0.0,
            "layer_norm_eps": 1e-7
        })
        self.transformer_model = AutoModelForSequenceClassification.from_pretrained(cfg.MODEL_PATH, config=config)
        self.attention = AttentionHead(config.hidden_size, 512, 1)
        self.regressor = nn.Linear(config.hidden_size, 1)
    
    def forward(self, input_ids, attention_mask):
        hidden_states = self.transformer_model(input_ids=input_ids, attention_mask=attention_mask)['hidden_states']
        last_layer_hidden_states = hidden_states[-1]
        weights = self.attention(last_layer_hidden_states)
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1) 
        return self.regressor(context_vector), context_vector

In [21]:
sample_model = CommonLitModel()

Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.bias', 'logits_proj.weight', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

In [22]:
import re

for i, (name, param) in enumerate(sample_model.named_parameters()):
    if(name.find('layer') > -1):
        layer_name = re.sub(r'.+(layer\.\d+).+', r'\1', name)

In [23]:
for i, (name, param) in enumerate(sample_model.named_parameters()):
    print(i, name, param.size())

0 transformer_model.transformer.mask_emb torch.Size([1, 1, 1024])
1 transformer_model.transformer.word_embedding.weight torch.Size([32000, 1024])
2 transformer_model.transformer.layer.0.rel_attn.q torch.Size([1024, 16, 64])
3 transformer_model.transformer.layer.0.rel_attn.k torch.Size([1024, 16, 64])
4 transformer_model.transformer.layer.0.rel_attn.v torch.Size([1024, 16, 64])
5 transformer_model.transformer.layer.0.rel_attn.o torch.Size([1024, 16, 64])
6 transformer_model.transformer.layer.0.rel_attn.r torch.Size([1024, 16, 64])
7 transformer_model.transformer.layer.0.rel_attn.r_r_bias torch.Size([16, 64])
8 transformer_model.transformer.layer.0.rel_attn.r_s_bias torch.Size([16, 64])
9 transformer_model.transformer.layer.0.rel_attn.r_w_bias torch.Size([16, 64])
10 transformer_model.transformer.layer.0.rel_attn.seg_embed torch.Size([2, 16, 64])
11 transformer_model.transformer.layer.0.rel_attn.layer_norm.weight torch.Size([1024])
12 transformer_model.transformer.layer.0.rel_attn.layer_

In [24]:
# sample_input_ids = torch.randint(0, 1000, [2, 248])
# sample_attention_mask = torch.randint(0, 1000, [2, 248])

In [25]:
sample_records = [sample_ds[i] for i in range(2)]

In [26]:
sample_records[0].keys()

dict_keys(['input_ids', 'attention_mask', 'target'])

In [27]:
sample_input_ids = torch.stack([r['input_ids'] for r in sample_records])
sample_attention_mask = torch.stack([r['attention_mask'] for r in sample_records])

In [28]:
sample_input_ids.shape, sample_attention_mask.shape

(torch.Size([2, 248]), torch.Size([2, 248]))

In [29]:
sample_input_ids

tensor([[    5,     5,     5,     5,     5,     5,     5,     5,     5,     5,
             5,     5,     5,     5,     5,     5,     5,     5,   311,    18,
           673,   104,  1061,    22,    18, 24843,    19,    36,  2037,    24,
           969,   111,  1318,  2295,     9,  2833,    20,    48,  3831,  2033,
            19,    36,    30,    24,  2403,  4525,     9,    32,  1331,    30,
          1972,    33,  2789,    13,  7876, 11330,    19,    50,  3514,    31,
         13337,    19,    57,    17, 21696,   782,    95,  9717,  3716,    21,
          5998,  6834,    23,    19,   115,    24,   525,  2789,   770,     9,
            32,  2355, 19163,    21, 29743,    23,    29,    54, 10045,    18,
           520,    19,    55,  6407,    68,    33, 10547,    21,    17, 26410,
            33,    17,  2853,  2378,    23,    20,  7307,    19,   115,  2789,
             9,  1551,  8703,  5903,    54,    72,  9446, 17345,    66,    31,
           107,    19,    21, 28037,  6915,    17,  

In [30]:
internal_out = sample_model.transformer_model(sample_input_ids, attention_mask=sample_attention_mask)

In [31]:
internal_out.keys()

odict_keys(['logits', 'mems', 'hidden_states'])

In [32]:
len(internal_out.hidden_states), internal_out.hidden_states[-1].shape

(25, torch.Size([2, 248, 1024]))

In [33]:
sample_res = sample_model(sample_input_ids, sample_attention_mask)

In [34]:
sample_res[0].shape, sample_res[1].shape

(torch.Size([2, 1]), torch.Size([2, 1024]))

In [35]:
torch.sum(torch.randn([8, 496, 768]), axis=1)

tensor([[-33.6125,  20.8116, -37.3507,  ...,  45.8287,   8.7434,  15.5695],
        [-10.1358,  -6.9110,   9.7339,  ...,  14.0870, -27.7199,  16.4197],
        [ 29.9244, -13.2548,   9.5534,  ...,  -1.6752,  41.7243,   2.7099],
        ...,
        [ -7.0634,  37.7444,  -5.1741,  ..., -28.5020, -33.9613,  13.2693],
        [ -5.2980,   9.7112, -13.2849,  ...,  -2.8648,  -6.6645,  19.3413],
        [ 22.5773,  42.2794,   5.8758,  ...,  22.9042,   1.5186,  31.2972]])

### Evaluation and Prediction

In [36]:
def eval_mse(model, data_loader):
    model.eval()
    mse_sum = 0
    mse_loss = nn.MSELoss(reduction='sum')
    
    with torch.no_grad():
        for batch_num, record in enumerate(data_loader):
            input_ids, attention_mask, target = record['input_ids'].to(cfg.DEVICE), record['attention_mask'].to(cfg.DEVICE), record['target'].to(cfg.DEVICE)
            pred, _ = model(input_ids, attention_mask)
            mse_sum += mse_loss(pred.flatten().cpu(), target.cpu())
            
    return mse_sum / len(data_loader.dataset)

In [37]:
def predict(model, data_loader):
    model.eval()
    result = []
    
    with torch.no_grad():
        for batch_num, record in tqdm(enumerate(data_loader), total=len(data_loader)):
            input_ids, attention_mask = record['input_ids'].to(cfg.DEVICE), record['attention_mask'].to(cfg.DEVICE)
            pred, _ = model(input_ids, attention_mask)
            result.extend(pred.flatten().to("cpu").tolist())
            
    return np.array(result)

In [38]:
sample_dl = DataLoader(sample_ds, shuffle=False, batch_size=16, num_workers=1)

### Optimizer and Sampler

In [39]:
def create_optimizer(model, base_lr=5e-5, last_lr=None):
    named_parameters = list(model.named_parameters())
    attention_param_start = 414
    regressor_param_start = 418
    roberta_parameters = named_parameters[:attention_param_start]
    attention_parameters = named_parameters[attention_param_start:regressor_param_start]
    regressor_parameters = named_parameters[regressor_param_start:]
    
    attention_group = [params for (name, params) in attention_parameters]
    regressor_group = [params for (name, params) in regressor_parameters]
    
    parameters = []
    if last_lr is not None:
        parameters.append({"params": attention_group, "lr": last_lr})
        parameters.append({"params": regressor_group, "lr": last_lr})
    else:
        parameters.append({"params": attention_group})
        parameters.append({"params": regressor_group})
        
    # Change on different models
    layer_low_threshold = 189
    layer_middle_threshold = 325
    
    for layer_num, (name, params) in enumerate(roberta_parameters):
        weight_decay = 0.0 if 'bias' in name else 0.01
        
        lr = base_lr / 2.5 # 2e-05
        if layer_num >= layer_middle_threshold:
            lr = base_lr / 0.5 # 1e-4
        elif layer_num >= layer_low_threshold:        
            lr = base_lr    
            
        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})
        
    return AdamW(parameters)

In [40]:
sample_optimizer = create_optimizer(sample_model)

In [41]:
from torch.utils.data import Sampler,SequentialSampler,RandomSampler,SubsetRandomSampler
from collections import Counter

class WeightedSampler(Sampler):
    
    def __init__(self, dataset):
        
        self.indices = list(range(len(dataset)))
        self.num_samples = len(dataset)
        self.label_to_count = dict(Counter(dataset.bins))
        weights = [1/self.label_to_count[i] for i in dataset.bins]
        
        self.weights = torch.tensor(weights,dtype=torch.double)
        
    def __iter__(self):
        count = 0
        index = [self.indices[i] for i in torch.multinomial(self.weights, self.num_samples, replacement=True)]
        while count < self.num_samples:
            yield index[count]
            count += 1
    
    def __len__(self):
        return self.num_samples

### Training

In [42]:
def choose_eval_period(val_rmse):
    for rmse, period in cfg.EVAL_SCHEDULE:
        if val_rmse >= rmse:
            return period

In [43]:
def serialize_best(best_val_rmse, best_epoch, val_rmse, epoch, model, model_path):
    if not best_val_rmse or val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        best_epoch = epoch
        if not model_path.parent.exists():
            os.makedirs(model_path.parent)
        
#         torch.save(model.state_dict(), model_path)
        print(f"New best_val_rmse: {best_val_rmse:0.4}")
    else:       
        print(f"Still best_val_rmse: {best_val_rmse:0.4}",
              f"(from epoch {best_epoch})")
    return best_epoch, best_val_rmse

In [44]:
class Trainer():
    def __init__(self, scaler, model, model_path, train_loader, val_loader, optimizer, scheduler=None, num_epochs=cfg.NUM_EPOCHS):
        self.scaler, self.model, self.model_path, self.train_loader, self.val_loader, self.optimizer, self.scheduler, self.num_epochs = (
            scaler, model, model_path, train_loader, val_loader, optimizer, scheduler, num_epochs
        )
            
    def train(self):
        self.model.train()
        
        mse_loss = nn.MSELoss(reduction='mean')
        
        best_val_rmse = None
        best_epoch = 0
        step = 0
        last_eval_step = 0
        eval_period = cfg.EVAL_SCHEDULE[0][1]    

        start = time.time()
        val_rmse_list = []
        
        tbar = tqdm(range(self.num_epochs), total=self.num_epochs)
        for epoch in tbar:
            tbar.set_description(f'Epoch: {epoch}')
            val_rmse = None
            for batch_num, record in enumerate(self.train_loader):
                input_ids, attention_mask, target = record['input_ids'].to(cfg.DEVICE), record['attention_mask'].to(cfg.DEVICE), record['target'].to(cfg.DEVICE)
                
                self.optimizer.zero_grad()
                
                # Casts operations to mixed precision
                with torch.cuda.amp.autocast():
                    pred, _ = self.model(input_ids, attention_mask)
                    mse = mse_loss(pred.flatten(), target)
                    
                self.scaler.scale(mse).backward()
                self.scaler.step(self.optimizer)
                self.scaler.update()
                
#                 mse.backward()
#                 self.optimizer.step()
                
                if self.scheduler:
                    self.scheduler.step()
                    
                if step >= last_eval_step + eval_period:
                    elapsed_seconds = time.time() - start
                    num_steps = step - last_eval_step
                    print(f"\n{num_steps} steps took {elapsed_seconds:0.3} seconds")
                    last_eval_step = step
                    
                    val_rmse = np.sqrt(eval_mse(self.model, self.val_loader))
                    print(f"Epoch: {epoch} batch_num: {batch_num}", f"val_rmse: {val_rmse:0.4} ", end='')
                    
                    eval_period = choose_eval_period(val_rmse)
                    best_epoch, best_val_rmse = serialize_best(best_val_rmse, best_epoch, val_rmse, epoch, self.model, self.model_path)
                    val_rmse_list.append(val_rmse)
                    start = time.time()
                # Finish early on condition
                if epoch > 0 and best_val_rmse > 0.6 or (len(val_rmse_list) > 5 and np.array(val_rmse_list).mean() > 1.0):
                    return best_val_rmse
                
                step += 1
        return best_val_rmse

In [45]:
kfold = KFold(n_splits=cfg.NUM_FOLDS, random_state=cfg.SEED, shuffle=True)
splits = list(kfold.split(train_df))

### Optuna

In [46]:
del sample_model
gc.collect()
torch.cuda.empty_cache()

In [47]:
# Best results
# Fold 0: { 'base_lr': 0.0001190683694379101, 'last_lr': 0.00017987585986205585, 'epochs': 4 } Best value: 0.49271923303604126
# Fold 1: {'base_lr': 0.00012114635348406963, 'last_lr': 0.0005477206613438486, 'epochs': 4}. Best value:  0.45853328704833984
# Fold 2: {'base_lr': 5.24730490640746e-05, 'last_lr': 0.00020041362261812433, 'epochs': 4}   Best value:  0.49088865518569946
# Fold 3: {'base_lr': 6.108276630664184e-05, 'last_lr': 0.00011544056953737668, 'epochs': 4}. Best value:  0.4930591881275177
# Fold 4: {'base_lr': 0.0001717178883932075, 'last_lr': 0.00042448836147656634, 'epochs': 4}  Best value:  0.48955243825912476
# Fold 5: {'base_lr': 0.000135700916847811, 'last_lr': 0.0029640935672153, 'epochs': 4}.      Best value:  0.4688156247138977

In [52]:

fold = 0

def objective(trial):
    base_lr = trial.suggest_float("base_lr", 3e-5, 5e-4, log=True)
    last_lr = trial.suggest_float("last_lr", 8e-5, 5e-3, log=True)
    epochs = 4
    schedule_func = trial.suggest_categorical('schedule_func', [get_cosine_with_hard_restarts_schedule_with_warmup, get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup])
    
    print(f'##### Using fold {fold}')
    print(f'##### Using base_lr {base_lr} last_lr {last_lr} epochs {epochs}')
    print(f'##### Using {schedule_func}')
    
    model_path = cfg.MODEL_FOLDER/f"{cfg.model_name.replace('/', '_')}_{fold + 1}/model_{fold + 1}.pth"
    
    set_random_seed(cfg.SEED + fold)
    
    tokenizer = AutoTokenizer.from_pretrained(cfg.TOKENIZER_PATH)
    
    train_indices, val_indices = splits[fold]
    train_dataset = CommonLitDataset(train_df.loc[train_indices], tokenizer)    
    val_dataset = CommonLitDataset(train_df.loc[val_indices], tokenizer)
    
    train_loader = DataLoader(train_dataset, batch_size=cfg.BATCH_SIZE,
                              drop_last=False, shuffle=True, num_workers=cfg.NUM_WORKERS)    
    val_loader = DataLoader(val_dataset, batch_size=cfg.BATCH_SIZE,
                            drop_last=False, shuffle=False, num_workers=cfg.NUM_WORKERS)
    
    model = CommonLitModel().to(cfg.DEVICE)
    
    optimizer = create_optimizer(model, base_lr=base_lr, last_lr=last_lr)
    
    scheduler = schedule_func(optimizer,
                                                num_training_steps=cfg.NUM_EPOCHS * len(train_loader), 
                                                num_warmup_steps=50)
    scaler = torch.cuda.amp.GradScaler() # fp16
    
    trainer = Trainer(scaler, model, model_path, train_loader, val_loader, optimizer, 
                      scheduler = scheduler, num_epochs = epochs)
    rmse_val = trainer.train()
    
    del trainer
    del model
    del tokenizer
    del scaler
    del optimizer
    del train_loader
    del val_loader
    torch.cuda.empty_cache()
    gc.collect()
    
    return rmse_val

In [None]:
for i in range(0, 3):
    fold = i
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=20)
    print(" Best value: ", study.best_trial.value)
    print(" Best params: ")
    for key, value in study.best_trial.params.items():
        print(f"    {key}: {value}")

[32m[I 2021-07-27 17:09:27,284][0m A new study created in memory with name: no-name-f28c07a0-9004-4214-bd53-abc0ddfaed43[0m


##### Using fold 0
##### Using base_lr 0.00026072981385982097 last_lr 0.0025988624450275374 epochs 4
##### Using <function get_linear_schedule_with_warmup at 0x7fe1f186b280>


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.bias', 'logits_proj.weight', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 7.52 seconds
Epoch: 0 batch_num: 16 val_rmse: 2.671 New best_val_rmse: 2.671

16 steps took 6.47 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.044 New best_val_rmse: 1.044

16 steps took 6.5 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.084 Still best_val_rmse: 1.044 (from epoch 0)

16 steps took 6.48 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.206 Still best_val_rmse: 1.044 (from epoch 0)

16 steps took 6.47 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.33 Still best_val_rmse: 1.044 (from epoch 0)

16 steps took 6.48 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.112 Still best_val_rmse: 1.044 (from epoch 0)


[32m[I 2021-07-27 17:11:01,214][0m Trial 0 finished with value: 1.0441198348999023 and parameters: {'base_lr': 0.00026072981385982097, 'last_lr': 0.0025988624450275374, 'schedule_func': <function get_linear_schedule_with_warmup at 0x7fe1f186b280>}. Best is trial 0 with value: 1.0441198348999023.[0m



##### Using fold 0
##### Using base_lr 0.00013328109624069146 last_lr 0.0013247307057016205 epochs 4
##### Using <function get_cosine_schedule_with_warmup at 0x7fe1f186b310>


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.bias', 'logits_proj.weight', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 7.67 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.373 New best_val_rmse: 1.373

16 steps took 6.53 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8198 New best_val_rmse: 0.8198

16 steps took 6.54 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7715 New best_val_rmse: 0.7715

16 steps took 6.55 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.621 New best_val_rmse: 0.621

16 steps took 6.55 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5937 New best_val_rmse: 0.5937

16 steps took 6.54 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6484 Still best_val_rmse: 0.5937 (from epoch 0)

16 steps took 6.55 seconds
Epoch: 0 batch_num: 112 val_rmse: 1.11 Still best_val_rmse: 0.5937 (from epoch 0)

16 steps took 6.56 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7706 Still best_val_rmse: 0.5937 (from epoch 0)

16 steps took 6.54 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5872 New best_val_rmse: 0.5872

16 steps took 7.33 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.7151 Still best_val_rmse: 0.5872 (from epoc

[32m[I 2021-07-27 17:19:31,200][0m Trial 1 finished with value: 0.5012840628623962 and parameters: {'base_lr': 0.00013328109624069146, 'last_lr': 0.0013247307057016205, 'schedule_func': <function get_cosine_schedule_with_warmup at 0x7fe1f186b310>}. Best is trial 1 with value: 0.5012840628623962.[0m



##### Using fold 0
##### Using base_lr 0.0001515095861920755 last_lr 0.001985500783768097 epochs 4
##### Using <function get_cosine_with_hard_restarts_schedule_with_warmup at 0x7fe1f186b3a0>


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.bias', 'logits_proj.weight', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 7.59 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.387 New best_val_rmse: 1.387

16 steps took 6.55 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.19 New best_val_rmse: 1.19

16 steps took 6.54 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.736 New best_val_rmse: 0.736

16 steps took 6.49 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.157 Still best_val_rmse: 0.736 (from epoch 0)

16 steps took 6.5 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.224 Still best_val_rmse: 0.736 (from epoch 0)

16 steps took 6.5 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.1 Still best_val_rmse: 0.736 (from epoch 0)


[32m[I 2021-07-27 17:21:05,866][0m Trial 2 finished with value: 0.7360088229179382 and parameters: {'base_lr': 0.0001515095861920755, 'last_lr': 0.001985500783768097, 'schedule_func': <function get_cosine_with_hard_restarts_schedule_with_warmup at 0x7fe1f186b3a0>}. Best is trial 1 with value: 0.5012840628623962.[0m



##### Using fold 0
##### Using base_lr 5.0146723276165316e-05 last_lr 0.0010658486417691217 epochs 4
##### Using <function get_cosine_schedule_with_warmup at 0x7fe1f186b310>


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.bias', 'logits_proj.weight', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 7.72 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.653 New best_val_rmse: 1.653

16 steps took 6.6 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9268 New best_val_rmse: 0.9268

16 steps took 6.55 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7334 New best_val_rmse: 0.7334

16 steps took 6.53 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6508 New best_val_rmse: 0.6508

16 steps took 6.6 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.9067 Still best_val_rmse: 0.6508 (from epoch 0)

16 steps took 6.59 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6018 New best_val_rmse: 0.6018

16 steps took 6.6 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5902 New best_val_rmse: 0.5902

16 steps took 6.56 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6414 Still best_val_rmse: 0.5902 (from epoch 0)

16 steps took 6.56 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6285 Still best_val_rmse: 0.5902 (from epoch 0)

16 steps took 7.35 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5575 New best_val_rmse: 0.5575

16 steps t

[32m[I 2021-07-27 17:29:37,669][0m Trial 3 finished with value: 0.5305038094520569 and parameters: {'base_lr': 5.0146723276165316e-05, 'last_lr': 0.0010658486417691217, 'schedule_func': <function get_cosine_schedule_with_warmup at 0x7fe1f186b310>}. Best is trial 1 with value: 0.5012840628623962.[0m



##### Using fold 0
##### Using base_lr 0.0003178309138188082 last_lr 0.0005171568446463313 epochs 4
##### Using <function get_cosine_schedule_with_warmup at 0x7fe1f186b310>


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.bias', 'logits_proj.weight', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 7.59 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.328 New best_val_rmse: 1.328

16 steps took 6.54 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.557 Still best_val_rmse: 1.328 (from epoch 0)

16 steps took 6.58 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.178 New best_val_rmse: 1.178

16 steps took 6.54 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.061 New best_val_rmse: 1.061

16 steps took 6.53 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.313 Still best_val_rmse: 1.061 (from epoch 0)

16 steps took 6.55 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.095 Still best_val_rmse: 1.061 (from epoch 0)


[32m[I 2021-07-27 17:31:12,114][0m Trial 4 finished with value: 1.0610512495040894 and parameters: {'base_lr': 0.0003178309138188082, 'last_lr': 0.0005171568446463313, 'schedule_func': <function get_cosine_schedule_with_warmup at 0x7fe1f186b310>}. Best is trial 1 with value: 0.5012840628623962.[0m



##### Using fold 0
##### Using base_lr 0.00021275634427171412 last_lr 0.00021952950481681497 epochs 4
##### Using <function get_linear_schedule_with_warmup at 0x7fe1f186b280>


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.bias', 'logits_proj.weight', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 7.67 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.63 New best_val_rmse: 1.63

16 steps took 6.59 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.296 New best_val_rmse: 1.296

16 steps took 6.57 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7887 New best_val_rmse: 0.7887

16 steps took 6.54 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8423 Still best_val_rmse: 0.7887 (from epoch 0)

16 steps took 6.54 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6209 New best_val_rmse: 0.6209

16 steps took 6.6 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6666 Still best_val_rmse: 0.6209 (from epoch 0)

16 steps took 6.55 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6751 Still best_val_rmse: 0.6209 (from epoch 0)

16 steps took 6.55 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.615 New best_val_rmse: 0.615

16 steps took 6.64 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.614 New best_val_rmse: 0.614


[32m[I 2021-07-27 17:33:30,194][0m Trial 5 finished with value: 0.6140443086624146 and parameters: {'base_lr': 0.00021275634427171412, 'last_lr': 0.00021952950481681497, 'schedule_func': <function get_linear_schedule_with_warmup at 0x7fe1f186b280>}. Best is trial 1 with value: 0.5012840628623962.[0m



##### Using fold 0
##### Using base_lr 0.000208932441184545 last_lr 0.001504442525654463 epochs 4
##### Using <function get_cosine_with_hard_restarts_schedule_with_warmup at 0x7fe1f186b3a0>


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.bias', 'logits_proj.weight', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 7.7 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.947 New best_val_rmse: 1.947

16 steps took 6.55 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.106 New best_val_rmse: 1.106

16 steps took 6.55 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.158 Still best_val_rmse: 1.106 (from epoch 0)

16 steps took 6.54 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.14 Still best_val_rmse: 1.106 (from epoch 0)

16 steps took 6.51 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.393 Still best_val_rmse: 1.106 (from epoch 0)

16 steps took 6.51 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.108 Still best_val_rmse: 1.106 (from epoch 0)



[32m[I 2021-07-27 17:35:04,622][0m Trial 6 finished with value: 1.1064022779464722 and parameters: {'base_lr': 0.000208932441184545, 'last_lr': 0.001504442525654463, 'schedule_func': <function get_cosine_with_hard_restarts_schedule_with_warmup at 0x7fe1f186b3a0>}. Best is trial 1 with value: 0.5012840628623962.[0m


##### Using fold 0
##### Using base_lr 3.097867799812087e-05 last_lr 0.00013146474035657642 epochs 4
##### Using <function get_cosine_schedule_with_warmup at 0x7fe1f186b310>


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.bias', 'logits_proj.weight', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 7.54 seconds
Epoch: 0 batch_num: 16 val_rmse: 2.011 New best_val_rmse: 2.011

16 steps took 6.57 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.112 New best_val_rmse: 1.112

16 steps took 6.69 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7378 New best_val_rmse: 0.7378

16 steps took 6.56 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6768 New best_val_rmse: 0.6768

16 steps took 6.55 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.8331 Still best_val_rmse: 0.6768 (from epoch 0)

16 steps took 6.55 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6172 New best_val_rmse: 0.6172

16 steps took 6.56 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6264 Still best_val_rmse: 0.6172 (from epoch 0)

16 steps took 6.58 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5771 New best_val_rmse: 0.5771

16 steps took 6.58 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5648 New best_val_rmse: 0.5648

16 steps took 7.28 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5516 New best_val_rmse: 0.5516

16 steps took 6.57 seconds

In [55]:
%%time

for i in range(3, len(list(splits))):
    fold = i
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=20)
    print(" Best value: ", study.best_trial.value)
    print(" Best params: ")
    for key, value in study.best_trial.params.items():
        print(f"    {key}: {value}")

[32m[I 2021-07-22 14:24:27,045][0m A new study created in memory with name: no-name-677fc894-e6f1-489f-bba5-60271810ee28[0m


##### Using fold 3
##### Using base_lr 0.00013674280465279492 last_lr 0.0007463327517034599 epochs 5


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 7.41 seconds
Epoch: 0 batch_num: 16 val_rmse: 2.2 New best_val_rmse: 2.2

16 steps took 6.48 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8643 New best_val_rmse: 0.8643

16 steps took 6.51 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6854 New best_val_rmse: 0.6854

16 steps took 6.53 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8669 Still best_val_rmse: 0.6854 (from epoch 0)

16 steps took 6.54 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7064 Still best_val_rmse: 0.6854 (from epoch 0)

16 steps took 6.57 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6116 New best_val_rmse: 0.6116

16 steps took 6.68 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7069 Still best_val_rmse: 0.6116 (from epoch 0)

16 steps took 6.57 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6129 Still best_val_rmse: 0.6116 (from epoch 0)

16 steps took 6.58 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.82 Still best_val_rmse: 0.6116 (from epoch 0)


[32m[I 2021-07-22 14:26:43,001][0m Trial 0 finished with value: 0.6116013526916504 and parameters: {'base_lr': 0.00013674280465279492, 'last_lr': 0.0007463327517034599, 'epochs': 5}. Best is trial 0 with value: 0.6116013526916504.[0m



##### Using fold 3
##### Using base_lr 0.00020254393403725305 last_lr 0.0006604802269671327 epochs 3


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 7.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 2.518 New best_val_rmse: 2.518

16 steps took 6.6 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9558 New best_val_rmse: 0.9558

16 steps took 6.58 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7423 New best_val_rmse: 0.7423

16 steps took 6.58 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.728 New best_val_rmse: 0.728

16 steps took 6.57 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7793 Still best_val_rmse: 0.728 (from epoch 0)

16 steps took 6.58 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6146 New best_val_rmse: 0.6146


[32m[I 2021-07-22 14:28:16,635][0m Trial 1 finished with value: 0.6145561337471008 and parameters: {'base_lr': 0.00020254393403725305, 'last_lr': 0.0006604802269671327, 'epochs': 3}. Best is trial 0 with value: 0.6116013526916504.[0m



##### Using fold 3
##### Using base_lr 0.00026969819179850266 last_lr 0.0012131152696478107 epochs 3


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 7.41 seconds
Epoch: 0 batch_num: 16 val_rmse: 2.31 New best_val_rmse: 2.31

16 steps took 6.55 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.284 New best_val_rmse: 1.284

16 steps took 6.59 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.207 New best_val_rmse: 1.207

16 steps took 6.56 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.097 New best_val_rmse: 1.097

16 steps took 6.6 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.031 New best_val_rmse: 1.031

16 steps took 6.53 seconds


[32m[I 2021-07-22 14:29:50,284][0m Trial 2 finished with value: 1.0311695337295532 and parameters: {'base_lr': 0.00026969819179850266, 'last_lr': 0.0012131152696478107, 'epochs': 3}. Best is trial 0 with value: 0.6116013526916504.[0m


Epoch: 0 batch_num: 96 val_rmse: 3.536 Still best_val_rmse: 1.031 (from epoch 0)

##### Using fold 3
##### Using base_lr 4.244562638646704e-05 last_lr 0.00036484650511324436 epochs 4


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 7.58 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.567 New best_val_rmse: 1.567

16 steps took 6.59 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7649 New best_val_rmse: 0.7649

16 steps took 6.58 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6699 New best_val_rmse: 0.6699

16 steps took 6.57 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7011 Still best_val_rmse: 0.6699 (from epoch 0)

16 steps took 6.6 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6042 New best_val_rmse: 0.6042

16 steps took 6.62 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5994 New best_val_rmse: 0.5994

16 steps took 6.59 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6721 Still best_val_rmse: 0.5994 (from epoch 0)

16 steps took 6.63 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5806 New best_val_rmse: 0.5806

16 steps took 6.62 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6881 Still best_val_rmse: 0.5806 (from epoch 0)

16 steps took 7.18 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5791 New best_val_rmse: 0.5791

16 steps

[32m[I 2021-07-22 14:38:17,159][0m Trial 3 finished with value: 0.524225115776062 and parameters: {'base_lr': 4.244562638646704e-05, 'last_lr': 0.00036484650511324436, 'epochs': 4}. Best is trial 3 with value: 0.524225115776062.[0m



##### Using fold 3
##### Using base_lr 0.0002437699640745575 last_lr 0.0004261877423621473 epochs 4


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 7.52 seconds
Epoch: 0 batch_num: 16 val_rmse: 3.337 New best_val_rmse: 3.337

16 steps took 6.54 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.048 New best_val_rmse: 1.048

16 steps took 6.54 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.07 Still best_val_rmse: 1.048 (from epoch 0)

16 steps took 6.55 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.178 Still best_val_rmse: 1.048 (from epoch 0)

16 steps took 6.53 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.044 New best_val_rmse: 1.044

16 steps took 6.57 seconds


[32m[I 2021-07-22 14:39:50,547][0m Trial 4 finished with value: 1.0441583395004272 and parameters: {'base_lr': 0.0002437699640745575, 'last_lr': 0.0004261877423621473, 'epochs': 4}. Best is trial 3 with value: 0.524225115776062.[0m


Epoch: 0 batch_num: 96 val_rmse: 1.101 Still best_val_rmse: 1.044 (from epoch 0)

##### Using fold 3
##### Using base_lr 0.00012866543806904285 last_lr 0.00294093536575603 epochs 5


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 7.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.881 New best_val_rmse: 1.881

16 steps took 6.57 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8812 New best_val_rmse: 0.8812

16 steps took 6.57 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7477 New best_val_rmse: 0.7477

16 steps took 6.56 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8286 Still best_val_rmse: 0.7477 (from epoch 0)

16 steps took 6.58 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6454 New best_val_rmse: 0.6454

16 steps took 6.61 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6356 New best_val_rmse: 0.6356

16 steps took 6.59 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7481 Still best_val_rmse: 0.6356 (from epoch 0)

16 steps took 6.59 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6025 New best_val_rmse: 0.6025

16 steps took 6.65 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.8019 Still best_val_rmse: 0.6025 (from epoch 0)


[32m[I 2021-07-22 14:42:06,608][0m Trial 5 finished with value: 0.6025357246398926 and parameters: {'base_lr': 0.00012866543806904285, 'last_lr': 0.00294093536575603, 'epochs': 5}. Best is trial 3 with value: 0.524225115776062.[0m



##### Using fold 3
##### Using base_lr 9.78622966176048e-05 last_lr 0.000594007307912072 epochs 3


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 7.45 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.297 New best_val_rmse: 1.297

16 steps took 6.56 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9661 New best_val_rmse: 0.9661

16 steps took 6.58 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6881 New best_val_rmse: 0.6881

16 steps took 6.58 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6476 New best_val_rmse: 0.6476

16 steps took 6.58 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7009 Still best_val_rmse: 0.6476 (from epoch 0)

16 steps took 6.58 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6777 Still best_val_rmse: 0.6476 (from epoch 0)

16 steps took 6.6 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.8833 Still best_val_rmse: 0.6476 (from epoch 0)

16 steps took 6.6 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7291 Still best_val_rmse: 0.6476 (from epoch 0)

16 steps took 6.6 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.7085 Still best_val_rmse: 0.6476 (from epoch 0)


[32m[I 2021-07-22 14:44:23,566][0m Trial 6 finished with value: 0.6475713849067688 and parameters: {'base_lr': 9.78622966176048e-05, 'last_lr': 0.000594007307912072, 'epochs': 3}. Best is trial 3 with value: 0.524225115776062.[0m



##### Using fold 3
##### Using base_lr 0.0003602540833722712 last_lr 0.0018127809391390109 epochs 4


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 7.43 seconds
Epoch: 0 batch_num: 16 val_rmse: 2.165 New best_val_rmse: 2.165

16 steps took 6.56 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.181 New best_val_rmse: 1.181

16 steps took 6.57 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.176 New best_val_rmse: 1.176

16 steps took 6.55 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.548 Still best_val_rmse: 1.176 (from epoch 0)

16 steps took 6.57 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.042 New best_val_rmse: 1.042

16 steps took 6.57 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.07 Still best_val_rmse: 1.042 (from epoch 0)


[32m[I 2021-07-22 14:45:56,899][0m Trial 7 finished with value: 1.0422102212905884 and parameters: {'base_lr': 0.0003602540833722712, 'last_lr': 0.0018127809391390109, 'epochs': 4}. Best is trial 3 with value: 0.524225115776062.[0m



##### Using fold 3
##### Using base_lr 4.6415807784583515e-05 last_lr 0.000316814913030945 epochs 5


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 7.48 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.433 New best_val_rmse: 1.433

16 steps took 6.57 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8469 New best_val_rmse: 0.8469

16 steps took 6.57 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6655 New best_val_rmse: 0.6655

16 steps took 6.57 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6905 Still best_val_rmse: 0.6655 (from epoch 0)

16 steps took 6.58 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6058 New best_val_rmse: 0.6058

16 steps took 6.61 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.626 Still best_val_rmse: 0.6058 (from epoch 0)

16 steps took 6.65 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7209 Still best_val_rmse: 0.6058 (from epoch 0)

16 steps took 6.59 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5918 New best_val_rmse: 0.5918

16 steps took 6.59 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6589 Still best_val_rmse: 0.5918 (from epoch 0)

16 steps took 7.14 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5979 Still best_val_rms

[32m[I 2021-07-22 14:56:35,514][0m Trial 8 finished with value: 0.516015350818634 and parameters: {'base_lr': 4.6415807784583515e-05, 'last_lr': 0.000316814913030945, 'epochs': 5}. Best is trial 8 with value: 0.516015350818634.[0m



##### Using fold 3
##### Using base_lr 0.0001790980718155451 last_lr 9.931083236563293e-05 epochs 4


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 7.47 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.762 New best_val_rmse: 1.762

16 steps took 6.56 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.101 New best_val_rmse: 1.101

16 steps took 6.57 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.059 New best_val_rmse: 1.059

16 steps took 6.58 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7186 New best_val_rmse: 0.7186

16 steps took 6.57 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.666 New best_val_rmse: 0.666

16 steps took 6.6 seconds


[32m[I 2021-07-22 14:58:11,958][0m Trial 9 finished with value: 0.6659549474716187 and parameters: {'base_lr': 0.0001790980718155451, 'last_lr': 9.931083236563293e-05, 'epochs': 4}. Best is trial 8 with value: 0.516015350818634.[0m


Epoch: 0 batch_num: 96 val_rmse: 0.7002 Still best_val_rmse: 0.666 (from epoch 0)

##### Using fold 3
##### Using base_lr 3.102510831375899e-05 last_lr 0.00014231526067270248 epochs 5


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 7.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.836 New best_val_rmse: 1.836

16 steps took 6.63 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9155 New best_val_rmse: 0.9155

16 steps took 6.57 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8068 New best_val_rmse: 0.8068

16 steps took 6.58 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6328 New best_val_rmse: 0.6328

16 steps took 6.57 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7069 Still best_val_rmse: 0.6328 (from epoch 0)

16 steps took 6.6 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6698 Still best_val_rmse: 0.6328 (from epoch 0)

16 steps took 6.6 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.636 Still best_val_rmse: 0.6328 (from epoch 0)

16 steps took 6.61 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6255 New best_val_rmse: 0.6255

16 steps took 6.6 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6862 Still best_val_rmse: 0.6255 (from epoch 0)


[32m[I 2021-07-22 15:00:28,294][0m Trial 10 finished with value: 0.6255457401275635 and parameters: {'base_lr': 3.102510831375899e-05, 'last_lr': 0.00014231526067270248, 'epochs': 5}. Best is trial 8 with value: 0.516015350818634.[0m



##### Using fold 3
##### Using base_lr 4.052642811586042e-05 last_lr 0.00022293895409840363 epochs 5


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 7.44 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.682 New best_val_rmse: 1.682

16 steps took 6.62 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7747 New best_val_rmse: 0.7747

16 steps took 6.57 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7187 New best_val_rmse: 0.7187

16 steps took 6.56 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6551 New best_val_rmse: 0.6551

16 steps took 6.58 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6904 Still best_val_rmse: 0.6551 (from epoch 0)

16 steps took 6.6 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7293 Still best_val_rmse: 0.6551 (from epoch 0)

16 steps took 6.57 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5945 New best_val_rmse: 0.5945

16 steps took 6.59 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5782 New best_val_rmse: 0.5782

16 steps took 6.62 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.7097 Still best_val_rmse: 0.5782 (from epoch 0)

16 steps took 7.22 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5791 Still best_val_rmse: 0.5782 (from e

[32m[I 2021-07-22 15:11:05,558][0m Trial 11 finished with value: 0.5196667313575745 and parameters: {'base_lr': 4.052642811586042e-05, 'last_lr': 0.00022293895409840363, 'epochs': 5}. Best is trial 8 with value: 0.516015350818634.[0m



##### Using fold 3
##### Using base_lr 6.068588484770048e-05 last_lr 0.00021184087471614598 epochs 5


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 7.49 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.04 New best_val_rmse: 1.04

16 steps took 6.55 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6841 New best_val_rmse: 0.6841

16 steps took 6.56 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7603 Still best_val_rmse: 0.6841 (from epoch 0)

16 steps took 6.55 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8245 Still best_val_rmse: 0.6841 (from epoch 0)

16 steps took 6.56 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5825 New best_val_rmse: 0.5825

16 steps took 6.58 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5428 New best_val_rmse: 0.5428

16 steps took 6.6 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5655 Still best_val_rmse: 0.5428 (from epoch 0)

16 steps took 6.6 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5388 New best_val_rmse: 0.5388

16 steps took 6.61 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5205 New best_val_rmse: 0.5205

16 steps took 7.14 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.6897 Still best_val_rmse: 0.5205 (from epoc

[32m[I 2021-07-22 15:22:51,672][0m Trial 12 finished with value: 0.49611586332321167 and parameters: {'base_lr': 6.068588484770048e-05, 'last_lr': 0.00021184087471614598, 'epochs': 5}. Best is trial 12 with value: 0.49611586332321167.[0m



##### Using fold 3
##### Using base_lr 7.101317396072361e-05 last_lr 0.0002051946814279808 epochs 5


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 7.57 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.146 New best_val_rmse: 1.146

16 steps took 6.6 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8374 New best_val_rmse: 0.8374

16 steps took 6.57 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7145 New best_val_rmse: 0.7145

16 steps took 6.59 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7251 Still best_val_rmse: 0.7145 (from epoch 0)

16 steps took 6.56 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6168 New best_val_rmse: 0.6168

16 steps took 6.59 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6866 Still best_val_rmse: 0.6168 (from epoch 0)

16 steps took 6.59 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6057 New best_val_rmse: 0.6057

16 steps took 6.59 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5453 New best_val_rmse: 0.5453

16 steps took 6.6 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5674 Still best_val_rmse: 0.5453 (from epoch 0)

16 steps took 7.2 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.589 Still best_val_rmse: 0.5453 (from epoc

[32m[I 2021-07-22 15:35:05,685][0m Trial 13 finished with value: 0.49442538619041443 and parameters: {'base_lr': 7.101317396072361e-05, 'last_lr': 0.0002051946814279808, 'epochs': 5}. Best is trial 13 with value: 0.49442538619041443.[0m



##### Using fold 3
##### Using base_lr 7.958738601878582e-05 last_lr 8.250917348407703e-05 epochs 5


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 7.47 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.271 New best_val_rmse: 1.271

16 steps took 6.56 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7439 New best_val_rmse: 0.7439

16 steps took 6.56 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7365 New best_val_rmse: 0.7365

16 steps took 6.56 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8225 Still best_val_rmse: 0.7365 (from epoch 0)

16 steps took 6.6 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6078 New best_val_rmse: 0.6078

16 steps took 6.6 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6447 Still best_val_rmse: 0.6078 (from epoch 0)

16 steps took 6.61 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5607 New best_val_rmse: 0.5607

16 steps took 6.6 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5597 New best_val_rmse: 0.5597

16 steps took 6.67 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5752 Still best_val_rmse: 0.5597 (from epoch 0)

16 steps took 7.26 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5607 Still best_val_rmse: 0.5597 (from epo

[32m[I 2021-07-22 15:47:08,171][0m Trial 14 finished with value: 0.496579110622406 and parameters: {'base_lr': 7.958738601878582e-05, 'last_lr': 8.250917348407703e-05, 'epochs': 5}. Best is trial 13 with value: 0.49442538619041443.[0m



##### Using fold 3
##### Using base_lr 7.371486452999384e-05 last_lr 0.00017425449514158598 epochs 5


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 7.45 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.189 New best_val_rmse: 1.189

16 steps took 6.54 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8729 New best_val_rmse: 0.8729

16 steps took 6.54 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6903 New best_val_rmse: 0.6903

16 steps took 6.55 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7388 Still best_val_rmse: 0.6903 (from epoch 0)

16 steps took 6.56 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5962 New best_val_rmse: 0.5962

16 steps took 6.57 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6426 Still best_val_rmse: 0.5962 (from epoch 0)

16 steps took 6.61 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6349 Still best_val_rmse: 0.5962 (from epoch 0)

16 steps took 6.62 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6007 Still best_val_rmse: 0.5962 (from epoch 0)

16 steps took 6.61 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5541 New best_val_rmse: 0.5541

16 steps took 7.19 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.6976 Still best_val_rm

[32m[I 2021-07-22 15:58:01,709][0m Trial 15 finished with value: 0.534666895866394 and parameters: {'base_lr': 7.371486452999384e-05, 'last_lr': 0.00017425449514158598, 'epochs': 5}. Best is trial 13 with value: 0.49442538619041443.[0m



##### Using fold 3
##### Using base_lr 6.108276630664184e-05 last_lr 0.00011544056953737668 epochs 4


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 7.55 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.048 New best_val_rmse: 1.048

16 steps took 6.56 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7279 New best_val_rmse: 0.7279

16 steps took 6.56 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7155 New best_val_rmse: 0.7155

16 steps took 6.56 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7595 Still best_val_rmse: 0.7155 (from epoch 0)

16 steps took 6.58 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7031 New best_val_rmse: 0.7031

16 steps took 6.59 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5437 New best_val_rmse: 0.5437

16 steps took 6.62 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.643 Still best_val_rmse: 0.5437 (from epoch 0)

16 steps took 6.62 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5378 New best_val_rmse: 0.5378

16 steps took 6.59 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5252 New best_val_rmse: 0.5252

16 steps took 7.22 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.7084 Still best_val_rmse: 0.5252 (from epoch 0)

16 steps

[32m[I 2021-07-22 16:07:13,880][0m Trial 16 finished with value: 0.4930591881275177 and parameters: {'base_lr': 6.108276630664184e-05, 'last_lr': 0.00011544056953737668, 'epochs': 4}. Best is trial 16 with value: 0.4930591881275177.[0m



##### Using fold 3
##### Using base_lr 5.801665314119398e-05 last_lr 0.00011042592029436529 epochs 4


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 7.51 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.139 New best_val_rmse: 1.139

16 steps took 6.57 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.759 New best_val_rmse: 0.759

16 steps took 6.61 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7591 Still best_val_rmse: 0.759 (from epoch 0)

16 steps took 6.57 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7948 Still best_val_rmse: 0.759 (from epoch 0)

16 steps took 6.63 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.592 New best_val_rmse: 0.592

16 steps took 6.58 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5457 New best_val_rmse: 0.5457

16 steps took 6.61 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5904 Still best_val_rmse: 0.5457 (from epoch 0)

16 steps took 6.61 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5438 New best_val_rmse: 0.5438

16 steps took 6.62 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5177 New best_val_rmse: 0.5177

16 steps took 7.15 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.7135 Still best_val_rmse: 0.5177 (from epoch 

[32m[I 2021-07-22 16:16:02,881][0m Trial 17 finished with value: 0.49643710255622864 and parameters: {'base_lr': 5.801665314119398e-05, 'last_lr': 0.00011042592029436529, 'epochs': 4}. Best is trial 16 with value: 0.4930591881275177.[0m



##### Using fold 3
##### Using base_lr 3.194618099250861e-05 last_lr 8.128233705048789e-05 epochs 3


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 7.56 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.848 New best_val_rmse: 1.848

16 steps took 6.58 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9074 New best_val_rmse: 0.9074

16 steps took 6.58 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8247 New best_val_rmse: 0.8247

16 steps took 6.59 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6383 New best_val_rmse: 0.6383

16 steps took 6.56 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7207 Still best_val_rmse: 0.6383 (from epoch 0)

16 steps took 6.59 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6428 Still best_val_rmse: 0.6383 (from epoch 0)

16 steps took 6.63 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5909 New best_val_rmse: 0.5909

16 steps took 6.6 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6098 Still best_val_rmse: 0.5909 (from epoch 0)

16 steps took 6.59 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6287 Still best_val_rmse: 0.5909 (from epoch 0)

16 steps took 7.16 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.6038 Still best_val_rms

[32m[I 2021-07-22 16:22:26,026][0m Trial 18 finished with value: 0.5293550491333008 and parameters: {'base_lr': 3.194618099250861e-05, 'last_lr': 8.128233705048789e-05, 'epochs': 3}. Best is trial 16 with value: 0.4930591881275177.[0m



##### Using fold 3
##### Using base_lr 9.772651821943555e-05 last_lr 0.00013610151307519143 epochs 4


[33m[W 2021-07-22 16:22:34,339][0m Trial 19 failed because of the following error: ValueError('Connection error, and we cannot find the requested files in the cached path. Please try again or make sure your Internet connection is on.')
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/optuna/_optimize.py", line 216, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-53-8b37909b4267>", line 15, in objective
    tokenizer = AutoTokenizer.from_pretrained(cfg.TOKENIZER_PATH)
  File "/opt/conda/lib/python3.8/site-packages/transformers/models/auto/tokenization_auto.py", line 445, in from_pretrained
    return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/transformers/tokenization_utils_base.py", line 1672, in from_pretrained
    resolved_vocab_files[file_id] = cached_path(
  File "/opt/conda/lib/python3.8/site-packages/transformers/file_utils.py", line

ValueError: Connection error, and we cannot find the requested files in the cached path. Please try again or make sure your Internet connection is on.

In [None]:
for i in range(4, len(list(splits))):
    fold = i
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=20)
    print(" Best value: ", study.best_trial.value)
    print(" Best params: ")
    for key, value in study.best_trial.params.items():
        print(f"    {key}: {value}")

[32m[I 2021-07-22 16:25:58,833][0m A new study created in memory with name: no-name-b64f2479-cc61-4389-a987-26e39eeee01a[0m


##### Using fold 4
##### Using base_lr 0.00012455338478817077 last_lr 0.0007911697618105153 epochs 4


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 7.45 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.067 New best_val_rmse: 1.067

16 steps took 6.5 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.08 Still best_val_rmse: 1.067 (from epoch 0)

16 steps took 6.5 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6595 New best_val_rmse: 0.6595

16 steps took 6.57 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8245 Still best_val_rmse: 0.6595 (from epoch 0)

16 steps took 6.54 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6526 New best_val_rmse: 0.6526

16 steps took 6.56 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6227 New best_val_rmse: 0.6227

16 steps took 6.56 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6191 New best_val_rmse: 0.6191

16 steps took 6.57 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5593 New best_val_rmse: 0.5593

16 steps took 6.57 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5453 New best_val_rmse: 0.5453

16 steps took 7.15 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5415 New best_val_rmse: 0.5415

16 steps took 6.58 seconds
Ep

[32m[I 2021-07-22 16:35:01,351][0m Trial 0 finished with value: 0.4938439726829529 and parameters: {'base_lr': 0.00012455338478817077, 'last_lr': 0.0007911697618105153, 'epochs': 4}. Best is trial 0 with value: 0.4938439726829529.[0m



##### Using fold 4
##### Using base_lr 0.0002489252288528087 last_lr 0.0018404752185335377 epochs 5


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 7.59 seconds
Epoch: 0 batch_num: 16 val_rmse: 3.833 New best_val_rmse: 3.833

16 steps took 6.59 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.265 New best_val_rmse: 1.265

16 steps took 6.54 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.045 New best_val_rmse: 1.045

16 steps took 6.57 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.18 Still best_val_rmse: 1.045 (from epoch 0)

16 steps took 6.55 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.021 New best_val_rmse: 1.021

16 steps took 6.61 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.356 Still best_val_rmse: 1.021 (from epoch 0)



[32m[I 2021-07-22 16:36:40,713][0m Trial 1 finished with value: 1.0206865072250366 and parameters: {'base_lr': 0.0002489252288528087, 'last_lr': 0.0018404752185335377, 'epochs': 5}. Best is trial 0 with value: 0.4938439726829529.[0m


##### Using fold 4
##### Using base_lr 9.38235365621146e-05 last_lr 9.008888128281592e-05 epochs 5


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 7.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.902 New best_val_rmse: 1.902

16 steps took 6.59 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9186 New best_val_rmse: 0.9186

16 steps took 6.57 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9052 New best_val_rmse: 0.9052

16 steps took 6.57 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7886 New best_val_rmse: 0.7886

16 steps took 6.6 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6119 New best_val_rmse: 0.6119

16 steps took 6.62 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7626 Still best_val_rmse: 0.6119 (from epoch 0)

16 steps took 6.6 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6028 New best_val_rmse: 0.6028

16 steps took 6.61 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6147 Still best_val_rmse: 0.6028 (from epoch 0)

16 steps took 6.64 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6376 Still best_val_rmse: 0.6028 (from epoch 0)


[32m[I 2021-07-22 16:39:02,587][0m Trial 2 finished with value: 0.6027756929397583 and parameters: {'base_lr': 9.38235365621146e-05, 'last_lr': 9.008888128281592e-05, 'epochs': 5}. Best is trial 0 with value: 0.4938439726829529.[0m



##### Using fold 4
##### Using base_lr 0.0003672395297481787 last_lr 0.0013548633017898166 epochs 3


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 7.48 seconds
Epoch: 0 batch_num: 16 val_rmse: 7.37 New best_val_rmse: 7.37

16 steps took 6.53 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.062 New best_val_rmse: 1.062

16 steps took 6.53 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.142 Still best_val_rmse: 1.062 (from epoch 0)

16 steps took 6.52 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.27 Still best_val_rmse: 1.062 (from epoch 0)

16 steps took 6.56 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.117 Still best_val_rmse: 1.062 (from epoch 0)

16 steps took 6.53 seconds


[32m[I 2021-07-22 16:40:35,776][0m Trial 3 finished with value: 1.0616463422775269 and parameters: {'base_lr': 0.0003672395297481787, 'last_lr': 0.0013548633017898166, 'epochs': 3}. Best is trial 0 with value: 0.4938439726829529.[0m


Epoch: 0 batch_num: 96 val_rmse: 1.151 Still best_val_rmse: 1.062 (from epoch 0)

##### Using fold 4
##### Using base_lr 8.316774517963056e-05 last_lr 0.0007533735404761509 epochs 3


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 7.49 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.686 New best_val_rmse: 1.686

16 steps took 6.57 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9998 New best_val_rmse: 0.9998

16 steps took 6.57 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7348 New best_val_rmse: 0.7348

16 steps took 6.59 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.701 New best_val_rmse: 0.701

16 steps took 6.59 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6463 New best_val_rmse: 0.6463

16 steps took 6.64 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6436 New best_val_rmse: 0.6436

16 steps took 6.64 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6069 New best_val_rmse: 0.6069

16 steps took 6.6 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.598 New best_val_rmse: 0.598

16 steps took 6.59 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6288 Still best_val_rmse: 0.598 (from epoch 0)

16 steps took 7.19 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.6033 Still best_val_rmse: 0.598 (from epoch 0)

16 steps took 6.58 seconds
Epoc

[32m[I 2021-07-22 16:46:58,394][0m Trial 4 finished with value: 0.5069844126701355 and parameters: {'base_lr': 8.316774517963056e-05, 'last_lr': 0.0007533735404761509, 'epochs': 3}. Best is trial 0 with value: 0.4938439726829529.[0m



##### Using fold 4
##### Using base_lr 8.03800437091818e-05 last_lr 0.002631415142328459 epochs 3


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 7.53 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.245 New best_val_rmse: 1.245

16 steps took 6.56 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9155 New best_val_rmse: 0.9155

16 steps took 6.55 seconds
Epoch: 0 batch_num: 48 val_rmse: 3.788 Still best_val_rmse: 0.9155 (from epoch 0)

16 steps took 6.59 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.093 Still best_val_rmse: 0.9155 (from epoch 0)

16 steps took 6.55 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.9219 Still best_val_rmse: 0.9155 (from epoch 0)

16 steps took 6.62 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6371 New best_val_rmse: 0.6371



[32m[I 2021-07-22 16:48:31,724][0m Trial 5 finished with value: 0.6371252536773682 and parameters: {'base_lr': 8.03800437091818e-05, 'last_lr': 0.002631415142328459, 'epochs': 3}. Best is trial 0 with value: 0.4938439726829529.[0m


##### Using fold 4
##### Using base_lr 6.684301123331321e-05 last_lr 0.0030363691693253004 epochs 4


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 7.52 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.403 New best_val_rmse: 1.403

16 steps took 6.56 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8943 New best_val_rmse: 0.8943

16 steps took 6.57 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9561 Still best_val_rmse: 0.8943 (from epoch 0)

16 steps took 6.57 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6707 New best_val_rmse: 0.6707

16 steps took 6.58 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6376 New best_val_rmse: 0.6376

16 steps took 6.6 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.8012 Still best_val_rmse: 0.6376 (from epoch 0)

16 steps took 6.6 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7961 Still best_val_rmse: 0.6376 (from epoch 0)

16 steps took 6.65 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.9327 Still best_val_rmse: 0.6376 (from epoch 0)

16 steps took 6.59 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6692 Still best_val_rmse: 0.6376 (from epoch 0)


[32m[I 2021-07-22 16:50:48,635][0m Trial 6 finished with value: 0.6375647187232971 and parameters: {'base_lr': 6.684301123331321e-05, 'last_lr': 0.0030363691693253004, 'epochs': 4}. Best is trial 0 with value: 0.4938439726829529.[0m



##### Using fold 4
##### Using base_lr 3.880623870249541e-05 last_lr 0.00031391618161442236 epochs 5


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 7.53 seconds
Epoch: 0 batch_num: 16 val_rmse: 2.884 New best_val_rmse: 2.884

16 steps took 6.56 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9728 New best_val_rmse: 0.9728

16 steps took 6.57 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7719 New best_val_rmse: 0.7719

16 steps took 6.57 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6433 New best_val_rmse: 0.6433

16 steps took 6.57 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6212 New best_val_rmse: 0.6212

16 steps took 6.58 seconds


[32m[I 2021-07-22 16:52:21,801][0m Trial 7 finished with value: 0.5872001051902771 and parameters: {'base_lr': 3.880623870249541e-05, 'last_lr': 0.00031391618161442236, 'epochs': 5}. Best is trial 0 with value: 0.4938439726829529.[0m


Epoch: 0 batch_num: 96 val_rmse: 0.5872 New best_val_rmse: 0.5872

##### Using fold 4
##### Using base_lr 0.0003781921609267425 last_lr 0.0002729467903023369 epochs 4


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 7.43 seconds
Epoch: 0 batch_num: 16 val_rmse: 7.607 New best_val_rmse: 7.607

16 steps took 6.54 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.249 New best_val_rmse: 1.249

16 steps took 6.52 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.183 New best_val_rmse: 1.183

16 steps took 6.52 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.051 New best_val_rmse: 1.051

16 steps took 6.56 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.04 New best_val_rmse: 1.04

16 steps took 6.58 seconds


[32m[I 2021-07-22 16:53:55,268][0m Trial 8 finished with value: 1.0396034717559814 and parameters: {'base_lr': 0.0003781921609267425, 'last_lr': 0.0002729467903023369, 'epochs': 4}. Best is trial 0 with value: 0.4938439726829529.[0m


Epoch: 0 batch_num: 96 val_rmse: 1.055 Still best_val_rmse: 1.04 (from epoch 0)

##### Using fold 4
##### Using base_lr 0.00030774727212992784 last_lr 0.0029232340532154497 epochs 3


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 7.52 seconds
Epoch: 0 batch_num: 16 val_rmse: 5.963 New best_val_rmse: 5.963

16 steps took 6.51 seconds
Epoch: 0 batch_num: 32 val_rmse: 2.312 New best_val_rmse: 2.312

16 steps took 6.54 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.115 New best_val_rmse: 1.115

16 steps took 6.51 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.142 Still best_val_rmse: 1.115 (from epoch 0)

16 steps took 6.53 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.042 New best_val_rmse: 1.042

16 steps took 6.57 seconds


[32m[I 2021-07-22 16:55:29,041][0m Trial 9 finished with value: 1.0418716669082642 and parameters: {'base_lr': 0.00030774727212992784, 'last_lr': 0.0029232340532154497, 'epochs': 3}. Best is trial 0 with value: 0.4938439726829529.[0m


Epoch: 0 batch_num: 96 val_rmse: 1.088 Still best_val_rmse: 1.042 (from epoch 0)

##### Using fold 4
##### Using base_lr 0.0001717178883932075 last_lr 0.00042448836147656634 epochs 4


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 7.51 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.608 New best_val_rmse: 1.608

16 steps took 6.6 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.143 New best_val_rmse: 1.143

16 steps took 6.61 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9745 New best_val_rmse: 0.9745

16 steps took 6.57 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.796 New best_val_rmse: 0.796

16 steps took 6.59 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6682 New best_val_rmse: 0.6682

16 steps took 6.59 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6391 New best_val_rmse: 0.6391

16 steps took 6.63 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5981 New best_val_rmse: 0.5981

16 steps took 6.59 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5786 New best_val_rmse: 0.5786

16 steps took 6.59 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5935 Still best_val_rmse: 0.5786 (from epoch 0)

16 steps took 7.21 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5626 New best_val_rmse: 0.5626

16 steps took 6.59 seconds
Epoch: 1 batch_num:

[32m[I 2021-07-22 17:04:28,900][0m Trial 10 finished with value: 0.48955243825912476 and parameters: {'base_lr': 0.0001717178883932075, 'last_lr': 0.00042448836147656634, 'epochs': 4}. Best is trial 10 with value: 0.48955243825912476.[0m



##### Using fold 4
##### Using base_lr 0.000186065157334065 last_lr 0.0004248090651725791 epochs 4


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 7.47 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.799 New best_val_rmse: 1.799

16 steps took 6.56 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9812 New best_val_rmse: 0.9812

16 steps took 6.57 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9166 New best_val_rmse: 0.9166

16 steps took 6.57 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8259 New best_val_rmse: 0.8259

16 steps took 6.56 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7665 New best_val_rmse: 0.7665

16 steps took 6.58 seconds


[32m[I 2021-07-22 17:06:02,436][0m Trial 11 finished with value: 0.7665433883666992 and parameters: {'base_lr': 0.000186065157334065, 'last_lr': 0.0004248090651725791, 'epochs': 4}. Best is trial 10 with value: 0.48955243825912476.[0m


Epoch: 0 batch_num: 96 val_rmse: 0.9536 Still best_val_rmse: 0.7665 (from epoch 0)

##### Using fold 4
##### Using base_lr 0.00015692949034032473 last_lr 0.00014268196348590442 epochs 4


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 7.48 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.178 New best_val_rmse: 1.178

16 steps took 6.57 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8942 New best_val_rmse: 0.8942

16 steps took 6.62 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6972 New best_val_rmse: 0.6972

16 steps took 6.57 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7543 Still best_val_rmse: 0.6972 (from epoch 0)

16 steps took 6.58 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7288 Still best_val_rmse: 0.6972 (from epoch 0)

16 steps took 6.6 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.9509 Still best_val_rmse: 0.6972 (from epoch 0)

16 steps took 6.59 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6933 New best_val_rmse: 0.6933

16 steps took 6.6 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.8176 Still best_val_rmse: 0.6933 (from epoch 0)

16 steps took 6.6 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.7374 Still best_val_rmse: 0.6933 (from epoch 0)



[32m[I 2021-07-22 17:08:18,214][0m Trial 12 finished with value: 0.6933364868164062 and parameters: {'base_lr': 0.00015692949034032473, 'last_lr': 0.00014268196348590442, 'epochs': 4}. Best is trial 10 with value: 0.48955243825912476.[0m


##### Using fold 4
##### Using base_lr 0.0001732732522059111 last_lr 0.000741969827003011 epochs 4


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 7.49 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.795 New best_val_rmse: 1.795

16 steps took 6.62 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.038 New best_val_rmse: 1.038

16 steps took 6.55 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9851 New best_val_rmse: 0.9851

16 steps took 6.52 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.35 Still best_val_rmse: 0.9851 (from epoch 0)

16 steps took 6.52 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.054 Still best_val_rmse: 0.9851 (from epoch 0)

16 steps took 6.54 seconds


[32m[I 2021-07-22 17:09:52,310][0m Trial 13 finished with value: 0.9850828051567078 and parameters: {'base_lr': 0.0001732732522059111, 'last_lr': 0.000741969827003011, 'epochs': 4}. Best is trial 10 with value: 0.48955243825912476.[0m


Epoch: 0 batch_num: 96 val_rmse: 1.116 Still best_val_rmse: 0.9851 (from epoch 0)

##### Using fold 4
##### Using base_lr 0.00012436284032296463 last_lr 0.0011179313245251732 epochs 4


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 7.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.042 New best_val_rmse: 1.042

16 steps took 6.56 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8783 New best_val_rmse: 0.8783

16 steps took 6.55 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7238 New best_val_rmse: 0.7238

16 steps took 6.58 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8231 Still best_val_rmse: 0.7238 (from epoch 0)

16 steps took 6.63 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6483 New best_val_rmse: 0.6483

16 steps took 6.57 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7195 Still best_val_rmse: 0.6483 (from epoch 0)

16 steps took 6.59 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6436 New best_val_rmse: 0.6436

16 steps took 6.62 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7433 Still best_val_rmse: 0.6436 (from epoch 0)

16 steps took 6.64 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6006 New best_val_rmse: 0.6006


[32m[I 2021-07-22 17:12:08,791][0m Trial 14 finished with value: 0.6006467938423157 and parameters: {'base_lr': 0.00012436284032296463, 'last_lr': 0.0011179313245251732, 'epochs': 4}. Best is trial 10 with value: 0.48955243825912476.[0m



##### Using fold 4
##### Using base_lr 4.027865360251286e-05 last_lr 0.00047032769189387546 epochs 5


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 7.48 seconds
Epoch: 0 batch_num: 16 val_rmse: 2.785 New best_val_rmse: 2.785

16 steps took 6.56 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.004 New best_val_rmse: 1.004

16 steps took 6.59 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7524 New best_val_rmse: 0.7524

16 steps took 6.55 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6769 New best_val_rmse: 0.6769

16 steps took 6.59 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6885 Still best_val_rmse: 0.6769 (from epoch 0)

16 steps took 6.56 seconds


[32m[I 2021-07-22 17:13:42,152][0m Trial 15 finished with value: 0.6580143570899963 and parameters: {'base_lr': 4.027865360251286e-05, 'last_lr': 0.00047032769189387546, 'epochs': 5}. Best is trial 10 with value: 0.48955243825912476.[0m


Epoch: 0 batch_num: 96 val_rmse: 0.658 New best_val_rmse: 0.658

##### Using fold 4
##### Using base_lr 0.00022983699940706383 last_lr 0.0001426453208823319 epochs 4


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 7.5 seconds
Epoch: 0 batch_num: 16 val_rmse: 2.906 New best_val_rmse: 2.906

16 steps took 6.59 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.069 New best_val_rmse: 1.069

16 steps took 6.56 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8899 New best_val_rmse: 0.8899

16 steps took 6.58 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7522 New best_val_rmse: 0.7522

16 steps took 6.65 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6326 New best_val_rmse: 0.6326

16 steps took 6.62 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6528 Still best_val_rmse: 0.6326 (from epoch 0)


[32m[I 2021-07-22 17:15:15,895][0m Trial 16 finished with value: 0.6326446533203125 and parameters: {'base_lr': 0.00022983699940706383, 'last_lr': 0.0001426453208823319, 'epochs': 4}. Best is trial 10 with value: 0.48955243825912476.[0m



##### Using fold 4
##### Using base_lr 5.6335035163877976e-05 last_lr 0.004952718022594586 epochs 4


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 7.48 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.14 New best_val_rmse: 1.14

16 steps took 6.58 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8726 New best_val_rmse: 0.8726

16 steps took 6.56 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9114 Still best_val_rmse: 0.8726 (from epoch 0)

16 steps took 6.58 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7547 New best_val_rmse: 0.7547

16 steps took 6.63 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6733 New best_val_rmse: 0.6733

16 steps took 6.58 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7172 Still best_val_rmse: 0.6733 (from epoch 0)

16 steps took 6.66 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7779 Still best_val_rmse: 0.6733 (from epoch 0)

16 steps took 6.58 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6899 Still best_val_rmse: 0.6733 (from epoch 0)

16 steps took 6.58 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5819 New best_val_rmse: 0.5819

16 steps took 7.17 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5634 New best_val_rmse: 

[32m[I 2021-07-22 17:23:43,960][0m Trial 17 finished with value: 0.5104207992553711 and parameters: {'base_lr': 5.6335035163877976e-05, 'last_lr': 0.004952718022594586, 'epochs': 4}. Best is trial 10 with value: 0.48955243825912476.[0m



##### Using fold 4
##### Using base_lr 0.00012067145049904876 last_lr 0.00021114563956794053 epochs 3


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 7.53 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.195 New best_val_rmse: 1.195

16 steps took 6.58 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.025 New best_val_rmse: 1.025

16 steps took 6.57 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7846 New best_val_rmse: 0.7846

16 steps took 6.57 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.9852 Still best_val_rmse: 0.7846 (from epoch 0)

16 steps took 6.66 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6659 New best_val_rmse: 0.6659

16 steps took 6.61 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6361 New best_val_rmse: 0.6361

16 steps took 6.62 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6051 New best_val_rmse: 0.6051

16 steps took 6.6 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6584 Still best_val_rmse: 0.6051 (from epoch 0)

16 steps took 6.6 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6044 New best_val_rmse: 0.6044


[32m[I 2021-07-22 17:26:00,523][0m Trial 18 finished with value: 0.6043904423713684 and parameters: {'base_lr': 0.00012067145049904876, 'last_lr': 0.00021114563956794053, 'epochs': 3}. Best is trial 10 with value: 0.48955243825912476.[0m



##### Using fold 4
##### Using base_lr 0.00012182070076373223 last_lr 0.0006078521862666121 epochs 5


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 7.48 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.07 New best_val_rmse: 1.07

16 steps took 6.57 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.125 Still best_val_rmse: 1.07 (from epoch 0)

16 steps took 6.57 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6927 New best_val_rmse: 0.6927

16 steps took 6.65 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7789 Still best_val_rmse: 0.6927 (from epoch 0)

16 steps took 6.59 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.8107 Still best_val_rmse: 0.6927 (from epoch 0)

16 steps took 6.61 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.8272 Still best_val_rmse: 0.6927 (from epoch 0)

16 steps took 6.6 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6272 New best_val_rmse: 0.6272

16 steps took 6.61 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5659 New best_val_rmse: 0.5659

16 steps took 6.59 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.54 New best_val_rmse: 0.54

16 steps took 7.16 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5606 Still best_val_rmse: 0.54 (

[32m[I 2021-07-22 17:38:01,904][0m Trial 19 finished with value: 0.49262937903404236 and parameters: {'base_lr': 0.00012182070076373223, 'last_lr': 0.0006078521862666121, 'epochs': 5}. Best is trial 10 with value: 0.48955243825912476.[0m
[32m[I 2021-07-22 17:38:01,909][0m A new study created in memory with name: no-name-f1ea1a6e-5e38-4907-acf3-6e239ec55f99[0m



 Best value:  0.48955243825912476
 Best params: 
    base_lr: 0.0001717178883932075
    last_lr: 0.00042448836147656634
    epochs: 4
##### Using fold 5
##### Using base_lr 0.0002764477472207697 last_lr 0.00021564819034446844 epochs 3


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 7.53 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.86 New best_val_rmse: 1.86

16 steps took 6.56 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.092 New best_val_rmse: 1.092

16 steps took 6.54 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.06 New best_val_rmse: 1.06

16 steps took 6.55 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.9125 New best_val_rmse: 0.9125

16 steps took 6.58 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.102 Still best_val_rmse: 0.9125 (from epoch 0)

16 steps took 6.56 seconds


[32m[I 2021-07-22 17:39:35,458][0m Trial 0 finished with value: 0.9125391244888306 and parameters: {'base_lr': 0.0002764477472207697, 'last_lr': 0.00021564819034446844, 'epochs': 3}. Best is trial 0 with value: 0.9125391244888306.[0m


Epoch: 0 batch_num: 96 val_rmse: 1.02 Still best_val_rmse: 0.9125 (from epoch 0)

##### Using fold 5
##### Using base_lr 0.00024554936545913427 last_lr 0.0007753895451485905 epochs 3


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 7.58 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.715 New best_val_rmse: 1.715

16 steps took 6.56 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.003 New best_val_rmse: 1.003

16 steps took 6.57 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7058 New best_val_rmse: 0.7058

16 steps took 6.58 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7339 Still best_val_rmse: 0.7058 (from epoch 0)

16 steps took 6.6 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6272 New best_val_rmse: 0.6272

16 steps took 6.64 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.8411 Still best_val_rmse: 0.6272 (from epoch 0)

16 steps took 6.61 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.9506 Still best_val_rmse: 0.6272 (from epoch 0)

16 steps took 6.6 seconds
Epoch: 0 batch_num: 128 val_rmse: 1.012 Still best_val_rmse: 0.6272 (from epoch 0)

16 steps took 6.6 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.7588 Still best_val_rmse: 0.6272 (from epoch 0)


[32m[I 2021-07-22 17:41:52,072][0m Trial 1 finished with value: 0.6271501779556274 and parameters: {'base_lr': 0.00024554936545913427, 'last_lr': 0.0007753895451485905, 'epochs': 3}. Best is trial 1 with value: 0.6271501779556274.[0m



##### Using fold 5
##### Using base_lr 0.0002748663461976918 last_lr 0.0042910375069491755 epochs 4


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 7.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.728 New best_val_rmse: 1.728

16 steps took 6.58 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9309 New best_val_rmse: 0.9309

16 steps took 6.57 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9043 New best_val_rmse: 0.9043

16 steps took 6.58 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.9819 Still best_val_rmse: 0.9043 (from epoch 0)

16 steps took 6.55 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.066 Still best_val_rmse: 0.9043 (from epoch 0)

16 steps took 6.54 seconds


[32m[I 2021-07-22 17:43:25,930][0m Trial 2 finished with value: 0.904285192489624 and parameters: {'base_lr': 0.0002748663461976918, 'last_lr': 0.0042910375069491755, 'epochs': 4}. Best is trial 1 with value: 0.6271501779556274.[0m


Epoch: 0 batch_num: 96 val_rmse: 1.02 Still best_val_rmse: 0.9043 (from epoch 0)

##### Using fold 5
##### Using base_lr 7.652664002075421e-05 last_lr 0.00014979767491809303 epochs 3


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 7.58 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8646 New best_val_rmse: 0.8646

16 steps took 6.58 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6805 New best_val_rmse: 0.6805

16 steps took 6.56 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6246 New best_val_rmse: 0.6246

16 steps took 6.57 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7585 Still best_val_rmse: 0.6246 (from epoch 0)

16 steps took 6.58 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5748 New best_val_rmse: 0.5748

16 steps took 6.61 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7808 Still best_val_rmse: 0.5748 (from epoch 0)

16 steps took 6.61 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5781 Still best_val_rmse: 0.5748 (from epoch 0)

16 steps took 6.6 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6532 Still best_val_rmse: 0.5748 (from epoch 0)

16 steps took 6.59 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5639 New best_val_rmse: 0.5639

16 steps took 7.2 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5591 New best_val_rmse

[32m[I 2021-07-22 17:49:59,020][0m Trial 3 finished with value: 0.480750173330307 and parameters: {'base_lr': 7.652664002075421e-05, 'last_lr': 0.00014979767491809303, 'epochs': 3}. Best is trial 3 with value: 0.480750173330307.[0m



##### Using fold 5
##### Using base_lr 0.00047039367469524384 last_lr 0.0010511399095471956 epochs 5


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 7.53 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.18 New best_val_rmse: 1.18

16 steps took 6.55 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.429 Still best_val_rmse: 1.18 (from epoch 0)

16 steps took 6.53 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.04 New best_val_rmse: 1.04

16 steps took 6.55 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.081 Still best_val_rmse: 1.04 (from epoch 0)

16 steps took 6.52 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.091 Still best_val_rmse: 1.04 (from epoch 0)

16 steps took 6.57 seconds


[32m[I 2021-07-22 17:51:32,557][0m Trial 4 finished with value: 1.0404369831085205 and parameters: {'base_lr': 0.00047039367469524384, 'last_lr': 0.0010511399095471956, 'epochs': 5}. Best is trial 3 with value: 0.480750173330307.[0m


Epoch: 0 batch_num: 96 val_rmse: 1.103 Still best_val_rmse: 1.04 (from epoch 0)

##### Using fold 5
##### Using base_lr 0.000135700916847811 last_lr 0.0029640935672153 epochs 4


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 7.52 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.176 New best_val_rmse: 1.176

16 steps took 6.58 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8045 New best_val_rmse: 0.8045

16 steps took 6.63 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6749 New best_val_rmse: 0.6749

16 steps took 6.6 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6839 Still best_val_rmse: 0.6749 (from epoch 0)

16 steps took 6.62 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6362 New best_val_rmse: 0.6362

16 steps took 6.6 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.711 Still best_val_rmse: 0.6362 (from epoch 0)

16 steps took 6.59 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6578 Still best_val_rmse: 0.6362 (from epoch 0)

16 steps took 6.71 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.773 Still best_val_rmse: 0.6362 (from epoch 0)

16 steps took 6.62 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5746 New best_val_rmse: 0.5746

16 steps took 7.18 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.6094 Still best_val_rmse: 

[32m[I 2021-07-22 18:07:28,519][0m Trial 5 finished with value: 0.4688156247138977 and parameters: {'base_lr': 0.000135700916847811, 'last_lr': 0.0029640935672153, 'epochs': 4}. Best is trial 5 with value: 0.4688156247138977.[0m


Epoch: 3 batch_num: 147 val_rmse: 0.4695 Still best_val_rmse: 0.4688 (from epoch 3)

##### Using fold 5
##### Using base_lr 0.0003816244755730149 last_lr 0.0001760744534229041 epochs 3


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 7.57 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.765 New best_val_rmse: 1.765

16 steps took 6.56 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.583 New best_val_rmse: 1.583

16 steps took 6.56 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.313 New best_val_rmse: 1.313

16 steps took 6.57 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.288 New best_val_rmse: 1.288

16 steps took 6.58 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.9166 New best_val_rmse: 0.9166

16 steps took 6.58 seconds


[32m[I 2021-07-22 18:09:02,364][0m Trial 6 finished with value: 0.9166398644447327 and parameters: {'base_lr': 0.0003816244755730149, 'last_lr': 0.0001760744534229041, 'epochs': 3}. Best is trial 5 with value: 0.4688156247138977.[0m


Epoch: 0 batch_num: 96 val_rmse: 1.208 Still best_val_rmse: 0.9166 (from epoch 0)

##### Using fold 5
##### Using base_lr 8.995995358073548e-05 last_lr 0.0005913800163279911 epochs 5


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 7.62 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.7742 New best_val_rmse: 0.7742

16 steps took 6.57 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6992 New best_val_rmse: 0.6992

16 steps took 6.59 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6155 New best_val_rmse: 0.6155

16 steps took 6.6 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7417 Still best_val_rmse: 0.6155 (from epoch 0)

16 steps took 6.65 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6845 Still best_val_rmse: 0.6155 (from epoch 0)

16 steps took 6.61 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.864 Still best_val_rmse: 0.6155 (from epoch 0)

16 steps took 6.6 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5769 New best_val_rmse: 0.5769

16 steps took 6.6 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.8989 Still best_val_rmse: 0.5769 (from epoch 0)

16 steps took 6.61 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.8375 Still best_val_rmse: 0.5769 (from epoch 0)

16 steps took 7.2 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.6572 Sti

[32m[I 2021-07-22 18:32:35,727][0m Trial 7 finished with value: 0.46997010707855225 and parameters: {'base_lr': 8.995995358073548e-05, 'last_lr': 0.0005913800163279911, 'epochs': 5}. Best is trial 5 with value: 0.4688156247138977.[0m


Epoch: 4 batch_num: 147 val_rmse: 0.4725 Still best_val_rmse: 0.47 (from epoch 3)

##### Using fold 5
##### Using base_lr 0.00012020206263341346 last_lr 0.0005492564740784009 epochs 3


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 7.58 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.027 New best_val_rmse: 1.027

16 steps took 6.58 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8473 New best_val_rmse: 0.8473

16 steps took 6.56 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6328 New best_val_rmse: 0.6328

16 steps took 6.62 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6024 New best_val_rmse: 0.6024

16 steps took 6.59 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5739 New best_val_rmse: 0.5739

16 steps took 6.59 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.8438 Still best_val_rmse: 0.5739 (from epoch 0)

16 steps took 6.63 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6299 Still best_val_rmse: 0.5739 (from epoch 0)

16 steps took 6.59 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6472 Still best_val_rmse: 0.5739 (from epoch 0)

16 steps took 6.59 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5388 New best_val_rmse: 0.5388

16 steps took 7.19 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5286 New best_val_rmse: 0.5286

16 step

[32m[I 2021-07-22 18:39:13,829][0m Trial 8 finished with value: 0.48350340127944946 and parameters: {'base_lr': 0.00012020206263341346, 'last_lr': 0.0005492564740784009, 'epochs': 3}. Best is trial 5 with value: 0.4688156247138977.[0m



##### Using fold 5
##### Using base_lr 3.416461739342611e-05 last_lr 0.0021998765313380802 epochs 4


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 7.54 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.098 New best_val_rmse: 1.098

16 steps took 6.57 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8311 New best_val_rmse: 0.8311

16 steps took 6.57 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7495 New best_val_rmse: 0.7495

16 steps took 6.57 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8807 Still best_val_rmse: 0.7495 (from epoch 0)

16 steps took 6.59 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7183 New best_val_rmse: 0.7183

16 steps took 6.6 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7341 Still best_val_rmse: 0.7183 (from epoch 0)

16 steps took 6.62 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5591 New best_val_rmse: 0.5591

16 steps took 6.62 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6971 Still best_val_rmse: 0.5591 (from epoch 0)

16 steps took 6.6 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5719 Still best_val_rmse: 0.5591 (from epoch 0)

16 steps took 7.12 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5578 New best_val_rmse: 

[32m[I 2021-07-22 18:47:40,939][0m Trial 9 finished with value: 0.503010630607605 and parameters: {'base_lr': 3.416461739342611e-05, 'last_lr': 0.0021998765313380802, 'epochs': 4}. Best is trial 5 with value: 0.4688156247138977.[0m



##### Using fold 5
##### Using base_lr 4.1983500986909504e-05 last_lr 0.004778601048775113 epochs 4


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


16 steps took 7.61 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.112 New best_val_rmse: 1.112

16 steps took 6.58 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7084 New best_val_rmse: 0.7084

16 steps took 6.63 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7748 Still best_val_rmse: 0.7084 (from epoch 0)

16 steps took 6.59 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8988 Still best_val_rmse: 0.7084 (from epoch 0)

16 steps took 6.58 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7561 Still best_val_rmse: 0.7084 (from epoch 0)

16 steps took 6.6 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5986 New best_val_rmse: 0.5986

16 steps took 6.61 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6054 Still best_val_rmse: 0.5986 (from epoch 0)

16 steps took 6.61 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6619 Still best_val_rmse: 0.5986 (from epoch 0)

16 steps took 6.59 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6014 Still best_val_rmse: 0.5986 (from epoch 0)

16 steps took 7.18 seconds
Epoch: 1 batch_num: 12 v

[32m[I 2021-07-22 18:56:23,505][0m Trial 10 finished with value: 0.4966193735599518 and parameters: {'base_lr': 4.1983500986909504e-05, 'last_lr': 0.004778601048775113, 'epochs': 4}. Best is trial 5 with value: 0.4688156247138977.[0m



##### Using fold 5
##### Using base_lr 0.00010638643847946303 last_lr 0.00036673227466262124 epochs 5


Some weights of the model checkpoint at xlnet-large-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-large-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.weight', 'sequence_summary.summary.bias', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


16 steps took 7.48 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9047 New best_val_rmse: 0.9047

16 steps took 6.56 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.114 Still best_val_rmse: 0.9047 (from epoch 0)

16 steps took 6.6 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.651 New best_val_rmse: 0.651

16 steps took 6.6 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6169 New best_val_rmse: 0.6169

16 steps took 6.58 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7197 Still best_val_rmse: 0.6169 (from epoch 0)

16 steps took 6.58 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7152 Still best_val_rmse: 0.6169 (from epoch 0)

16 steps took 6.59 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.8469 Still best_val_rmse: 0.6169 (from epoch 0)

16 steps took 6.59 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5955 New best_val_rmse: 0.5955

16 steps took 6.69 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5545 New best_val_rmse: 0.5545

16 steps took 7.18 seconds
Epoch: 1 batch_num: 12 val_rmse: 0.5298 New best_val_rmse: 0

### Verify the model

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm

In [None]:
cfg.model_offset = 0
cfg.model_limit = 6
cfg.n_folds = 5
cfg.svm_kernels = ['rbf']
cfg.svm_c = 5

In [None]:
num_bins = int(np.ceil(np.log2(len(train_df))))
train_df['bins'] = pd.cut(train_df['target'], bins=num_bins, labels=False)
bins = train_df['bins'].values

In [None]:
%%time

inference_models = []
for i in range(1, cfg.NUM_FOLDS + 1):
    print(f'Model {i}')
    inference_model = CommonLitModel()
    inference_model = inference_model.cuda()
    inference_model.load_state_dict(torch.load(str(MODELS_PATH/f"{cfg.model_name.replace('/', '_')}_{i}/model_{i}.pth")))
    inference_model.eval();
    inference_models.append(inference_model)

In [None]:
from transformers import RobertaTokenizer

tokenizers = []
for i in range(1, cfg.NUM_FOLDS):
    tokenizer = RobertaTokenizer.from_pretrained(MODELS_PATH/f"{cfg.model_name.replace('/', '_')}_{i}")
    tokenizers.append(tokenizer)

In [None]:
def get_cls_embeddings(dl, transformer_model):
    cls_embeddings = []
    with torch.no_grad():
        for input_features in tqdm(dl, total=len(dl)):
            output, context_vector = transformer_model(input_features['input_ids'].cuda(), input_features['attention_mask'].cuda())
#             cls_embeddings.extend(output['last_hidden_state'][:,0,:].detach().cpu().numpy())
            embedding_out = context_vector.detach().cpu().numpy()
            cls_embeddings.extend(embedding_out)
    return np.array(cls_embeddings)

In [None]:
def rmse_score(X, y):
    return np.sqrt(mean_squared_error(X, y))

In [None]:
def convert_to_list(t):
    return t.flatten().long()

class CommonLitDataset(nn.Module):
    def __init__(self, text, test_id, tokenizer, max_len=128):
        self.excerpt = text
        self.test_id = test_id
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        return {'input_ids': convert_to_list(encode['input_ids']),
                'attention_mask': convert_to_list(encode['attention_mask']),
                'id': self.test_id[idx]}
    
    def __len__(self):
        return len(self.excerpt)

In [None]:
def create_dl(df, tokenizer):
    text = df['excerpt'].values
    ids = df['id'].values
    ds = CommonLitDataset(text, ids, tokenizer, max_len=cfg.MAX_LEN)
    return DataLoader(ds, 
                      batch_size = cfg.BATCH_SIZE,
                      shuffle=False,
                      num_workers = 1,
                      pin_memory=True,
                      drop_last=False
                     )

In [None]:
train_df = pd.read_csv(DATA_PATH/'train-orig.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')
remove_unnecessary(train_df)

In [None]:
train_target_mean = train_df['target'].mean()
train_target_std = train_df['target'].std()
train_df['normalized_target'] = (train_df['target'] - train_target_mean) / train_target_std

In [None]:
%%time

train_target = train_df['normalized_target'].values

def calc_mean(scores):
    return np.mean(np.array(scores), axis=0)

final_scores = []
final_rmse = []
kernel_rmse_score_mean = []
final_kernel_predictions_means = []
for j, (inference_model, tokenizer) in enumerate(zip(inference_models, tokenizers)):
    print('Model', j)
    test_dl = create_dl(test_df, tokenizer)
    train_dl = create_dl(train_df, tokenizer)
    transformer_model = inference_model
    transformer_model.cuda()
    X = get_cls_embeddings(train_dl, transformer_model)
    
    y = train_target
    X_test = get_cls_embeddings(test_dl, transformer_model)
    
    kfold = StratifiedKFold(n_splits=cfg.NUM_FOLDS)
    scores = []
    rmse_scores = []
    kernel_predictions_means = []
    for kernel in cfg.svm_kernels:
        print('Kernel', kernel)
        kernel_scores = []
        kernel_rmse_scores = []
        kernel_predictions = []
        for k, (train_idx, valid_idx) in enumerate(kfold.split(X, bins)):

            print('Fold', k, train_idx.shape, valid_idx.shape)
            model = SVR(C=cfg.svm_c, kernel=kernel, gamma='auto')

            X_train, y_train = X[train_idx], y[train_idx]
            X_valid, y_valid = X[valid_idx], y[valid_idx]
            model.fit(X_train, y_train)
            prediction = model.predict(X_valid)
            kernel_predictions.append(prediction)
            kernel_rmse_scores.append(rmse_score(prediction, y_valid))
            print('rmse_score', kernel_rmse_scores[k])
            kernel_scores.append(model.predict(X_test))
        kernel_predictions_means.append(np.array([np.mean(kp) for kp in kernel_predictions]).mean())
        scores.append(calc_mean(kernel_scores))
        kernel_rmse_score = calc_mean(kernel_rmse_scores)
        kernel_rmse_score_mean.append(kernel_rmse_score)
        rmse_scores.append(kernel_rmse_score)
    final_kernel_predictions_means.append(kernel_predictions_means)
    final_scores.append(calc_mean(scores))
    final_rmse.append(calc_mean(rmse_scores))
print('FINAL RMSE score', np.mean(np.array(final_rmse)))

In [None]:
final_kernel_predictions_means

In [None]:
# (train_df['target'] - cfg.train_target_mean) / cfg.train_target_std
final_scores_normalized = np.array(final_scores) * train_target_std + train_target_mean

In [None]:
kernel_rmse_score_mean_array = np.array(kernel_rmse_score_mean)
kernel_rmse_score_mean_sum = np.sum(kernel_rmse_score_mean_array)
prop_losses = kernel_rmse_score_mean_array / kernel_rmse_score_mean_sum
prop_losses_sum = (1 - prop_losses).sum()
weights = (1 - prop_losses) / prop_losses_sum
weights

In [None]:
def calc_mean(scores, weights=weights):
    return np.average(np.array(scores), weights=weights, axis=0)

In [None]:
target_mean = train_df['target'].mean()
final_scores_flat = calc_mean(final_scores_normalized).flatten()
final_scores_mean = final_scores_flat.mean()
target_mean, np.array(final_scores_normalized).mean()
# (-0.9579984513405823, -0.8029817438292849)

In [None]:
final_scores_flat

In [None]:
mean_diff = target_mean - final_scores_mean
mean_diff, mean_diff / len(final_scores)

In [None]:
sample_df['target'] = final_scores_flat + mean_diff
# sample_df['target'] = len(final_scores) / np.sum(1 / np.array(final_scores), axis=0) # harmonic mean
sample_df

### Prepare Packaging

In [None]:
cfg.model_name

In [None]:
BEST_MODEL_FOLDER = MODELS_PATH/cfg.model_name/'best'
!rm -rf {BEST_MODEL_FOLDER}
!mkdir -p {BEST_MODEL_FOLDER}

In [None]:
BEST_MODEL_FOLDER

In [None]:
cfg.NUM_FOLDS

In [None]:
bestmodels = [MODELS_PATH/f'{cfg.model_name}_{i + 1}' for i in range(0, cfg.NUM_FOLDS)]

In [None]:
bestmodels

In [None]:
from shutil import copyfile

def normalize_name(path_name):
    return path_name.replace('', '')

for i, best_model in enumerate(bestmodels):
    print(f'Processing {i}th model')
    i = i + 1
    best_model_file = f'{best_model}/model_{i}.pth'
    if Path(best_model_file).exists():
        copyfile(best_model_file, f'{BEST_MODEL_FOLDER}/{i}_pytorch_model.bin')
        tokenizer_path = Path(BEST_MODEL_FOLDER/f'tokenizer-{i}')
        tokenizer_path.mkdir(parents=True, exist_ok=True)
        assert tokenizer_path.exists()

        tokenizer_json = Path(normalize_name(f'{MODELS_PATH/cfg.model_name}_{i}/tokenizer_config.json'))
        assert tokenizer_json.exists(), f'{tokenizer_json} does not exist'
        copyfile(tokenizer_json, tokenizer_path/'tokenizer.json')

        vocab_txt = Path(normalize_name(f'{MODELS_PATH/cfg.model_name}_{i}/vocab.json'))
        assert vocab_txt.exists(), f'{vocab_txt} does not exist'
        copyfile(vocab_txt, tokenizer_path/'vocab.json')

        merges = Path(normalize_name(f'{MODELS_PATH/cfg.model_name}_{i}/merges.txt'))
        assert merges.exists()
        copyfile(merges, tokenizer_path/'merges.txt')
    else:
        print(f'{best_model_file} is missing')

In [None]:
import shutil

shutil.make_archive(MODELS_PATH/cfg.model_name/'best_models', 'zip', BEST_MODEL_FOLDER)

In [None]:
!ls {MODELS_PATH/cfg.model_name}

In [None]:
!mv {MODELS_PATH}/{cfg.model_name}.yaml {MODELS_PATH/cfg.model_name}

In [None]:
transformer_model.transformer_model.save_pretrained(save_directory=f'{MODELS_PATH/cfg.model_name}/lm')

In [None]:
!du -h {MODELS_PATH/cfg.model_name}/*

In [None]:
shutil.make_archive(MODELS_PATH/cfg.model_name/'lm', 'zip', f'{MODELS_PATH/cfg.model_name}/lm')

In [None]:
!kaggle datasets init -p {MODELS_PATH/cfg.model_name}

In [None]:
dataset_json_path = Path(MODELS_PATH/cfg.model_name/'dataset-metadata.json')
assert dataset_json_path.exists()

In [None]:
!cat {str(dataset_json_path)}

In [None]:
with open(dataset_json_path, 'r') as f:
    dataset_json = f.read()
    dataset_json = dataset_json.replace('INSERT_TITLE_HERE', f'commonlit-{cfg.model_name}-light').replace('INSERT_SLUG_HERE', f'commonlit-{cfg.model_name}-light')
    print(dataset_json)
with(open(dataset_json_path, 'w')) as f:
    f.write(dataset_json)

In [None]:
!rm -rf {MODELS_PATH/cfg.model_name}/best
!rm -rf {MODELS_PATH/cfg.model_name}/lm

In [None]:
!kaggle datasets create -p {MODELS_PATH/cfg.model_name}

In [None]:
!kaggle datasets version -p {MODELS_PATH/cfg.model_name} -m "Version with merges.txt" -d

In [None]:
state_dict = torch.load(str(MODELS_PATH/f'distilroberta-0/checkpoint-105/pytorch_model.bin'))

In [None]:
loaded_model = CommonLitModel()

In [None]:
loaded_model.load_state_dict(state_dict)