In [None]:
# !pip install optuna

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import gc, warnings, random, time, os

from pathlib import Path

from tqdm.notebook import tqdm

warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from transformers import AutoModel, AutoTokenizer, AutoConfig
from transformers import get_cosine_schedule_with_warmup

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

import seaborn as sns

import gc
gc.enable()

import optuna

### Folders and Dataframes

In [2]:
DATA_PATH = Path('/home/commonlit/data/')
assert DATA_PATH.exists()
MODELS_PATH = Path('/home/commonlit/models/')
if not MODELS_PATH.exists():
    os.mkdir(MODELS_PATH)
assert MODELS_PATH.exists()

In [3]:
train_df = pd.read_csv(DATA_PATH/'train-orig.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')
sample_df = pd.read_csv(DATA_PATH/'sample_submission.csv')

In [4]:
def remove_unnecessary(df):
    df.drop(df[df['target'] == 0].index, inplace=True)
    df.reset_index(drop=True, inplace=True)
    
remove_unnecessary(train_df)

In [5]:
train_df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845
...,...,...,...,...,...,...
2828,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,When you think of dinosaurs and where they liv...,1.711390,0.646900
2829,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,So what is a solid? Solids are usually hard be...,0.189476,0.535648
2830,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,The second state of matter we will discuss is ...,0.255209,0.483866
2831,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,Solids are shapes that you can actually touch....,-0.215279,0.514128


### Config and Seeding

In [10]:
class Config(): 
    NUM_FOLDS = 6
    NUM_EPOCHS = 3
    BATCH_SIZE = 16
    MAX_LEN = 248
    EVAL_SCHEDULE = [(0.50, 16), (0.49, 8), (0.48, 4), (0.47, 2), (-1., 1)]
    MODEL_PATH = 'microsoft/prophetnet-large-uncased'
    TOKENIZER_PATH = 'microsoft/prophetnet-large-uncased'
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
#     DEVICE = "cpu"
    SEED = 1000
    NUM_WORKERS = 2
    MODEL_FOLDER = MODELS_PATH
    model_name = 'prophetnet-large-uncased'
    svm_kernels = ['rbf']
    svm_c = 5

cfg = Config()

In [11]:
if not cfg.MODEL_FOLDER.exists():
    os.mkdir(cfg.MODEL_FOLDER)

In [12]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True

### Dataset

In [13]:
def add_bins(train_df, num_bins):
    train_df.loc[:, 'bins'] = pd.cut(train_df['target'], bins=num_bins, labels=False)
    return num_bins

In [14]:
add_bins(train_df, cfg.NUM_FOLDS)

6

In [15]:
train_df.groupby(['bins'])['target'].agg(['count', 'mean'])

Unnamed: 0_level_0,count,mean
bins,Unnamed: 1_level_1,Unnamed: 2_level_1
0,122,-3.125765
1,441,-2.270279
2,784,-1.41215
3,886,-0.548095
4,494,0.289716
5,106,1.070237


In [16]:
tokenizer = AutoTokenizer.from_pretrained(cfg.TOKENIZER_PATH)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1397.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231506.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=90.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=141.0, style=ProgressStyle(description_…




In [17]:
with open('../data/tokenizer.vocab.txt', 'w') as f:
    for k, v in tokenizer.vocab.items():
        f.write(f'{k}: {v}\n')

In [14]:
pad_token = '______'

In [18]:
class CommonLitDataset(Dataset):
    def __init__(self, df, tokenizer, inference_only=False):
        super().__init__()
        self.df, self.inference_only = df, inference_only
        self.text = df['excerpt'].tolist()
        self.bins = df['bins']
        if not inference_only:
            self.target = torch.tensor(df['target'].to_numpy(), dtype = torch.float32)
        
#         tokenizer.add_special_tokens({'pad_token': pad_token})
#         assert tokenizer.pad_token == pad_token
        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',
            max_length = cfg.MAX_LEN,
            truncation = True,
            return_attention_mask=True
        )
        self.tokenizer = tokenizer
        
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return {'input_ids': input_ids, 'attention_mask': attention_mask}
        else:
            target = self.target[index]
            return {'input_ids': input_ids, 'attention_mask': attention_mask, 'target': target}
    
    def __len__(self):
        return len(self.df)

In [19]:
sample_ds = CommonLitDataset(train_df, tokenizer)

### Model

In [20]:
class AttentionHead(nn.Module):
    
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        
        self.hidden_layer = nn.Linear(in_features, hidden_dim)
        self.final_layer = nn.Linear(hidden_dim, num_targets)
        self.out_features = hidden_dim
        
    def forward(self, features):
        att = torch.tanh(self.hidden_layer(features))
        score = self.final_layer(att)
        attention_weights = torch.softmax(score, dim=1)
        return attention_weights

In [21]:
config = AutoConfig.from_pretrained(cfg.MODEL_PATH)

In [22]:
config.vocab_size, tokenizer.vocab_size

(30522, 30522)

In [46]:
from transformers import ProphetNetEncoder

class CommonLitModel(nn.Module):
    def __init__(self):
        super(CommonLitModel, self).__init__()
        config = AutoConfig.from_pretrained(cfg.MODEL_PATH)
        config.update({
            "output_hidden_states": True,
            "hidden_dropout_prob": 0.0,
            "layer_norm_eps": 1e-7
        })
        self.transformer_model = ProphetNetEncoder.from_pretrained(cfg.MODEL_PATH, config=config)
        self.attention = AttentionHead(config.hidden_size, 512, 1)
        self.regressor = nn.Linear(config.hidden_size, 1)
    
    def forward(self, input_ids, attention_mask):
        hidden_states = self.transformer_model(input_ids=input_ids, attention_mask=attention_mask)['hidden_states']
        last_layer_hidden_states = hidden_states[-1]
        weights = self.attention(last_layer_hidden_states)
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1) 
        return self.regressor(context_vector), context_vector

In [47]:
sample_model = CommonLitModel()

Some weights of the model checkpoint at microsoft/prophetnet-large-uncased were not used when initializing ProphetNetEncoder: ['prophetnet.decoder.layers.6.self_attn.query_proj.weight', 'prophetnet.encoder.layers.0.feed_forward.output.bias', 'prophetnet.decoder.layers.3.cross_attn.query_proj.bias', 'prophetnet.decoder.layers.4.self_attn.relative_pos_embeddings.weight', 'prophetnet.encoder.layers.11.feed_forward.intermediate.weight', 'prophetnet.decoder.layers.6.self_attn.relative_pos_embeddings.bias', 'prophetnet.decoder.layers.7.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.8.self_attn.query_proj.weight', 'prophetnet.decoder.layers.3.cross_attn.key_proj.bias', 'prophetnet.encoder.layers.5.self_attn.value_proj.weight', 'prophetnet.decoder.layers.1.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.6.self_attn.key_proj.weight', 'prophetnet.decoder.layers.2.self_attn.relative_pos_embeddings.weight', 'prophetnet.decoder.layers.8.self_attn.value_proj.bias', 'prophetn

In [48]:
import re

for i, (name, param) in enumerate(sample_model.named_parameters()):
    if(name.find('layer') > -1):
        layer_name = re.sub(r'.+(layer\.\d+).+', r'\1', name)

In [49]:
for i, (name, param) in enumerate(sample_model.named_parameters()):
    print(i, name, param.size())

0 transformer_model.word_embeddings.weight torch.Size([30522, 1024])
1 transformer_model.position_embeddings.weight torch.Size([512, 1024])
2 transformer_model.embeddings_layer_norm.weight torch.Size([1024])
3 transformer_model.embeddings_layer_norm.bias torch.Size([1024])
4 transformer_model.layers.0.self_attn.key_proj.weight torch.Size([1024, 1024])
5 transformer_model.layers.0.self_attn.key_proj.bias torch.Size([1024])
6 transformer_model.layers.0.self_attn.value_proj.weight torch.Size([1024, 1024])
7 transformer_model.layers.0.self_attn.value_proj.bias torch.Size([1024])
8 transformer_model.layers.0.self_attn.query_proj.weight torch.Size([1024, 1024])
9 transformer_model.layers.0.self_attn.query_proj.bias torch.Size([1024])
10 transformer_model.layers.0.self_attn.out_proj.weight torch.Size([1024, 1024])
11 transformer_model.layers.0.self_attn.out_proj.bias torch.Size([1024])
12 transformer_model.layers.0.self_attn_layer_norm.weight torch.Size([1024])
13 transformer_model.layers.0.s

In [50]:
# sample_input_ids = torch.randint(0, 1000, [2, 248])
# sample_attention_mask = torch.randint(0, 1000, [2, 248])

In [51]:
sample_records = [sample_ds[i] for i in range(2)]

In [52]:
sample_records[0].keys()

dict_keys(['input_ids', 'attention_mask', 'target'])

In [53]:
sample_input_ids = torch.stack([r['input_ids'] for r in sample_records])
sample_attention_mask = torch.stack([r['attention_mask'] for r in sample_records])

In [54]:
sample_input_ids.shape, sample_attention_mask.shape

(torch.Size([2, 248]), torch.Size([2, 248]))

In [55]:
sample_input_ids

tensor([[ 2043,  1996,  2402,  2111,  2513,  2000,  1996, 14307,  1010,  2009,
          3591,  1037, 27873,  2904,  3311,  1012,  2612,  1997,  2019,  4592,
          3496,  1010,  2009,  2001,  1037,  3467,  5957,  1012,  1996,  2723,
          2001,  3139,  2007,  4586,  1011,  2317, 10683,  1010,  2025,  4201,
          2006, 15299,  1010,  2021, 19379, 21132,  2058, 18548,  1998,  2940,
         25384,  1010,  2066,  1037,  2613,  4586,  2492,  1012,  1996,  3365,
          9486,  1998, 16899,  2015,  2008,  2018,  7429,  1996,  2282,  1010,
          2020,  9898,  2098,  2007, 13724,  1998, 25259,  2007, 25252,  1997,
          6557,  1010,  2066,  4586,  1012,  2036,  6323,  6497,  2018,  2042,
          8217, 11867,  6657, 19859,  2006,  2068,  1010,  1998, 20332,  6121,
         24582, 20921,  5112,  2013,  1996,  5628,  1012,  2012,  2169,  2203,
          1997,  1996,  2282,  1010,  2006,  1996,  2813,  1010,  5112,  1037,
          3376,  4562,  1011,  3096, 20452,  1012,  

In [56]:
internal_out = sample_model.transformer_model(sample_input_ids, attention_mask=sample_attention_mask)

In [57]:
internal_out.keys()

odict_keys(['last_hidden_state', 'hidden_states'])

In [58]:
len(internal_out.hidden_states), internal_out.hidden_states[-1].shape

(13, torch.Size([2, 248, 1024]))

In [59]:
sample_res = sample_model(sample_input_ids, sample_attention_mask)

In [60]:
sample_res[0].shape, sample_res[1].shape

(torch.Size([2, 1]), torch.Size([2, 1024]))

In [61]:
torch.sum(torch.randn([8, 496, 768]), axis=1)

tensor([[ 32.8364,   0.3652,  13.6404,  ...,  19.4211,   6.2608,  19.6858],
        [-25.6278,  16.3897,  -6.3549,  ..., -12.3641, -18.1215,   9.7873],
        [ -8.5039,  -0.7417,   9.0848,  ..., -55.3892, -19.1044, -14.6381],
        ...,
        [ 25.1027, -13.0931,  19.1718,  ..., -38.2758,  22.2616,   5.6755],
        [  3.2056,  22.1658,  29.6394,  ..., -16.8787,  -4.0298,   3.4869],
        [ -1.5585,  11.8009,  -5.3453,  ...,   5.1722, -23.5612, -15.3783]])

### Evaluation and Prediction

In [62]:
def eval_mse(model, data_loader):
    model.eval()
    mse_sum = 0
    mse_loss = nn.MSELoss(reduction='sum')
    
    with torch.no_grad():
        for batch_num, record in enumerate(data_loader):
            input_ids, attention_mask, target = record['input_ids'].to(cfg.DEVICE), record['attention_mask'].to(cfg.DEVICE), record['target'].to(cfg.DEVICE)
            pred, _ = model(input_ids, attention_mask)
            mse_sum += mse_loss(pred.flatten().cpu(), target.cpu())
            
    return mse_sum / len(data_loader.dataset)

In [63]:
def predict(model, data_loader):
    model.eval()
    result = []
    
    with torch.no_grad():
        for batch_num, record in tqdm(enumerate(data_loader), total=len(data_loader)):
            input_ids, attention_mask = record['input_ids'].to(cfg.DEVICE), record['attention_mask'].to(cfg.DEVICE)
            pred, _ = model(input_ids, attention_mask)
            result.extend(pred.flatten().to("cpu").tolist())
            
    return np.array(result)

In [64]:
sample_dl = DataLoader(sample_ds, shuffle=False, batch_size=16, num_workers=1)

### Optimizer and Sampler

In [65]:
5e-5 / 2.5, 5e-5 / 0.5, 5e-5

(2e-05, 0.0001, 5e-05)

In [66]:
def create_optimizer(model, base_lr=5e-5, last_lr=None):
    named_parameters = list(model.named_parameters())
    attention_param_start = 196
    regressor_param_start = 200
    roberta_parameters = named_parameters[:attention_param_start]
    attention_parameters = named_parameters[attention_param_start:regressor_param_start]
    regressor_parameters = named_parameters[regressor_param_start:]
    
    attention_group = [params for (name, params) in attention_parameters]
    regressor_group = [params for (name, params) in regressor_parameters]
    
    parameters = []
    if last_lr is not None:
        parameters.append({"params": attention_group, "lr": last_lr})
        parameters.append({"params": regressor_group, "lr": last_lr})
    else:
        parameters.append({"params": attention_group})
        parameters.append({"params": regressor_group})
        
    # Change on different models
    layer_low_threshold = 132
    layer_middle_threshold = 70
    
    for layer_num, (name, params) in enumerate(roberta_parameters):
        weight_decay = 0.0 if 'bias' in name else 0.01
        
        lr = base_lr / 2.5 # 2e-05
        if layer_num >= layer_middle_threshold:
            lr = base_lr / 0.5 # 1e-4
        elif layer_num >= layer_low_threshold:        
            lr = base_lr    
            
        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})
        
    return AdamW(parameters)

In [67]:
sample_optimizer = create_optimizer(sample_model)

In [68]:
from torch.utils.data import Sampler,SequentialSampler,RandomSampler,SubsetRandomSampler
from collections import Counter

class WeightedSampler(Sampler):
    
    def __init__(self, dataset):
        
        self.indices = list(range(len(dataset)))
        self.num_samples = len(dataset)
        self.label_to_count = dict(Counter(dataset.bins))
        weights = [1/self.label_to_count[i] for i in dataset.bins]
        
        self.weights = torch.tensor(weights,dtype=torch.double)
        
    def __iter__(self):
        count = 0
        index = [self.indices[i] for i in torch.multinomial(self.weights, self.num_samples, replacement=True)]
        while count < self.num_samples:
            yield index[count]
            count += 1
    
    def __len__(self):
        return self.num_samples

### Training

In [69]:
def choose_eval_period(val_rmse):
    for rmse, period in cfg.EVAL_SCHEDULE:
        if val_rmse >= rmse:
            return period

In [70]:
def serialize_best(best_val_rmse, best_epoch, val_rmse, epoch, model, model_path):
    if not best_val_rmse or val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        best_epoch = epoch
        if not model_path.parent.exists():
            os.makedirs(model_path.parent)
        
#         torch.save(model.state_dict(), model_path)
        print(f"New best_val_rmse: {best_val_rmse:0.4}")
    else:       
        print(f"Still best_val_rmse: {best_val_rmse:0.4}",
              f"(from epoch {best_epoch})")
    return best_epoch, best_val_rmse

In [71]:
class Trainer():
    def __init__(self, scaler, model, model_path, train_loader, val_loader, optimizer, scheduler=None, num_epochs=cfg.NUM_EPOCHS):
        self.scaler, self.model, self.model_path, self.train_loader, self.val_loader, self.optimizer, self.scheduler, self.num_epochs = (
            scaler, model, model_path, train_loader, val_loader, optimizer, scheduler, num_epochs
        )
            
    def train(self):
        self.model.train()
        
        mse_loss = nn.MSELoss(reduction='mean')
        
        best_val_rmse = None
        best_epoch = 0
        step = 0
        last_eval_step = 0
        eval_period = cfg.EVAL_SCHEDULE[0][1]    

        start = time.time()
        val_rmse_list = []
        
        tbar = tqdm(range(self.num_epochs), total=self.num_epochs)
        for epoch in tbar:
            tbar.set_description(f'Epoch: {epoch}')
            val_rmse = None
            for batch_num, record in enumerate(self.train_loader):
                input_ids, attention_mask, target = record['input_ids'].to(cfg.DEVICE), record['attention_mask'].to(cfg.DEVICE), record['target'].to(cfg.DEVICE)
                
                self.optimizer.zero_grad()
                
                # Casts operations to mixed precision
                with torch.cuda.amp.autocast():
                    pred, _ = self.model(input_ids, attention_mask)
                    mse = mse_loss(pred.flatten(), target)
                    
                self.scaler.scale(mse).backward()
                self.scaler.step(self.optimizer)
                self.scaler.update()
                
#                 mse.backward()
#                 self.optimizer.step()
                
                if self.scheduler:
                    self.scheduler.step()
                    
                if step >= last_eval_step + eval_period:
                    elapsed_seconds = time.time() - start
                    num_steps = step - last_eval_step
                    print(f"\n{num_steps} steps took {elapsed_seconds:0.3} seconds")
                    last_eval_step = step
                    
                    val_rmse = np.sqrt(eval_mse(self.model, self.val_loader))
                    print(f"Epoch: {epoch} batch_num: {batch_num}", f"val_rmse: {val_rmse:0.4} ", end='')
                    
                    eval_period = choose_eval_period(val_rmse)
                    best_epoch, best_val_rmse = serialize_best(best_val_rmse, best_epoch, val_rmse, epoch, self.model, self.model_path)
                    val_rmse_list.append(val_rmse)
                    start = time.time()
                # Finish early on condition
                if epoch > 0 and best_val_rmse > 0.6 or (len(val_rmse_list) > 5 and np.array(val_rmse_list).mean() > 1.0):
                    return best_val_rmse
                
                step += 1
        return best_val_rmse

In [72]:
kfold = KFold(n_splits=cfg.NUM_FOLDS, random_state=cfg.SEED, shuffle=True)
splits = list(kfold.split(train_df))

### Optuna

In [73]:
del sample_model
gc.collect()
torch.cuda.empty_cache()

In [74]:
# Best results
# Fold 0: 
# Fold 1: 
# Fold 2: 
# Fold 3: 
# Fold 4: 
# Fold 5: 

In [75]:

fold = 0

def objective(trial):
    base_lr = trial.suggest_float("base_lr", 3e-5, 5e-4, log=True)
    last_lr = trial.suggest_float("last_lr", 8e-5, 5e-3, log=True)
    
    print(f'##### Using fold {fold}')
    print(f'##### Using base_lr {base_lr} last_lr {last_lr}')
    
    model_path = cfg.MODEL_FOLDER/f"{cfg.model_name.replace('/', '_')}_{fold + 1}/model_{fold + 1}.pth"
    
    set_random_seed(cfg.SEED + fold)
    
    tokenizer = AutoTokenizer.from_pretrained(cfg.TOKENIZER_PATH)
    
    train_indices, val_indices = splits[fold]
    train_dataset = CommonLitDataset(train_df.loc[train_indices], tokenizer)    
    val_dataset = CommonLitDataset(train_df.loc[val_indices], tokenizer)
    
    train_loader = DataLoader(train_dataset, batch_size=cfg.BATCH_SIZE,
                              drop_last=False, shuffle=True, num_workers=cfg.NUM_WORKERS)    
    val_loader = DataLoader(val_dataset, batch_size=cfg.BATCH_SIZE,
                            drop_last=False, shuffle=False, num_workers=cfg.NUM_WORKERS)
    
    set_random_seed(cfg.SEED + fold)
    
    model = CommonLitModel().to(cfg.DEVICE)
    
    optimizer = create_optimizer(model, base_lr=base_lr, last_lr=last_lr)
    
    scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                num_training_steps=cfg.NUM_EPOCHS * len(train_loader), 
                                                num_warmup_steps=50)
    scaler = torch.cuda.amp.GradScaler()
    
    trainer = Trainer(scaler, model, model_path, train_loader, val_loader, optimizer, scheduler = scheduler)
    rmse_val = trainer.train()
    
    del trainer
    del model
    del tokenizer
    del scaler
    del optimizer
    del train_loader
    del val_loader
    torch.cuda.empty_cache()
    gc.collect()
    
    return rmse_val

In [76]:
for i in range(0, 3):
    fold = i
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=20)
    print(" Best value: ", study.best_trial.value)
    print(" Best params: ")
    for key, value in study.best_trial.params.items():
        print(f"    {key}: {value}")

[32m[I 2021-07-21 16:43:54,788][0m A new study created in memory with name: no-name-a06fe2e6-81d5-4aa7-a535-a0df05f1d340[0m


##### Using fold 0
##### Using base_lr 0.0002878566540879815 last_lr 9.323704556966397e-05


Some weights of the model checkpoint at microsoft/prophetnet-large-uncased were not used when initializing ProphetNetEncoder: ['prophetnet.decoder.layers.6.self_attn.query_proj.weight', 'prophetnet.encoder.layers.0.feed_forward.output.bias', 'prophetnet.decoder.layers.3.cross_attn.query_proj.bias', 'prophetnet.decoder.layers.4.self_attn.relative_pos_embeddings.weight', 'prophetnet.encoder.layers.11.feed_forward.intermediate.weight', 'prophetnet.decoder.layers.6.self_attn.relative_pos_embeddings.bias', 'prophetnet.decoder.layers.7.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.8.self_attn.query_proj.weight', 'prophetnet.decoder.layers.3.cross_attn.key_proj.bias', 'prophetnet.encoder.layers.5.self_attn.value_proj.weight', 'prophetnet.decoder.layers.1.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.6.self_attn.key_proj.weight', 'prophetnet.decoder.layers.2.self_attn.relative_pos_embeddings.weight', 'prophetnet.decoder.layers.8.self_attn.value_proj.bias', 'prophetn

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 4.45 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.034 New best_val_rmse: 1.034

16 steps took 2.12 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.053 Still best_val_rmse: 1.034 (from epoch 0)

16 steps took 2.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.032 New best_val_rmse: 1.032

16 steps took 2.08 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.493 Still best_val_rmse: 1.032 (from epoch 0)

16 steps took 2.07 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.099 Still best_val_rmse: 1.032 (from epoch 0)

16 steps took 2.08 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.04 Still best_val_rmse: 1.032 (from epoch 0)



[32m[I 2021-07-21 16:44:43,610][0m Trial 0 finished with value: 1.0319627523422241 and parameters: {'base_lr': 0.0002878566540879815, 'last_lr': 9.323704556966397e-05}. Best is trial 0 with value: 1.0319627523422241.[0m


##### Using fold 0
##### Using base_lr 0.00024146991403739562 last_lr 8.219802954207824e-05


Some weights of the model checkpoint at microsoft/prophetnet-large-uncased were not used when initializing ProphetNetEncoder: ['prophetnet.decoder.layers.6.self_attn.query_proj.weight', 'prophetnet.encoder.layers.0.feed_forward.output.bias', 'prophetnet.decoder.layers.3.cross_attn.query_proj.bias', 'prophetnet.decoder.layers.4.self_attn.relative_pos_embeddings.weight', 'prophetnet.encoder.layers.11.feed_forward.intermediate.weight', 'prophetnet.decoder.layers.6.self_attn.relative_pos_embeddings.bias', 'prophetnet.decoder.layers.7.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.8.self_attn.query_proj.weight', 'prophetnet.decoder.layers.3.cross_attn.key_proj.bias', 'prophetnet.encoder.layers.5.self_attn.value_proj.weight', 'prophetnet.decoder.layers.1.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.6.self_attn.key_proj.weight', 'prophetnet.decoder.layers.2.self_attn.relative_pos_embeddings.weight', 'prophetnet.decoder.layers.8.self_attn.value_proj.bias', 'prophetn

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 3.59 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.157 New best_val_rmse: 1.157

16 steps took 2.13 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.062 New best_val_rmse: 1.062

16 steps took 2.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.16 Still best_val_rmse: 1.062 (from epoch 0)

16 steps took 2.09 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.047 New best_val_rmse: 1.047

16 steps took 2.13 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.06 Still best_val_rmse: 1.047 (from epoch 0)

16 steps took 2.09 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.046 New best_val_rmse: 1.046



[32m[I 2021-07-21 16:45:29,827][0m Trial 1 finished with value: 1.0462322235107422 and parameters: {'base_lr': 0.00024146991403739562, 'last_lr': 8.219802954207824e-05}. Best is trial 0 with value: 1.0319627523422241.[0m


##### Using fold 0
##### Using base_lr 5.780015916824462e-05 last_lr 0.000260312114748509


Some weights of the model checkpoint at microsoft/prophetnet-large-uncased were not used when initializing ProphetNetEncoder: ['prophetnet.decoder.layers.6.self_attn.query_proj.weight', 'prophetnet.encoder.layers.0.feed_forward.output.bias', 'prophetnet.decoder.layers.3.cross_attn.query_proj.bias', 'prophetnet.decoder.layers.4.self_attn.relative_pos_embeddings.weight', 'prophetnet.encoder.layers.11.feed_forward.intermediate.weight', 'prophetnet.decoder.layers.6.self_attn.relative_pos_embeddings.bias', 'prophetnet.decoder.layers.7.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.8.self_attn.query_proj.weight', 'prophetnet.decoder.layers.3.cross_attn.key_proj.bias', 'prophetnet.encoder.layers.5.self_attn.value_proj.weight', 'prophetnet.decoder.layers.1.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.6.self_attn.key_proj.weight', 'prophetnet.decoder.layers.2.self_attn.relative_pos_embeddings.weight', 'prophetnet.decoder.layers.8.self_attn.value_proj.bias', 'prophetn

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 3.61 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.414 New best_val_rmse: 1.414

16 steps took 2.12 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9853 New best_val_rmse: 0.9853

16 steps took 2.11 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8639 New best_val_rmse: 0.8639

16 steps took 2.11 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.067 Still best_val_rmse: 0.8639 (from epoch 0)

16 steps took 2.12 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7851 New best_val_rmse: 0.7851

16 steps took 2.11 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.74 New best_val_rmse: 0.74

16 steps took 2.13 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7639 Still best_val_rmse: 0.74 (from epoch 0)

16 steps took 2.13 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7802 Still best_val_rmse: 0.74 (from epoch 0)

16 steps took 2.14 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.7674 Still best_val_rmse: 0.74 (from epoch 0)


[32m[I 2021-07-21 16:46:30,206][0m Trial 2 finished with value: 0.7399754524230957 and parameters: {'base_lr': 5.780015916824462e-05, 'last_lr': 0.000260312114748509}. Best is trial 2 with value: 0.7399754524230957.[0m



##### Using fold 0
##### Using base_lr 3.876634743181945e-05 last_lr 0.0035502200498399835


Some weights of the model checkpoint at microsoft/prophetnet-large-uncased were not used when initializing ProphetNetEncoder: ['prophetnet.decoder.layers.6.self_attn.query_proj.weight', 'prophetnet.encoder.layers.0.feed_forward.output.bias', 'prophetnet.decoder.layers.3.cross_attn.query_proj.bias', 'prophetnet.decoder.layers.4.self_attn.relative_pos_embeddings.weight', 'prophetnet.encoder.layers.11.feed_forward.intermediate.weight', 'prophetnet.decoder.layers.6.self_attn.relative_pos_embeddings.bias', 'prophetnet.decoder.layers.7.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.8.self_attn.query_proj.weight', 'prophetnet.decoder.layers.3.cross_attn.key_proj.bias', 'prophetnet.encoder.layers.5.self_attn.value_proj.weight', 'prophetnet.decoder.layers.1.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.6.self_attn.key_proj.weight', 'prophetnet.decoder.layers.2.self_attn.relative_pos_embeddings.weight', 'prophetnet.decoder.layers.8.self_attn.value_proj.bias', 'prophetn

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 3.61 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.197 New best_val_rmse: 1.197

16 steps took 2.14 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8405 New best_val_rmse: 0.8405

16 steps took 2.12 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8052 New best_val_rmse: 0.8052

16 steps took 2.12 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8578 Still best_val_rmse: 0.8052 (from epoch 0)

16 steps took 2.12 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.9105 Still best_val_rmse: 0.8052 (from epoch 0)

16 steps took 2.11 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7607 New best_val_rmse: 0.7607

16 steps took 2.12 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7019 New best_val_rmse: 0.7019

16 steps took 2.13 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.8408 Still best_val_rmse: 0.7019 (from epoch 0)

16 steps took 2.13 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.7566 Still best_val_rmse: 0.7019 (from epoch 0)


[32m[I 2021-07-21 16:47:30,329][0m Trial 3 finished with value: 0.701943576335907 and parameters: {'base_lr': 3.876634743181945e-05, 'last_lr': 0.0035502200498399835}. Best is trial 3 with value: 0.701943576335907.[0m



##### Using fold 0
##### Using base_lr 0.0001114647388184985 last_lr 0.004873653853942579


Some weights of the model checkpoint at microsoft/prophetnet-large-uncased were not used when initializing ProphetNetEncoder: ['prophetnet.decoder.layers.6.self_attn.query_proj.weight', 'prophetnet.encoder.layers.0.feed_forward.output.bias', 'prophetnet.decoder.layers.3.cross_attn.query_proj.bias', 'prophetnet.decoder.layers.4.self_attn.relative_pos_embeddings.weight', 'prophetnet.encoder.layers.11.feed_forward.intermediate.weight', 'prophetnet.decoder.layers.6.self_attn.relative_pos_embeddings.bias', 'prophetnet.decoder.layers.7.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.8.self_attn.query_proj.weight', 'prophetnet.decoder.layers.3.cross_attn.key_proj.bias', 'prophetnet.encoder.layers.5.self_attn.value_proj.weight', 'prophetnet.decoder.layers.1.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.6.self_attn.key_proj.weight', 'prophetnet.decoder.layers.2.self_attn.relative_pos_embeddings.weight', 'prophetnet.decoder.layers.8.self_attn.value_proj.bias', 'prophetn

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 3.57 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.317 New best_val_rmse: 1.317

16 steps took 2.14 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.205 New best_val_rmse: 1.205

16 steps took 2.11 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.046 New best_val_rmse: 1.046

16 steps took 2.14 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.9878 New best_val_rmse: 0.9878

16 steps took 2.13 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.8583 New best_val_rmse: 0.8583

16 steps took 2.12 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7639 New best_val_rmse: 0.7639



[32m[I 2021-07-21 16:48:17,000][0m Trial 4 finished with value: 0.7638656497001648 and parameters: {'base_lr': 0.0001114647388184985, 'last_lr': 0.004873653853942579}. Best is trial 3 with value: 0.701943576335907.[0m


##### Using fold 0
##### Using base_lr 0.0002297609878584336 last_lr 0.0009720483444361926


Some weights of the model checkpoint at microsoft/prophetnet-large-uncased were not used when initializing ProphetNetEncoder: ['prophetnet.decoder.layers.6.self_attn.query_proj.weight', 'prophetnet.encoder.layers.0.feed_forward.output.bias', 'prophetnet.decoder.layers.3.cross_attn.query_proj.bias', 'prophetnet.decoder.layers.4.self_attn.relative_pos_embeddings.weight', 'prophetnet.encoder.layers.11.feed_forward.intermediate.weight', 'prophetnet.decoder.layers.6.self_attn.relative_pos_embeddings.bias', 'prophetnet.decoder.layers.7.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.8.self_attn.query_proj.weight', 'prophetnet.decoder.layers.3.cross_attn.key_proj.bias', 'prophetnet.encoder.layers.5.self_attn.value_proj.weight', 'prophetnet.decoder.layers.1.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.6.self_attn.key_proj.weight', 'prophetnet.decoder.layers.2.self_attn.relative_pos_embeddings.weight', 'prophetnet.decoder.layers.8.self_attn.value_proj.bias', 'prophetn

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 3.74 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.195 New best_val_rmse: 1.195

16 steps took 2.14 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.064 New best_val_rmse: 1.064

16 steps took 2.11 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.053 New best_val_rmse: 1.053

16 steps took 2.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.103 Still best_val_rmse: 1.053 (from epoch 0)

16 steps took 2.11 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.166 Still best_val_rmse: 1.053 (from epoch 0)

16 steps took 2.15 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.048 New best_val_rmse: 1.048



[32m[I 2021-07-21 16:49:03,453][0m Trial 5 finished with value: 1.048426866531372 and parameters: {'base_lr': 0.0002297609878584336, 'last_lr': 0.0009720483444361926}. Best is trial 3 with value: 0.701943576335907.[0m


##### Using fold 0
##### Using base_lr 5.631825215796122e-05 last_lr 0.00014888037471785754


Some weights of the model checkpoint at microsoft/prophetnet-large-uncased were not used when initializing ProphetNetEncoder: ['prophetnet.decoder.layers.6.self_attn.query_proj.weight', 'prophetnet.encoder.layers.0.feed_forward.output.bias', 'prophetnet.decoder.layers.3.cross_attn.query_proj.bias', 'prophetnet.decoder.layers.4.self_attn.relative_pos_embeddings.weight', 'prophetnet.encoder.layers.11.feed_forward.intermediate.weight', 'prophetnet.decoder.layers.6.self_attn.relative_pos_embeddings.bias', 'prophetnet.decoder.layers.7.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.8.self_attn.query_proj.weight', 'prophetnet.decoder.layers.3.cross_attn.key_proj.bias', 'prophetnet.encoder.layers.5.self_attn.value_proj.weight', 'prophetnet.decoder.layers.1.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.6.self_attn.key_proj.weight', 'prophetnet.decoder.layers.2.self_attn.relative_pos_embeddings.weight', 'prophetnet.decoder.layers.8.self_attn.value_proj.bias', 'prophetn

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 3.68 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.385 New best_val_rmse: 1.385

16 steps took 2.15 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9786 New best_val_rmse: 0.9786

16 steps took 2.12 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7991 New best_val_rmse: 0.7991

16 steps took 2.12 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.9005 Still best_val_rmse: 0.7991 (from epoch 0)

16 steps took 2.13 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.792 New best_val_rmse: 0.792

16 steps took 2.12 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.8482 Still best_val_rmse: 0.792 (from epoch 0)

16 steps took 2.13 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6957 New best_val_rmse: 0.6957

16 steps took 2.13 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7287 Still best_val_rmse: 0.6957 (from epoch 0)

16 steps took 2.14 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.7017 Still best_val_rmse: 0.6957 (from epoch 0)


[32m[I 2021-07-21 16:50:04,110][0m Trial 6 finished with value: 0.6957458257675171 and parameters: {'base_lr': 5.631825215796122e-05, 'last_lr': 0.00014888037471785754}. Best is trial 6 with value: 0.6957458257675171.[0m



##### Using fold 0
##### Using base_lr 0.0001429269050688748 last_lr 0.0004837707461063687


Some weights of the model checkpoint at microsoft/prophetnet-large-uncased were not used when initializing ProphetNetEncoder: ['prophetnet.decoder.layers.6.self_attn.query_proj.weight', 'prophetnet.encoder.layers.0.feed_forward.output.bias', 'prophetnet.decoder.layers.3.cross_attn.query_proj.bias', 'prophetnet.decoder.layers.4.self_attn.relative_pos_embeddings.weight', 'prophetnet.encoder.layers.11.feed_forward.intermediate.weight', 'prophetnet.decoder.layers.6.self_attn.relative_pos_embeddings.bias', 'prophetnet.decoder.layers.7.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.8.self_attn.query_proj.weight', 'prophetnet.decoder.layers.3.cross_attn.key_proj.bias', 'prophetnet.encoder.layers.5.self_attn.value_proj.weight', 'prophetnet.decoder.layers.1.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.6.self_attn.key_proj.weight', 'prophetnet.decoder.layers.2.self_attn.relative_pos_embeddings.weight', 'prophetnet.decoder.layers.8.self_attn.value_proj.bias', 'prophetn

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 3.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.469 New best_val_rmse: 1.469

16 steps took 2.14 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.174 New best_val_rmse: 1.174

16 steps took 2.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.025 New best_val_rmse: 1.025

16 steps took 2.12 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.026 Still best_val_rmse: 1.025 (from epoch 0)

16 steps took 2.12 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.06 Still best_val_rmse: 1.025 (from epoch 0)

16 steps took 2.12 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.046 Still best_val_rmse: 1.025 (from epoch 0)



[32m[I 2021-07-21 16:50:50,781][0m Trial 7 finished with value: 1.0254660844802856 and parameters: {'base_lr': 0.0001429269050688748, 'last_lr': 0.0004837707461063687}. Best is trial 6 with value: 0.6957458257675171.[0m


##### Using fold 0
##### Using base_lr 0.00033128641808505526 last_lr 0.0010281217826737118


Some weights of the model checkpoint at microsoft/prophetnet-large-uncased were not used when initializing ProphetNetEncoder: ['prophetnet.decoder.layers.6.self_attn.query_proj.weight', 'prophetnet.encoder.layers.0.feed_forward.output.bias', 'prophetnet.decoder.layers.3.cross_attn.query_proj.bias', 'prophetnet.decoder.layers.4.self_attn.relative_pos_embeddings.weight', 'prophetnet.encoder.layers.11.feed_forward.intermediate.weight', 'prophetnet.decoder.layers.6.self_attn.relative_pos_embeddings.bias', 'prophetnet.decoder.layers.7.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.8.self_attn.query_proj.weight', 'prophetnet.decoder.layers.3.cross_attn.key_proj.bias', 'prophetnet.encoder.layers.5.self_attn.value_proj.weight', 'prophetnet.decoder.layers.1.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.6.self_attn.key_proj.weight', 'prophetnet.decoder.layers.2.self_attn.relative_pos_embeddings.weight', 'prophetnet.decoder.layers.8.self_attn.value_proj.bias', 'prophetn

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 3.61 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.054 New best_val_rmse: 1.054

16 steps took 2.13 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.04 New best_val_rmse: 1.04

16 steps took 2.11 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.033 New best_val_rmse: 1.033

16 steps took 2.12 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.106 Still best_val_rmse: 1.033 (from epoch 0)

16 steps took 2.13 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.196 Still best_val_rmse: 1.033 (from epoch 0)

16 steps took 2.16 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.056 Still best_val_rmse: 1.033 (from epoch 0)



[32m[I 2021-07-21 16:51:37,608][0m Trial 8 finished with value: 1.0332621335983276 and parameters: {'base_lr': 0.00033128641808505526, 'last_lr': 0.0010281217826737118}. Best is trial 6 with value: 0.6957458257675171.[0m


##### Using fold 0
##### Using base_lr 0.00010814746949210281 last_lr 0.0001937584354085873


Some weights of the model checkpoint at microsoft/prophetnet-large-uncased were not used when initializing ProphetNetEncoder: ['prophetnet.decoder.layers.6.self_attn.query_proj.weight', 'prophetnet.encoder.layers.0.feed_forward.output.bias', 'prophetnet.decoder.layers.3.cross_attn.query_proj.bias', 'prophetnet.decoder.layers.4.self_attn.relative_pos_embeddings.weight', 'prophetnet.encoder.layers.11.feed_forward.intermediate.weight', 'prophetnet.decoder.layers.6.self_attn.relative_pos_embeddings.bias', 'prophetnet.decoder.layers.7.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.8.self_attn.query_proj.weight', 'prophetnet.decoder.layers.3.cross_attn.key_proj.bias', 'prophetnet.encoder.layers.5.self_attn.value_proj.weight', 'prophetnet.decoder.layers.1.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.6.self_attn.key_proj.weight', 'prophetnet.decoder.layers.2.self_attn.relative_pos_embeddings.weight', 'prophetnet.decoder.layers.8.self_attn.value_proj.bias', 'prophetn

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 3.69 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.402 New best_val_rmse: 1.402

16 steps took 2.11 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.507 Still best_val_rmse: 1.402 (from epoch 0)

16 steps took 2.13 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8946 New best_val_rmse: 0.8946

16 steps took 2.13 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.9425 Still best_val_rmse: 0.8946 (from epoch 0)

16 steps took 2.14 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.236 Still best_val_rmse: 0.8946 (from epoch 0)

16 steps took 2.13 seconds


[32m[I 2021-07-21 16:52:24,762][0m Trial 9 finished with value: 0.8597679734230042 and parameters: {'base_lr': 0.00010814746949210281, 'last_lr': 0.0001937584354085873}. Best is trial 6 with value: 0.6957458257675171.[0m


Epoch: 0 batch_num: 96 val_rmse: 0.8598 New best_val_rmse: 0.8598

##### Using fold 0
##### Using base_lr 3.011135839769023e-05 last_lr 0.00015521593662522676


Some weights of the model checkpoint at microsoft/prophetnet-large-uncased were not used when initializing ProphetNetEncoder: ['prophetnet.decoder.layers.6.self_attn.query_proj.weight', 'prophetnet.encoder.layers.0.feed_forward.output.bias', 'prophetnet.decoder.layers.3.cross_attn.query_proj.bias', 'prophetnet.decoder.layers.4.self_attn.relative_pos_embeddings.weight', 'prophetnet.encoder.layers.11.feed_forward.intermediate.weight', 'prophetnet.decoder.layers.6.self_attn.relative_pos_embeddings.bias', 'prophetnet.decoder.layers.7.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.8.self_attn.query_proj.weight', 'prophetnet.decoder.layers.3.cross_attn.key_proj.bias', 'prophetnet.encoder.layers.5.self_attn.value_proj.weight', 'prophetnet.decoder.layers.1.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.6.self_attn.key_proj.weight', 'prophetnet.decoder.layers.2.self_attn.relative_pos_embeddings.weight', 'prophetnet.decoder.layers.8.self_attn.value_proj.bias', 'prophetn

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 3.72 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8753 New best_val_rmse: 0.8753

16 steps took 2.15 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.05 Still best_val_rmse: 0.8753 (from epoch 0)

16 steps took 2.12 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7627 New best_val_rmse: 0.7627

16 steps took 2.15 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8496 Still best_val_rmse: 0.7627 (from epoch 0)

16 steps took 2.13 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.036 Still best_val_rmse: 0.7627 (from epoch 0)

16 steps took 2.13 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.736 New best_val_rmse: 0.736

16 steps took 2.14 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.8172 Still best_val_rmse: 0.736 (from epoch 0)

16 steps took 2.13 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6891 New best_val_rmse: 0.6891

16 steps took 2.12 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.9938 Still best_val_rmse: 0.6891 (from epoch 0)



[32m[I 2021-07-21 16:53:25,390][0m Trial 10 finished with value: 0.6891382336616516 and parameters: {'base_lr': 3.011135839769023e-05, 'last_lr': 0.00015521593662522676}. Best is trial 10 with value: 0.6891382336616516.[0m


##### Using fold 0
##### Using base_lr 3.1317678752992314e-05 last_lr 0.000158820110171061


Some weights of the model checkpoint at microsoft/prophetnet-large-uncased were not used when initializing ProphetNetEncoder: ['prophetnet.decoder.layers.6.self_attn.query_proj.weight', 'prophetnet.encoder.layers.0.feed_forward.output.bias', 'prophetnet.decoder.layers.3.cross_attn.query_proj.bias', 'prophetnet.decoder.layers.4.self_attn.relative_pos_embeddings.weight', 'prophetnet.encoder.layers.11.feed_forward.intermediate.weight', 'prophetnet.decoder.layers.6.self_attn.relative_pos_embeddings.bias', 'prophetnet.decoder.layers.7.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.8.self_attn.query_proj.weight', 'prophetnet.decoder.layers.3.cross_attn.key_proj.bias', 'prophetnet.encoder.layers.5.self_attn.value_proj.weight', 'prophetnet.decoder.layers.1.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.6.self_attn.key_proj.weight', 'prophetnet.decoder.layers.2.self_attn.relative_pos_embeddings.weight', 'prophetnet.decoder.layers.8.self_attn.value_proj.bias', 'prophetn

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 3.63 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8805 New best_val_rmse: 0.8805

16 steps took 2.15 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.112 Still best_val_rmse: 0.8805 (from epoch 0)

16 steps took 2.13 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8331 New best_val_rmse: 0.8331

16 steps took 2.14 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7255 New best_val_rmse: 0.7255

16 steps took 2.13 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.8897 Still best_val_rmse: 0.7255 (from epoch 0)

16 steps took 2.14 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6975 New best_val_rmse: 0.6975

16 steps took 2.13 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7098 Still best_val_rmse: 0.6975 (from epoch 0)

16 steps took 2.14 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6667 New best_val_rmse: 0.6667

16 steps took 2.14 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6826 Still best_val_rmse: 0.6667 (from epoch 0)


[32m[I 2021-07-21 16:54:26,051][0m Trial 11 finished with value: 0.6667493581771851 and parameters: {'base_lr': 3.1317678752992314e-05, 'last_lr': 0.000158820110171061}. Best is trial 11 with value: 0.6667493581771851.[0m



##### Using fold 0
##### Using base_lr 3.0600169801509264e-05 last_lr 0.0003888383268561251


Some weights of the model checkpoint at microsoft/prophetnet-large-uncased were not used when initializing ProphetNetEncoder: ['prophetnet.decoder.layers.6.self_attn.query_proj.weight', 'prophetnet.encoder.layers.0.feed_forward.output.bias', 'prophetnet.decoder.layers.3.cross_attn.query_proj.bias', 'prophetnet.decoder.layers.4.self_attn.relative_pos_embeddings.weight', 'prophetnet.encoder.layers.11.feed_forward.intermediate.weight', 'prophetnet.decoder.layers.6.self_attn.relative_pos_embeddings.bias', 'prophetnet.decoder.layers.7.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.8.self_attn.query_proj.weight', 'prophetnet.decoder.layers.3.cross_attn.key_proj.bias', 'prophetnet.encoder.layers.5.self_attn.value_proj.weight', 'prophetnet.decoder.layers.1.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.6.self_attn.key_proj.weight', 'prophetnet.decoder.layers.2.self_attn.relative_pos_embeddings.weight', 'prophetnet.decoder.layers.8.self_attn.value_proj.bias', 'prophetn

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 3.68 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8787 New best_val_rmse: 0.8787

16 steps took 2.13 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.121 Still best_val_rmse: 0.8787 (from epoch 0)

16 steps took 2.12 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7982 New best_val_rmse: 0.7982

16 steps took 2.17 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7568 New best_val_rmse: 0.7568

16 steps took 2.13 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.063 Still best_val_rmse: 0.7568 (from epoch 0)

16 steps took 2.11 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.8223 Still best_val_rmse: 0.7568 (from epoch 0)

16 steps took 2.14 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7741 Still best_val_rmse: 0.7568 (from epoch 0)

16 steps took 2.13 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.8672 Still best_val_rmse: 0.7568 (from epoch 0)

16 steps took 2.14 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6917 New best_val_rmse: 0.6917


[32m[I 2021-07-21 16:55:26,911][0m Trial 12 finished with value: 0.6917469501495361 and parameters: {'base_lr': 3.0600169801509264e-05, 'last_lr': 0.0003888383268561251}. Best is trial 11 with value: 0.6667493581771851.[0m



##### Using fold 0
##### Using base_lr 3.0144900564358662e-05 last_lr 0.0001271786893237739


Some weights of the model checkpoint at microsoft/prophetnet-large-uncased were not used when initializing ProphetNetEncoder: ['prophetnet.decoder.layers.6.self_attn.query_proj.weight', 'prophetnet.encoder.layers.0.feed_forward.output.bias', 'prophetnet.decoder.layers.3.cross_attn.query_proj.bias', 'prophetnet.decoder.layers.4.self_attn.relative_pos_embeddings.weight', 'prophetnet.encoder.layers.11.feed_forward.intermediate.weight', 'prophetnet.decoder.layers.6.self_attn.relative_pos_embeddings.bias', 'prophetnet.decoder.layers.7.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.8.self_attn.query_proj.weight', 'prophetnet.decoder.layers.3.cross_attn.key_proj.bias', 'prophetnet.encoder.layers.5.self_attn.value_proj.weight', 'prophetnet.decoder.layers.1.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.6.self_attn.key_proj.weight', 'prophetnet.decoder.layers.2.self_attn.relative_pos_embeddings.weight', 'prophetnet.decoder.layers.8.self_attn.value_proj.bias', 'prophetn

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 3.62 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8753 New best_val_rmse: 0.8753

16 steps took 2.14 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.046 Still best_val_rmse: 0.8753 (from epoch 0)

16 steps took 2.14 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7657 New best_val_rmse: 0.7657

16 steps took 2.12 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8523 Still best_val_rmse: 0.7657 (from epoch 0)

16 steps took 2.12 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.004 Still best_val_rmse: 0.7657 (from epoch 0)

16 steps took 2.14 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7213 New best_val_rmse: 0.7213

16 steps took 2.13 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7936 Still best_val_rmse: 0.7213 (from epoch 0)

16 steps took 2.14 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6712 New best_val_rmse: 0.6712

16 steps took 2.15 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6954 Still best_val_rmse: 0.6712 (from epoch 0)


[32m[I 2021-07-21 16:56:27,237][0m Trial 13 finished with value: 0.6712322235107422 and parameters: {'base_lr': 3.0144900564358662e-05, 'last_lr': 0.0001271786893237739}. Best is trial 11 with value: 0.6667493581771851.[0m



##### Using fold 0
##### Using base_lr 4.9849323716709556e-05 last_lr 8.034760636768166e-05


Some weights of the model checkpoint at microsoft/prophetnet-large-uncased were not used when initializing ProphetNetEncoder: ['prophetnet.decoder.layers.6.self_attn.query_proj.weight', 'prophetnet.encoder.layers.0.feed_forward.output.bias', 'prophetnet.decoder.layers.3.cross_attn.query_proj.bias', 'prophetnet.decoder.layers.4.self_attn.relative_pos_embeddings.weight', 'prophetnet.encoder.layers.11.feed_forward.intermediate.weight', 'prophetnet.decoder.layers.6.self_attn.relative_pos_embeddings.bias', 'prophetnet.decoder.layers.7.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.8.self_attn.query_proj.weight', 'prophetnet.decoder.layers.3.cross_attn.key_proj.bias', 'prophetnet.encoder.layers.5.self_attn.value_proj.weight', 'prophetnet.decoder.layers.1.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.6.self_attn.key_proj.weight', 'prophetnet.decoder.layers.2.self_attn.relative_pos_embeddings.weight', 'prophetnet.decoder.layers.8.self_attn.value_proj.bias', 'prophetn

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 3.66 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.21 New best_val_rmse: 1.21

16 steps took 2.15 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.011 New best_val_rmse: 1.011

16 steps took 2.12 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8091 New best_val_rmse: 0.8091

16 steps took 2.12 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8783 Still best_val_rmse: 0.8091 (from epoch 0)

16 steps took 2.14 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.8059 New best_val_rmse: 0.8059

16 steps took 2.11 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.8667 Still best_val_rmse: 0.8059 (from epoch 0)

16 steps took 2.13 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.8076 Still best_val_rmse: 0.8059 (from epoch 0)

16 steps took 2.13 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7087 New best_val_rmse: 0.7087

16 steps took 2.14 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6687 New best_val_rmse: 0.6687


[32m[I 2021-07-21 16:57:27,606][0m Trial 14 finished with value: 0.6686643362045288 and parameters: {'base_lr': 4.9849323716709556e-05, 'last_lr': 8.034760636768166e-05}. Best is trial 11 with value: 0.6667493581771851.[0m



##### Using fold 0
##### Using base_lr 5.5161998290280325e-05 last_lr 8.160289364513194e-05


Some weights of the model checkpoint at microsoft/prophetnet-large-uncased were not used when initializing ProphetNetEncoder: ['prophetnet.decoder.layers.6.self_attn.query_proj.weight', 'prophetnet.encoder.layers.0.feed_forward.output.bias', 'prophetnet.decoder.layers.3.cross_attn.query_proj.bias', 'prophetnet.decoder.layers.4.self_attn.relative_pos_embeddings.weight', 'prophetnet.encoder.layers.11.feed_forward.intermediate.weight', 'prophetnet.decoder.layers.6.self_attn.relative_pos_embeddings.bias', 'prophetnet.decoder.layers.7.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.8.self_attn.query_proj.weight', 'prophetnet.decoder.layers.3.cross_attn.key_proj.bias', 'prophetnet.encoder.layers.5.self_attn.value_proj.weight', 'prophetnet.decoder.layers.1.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.6.self_attn.key_proj.weight', 'prophetnet.decoder.layers.2.self_attn.relative_pos_embeddings.weight', 'prophetnet.decoder.layers.8.self_attn.value_proj.bias', 'prophetn

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 3.64 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.357 New best_val_rmse: 1.357

16 steps took 2.14 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9685 New best_val_rmse: 0.9685

16 steps took 2.13 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8439 New best_val_rmse: 0.8439

16 steps took 2.13 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8136 New best_val_rmse: 0.8136

16 steps took 2.13 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7972 New best_val_rmse: 0.7972

16 steps took 2.11 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.8151 Still best_val_rmse: 0.7972 (from epoch 0)

16 steps took 2.14 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6736 New best_val_rmse: 0.6736

16 steps took 2.13 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.8039 Still best_val_rmse: 0.6736 (from epoch 0)

16 steps took 2.13 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.8706 Still best_val_rmse: 0.6736 (from epoch 0)


[32m[I 2021-07-21 16:58:27,875][0m Trial 15 finished with value: 0.6736304759979248 and parameters: {'base_lr': 5.5161998290280325e-05, 'last_lr': 8.160289364513194e-05}. Best is trial 11 with value: 0.6667493581771851.[0m



##### Using fold 0
##### Using base_lr 7.483884628849969e-05 last_lr 0.00027518830472027223


Some weights of the model checkpoint at microsoft/prophetnet-large-uncased were not used when initializing ProphetNetEncoder: ['prophetnet.decoder.layers.6.self_attn.query_proj.weight', 'prophetnet.encoder.layers.0.feed_forward.output.bias', 'prophetnet.decoder.layers.3.cross_attn.query_proj.bias', 'prophetnet.decoder.layers.4.self_attn.relative_pos_embeddings.weight', 'prophetnet.encoder.layers.11.feed_forward.intermediate.weight', 'prophetnet.decoder.layers.6.self_attn.relative_pos_embeddings.bias', 'prophetnet.decoder.layers.7.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.8.self_attn.query_proj.weight', 'prophetnet.decoder.layers.3.cross_attn.key_proj.bias', 'prophetnet.encoder.layers.5.self_attn.value_proj.weight', 'prophetnet.decoder.layers.1.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.6.self_attn.key_proj.weight', 'prophetnet.decoder.layers.2.self_attn.relative_pos_embeddings.weight', 'prophetnet.decoder.layers.8.self_attn.value_proj.bias', 'prophetn

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 3.59 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.406 New best_val_rmse: 1.406

16 steps took 2.13 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8224 New best_val_rmse: 0.8224

16 steps took 2.13 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8501 Still best_val_rmse: 0.8224 (from epoch 0)

16 steps took 2.13 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.751 New best_val_rmse: 0.751

16 steps took 2.13 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.8314 Still best_val_rmse: 0.751 (from epoch 0)

16 steps took 2.11 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.112 Still best_val_rmse: 0.751 (from epoch 0)

16 steps took 2.13 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.7838 Still best_val_rmse: 0.751 (from epoch 0)

16 steps took 2.13 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.747 New best_val_rmse: 0.747

16 steps took 2.13 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.8228 Still best_val_rmse: 0.747 (from epoch 0)


[32m[I 2021-07-21 16:59:28,389][0m Trial 16 finished with value: 0.7469640970230103 and parameters: {'base_lr': 7.483884628849969e-05, 'last_lr': 0.00027518830472027223}. Best is trial 11 with value: 0.6667493581771851.[0m



##### Using fold 0
##### Using base_lr 4.4277086329381804e-05 last_lr 0.0009279335806268679


Some weights of the model checkpoint at microsoft/prophetnet-large-uncased were not used when initializing ProphetNetEncoder: ['prophetnet.decoder.layers.6.self_attn.query_proj.weight', 'prophetnet.encoder.layers.0.feed_forward.output.bias', 'prophetnet.decoder.layers.3.cross_attn.query_proj.bias', 'prophetnet.decoder.layers.4.self_attn.relative_pos_embeddings.weight', 'prophetnet.encoder.layers.11.feed_forward.intermediate.weight', 'prophetnet.decoder.layers.6.self_attn.relative_pos_embeddings.bias', 'prophetnet.decoder.layers.7.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.8.self_attn.query_proj.weight', 'prophetnet.decoder.layers.3.cross_attn.key_proj.bias', 'prophetnet.encoder.layers.5.self_attn.value_proj.weight', 'prophetnet.decoder.layers.1.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.6.self_attn.key_proj.weight', 'prophetnet.decoder.layers.2.self_attn.relative_pos_embeddings.weight', 'prophetnet.decoder.layers.8.self_attn.value_proj.bias', 'prophetn

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 3.66 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.12 New best_val_rmse: 1.12

16 steps took 2.15 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.071 New best_val_rmse: 1.071

16 steps took 2.14 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7532 New best_val_rmse: 0.7532

16 steps took 2.13 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7507 New best_val_rmse: 0.7507

16 steps took 2.13 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7682 Still best_val_rmse: 0.7507 (from epoch 0)

16 steps took 2.14 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.7305 New best_val_rmse: 0.7305

16 steps took 2.12 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.8008 Still best_val_rmse: 0.7305 (from epoch 0)

16 steps took 2.13 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.7995 Still best_val_rmse: 0.7305 (from epoch 0)

16 steps took 2.14 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.7287 New best_val_rmse: 0.7287


[32m[I 2021-07-21 17:00:29,083][0m Trial 17 finished with value: 0.7286903262138367 and parameters: {'base_lr': 4.4277086329381804e-05, 'last_lr': 0.0009279335806268679}. Best is trial 11 with value: 0.6667493581771851.[0m



##### Using fold 0
##### Using base_lr 9.11692304824071e-05 last_lr 0.001779649998969463


Some weights of the model checkpoint at microsoft/prophetnet-large-uncased were not used when initializing ProphetNetEncoder: ['prophetnet.decoder.layers.6.self_attn.query_proj.weight', 'prophetnet.encoder.layers.0.feed_forward.output.bias', 'prophetnet.decoder.layers.3.cross_attn.query_proj.bias', 'prophetnet.decoder.layers.4.self_attn.relative_pos_embeddings.weight', 'prophetnet.encoder.layers.11.feed_forward.intermediate.weight', 'prophetnet.decoder.layers.6.self_attn.relative_pos_embeddings.bias', 'prophetnet.decoder.layers.7.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.8.self_attn.query_proj.weight', 'prophetnet.decoder.layers.3.cross_attn.key_proj.bias', 'prophetnet.encoder.layers.5.self_attn.value_proj.weight', 'prophetnet.decoder.layers.1.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.6.self_attn.key_proj.weight', 'prophetnet.decoder.layers.2.self_attn.relative_pos_embeddings.weight', 'prophetnet.decoder.layers.8.self_attn.value_proj.bias', 'prophetn

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 3.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.359 New best_val_rmse: 1.359

16 steps took 2.14 seconds
Epoch: 0 batch_num: 32 val_rmse: 1.016 New best_val_rmse: 1.016

16 steps took 2.11 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9046 New best_val_rmse: 0.9046

16 steps took 2.12 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7934 New best_val_rmse: 0.7934

16 steps took 2.12 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.8078 Still best_val_rmse: 0.7934 (from epoch 0)

16 steps took 2.11 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.288 Still best_val_rmse: 0.7934 (from epoch 0)



[32m[I 2021-07-21 17:01:15,882][0m Trial 18 finished with value: 0.7933805584907532 and parameters: {'base_lr': 9.11692304824071e-05, 'last_lr': 0.001779649998969463}. Best is trial 11 with value: 0.6667493581771851.[0m


##### Using fold 0
##### Using base_lr 0.00016026748118707276 last_lr 0.00010651126202041471


Some weights of the model checkpoint at microsoft/prophetnet-large-uncased were not used when initializing ProphetNetEncoder: ['prophetnet.decoder.layers.6.self_attn.query_proj.weight', 'prophetnet.encoder.layers.0.feed_forward.output.bias', 'prophetnet.decoder.layers.3.cross_attn.query_proj.bias', 'prophetnet.decoder.layers.4.self_attn.relative_pos_embeddings.weight', 'prophetnet.encoder.layers.11.feed_forward.intermediate.weight', 'prophetnet.decoder.layers.6.self_attn.relative_pos_embeddings.bias', 'prophetnet.decoder.layers.7.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.8.self_attn.query_proj.weight', 'prophetnet.decoder.layers.3.cross_attn.key_proj.bias', 'prophetnet.encoder.layers.5.self_attn.value_proj.weight', 'prophetnet.decoder.layers.1.feed_forward.intermediate.bias', 'prophetnet.decoder.layers.6.self_attn.key_proj.weight', 'prophetnet.decoder.layers.2.self_attn.relative_pos_embeddings.weight', 'prophetnet.decoder.layers.8.self_attn.value_proj.bias', 'prophetn

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


16 steps took 3.6 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.458 New best_val_rmse: 1.458

16 steps took 2.13 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.9896 New best_val_rmse: 0.9896

16 steps took 2.12 seconds
Epoch: 0 batch_num: 48 val_rmse: 1.2 Still best_val_rmse: 0.9896 (from epoch 0)

16 steps took 2.12 seconds
Epoch: 0 batch_num: 64 val_rmse: 1.089 Still best_val_rmse: 0.9896 (from epoch 0)

16 steps took 2.13 seconds
Epoch: 0 batch_num: 80 val_rmse: 1.064 Still best_val_rmse: 0.9896 (from epoch 0)

16 steps took 2.12 seconds
Epoch: 0 batch_num: 96 val_rmse: 1.039 Still best_val_rmse: 0.9896 (from epoch 0)



[32m[I 2021-07-21 17:02:02,828][0m Trial 19 finished with value: 0.989554226398468 and parameters: {'base_lr': 0.00016026748118707276, 'last_lr': 0.00010651126202041471}. Best is trial 11 with value: 0.6667493581771851.[0m
[32m[I 2021-07-21 17:02:02,830][0m A new study created in memory with name: no-name-adbfd20b-3dd5-4e87-b52e-293a3f350f07[0m


 Best value:  0.6667493581771851
 Best params: 
    base_lr: 3.1317678752992314e-05
    last_lr: 0.000158820110171061
##### Using fold 1
##### Using base_lr 4.0393720359433316e-05 last_lr 0.002291879195268982


KeyboardInterrupt: 

In [None]:
%%time

for i in range(3, len(list(splits))):
    fold = i
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=20)
    print(" Best value: ", study.best_trial.value)
    print(" Best params: ")
    for key, value in study.best_trial.params.items():
        print(f"    {key}: {value}")

In [None]:
for i in range(2, 3):
    fold = i
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=20)
    print(" Best value: ", study.best_trial.value)
    print(" Best params: ")
    for key, value in study.best_trial.params.items():
        print(f"    {key}: {value}")

### Verify the model

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm

In [None]:
cfg.model_offset = 0
cfg.model_limit = 6
cfg.n_folds = 5
cfg.svm_kernels = ['rbf']
cfg.svm_c = 5

In [None]:
num_bins = int(np.ceil(np.log2(len(train_df))))
train_df['bins'] = pd.cut(train_df['target'], bins=num_bins, labels=False)
bins = train_df['bins'].values

In [None]:
%%time

inference_models = []
for i in range(1, cfg.NUM_FOLDS + 1):
    print(f'Model {i}')
    inference_model = CommonLitModel()
    inference_model = inference_model.cuda()
    inference_model.load_state_dict(torch.load(str(MODELS_PATH/f"{cfg.model_name.replace('/', '_')}_{i}/model_{i}.pth")))
    inference_model.eval();
    inference_models.append(inference_model)

In [None]:
from transformers import RobertaTokenizer

tokenizers = []
for i in range(1, cfg.NUM_FOLDS):
    tokenizer = RobertaTokenizer.from_pretrained(MODELS_PATH/f"{cfg.model_name.replace('/', '_')}_{i}")
    tokenizers.append(tokenizer)

In [None]:
def get_cls_embeddings(dl, transformer_model):
    cls_embeddings = []
    with torch.no_grad():
        for input_features in tqdm(dl, total=len(dl)):
            output, context_vector = transformer_model(input_features['input_ids'].cuda(), input_features['attention_mask'].cuda())
#             cls_embeddings.extend(output['last_hidden_state'][:,0,:].detach().cpu().numpy())
            embedding_out = context_vector.detach().cpu().numpy()
            cls_embeddings.extend(embedding_out)
    return np.array(cls_embeddings)

In [None]:
def rmse_score(X, y):
    return np.sqrt(mean_squared_error(X, y))

In [None]:
def convert_to_list(t):
    return t.flatten().long()

class CommonLitDataset(nn.Module):
    def __init__(self, text, test_id, tokenizer, max_len=128):
        self.excerpt = text
        self.test_id = test_id
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        return {'input_ids': convert_to_list(encode['input_ids']),
                'attention_mask': convert_to_list(encode['attention_mask']),
                'id': self.test_id[idx]}
    
    def __len__(self):
        return len(self.excerpt)

In [None]:
def create_dl(df, tokenizer):
    text = df['excerpt'].values
    ids = df['id'].values
    ds = CommonLitDataset(text, ids, tokenizer, max_len=cfg.MAX_LEN)
    return DataLoader(ds, 
                      batch_size = cfg.BATCH_SIZE,
                      shuffle=False,
                      num_workers = 1,
                      pin_memory=True,
                      drop_last=False
                     )

In [None]:
train_df = pd.read_csv(DATA_PATH/'train-orig.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')
remove_unnecessary(train_df)

In [None]:
train_target_mean = train_df['target'].mean()
train_target_std = train_df['target'].std()
train_df['normalized_target'] = (train_df['target'] - train_target_mean) / train_target_std

In [None]:
%%time

train_target = train_df['normalized_target'].values

def calc_mean(scores):
    return np.mean(np.array(scores), axis=0)

final_scores = []
final_rmse = []
kernel_rmse_score_mean = []
final_kernel_predictions_means = []
for j, (inference_model, tokenizer) in enumerate(zip(inference_models, tokenizers)):
    print('Model', j)
    test_dl = create_dl(test_df, tokenizer)
    train_dl = create_dl(train_df, tokenizer)
    transformer_model = inference_model
    transformer_model.cuda()
    X = get_cls_embeddings(train_dl, transformer_model)
    
    y = train_target
    X_test = get_cls_embeddings(test_dl, transformer_model)
    
    kfold = StratifiedKFold(n_splits=cfg.NUM_FOLDS)
    scores = []
    rmse_scores = []
    kernel_predictions_means = []
    for kernel in cfg.svm_kernels:
        print('Kernel', kernel)
        kernel_scores = []
        kernel_rmse_scores = []
        kernel_predictions = []
        for k, (train_idx, valid_idx) in enumerate(kfold.split(X, bins)):

            print('Fold', k, train_idx.shape, valid_idx.shape)
            model = SVR(C=cfg.svm_c, kernel=kernel, gamma='auto')

            X_train, y_train = X[train_idx], y[train_idx]
            X_valid, y_valid = X[valid_idx], y[valid_idx]
            model.fit(X_train, y_train)
            prediction = model.predict(X_valid)
            kernel_predictions.append(prediction)
            kernel_rmse_scores.append(rmse_score(prediction, y_valid))
            print('rmse_score', kernel_rmse_scores[k])
            kernel_scores.append(model.predict(X_test))
        kernel_predictions_means.append(np.array([np.mean(kp) for kp in kernel_predictions]).mean())
        scores.append(calc_mean(kernel_scores))
        kernel_rmse_score = calc_mean(kernel_rmse_scores)
        kernel_rmse_score_mean.append(kernel_rmse_score)
        rmse_scores.append(kernel_rmse_score)
    final_kernel_predictions_means.append(kernel_predictions_means)
    final_scores.append(calc_mean(scores))
    final_rmse.append(calc_mean(rmse_scores))
print('FINAL RMSE score', np.mean(np.array(final_rmse)))

In [None]:
final_kernel_predictions_means

In [None]:
# (train_df['target'] - cfg.train_target_mean) / cfg.train_target_std
final_scores_normalized = np.array(final_scores) * train_target_std + train_target_mean

In [None]:
kernel_rmse_score_mean_array = np.array(kernel_rmse_score_mean)
kernel_rmse_score_mean_sum = np.sum(kernel_rmse_score_mean_array)
prop_losses = kernel_rmse_score_mean_array / kernel_rmse_score_mean_sum
prop_losses_sum = (1 - prop_losses).sum()
weights = (1 - prop_losses) / prop_losses_sum
weights

In [None]:
def calc_mean(scores, weights=weights):
    return np.average(np.array(scores), weights=weights, axis=0)

In [None]:
target_mean = train_df['target'].mean()
final_scores_flat = calc_mean(final_scores_normalized).flatten()
final_scores_mean = final_scores_flat.mean()
target_mean, np.array(final_scores_normalized).mean()
# (-0.9579984513405823, -0.8029817438292849)

In [None]:
final_scores_flat

In [None]:
mean_diff = target_mean - final_scores_mean
mean_diff, mean_diff / len(final_scores)

In [None]:
sample_df['target'] = final_scores_flat + mean_diff
# sample_df['target'] = len(final_scores) / np.sum(1 / np.array(final_scores), axis=0) # harmonic mean
sample_df

### Prepare Packaging

In [None]:
cfg.model_name

In [None]:
BEST_MODEL_FOLDER = MODELS_PATH/cfg.model_name/'best'
!rm -rf {BEST_MODEL_FOLDER}
!mkdir -p {BEST_MODEL_FOLDER}

In [None]:
BEST_MODEL_FOLDER

In [None]:
cfg.NUM_FOLDS

In [None]:
bestmodels = [MODELS_PATH/f'{cfg.model_name}_{i + 1}' for i in range(0, cfg.NUM_FOLDS)]

In [None]:
bestmodels

In [None]:
from shutil import copyfile

def normalize_name(path_name):
    return path_name.replace('', '')

for i, best_model in enumerate(bestmodels):
    print(f'Processing {i}th model')
    i = i + 1
    best_model_file = f'{best_model}/model_{i}.pth'
    if Path(best_model_file).exists():
        copyfile(best_model_file, f'{BEST_MODEL_FOLDER}/{i}_pytorch_model.bin')
        tokenizer_path = Path(BEST_MODEL_FOLDER/f'tokenizer-{i}')
        tokenizer_path.mkdir(parents=True, exist_ok=True)
        assert tokenizer_path.exists()

        tokenizer_json = Path(normalize_name(f'{MODELS_PATH/cfg.model_name}_{i}/tokenizer_config.json'))
        assert tokenizer_json.exists(), f'{tokenizer_json} does not exist'
        copyfile(tokenizer_json, tokenizer_path/'tokenizer.json')

        vocab_txt = Path(normalize_name(f'{MODELS_PATH/cfg.model_name}_{i}/vocab.json'))
        assert vocab_txt.exists(), f'{vocab_txt} does not exist'
        copyfile(vocab_txt, tokenizer_path/'vocab.json')

        merges = Path(normalize_name(f'{MODELS_PATH/cfg.model_name}_{i}/merges.txt'))
        assert merges.exists()
        copyfile(merges, tokenizer_path/'merges.txt')
    else:
        print(f'{best_model_file} is missing')

In [None]:
import shutil

shutil.make_archive(MODELS_PATH/cfg.model_name/'best_models', 'zip', BEST_MODEL_FOLDER)

In [None]:
!ls {MODELS_PATH/cfg.model_name}

In [None]:
!mv {MODELS_PATH}/{cfg.model_name}.yaml {MODELS_PATH/cfg.model_name}

In [None]:
transformer_model.transformer_model.save_pretrained(save_directory=f'{MODELS_PATH/cfg.model_name}/lm')

In [None]:
!du -h {MODELS_PATH/cfg.model_name}/*

In [None]:
shutil.make_archive(MODELS_PATH/cfg.model_name/'lm', 'zip', f'{MODELS_PATH/cfg.model_name}/lm')

In [None]:
!kaggle datasets init -p {MODELS_PATH/cfg.model_name}

In [None]:
dataset_json_path = Path(MODELS_PATH/cfg.model_name/'dataset-metadata.json')
assert dataset_json_path.exists()

In [None]:
!cat {str(dataset_json_path)}

In [None]:
with open(dataset_json_path, 'r') as f:
    dataset_json = f.read()
    dataset_json = dataset_json.replace('INSERT_TITLE_HERE', f'commonlit-{cfg.model_name}-light').replace('INSERT_SLUG_HERE', f'commonlit-{cfg.model_name}-light')
    print(dataset_json)
with(open(dataset_json_path, 'w')) as f:
    f.write(dataset_json)

In [None]:
!rm -rf {MODELS_PATH/cfg.model_name}/best
!rm -rf {MODELS_PATH/cfg.model_name}/lm

In [None]:
!kaggle datasets create -p {MODELS_PATH/cfg.model_name}

In [None]:
!kaggle datasets version -p {MODELS_PATH/cfg.model_name} -m "Version with merges.txt" -d

In [None]:
state_dict = torch.load(str(MODELS_PATH/f'distilroberta-0/checkpoint-105/pytorch_model.bin'))

In [None]:
loaded_model = CommonLitModel()

In [None]:
loaded_model.load_state_dict(state_dict)