In [1]:
# !pip install transformers

In [2]:
import os, gc, sys, time, collections, random
import numpy as np
import pandas as pd

from typing import Dict, Optional, Union, Any, List, Tuple

from pathlib import Path
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn

import torch.utils.data as D
from torch.utils.data.dataset import Dataset, IterableDataset
from torch.utils.data.dataloader import DataLoader

from transformers import DistilBertTokenizerFast
from transformers import DistilBertModel
from transformers import BertTokenizerFast
from transformers import BertModel
from transformers import Trainer
from transformers import AutoModel, AutoTokenizer
from transformers.data.processors.utils import InputFeatures
from transformers import TrainingArguments
from transformers.trainer_utils import EvalLoopOutput
from transformers.trainer import logging
from transformers.file_utils import is_torch_tpu_available, is_sagemaker_mp_enabled
from transformers.trainer_pt_utils import find_batch_size, nested_concat, nested_numpify, nested_truncate, nested_detach

### Folders and Dataframes

In [3]:
DATA_PATH = Path('/home/commonlit/data/')
assert DATA_PATH.exists()
MODELS_PATH = Path('/home/commonlit/models/')
if not MODELS_PATH.exists():
    os.mkdir(MODELS_PATH)
assert MODELS_PATH.exists()

In [4]:
!ls {DATA_PATH}

commonlitreadabilityprize.zip  sample_submission.csv  test.csv	train.csv


In [5]:
train_df = pd.read_csv(DATA_PATH/'train.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')
sample_df = pd.read_csv(DATA_PATH/'sample_submission.csv')

In [6]:
train_df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845
...,...,...,...,...,...,...
2829,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,When you think of dinosaurs and where they liv...,1.711390,0.646900
2830,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,So what is a solid? Solids are usually hard be...,0.189476,0.535648
2831,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,The second state of matter we will discuss is ...,0.255209,0.483866
2832,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,Solids are shapes that you can actually touch....,-0.215279,0.514128


### Prepare Cross Validation

In [7]:
target = train_df['target'].to_numpy()

In [8]:
num_bins = int(np.floor(np.log2(len(train_df))) + 1)
train_df.loc[:, 'bins'] = pd.cut(train_df['target'], bins=num_bins, labels=False)

In [9]:
train_df[['target', 'bins']].groupby(['bins']).agg(['mean', 'count'])

Unnamed: 0_level_0,target,target
Unnamed: 0_level_1,mean,count
bins,Unnamed: 1_level_2,Unnamed: 2_level_2
0,-3.413097,43
1,-2.969369,79
2,-2.526589,172
3,-2.106393,269
4,-1.652726,366
5,-1.201502,418
6,-0.748738,481
7,-0.3098,405
8,0.130016,312
9,0.560407,183


In [10]:
kf = StratifiedKFold(n_splits=num_bins)

In [11]:
for i, (t_, v_) in enumerate(kf.split(X=train_df, y=train_df.bins.values)):
    train_df.loc[v_, 'kfold'] = i

In [12]:
train_df['kfold'] = train_df['kfold'].astype(np.uint8)

In [13]:
train_df = train_df.drop('bins', axis=1)

In [14]:
train_df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,kfold
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009,0
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805,0
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676,0
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007,0
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845,0
...,...,...,...,...,...,...,...
2829,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,When you think of dinosaurs and where they liv...,1.711390,0.646900,11
2830,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,So what is a solid? Solids are usually hard be...,0.189476,0.535648,11
2831,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,The second state of matter we will discuss is ...,0.255209,0.483866,11
2832,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,Solids are shapes that you can actually touch....,-0.215279,0.514128,11


In [15]:
bin_list = list(range(num_bins))
random.shuffle(bin_list)
bin_list

[10, 3, 6, 7, 9, 0, 1, 2, 8, 11, 5, 4]

### Metrics

In [16]:
def rmse_score(y_true, y_pred):
    return np.sqrt(np.mean((y_pred - y_true) ** 2))

def rmse_score_2(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [17]:
a = np.random.rand(10)
b = np.random.rand(10)

In [18]:
rmse_score(a, b), rmse_score_2(a, b)

(0.43627309883301485, 0.43627309883301485)

### Configuration

In [19]:
class CONFIG():
    model_name = 'bert'
    batch_size = 80
    max_len = 256
    save_dir = f'trained/{model_name}'
    num_workers = 2
    epochs = 20
    pretrained_transformers_model = f'{model_name}-base-uncased'
    split = 2

In [20]:
cfg = CONFIG()

### Prepare train test split

In [21]:
def create_split(fold = [1]):
    valid_df = train_df[train_df['kfold'].isin(fold)]
    valid_text = valid_df['excerpt'].values
    valid_target = valid_df['target'].values
    training_df = train_df[~train_df['kfold'].isin(fold)]
    train_text = training_df['excerpt'].values
    train_target = training_df['target'].values
    return train_text, train_target, valid_text, valid_target

In [22]:
train_text, train_target, valid_text, valid_target = create_split([cfg.split])
len(train_text), len(valid_text)

(2598, 236)

### Prepare Tokenizers

In [23]:
tokenizer = AutoTokenizer.from_pretrained(cfg.pretrained_transformers_model)
# Save the tokenizer so that you can download the files and move it to a Kaggle dataset.
tokenizer.save_pretrained(cfg.save_dir)

('trained/bert/tokenizer_config.json',
 'trained/bert/special_tokens_map.json',
 'trained/bert/vocab.txt',
 'trained/bert/added_tokens.json',
 'trained/bert/tokenizer.json')

In [24]:
encoded_dict = tokenizer(train_df['excerpt'].values[0],
                                return_tensors='pt',
                                max_length=cfg.max_len,
                                padding='max_length',
                                truncation=True)
decoded = tokenizer.decode(encoded_dict["input_ids"].squeeze())
decoded

'[CLS] when the young people returned to the ballroom, it presented a decidedly changed appearance. instead of an interior scene, it was a winter landscape. the floor was covered with snow - white canvas, not laid on smoothly, but rumpled over bumps and hillocks, like a real snow field. the numerous palms and evergreens that had decorated the room, were powdered with flour and strewn with tufts of cotton, like snow. also diamond dust had been lightly sprinkled on them, and glittering crystal icicles hung from the branches. at each end of the room, on the wall, hung a beautiful bear - skin rug. these rugs were for prizes, one for the girls and one for the boys. and this was the game. the girls were gathered at one end of the room and the boys at the other, and one end was called the north pole, and the other the south pole. each player was given a small flag which they were to plant on reaching the pole. this would have been an easy matter, but each traveller was obliged to wear snowsho

In [25]:
encoded_dict['input_ids'].shape

torch.Size([1, 256])

In [26]:
def convert_to_list(t):
    return t.flatten().long()

class CommonLitDataset(nn.Module):
    def __init__(self, text, target, tokenizer, max_len=128):
        self.excerpt = text
        self.target = target
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        return InputFeatures(input_ids=convert_to_list(encode['input_ids']),
                      attention_mask=convert_to_list(encode['attention_mask']),
                      label=torch.tensor(self.target[idx]))
    
    def __len__(self):
        return len(self.excerpt)

In [27]:
def create_train_valid_ds(tokenizer, train_text, train_target, valid_text, valid_target):
    train_ds = CommonLitDataset(train_text, train_target, tokenizer, cfg.max_len)
    valid_ds = CommonLitDataset(valid_text, valid_target, tokenizer, cfg.max_len)
    return train_ds, valid_ds

In [28]:
# train_dl = D.DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, num_workers=cfg.num_workers)
# train_dl = D.DataLoader(valid_ds, batch_size=cfg.batch_size, shuffle=False, num_workers=cfg.num_workers)

In [29]:
# encode, target = next(iter(train_dl))

In [30]:
# encode.keys(), target.shape, encode['input_ids'].shape, encode['attention_mask'].shape

In [31]:
# encode['input_ids'][0].squeeze()

### Model

In [32]:
# You can use a Transformer model of your choice.
# transformer_model = DistilBertModel.from_pretrained(cfg.pretrained_transformers_model)
transformer_model = AutoModel.from_pretrained(cfg.pretrained_transformers_model)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [33]:
# transformer_out = transformer_model(input_ids=encode['input_ids'].squeeze(), attention_mask=encode['attention_mask'].squeeze())

In [34]:
# dict(transformer_out)['last_hidden_state'].shape

In [35]:
# torch.mean(transformer_out.last_hidden_state, axis=1).shape

In [36]:
# sample_layer = nn.Linear(768, 1)

In [37]:
# sample_layer(torch.mean(transformer_out.last_hidden_state, axis=1)).shape

In [38]:
from transformers import PreTrainedModel

class CommonLitModel(PreTrainedModel):
    def __init__(self):
        super(PreTrainedModel, self).__init__()
        self.transformer_model = AutoModel.from_pretrained(cfg.pretrained_transformers_model)
        self.drop = nn.Dropout(0.5)
        self.out = nn.Linear(768, 1)
        self.config = self.transformer_model.config
        
    def forward(self, input_ids, attention_mask):
        transformer_out = self.transformer_model(input_ids=input_ids.squeeze(), attention_mask=attention_mask.squeeze(), output_hidden_states=False)
        x = transformer_out.pooler_output
#         x = transformer_out.last_hidden_state[:, 0, :]
        x = self.drop(x)
        x = self.out(x)
        return x
    
    def floating_point_ops(self, inputs: Dict[str, Union[torch.Tensor, Any]]):
        """
        For models that inherit from :class:`~transformers.PreTrainedModel`, uses that method to compute the number of
        floating point operations for every backward + forward pass. If using another model, either implement such a
        method in the model or subclass and override this method.
        Args:
            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.
        Returns:
            :obj:`int`: The number of floating-point operations.
        """
        return 0

In [39]:
model = CommonLitModel()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [40]:
encoded_dict.input_ids.shape

torch.Size([1, 256])

In [41]:
transformer_model = transformer_model.cuda()
sample_out = transformer_model(encoded_dict.input_ids.cuda(), encoded_dict.attention_mask.cuda(), output_hidden_states=True)

In [42]:
train_ds, valid_ds = create_train_valid_ds(tokenizer, train_text, train_target, valid_text, valid_target)

In [43]:
encode = train_ds[0]

In [44]:
encode.attention_mask.unsqueeze(0).shape, encoded_dict.input_ids.shape

(torch.Size([1, 256]), torch.Size([1, 256]))

In [45]:
sample_out = transformer_model(encode.input_ids.unsqueeze(0).cuda(), encode.attention_mask.unsqueeze(0).cuda())

In [46]:
sample_out.pooler_output.shape

torch.Size([1, 768])

### Training

In [47]:
import wandb
wandb.init(project="commonlit")

[34m[1mwandb[0m: Currently logged in as: [33mgilf[0m (use `wandb login --relogin` to force relogin)


In [48]:
loss_fct = nn.MSELoss()

In [49]:
def create_training_args(fold):
    training_args = TrainingArguments(
        output_dir=str(MODELS_PATH/f'{cfg.model_name}-{fold}'),
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,
        per_device_train_batch_size=cfg.batch_size,
        per_device_eval_batch_size=cfg.batch_size,
        num_train_epochs=cfg.epochs,
        logging_strategy="epoch",
        logging_first_step=True,
        save_steps=40000,
        fp16=True,
        evaluation_strategy="epoch",
        save_total_limit = 3,
        load_best_model_at_end=True,
        metric_for_best_model='mse',
        greater_is_better=False,
        gradient_accumulation_steps=2,
        learning_rate=5e-5
    )
    return training_args

In [50]:
training_args = create_training_args(cfg.split)

In [51]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    return {'mse': mean_squared_error(logits, labels), 'rmse': rmse_score_2(logits, labels)}

In [52]:
tokenizer = AutoTokenizer.from_pretrained(cfg.pretrained_transformers_model)

In [53]:
logger = logging.get_logger(__name__)

class CommonLitTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        input_ids = inputs.pop("input_ids")
        attention_mask = inputs.pop("attention_mask")
        outputs = model(input_ids, attention_mask)
        logits = outputs
        loss = loss_fct(logits.flatten(),
                        labels.float().flatten())
        zero_cat = torch.zeros([1, 1]).to(outputs.device)
        return (loss, torch.cat([zero_cat, outputs])) if return_outputs else loss

In [54]:
!rm -rf /home/commonlit/models/bert-*

In [55]:
%%time

from transformers import EarlyStoppingCallback

bin_step = 1
bestmodels = []
eval_rmses = []
for i in range(0, num_bins, bin_step):
    wandb.init()
    train_bins = bin_list[i:i+bin_step]
    print('train_bins', f'{i}: {train_bins}')
    tokenizer = AutoTokenizer.from_pretrained(cfg.pretrained_transformers_model)
    train_text, train_target, valid_text, valid_target = create_split([i])
    train_ds, valid_ds = create_train_valid_ds(tokenizer, train_text, train_target, valid_text, valid_target)
    training_args = create_training_args(i)
    model = CommonLitModel()
    trainer = CommonLitTrainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=valid_ds,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=9)]
    )
    trainer.train()
    trainer.save_model()
    print('training_args.output_dir', training_args.output_dir)
    tokenizer.save_pretrained(training_args.output_dir)
    result = trainer.evaluate()
    bestmodels.append(trainer.state.best_model_checkpoint)
    print('best_model_checkpoint', trainer.state.best_model_checkpoint)
    print('result', result)
    eval_rmses.append(result['eval_rmse'])

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

train_bins 0: [10]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch,Training Loss,Validation Loss,Mse,Rmse
0,1.2889,0.597303,0.597303,0.772854
1,0.5001,0.502688,0.502688,0.709005
2,0.3181,0.307054,0.307054,0.554124
3,0.223,0.493393,0.493393,0.702419
4,0.1518,0.41028,0.41028,0.640531
5,0.1044,0.583482,0.583481,0.76386
6,0.1248,0.646072,0.646072,0.803786
7,0.0859,0.371755,0.371755,0.609718
8,0.0555,0.408381,0.408382,0.639047
9,0.0474,0.46101,0.46101,0.678977


training_args.output_dir /home/commonlit/models/bert-0


best_model_checkpoint /home/commonlit/models/bert-0/checkpoint-48
result {'eval_loss': 0.307053804397583, 'eval_mse': 0.3070538341999054, 'eval_rmse': 0.5541244149208069, 'eval_runtime': 1.3412, 'eval_samples_per_second': 176.708, 'epoch': 11.97, 'eval_mem_cpu_alloc_delta': 0, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 0, 'eval_mem_gpu_peaked_delta': 944294400}


VBox(children=(Label(value=' 0.03MB of 0.03MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train/loss,0.0339
train/learning_rate,2e-05
train/epoch,11.97
train/global_step,192.0
_runtime,278.0
_timestamp,1622196185.0
_step,26.0
eval/loss,0.30705
eval/mse,0.30705
eval/rmse,0.55412


0,1
train/loss,█▄▂▂▂▁▁▁▁▁▁▁▁
train/learning_rate,█▇▇▆▆▅▅▄▃▃▂▂▁
train/epoch,▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▆▆▆▆▇▇▇▇████
train/global_step,▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▆▆▆▆▇▇▇▇████
_runtime,▁▁▂▂▂▃▃▃▃▄▄▄▄▅▅▆▆▆▆▇▇▇▇████
_timestamp,▁▁▂▂▂▃▃▃▃▄▄▄▄▅▅▆▆▆▆▇▇▇▇████
_step,▁▁▂▂▂▂▃▃▃▃▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇██
eval/loss,▇▅▁▅▃▇█▂▃▄▅▆▁
eval/mse,▇▅▁▅▃▇█▂▃▄▅▆▁
eval/rmse,▇▅▁▅▃▇█▃▃▅▅▆▁


train_bins 1: [3]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch,Training Loss,Validation Loss,Mse,Rmse
0,1.1874,0.672168,0.672168,0.819858
1,0.4885,0.351761,0.351761,0.593095
2,0.3576,0.320682,0.320682,0.566288
3,0.2465,0.382358,0.382358,0.618351
4,0.1739,0.360723,0.360723,0.600603
5,0.1801,0.449627,0.449627,0.670543
6,0.1357,0.377559,0.377559,0.614458
7,0.0898,0.311363,0.311363,0.557999
8,0.0726,0.368873,0.368873,0.607349
9,0.0637,0.373716,0.373716,0.611323


training_args.output_dir /home/commonlit/models/bert-1


best_model_checkpoint /home/commonlit/models/bert-1/checkpoint-192
result {'eval_loss': 0.30099478363990784, 'eval_mse': 0.30099475383758545, 'eval_rmse': 0.548629879951477, 'eval_runtime': 1.3526, 'eval_samples_per_second': 175.221, 'epoch': 19.97, 'eval_mem_cpu_alloc_delta': 0, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 0, 'eval_mem_gpu_peaked_delta': 944294400}


VBox(children=(Label(value=' 0.03MB of 0.03MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train/loss,0.0215
train/learning_rate,0.0
train/epoch,19.97
train/global_step,320.0
_runtime,455.0
_timestamp,1622196645.0
_step,42.0
eval/loss,0.30099
eval/mse,0.30099
eval/rmse,0.54863


0,1
train/loss,█▅▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,██▇▇▇▆▆▆▅▅▅▄▄▃▃▃▂▂▂▁▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇█████
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇█████
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇█████
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇█████
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇██
eval/loss,█▂▁▃▂▄▂▁▂▂▁▁▃▂▁▂▂▂▂▂▁
eval/mse,█▂▁▃▂▄▂▁▂▂▁▁▃▂▁▂▂▂▂▂▁
eval/rmse,█▂▁▃▂▄▃▁▃▃▁▁▃▂▂▂▂▂▂▂▁


train_bins 2: [6]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch,Training Loss,Validation Loss,Mse,Rmse
0,1.1235,0.489318,0.489318,0.699513
1,0.4849,0.426611,0.426611,0.653155
2,0.3191,0.34239,0.34239,0.585141
3,0.2242,0.378172,0.378172,0.614957
4,0.1911,0.573299,0.573299,0.757165
5,0.1346,0.306037,0.306037,0.553206
6,0.115,0.388525,0.388525,0.623318
7,0.1042,0.431388,0.431389,0.656802
8,0.0829,0.398221,0.398221,0.631048
9,0.0748,0.351657,0.351657,0.593007


training_args.output_dir /home/commonlit/models/bert-2


best_model_checkpoint /home/commonlit/models/bert-2/checkpoint-96
result {'eval_loss': 0.3060372769832611, 'eval_mse': 0.3060373067855835, 'eval_rmse': 0.5532063841819763, 'eval_runtime': 1.3607, 'eval_samples_per_second': 173.441, 'epoch': 14.97, 'eval_mem_cpu_alloc_delta': -176816128, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 176816128, 'eval_mem_gpu_peaked_delta': 944294400}


VBox(children=(Label(value=' 0.03MB of 0.03MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train/loss,0.0411
train/learning_rate,1e-05
train/epoch,14.97
train/global_step,240.0
_runtime,344.0
_timestamp,1622196993.0
_step,32.0
eval/loss,0.30604
eval/mse,0.30604
eval/rmse,0.55321


0,1
train/loss,█▄▂▂▂▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,██▇▇▆▆▅▅▄▄▃▃▂▂▁▁
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇██████
train/global_step,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇██████
_runtime,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
_timestamp,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
_step,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇███
eval/loss,▆▄▂▃█▁▃▄▃▂▃▂▃▃▄▁
eval/mse,▆▄▂▃█▁▃▄▃▂▃▂▃▃▄▁
eval/rmse,▆▄▂▃█▁▃▅▄▂▄▂▃▃▄▁


train_bins 3: [7]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch,Training Loss,Validation Loss,Mse,Rmse
0,1.0137,0.508218,0.508218,0.712894
1,0.5125,0.467039,0.467039,0.683403
2,0.3539,0.295093,0.295093,0.543225
3,0.2864,0.313196,0.313196,0.559639
4,0.1962,0.542101,0.542101,0.736275
5,0.1448,0.424731,0.424731,0.651714
6,0.0995,0.569742,0.569741,0.754812
7,0.0817,0.445117,0.445117,0.667171
8,0.0678,0.654498,0.654498,0.80901
9,0.0552,0.561809,0.561809,0.749539


training_args.output_dir /home/commonlit/models/bert-3


best_model_checkpoint /home/commonlit/models/bert-3/checkpoint-48
result {'eval_loss': 0.29509344696998596, 'eval_mse': 0.2950934171676636, 'eval_rmse': 0.5432249903678894, 'eval_runtime': 1.3628, 'eval_samples_per_second': 173.179, 'epoch': 11.97, 'eval_mem_cpu_alloc_delta': -154705920, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 154705920, 'eval_mem_gpu_peaked_delta': 944294400}


VBox(children=(Label(value=' 0.03MB of 0.03MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train/loss,0.0397
train/learning_rate,2e-05
train/epoch,11.97
train/global_step,192.0
_runtime,278.0
_timestamp,1622197275.0
_step,26.0
eval/loss,0.29509
eval/mse,0.29509
eval/rmse,0.54322


0,1
train/loss,█▄▃▂▂▂▁▁▁▁▁▁▁
train/learning_rate,█▇▇▆▆▅▅▄▃▃▂▂▁
train/epoch,▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▆▆▆▆▇▇▇▇████
train/global_step,▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▆▆▆▆▇▇▇▇████
_runtime,▁▁▂▂▂▃▃▃▃▄▄▄▄▅▅▆▆▆▆▇▇▇▇████
_timestamp,▁▁▂▂▂▃▃▃▃▄▄▄▄▅▅▆▆▆▆▇▇▇▇████
_step,▁▁▂▂▂▂▃▃▃▃▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇██
eval/loss,▅▄▁▁▆▄▆▄█▆▃▆▁
eval/mse,▅▄▁▁▆▄▆▄█▆▃▆▁
eval/rmse,▅▅▁▁▆▄▇▄█▆▄▆▁


train_bins 4: [9]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch,Training Loss,Validation Loss,Mse,Rmse
0,1.0609,0.392449,0.392449,0.626457
1,0.447,0.344281,0.344281,0.586754
2,0.2922,0.58311,0.58311,0.763616
3,0.2275,0.317434,0.317434,0.563413
4,0.1556,0.447778,0.447778,0.669162
5,0.1361,0.645342,0.645342,0.803332
6,0.107,0.421243,0.421243,0.649032
7,0.0834,0.595452,0.595452,0.771655
8,0.0937,0.360089,0.360088,0.600074
9,0.0573,0.444973,0.444974,0.667063


training_args.output_dir /home/commonlit/models/bert-4


best_model_checkpoint /home/commonlit/models/bert-4/checkpoint-64
result {'eval_loss': 0.31743407249450684, 'eval_mse': 0.31743407249450684, 'eval_rmse': 0.5634129047393799, 'eval_runtime': 1.3535, 'eval_samples_per_second': 174.363, 'epoch': 12.97, 'eval_mem_cpu_alloc_delta': 0, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 0, 'eval_mem_gpu_peaked_delta': 944294400}


VBox(children=(Label(value=' 0.03MB of 0.03MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train/loss,0.0383
train/learning_rate,2e-05
train/epoch,12.97
train/global_step,208.0
_runtime,305.0
_timestamp,1622197585.0
_step,28.0
eval/loss,0.31743
eval/mse,0.31743
eval/rmse,0.56341


0,1
train/loss,█▄▂▂▂▁▁▁▁▁▁▁▁▁
train/learning_rate,█▇▇▆▆▅▅▄▄▃▃▂▂▁
train/epoch,▁▁▁▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
train/global_step,▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
_runtime,▁▁▁▂▂▂▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
_timestamp,▁▁▁▂▂▂▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
_step,▁▁▁▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇▇██
eval/loss,▃▂▇▁▄█▃▇▂▄▃▂▄▁
eval/mse,▃▂▇▁▄█▃▇▂▄▃▂▄▁
eval/rmse,▃▂▇▁▄█▃▇▂▄▃▂▄▁


train_bins 5: [0]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch,Training Loss,Validation Loss,Mse,Rmse
0,0.947,0.377731,0.377731,0.614598
1,0.4341,0.32067,0.32067,0.566277
2,0.2954,0.278443,0.278443,0.527677
3,0.2307,0.415549,0.415549,0.644631
4,0.1663,0.374867,0.374867,0.612264
5,0.1177,0.278159,0.278159,0.527408
6,0.1064,0.279811,0.279811,0.528972
7,0.0814,0.274138,0.274138,0.523582
8,0.068,0.431991,0.43199,0.65726
9,0.0746,0.380415,0.380416,0.616778


training_args.output_dir /home/commonlit/models/bert-5


best_model_checkpoint /home/commonlit/models/bert-5/checkpoint-128
result {'eval_loss': 0.2741376459598541, 'eval_mse': 0.2741377055644989, 'eval_rmse': 0.5235816240310669, 'eval_runtime': 1.3512, 'eval_samples_per_second': 174.657, 'epoch': 16.97, 'eval_mem_cpu_alloc_delta': -262144, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 262144, 'eval_mem_gpu_peaked_delta': 944294400}


VBox(children=(Label(value=' 0.03MB of 0.03MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train/loss,0.0234
train/learning_rate,1e-05
train/epoch,16.97
train/global_step,272.0
_runtime,392.0
_timestamp,1622197981.0
_step,36.0
eval/loss,0.27414
eval/mse,0.27414
eval/rmse,0.52358


0,1
train/loss,█▄▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,██▇▇▆▆▆▅▅▄▄▃▃▃▂▂▁▁
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇██████
train/global_step,▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇██████
_runtime,▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
_timestamp,▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
eval/loss,▆▃▁▇▅▁▁▁█▆▇▅▄▂▇▅▃▁
eval/mse,▆▃▁▇▅▁▁▁█▆▇▅▄▂▇▅▃▁
eval/rmse,▆▃▁▇▆▁▁▁█▆▇▅▄▂▇▆▃▁


train_bins 6: [1]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch,Training Loss,Validation Loss,Mse,Rmse
0,0.8834,0.361409,0.361409,0.601173
1,0.4216,0.295252,0.295252,0.543371
2,0.2875,0.302217,0.302217,0.549742
3,0.2328,0.473655,0.473655,0.688226
4,0.1825,0.219806,0.219806,0.468834
5,0.1413,0.249278,0.249278,0.499278
6,0.1184,0.513094,0.513094,0.716306
7,0.1012,0.258693,0.258692,0.508618
8,0.0852,0.255998,0.255998,0.505963
9,0.0673,0.46698,0.46698,0.683359


training_args.output_dir /home/commonlit/models/bert-6


best_model_checkpoint /home/commonlit/models/bert-6/checkpoint-80
result {'eval_loss': 0.2198057323694229, 'eval_mse': 0.2198057323694229, 'eval_rmse': 0.46883442997932434, 'eval_runtime': 1.3451, 'eval_samples_per_second': 175.446, 'epoch': 13.97, 'eval_mem_cpu_alloc_delta': 0, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 0, 'eval_mem_gpu_peaked_delta': 944294400}


VBox(children=(Label(value=' 0.03MB of 0.03MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train/loss,0.0387
train/learning_rate,2e-05
train/epoch,13.97
train/global_step,224.0
_runtime,331.0
_timestamp,1622198317.0
_step,30.0
eval/loss,0.21981
eval/mse,0.21981
eval/rmse,0.46883


0,1
train/loss,█▅▃▂▂▂▁▁▁▁▁▁▁▁▁
train/learning_rate,██▇▇▆▆▅▅▄▄▃▃▂▂▁
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
_runtime,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
_timestamp,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
_step,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇███
eval/loss,▄▃▃▇▁▂█▂▂▇▃▆▃▃▁
eval/mse,▄▃▃▇▁▂█▂▂▇▃▆▃▃▁
eval/rmse,▅▃▃▇▁▂█▂▂▇▃▆▃▃▁


train_bins 7: [2]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch,Training Loss,Validation Loss,Mse,Rmse
0,0.9521,0.288253,0.288253,0.536892
1,0.4282,0.288453,0.288453,0.537078
2,0.3266,0.28111,0.28111,0.530198
3,0.2429,0.314621,0.314621,0.560911
4,0.1781,0.455873,0.455873,0.675183
5,0.1235,0.293208,0.293208,0.541487
6,0.0931,0.350949,0.350949,0.592409
7,0.0729,0.28342,0.28342,0.532372
8,0.0554,0.324688,0.324688,0.569814
9,0.05,0.297372,0.297372,0.545318


training_args.output_dir /home/commonlit/models/bert-7


best_model_checkpoint /home/commonlit/models/bert-7/checkpoint-208
result {'eval_loss': 0.2610973119735718, 'eval_mse': 0.2610972821712494, 'eval_rmse': 0.5109767913818359, 'eval_runtime': 1.344, 'eval_samples_per_second': 175.6, 'epoch': 19.97, 'eval_mem_cpu_alloc_delta': 0, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 0, 'eval_mem_gpu_peaked_delta': 944294400}


VBox(children=(Label(value=' 0.03MB of 0.03MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train/loss,0.0181
train/learning_rate,0.0
train/epoch,19.97
train/global_step,320.0
_runtime,461.0
_timestamp,1622198781.0
_step,42.0
eval/loss,0.2611
eval/mse,0.2611
eval/rmse,0.51098


0,1
train/loss,█▅▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,██▇▇▇▆▆▆▅▅▅▄▄▃▃▃▂▂▂▁▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇█████
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇█████
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇█████
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇█████
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇██
eval/loss,▂▂▂▃█▂▄▂▃▂▂▂▁▅▁▃▂▂▂▂▁
eval/mse,▂▂▂▃█▂▄▂▃▂▂▂▁▅▁▃▂▂▂▂▁
eval/rmse,▂▂▂▃█▂▄▂▄▂▃▂▁▅▁▃▂▂▂▃▁


train_bins 8: [8]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch,Training Loss,Validation Loss,Mse,Rmse
0,1.1577,0.416455,0.416455,0.645333
1,0.471,0.398478,0.398478,0.631251
2,0.346,0.323144,0.323144,0.568457
3,0.2648,0.293164,0.293164,0.541446
4,0.2163,0.55505,0.55505,0.745017
5,0.1553,0.339826,0.339826,0.582946
6,0.1349,0.438015,0.438015,0.661827
7,0.0904,0.544145,0.544145,0.737662
8,0.0958,0.592364,0.592364,0.769652
9,0.1023,0.564698,0.564698,0.751464


training_args.output_dir /home/commonlit/models/bert-8


best_model_checkpoint /home/commonlit/models/bert-8/checkpoint-64
result {'eval_loss': 0.2931639850139618, 'eval_mse': 0.2931639850139618, 'eval_rmse': 0.5414462089538574, 'eval_runtime': 1.3393, 'eval_samples_per_second': 176.209, 'epoch': 12.97, 'eval_mem_cpu_alloc_delta': 0, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 0, 'eval_mem_gpu_peaked_delta': 944294400}


VBox(children=(Label(value=' 0.03MB of 0.03MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train/loss,0.0524
train/learning_rate,2e-05
train/epoch,12.97
train/global_step,208.0
_runtime,308.0
_timestamp,1622199095.0
_step,28.0
eval/loss,0.29316
eval/mse,0.29316
eval/rmse,0.54145


0,1
train/loss,█▄▂▂▂▁▁▁▁▁▁▁▁▁
train/learning_rate,██▇▆▆▅▅▄▄▃▃▂▂▁
train/epoch,▁▁▁▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
train/global_step,▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
_runtime,▁▁▁▂▂▂▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
_timestamp,▁▁▁▂▂▂▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
_step,▁▁▁▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇▇██
eval/loss,▄▃▂▁▇▂▄▆▇▇█▃▅▁
eval/mse,▄▃▂▁▇▂▄▆▇▇█▃▅▁
eval/rmse,▄▄▂▁▇▂▄▇▇▇█▄▅▁


train_bins 9: [11]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch,Training Loss,Validation Loss,Mse,Rmse
0,0.972,0.578082,0.578082,0.760317
1,0.4375,0.479791,0.479791,0.692669
2,0.3046,0.380459,0.380459,0.616814
3,0.2079,0.406177,0.406177,0.63732
4,0.1739,0.308677,0.308677,0.555587
5,0.1324,0.330026,0.330026,0.574479
6,0.1097,0.523641,0.523641,0.72363
7,0.0838,0.308934,0.308934,0.555818
8,0.0725,0.301592,0.301592,0.549174
9,0.0582,0.451022,0.451022,0.671581


training_args.output_dir /home/commonlit/models/bert-9


best_model_checkpoint /home/commonlit/models/bert-9/checkpoint-144
result {'eval_loss': 0.3015921413898468, 'eval_mse': 0.301592081785202, 'eval_rmse': 0.5491740107536316, 'eval_runtime': 1.3489, 'eval_samples_per_second': 174.956, 'epoch': 17.97, 'eval_mem_cpu_alloc_delta': 0, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 0, 'eval_mem_gpu_peaked_delta': 944294400}


VBox(children=(Label(value=' 0.03MB of 0.03MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train/loss,0.0217
train/learning_rate,1e-05
train/epoch,17.97
train/global_step,288.0
_runtime,420.0
_timestamp,1622199520.0
_step,38.0
eval/loss,0.30159
eval/mse,0.30159
eval/rmse,0.54917


0,1
train/loss,█▅▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,██▇▇▆▆▆▅▅▅▄▄▃▃▃▂▂▁▁
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇██████
train/global_step,▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇██████
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇█████
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇█████
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
eval/loss,█▆▃▄▁▂▇▁▁▅▂▂▃▃▃▄▃▃▁
eval/mse,█▆▃▄▁▂▇▁▁▅▂▂▃▃▃▄▃▃▁
eval/rmse,█▆▃▄▁▂▇▁▁▅▂▂▃▃▃▄▃▄▁


train_bins 10: [5]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch,Training Loss,Validation Loss,Mse,Rmse
0,0.918,0.540998,0.540998,0.735525
1,0.3936,0.738494,0.738494,0.859357
2,0.2953,0.353594,0.353594,0.594638
3,0.212,0.510102,0.510102,0.714214
4,0.1897,0.352614,0.352614,0.593813
5,0.1607,0.299924,0.299924,0.547653
6,0.1315,0.377753,0.377753,0.614616
7,0.0958,0.476691,0.476691,0.690428
8,0.0668,0.525158,0.525158,0.724678
9,0.0576,0.457778,0.457778,0.676593


training_args.output_dir /home/commonlit/models/bert-10


best_model_checkpoint /home/commonlit/models/bert-10/checkpoint-96
result {'eval_loss': 0.299924373626709, 'eval_mse': 0.299924373626709, 'eval_rmse': 0.5476534962654114, 'eval_runtime': 1.3425, 'eval_samples_per_second': 175.791, 'epoch': 14.97, 'eval_mem_cpu_alloc_delta': 0, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 0, 'eval_mem_gpu_peaked_delta': 944294400}


VBox(children=(Label(value=' 0.03MB of 0.03MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train/loss,0.026
train/learning_rate,1e-05
train/epoch,14.97
train/global_step,240.0
_runtime,351.0
_timestamp,1622199876.0
_step,32.0
eval/loss,0.29992
eval/mse,0.29992
eval/rmse,0.54765


0,1
train/loss,█▄▂▂▂▂▁▁▁▁▁▁▁▁▁▁
train/learning_rate,██▇▇▆▆▅▅▄▄▃▃▂▂▁▁
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇██████
train/global_step,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇██████
_runtime,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
_timestamp,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
_step,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇███
eval/loss,▅█▂▄▂▁▂▄▅▄▃▄▄▃▃▁
eval/mse,▅█▂▄▂▁▂▄▅▄▃▄▄▃▃▁
eval/rmse,▅█▂▅▂▁▃▄▅▄▃▄▅▄▃▁


train_bins 11: [4]


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch,Training Loss,Validation Loss,Mse,Rmse
0,0.9866,0.485803,0.485803,0.696996
1,0.4457,0.468408,0.468408,0.684403
2,0.3243,0.348973,0.348973,0.59074
3,0.2323,0.379724,0.379724,0.616217
4,0.1675,0.372726,0.372726,0.610513
5,0.1329,0.467292,0.467292,0.683588
6,0.0931,0.359733,0.359733,0.599778
7,0.0974,0.582988,0.582988,0.763537
8,0.0745,0.371522,0.371522,0.609526
9,0.059,0.569263,0.569263,0.754495


training_args.output_dir /home/commonlit/models/bert-11


best_model_checkpoint /home/commonlit/models/bert-11/checkpoint-48
result {'eval_loss': 0.3489733636379242, 'eval_mse': 0.3489733636379242, 'eval_rmse': 0.5907396674156189, 'eval_runtime': 1.3419, 'eval_samples_per_second': 175.869, 'epoch': 11.97, 'eval_mem_cpu_alloc_delta': 0, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 0, 'eval_mem_gpu_peaked_delta': 944294400}
CPU times: user 1h 43min 11s, sys: 22min 13s, total: 2h 5min 25s
Wall time: 1h 11min 2s


In [56]:
trainer.train()

Epoch,Training Loss,Validation Loss,Mse,Rmse
0,0.2251,0.385473,0.385473,0.620865
1,0.1922,0.463153,0.463153,0.680553
2,0.1684,0.533111,0.533111,0.730144
3,0.1325,0.398889,0.398889,0.631576
4,0.113,0.527647,0.527647,0.726393
5,0.0997,0.486656,0.486656,0.697608
6,0.0879,0.504787,0.504787,0.710484
7,0.0786,0.470719,0.470719,0.68609
8,0.0811,0.471283,0.471283,0.686501
9,0.0762,0.471283,0.471283,0.686501


TrainOutput(global_step=160, training_loss=0.12613386586308478, metrics={'train_runtime': 225.1119, 'train_samples_per_second': 1.422, 'total_flos': 0, 'epoch': 9.97, 'train_mem_cpu_alloc_delta': 278528, 'train_mem_gpu_alloc_delta': 512, 'train_mem_cpu_peaked_delta': 187420672, 'train_mem_gpu_peaked_delta': 18555851264})

In [57]:
BEST_MODEL_FOLDER = MODELS_PATH/cfg.model_name/'best'
!rm -rf {BEST_MODEL_FOLDER}
!mkdir -p {BEST_MODEL_FOLDER}

In [58]:
from shutil import copyfile

for i, best_model in enumerate(bestmodels):
    best_model_file = f'{best_model}/pytorch_model.bin'
    if Path(best_model_file).exists():
        copyfile(best_model_file, f'{BEST_MODEL_FOLDER}/{i}_pytorch_model.bin')
        tokenizer_path = Path(BEST_MODEL_FOLDER/f'tokenizer-{i}')
        tokenizer_path.mkdir(parents=True, exist_ok=True)
        assert tokenizer_path.exists()

        tokenizer_json = Path(f'{MODELS_PATH/cfg.model_name}-{i}/tokenizer.json')
        assert tokenizer_json.exists()
        copyfile(tokenizer_json, tokenizer_path/'tokenizer.json')

        vocab_txt = Path(f'{MODELS_PATH/cfg.model_name}-{i}/vocab.txt')
        assert vocab_txt.exists()
        copyfile(vocab_txt, tokenizer_path/'vocab.txt')

        config_json = Path(f'{MODELS_PATH/cfg.model_name}-{i}/config.json')
        assert config_json.exists()
        copyfile(config_json, tokenizer_path/'config.json')
    else:
        print(f'{best_model_file} is missing')

/home/commonlit/models/bert-11/checkpoint-48/pytorch_model.bin is missing


In [59]:
bestmodels

['/home/commonlit/models/bert-0/checkpoint-48',
 '/home/commonlit/models/bert-1/checkpoint-192',
 '/home/commonlit/models/bert-2/checkpoint-96',
 '/home/commonlit/models/bert-3/checkpoint-48',
 '/home/commonlit/models/bert-4/checkpoint-64',
 '/home/commonlit/models/bert-5/checkpoint-128',
 '/home/commonlit/models/bert-6/checkpoint-80',
 '/home/commonlit/models/bert-7/checkpoint-208',
 '/home/commonlit/models/bert-8/checkpoint-64',
 '/home/commonlit/models/bert-9/checkpoint-144',
 '/home/commonlit/models/bert-10/checkpoint-96',
 '/home/commonlit/models/bert-11/checkpoint-48']

In [60]:
import shutil

shutil.make_archive(MODELS_PATH/cfg.model_name/'best_models', 'zip', BEST_MODEL_FOLDER)

'/home/commonlit/models/bert/best_models.zip'