In [None]:
# !pip install transformers

In [1]:
import os, gc, sys, time, collections, random
import numpy as np
import pandas as pd

from typing import Dict, Optional, Union, Any, List, Tuple

from pathlib import Path
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn

import torch.utils.data as D
from torch.utils.data.dataset import Dataset, IterableDataset
from torch.utils.data.dataloader import DataLoader

from transformers import DistilBertTokenizerFast
from transformers import DistilBertModel
from transformers import BertTokenizerFast
from transformers import BertModel
from transformers import Trainer
from transformers import AutoModel, AutoTokenizer, AutoConfig
from transformers.data.processors.utils import InputFeatures
from transformers import TrainingArguments
from transformers.trainer_utils import EvalLoopOutput
from transformers.trainer import logging
from transformers.file_utils import is_torch_tpu_available, is_sagemaker_mp_enabled
from transformers.trainer_pt_utils import find_batch_size, nested_concat, nested_numpify, nested_truncate, nested_detach

### Folders and Dataframes

In [2]:
DATA_PATH = Path('/home/commonlit/data/')
assert DATA_PATH.exists()
MODELS_PATH = Path('/home/commonlit/models/')
if not MODELS_PATH.exists():
    os.mkdir(MODELS_PATH)
assert MODELS_PATH.exists()

In [3]:
!ls {DATA_PATH}

commonlit_lm		       sample_submission.csv  train-orig.csv
commonlitreadabilityprize.zip  test.csv		      train.csv


In [4]:
train_df = pd.read_csv(DATA_PATH/'train.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')
sample_df = pd.read_csv(DATA_PATH/'sample_submission.csv')

In [5]:
train_df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
2,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
3,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
4,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
...,...,...,...,...,...,...
2836,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,When you think of dinosaurs and where they liv...,1.711390,0.646900
2837,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,So what is a solid? Solids are usually hard be...,0.189476,0.535648
2838,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,The second state of matter we will discuss is ...,0.255209,0.483866
2839,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,Solids are shapes that you can actually touch....,-0.215279,0.514128


In [6]:
train_df[train_df['id'] == '5127fb10f']['excerpt'].values

array(['The Battle of Waterloo was a battle that was fought mostly between French and British forces. Napoleon was crowned as Emperor of France in 1804. Then he launched many successful attacks on other countries in Europe. France soon had an empire that stretched from Spain to the Russian border. The only country that was still not captured was Great Britain. The Royal Navy had many ships, so invasion by France was not possible. However, Great Britain was not strong enough to stop Napoleon and his army from taking over most of mainland Europe.\nNapoleon seemed unstoppable until two separate campaigns caused his empire to fall apart. He gathered a huge army to invade and conquer Russia once and for all in 1812. However, he did not think that he would have very many difficulties and it turned out he did. His army was caught by the Russian winter and destroyed by the weather and lack of food.',
       'Napoleon was crowned as Emperor of France in 1804, and then launched the successful Na

In [7]:
train_df[train_df['id'] == '5127fb10f']['target']

257   -0.338548
258   -0.338548
Name: target, dtype: float64

### Prepare Cross Validation

In [8]:
target = train_df['target'].to_numpy()

In [9]:
num_bins = int(np.floor(np.log2(len(train_df))) + 1)
train_df.loc[:, 'bins'] = pd.cut(train_df['target'], bins=num_bins, labels=False)

In [10]:
train_df[['target', 'bins']].groupby(['bins']).agg(['mean', 'count'])

Unnamed: 0_level_0,target,target
Unnamed: 0_level_1,mean,count
bins,Unnamed: 1_level_2,Unnamed: 2_level_2
0,-3.413097,43
1,-2.969369,79
2,-2.526589,172
3,-2.106393,269
4,-1.652726,366
5,-1.20115,419
6,-0.74879,483
7,-0.30957,408
8,0.130016,312
9,0.560802,184


In [11]:
kf = StratifiedKFold(n_splits=num_bins)

In [12]:
for i, (t_, v_) in enumerate(kf.split(X=train_df, y=train_df.bins.values)):
    train_df.loc[v_, 'kfold'] = i

In [13]:
train_df['kfold'] = train_df['kfold'].astype(np.uint8)

In [14]:
train_df = train_df.drop('bins', axis=1)

In [15]:
train_df

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error,kfold
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009,0
1,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009,0
2,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805,0
3,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676,0
4,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007,0
...,...,...,...,...,...,...,...
2836,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,When you think of dinosaurs and where they liv...,1.711390,0.646900,11
2837,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,So what is a solid? Solids are usually hard be...,0.189476,0.535648,11
2838,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,The second state of matter we will discuss is ...,0.255209,0.483866,11
2839,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,Solids are shapes that you can actually touch....,-0.215279,0.514128,11


In [16]:
bin_list = list(range(num_bins))
random.shuffle(bin_list)
bin_list

[4, 10, 9, 2, 0, 1, 5, 8, 7, 6, 11, 3]

### Metrics

In [17]:
def rmse_score(y_true, y_pred):
    return np.sqrt(np.mean((y_pred - y_true) ** 2))

def rmse_score_2(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [18]:
a = np.random.rand(10)
b = np.random.rand(10)

In [19]:
rmse_score(a, b), rmse_score_2(a, b)

(0.3642612017514833, 0.3642612017514833)

### Configuration

In [20]:
class CONFIG():
    model_name = 'valhalla/distilbart-mnli-12-9'
    batch_size = 16
    max_len = 256
    save_dir = f'trained/{model_name}'
    num_workers = 2
    epochs = 30
    pretrained_transformers_model = f'/home/commonlit/models/distilbart/lm'

In [21]:
cfg = CONFIG()

### Prepare train test split

In [22]:
def create_split(fold = [1]):
    valid_df = train_df[train_df['kfold'].isin(fold)]
    valid_text = valid_df['excerpt'].values
    valid_target = valid_df['target'].values
    training_df = train_df[~train_df['kfold'].isin(fold)]
    train_text = training_df['excerpt'].values
    train_target = training_df['target'].values
    return train_text, train_target, valid_text, valid_target

In [23]:
train_text, train_target, valid_text, valid_target = create_split([0])
len(train_text), len(valid_text)

(2604, 237)

### Prepare Tokenizers

In [24]:
tokenizer = AutoTokenizer.from_pretrained(cfg.pretrained_transformers_model)
# Save the tokenizer so that you can download the files and move it to a Kaggle dataset.
tokenizer.save_pretrained(cfg.save_dir)

('trained/valhalla/distilbart-mnli-12-9/tokenizer_config.json',
 'trained/valhalla/distilbart-mnli-12-9/special_tokens_map.json',
 'trained/valhalla/distilbart-mnli-12-9/vocab.json',
 'trained/valhalla/distilbart-mnli-12-9/merges.txt',
 'trained/valhalla/distilbart-mnli-12-9/added_tokens.json',
 'trained/valhalla/distilbart-mnli-12-9/tokenizer.json')

In [25]:
encoded_dict = tokenizer(train_df['excerpt'].values[0],
                                return_tensors='pt',
                                max_length=cfg.max_len,
                                padding='max_length',
                                truncation=True)
decoded = tokenizer.decode(encoded_dict["input_ids"].squeeze())
decoded

'<s>When the young people returned to the ballroom, it presented a decidedly changed appearance. Instead of an interior scene, it was a winter landscape.\nThe floor was covered with snow-white canvas, not laid on smoothly, but rumpled over bumps and hillocks, like a real snow field. The numerous palms and evergreens that had decorated the room, were powdered with flour and strewn with tufts of cotton, like snow. Also diamond dust had been lightly sprinkled on them, and glittering crystal icicles hung from the branches.\nAt each end of the room, on the wall, hung a beautiful bear-skin rug.\nThese rugs were for prizes, one for the girls and one for the boys. And this was the game.\nThe girls were gathered at one end of the room and the boys at the other, and one end was called the North Pole, and the other the South Pole. Each player was given a small flag which they were to plant on reaching the Pole.\nThis would have been an easy matter, but each traveller was obliged to wear snowshoes

In [26]:
encoded_dict['input_ids'].shape

torch.Size([1, 256])

In [27]:
def convert_to_list(t):
    return t.flatten().long()

class CommonLitDataset(nn.Module):
    def __init__(self, text, target, tokenizer, max_len=128):
        self.excerpt = text
        self.target = target
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self, idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)
        return InputFeatures(input_ids=convert_to_list(encode['input_ids']),
                      attention_mask=convert_to_list(encode['attention_mask']),
                      label=torch.tensor(self.target[idx]))
    
    def __len__(self):
        return len(self.excerpt)

In [28]:
def create_train_valid_ds(tokenizer, train_text, train_target, valid_text, valid_target):
    train_ds = CommonLitDataset(train_text, train_target, tokenizer, cfg.max_len)
    valid_ds = CommonLitDataset(valid_text, valid_target, tokenizer, cfg.max_len)
    return train_ds, valid_ds

In [29]:
# train_dl = D.DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, num_workers=cfg.num_workers)
# train_dl = D.DataLoader(valid_ds, batch_size=cfg.batch_size, shuffle=False, num_workers=cfg.num_workers)

In [30]:
# encode, target = next(iter(train_dl))

In [31]:
# encode.keys(), target.shape, encode['input_ids'].shape, encode['attention_mask'].shape

In [32]:
# encode['input_ids'][0].squeeze()

### Model

In [33]:
# You can use a Transformer model of your choice.
# transformer_model = DistilBertModel.from_pretrained(cfg.pretrained_transformers_model)
transformer_model = AutoModel.from_pretrained(cfg.pretrained_transformers_model)

Some weights of the model checkpoint at /home/commonlit/models/distilbart/lm were not used when initializing BartModel: ['final_logits_bias', 'lm_head.weight']
- This IS expected if you are initializing BartModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BartModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [34]:
# transformer_out = transformer_model(input_ids=encode['input_ids'].squeeze(), attention_mask=encode['attention_mask'].squeeze())

In [35]:
# dict(transformer_out)['last_hidden_state'].shape

In [36]:
# torch.mean(transformer_out.last_hidden_state, axis=1).shape

In [37]:
# sample_layer = nn.Linear(768, 1)

In [38]:
model_config = AutoConfig.from_pretrained(cfg.pretrained_transformers_model)

In [39]:
model_config

BartConfig {
  "_name_or_path": "valhalla/distilbart-mnli-12-9",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 9,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "extra_pos_embeddings": 2,
  "finetuning_task": "mnli",
  "force_bos_token_to_be_generated": false,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "contradiction",
    "1": "neutral",
    "2": "entailment"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id

In [40]:
from torch.nn import functional as F

In [41]:
from transformers import PreTrainedModel

class CommonLitModel(PreTrainedModel):
    def __init__(self):
        super(PreTrainedModel, self).__init__()
        self.transformer_model = AutoModel.from_pretrained(cfg.pretrained_transformers_model)
        self.drop = nn.Dropout(0.5)
        self.config = AutoConfig.from_pretrained(cfg.pretrained_transformers_model)
        self.layer_norm = nn.LayerNorm(self.config.max_position_embeddings)
        self.out = nn.Linear(self.config.max_position_embeddings, 1)
#         self._init_weights(self.layer_norm)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        if isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def forward(self, input_ids, attention_mask):
        transformer_out = self.transformer_model(input_ids=input_ids.squeeze(), attention_mask=attention_mask.squeeze(), output_hidden_states=False)
#         x = transformer_out.pooler_output
        x = transformer_out.last_hidden_state[:, 0, :] # N, C, X
        x = self.layer_norm(x)
        x = self.drop(x)
        x = self.out(x)
        return x
    
    def floating_point_ops(self, inputs: Dict[str, Union[torch.Tensor, Any]]):
        """
        For models that inherit from :class:`~transformers.PreTrainedModel`, uses that method to compute the number of
        floating point operations for every backward + forward pass. If using another model, either implement such a
        method in the model or subclass and override this method.
        Args:
            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.
        Returns:
            :obj:`int`: The number of floating-point operations.
        """
        return 0

In [42]:
model = CommonLitModel()

Some weights of the model checkpoint at /home/commonlit/models/distilbart/lm were not used when initializing BartModel: ['final_logits_bias', 'lm_head.weight']
- This IS expected if you are initializing BartModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BartModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [43]:
encoded_dict.input_ids.shape

torch.Size([1, 256])

In [44]:
transformer_model = transformer_model.cuda()
sample_out = transformer_model(encoded_dict.input_ids.cuda(), encoded_dict.attention_mask.cuda(), output_hidden_states=True)

In [45]:
sample_out.keys()

odict_keys(['last_hidden_state', 'past_key_values', 'decoder_hidden_states', 'encoder_last_hidden_state', 'encoder_hidden_states'])

In [46]:
sample_out['last_hidden_state'].shape, sample_out['last_hidden_state'][:, 0].shape

(torch.Size([1, 256, 1024]), torch.Size([1, 1024]))

In [47]:
train_ds, valid_ds = create_train_valid_ds(tokenizer, train_text, train_target, valid_text, valid_target)

In [48]:
encode = train_ds[0]

In [49]:
encode.attention_mask.unsqueeze(0).shape, encoded_dict.input_ids.shape

(torch.Size([1, 256]), torch.Size([1, 256]))

In [50]:
sample_out = transformer_model(encode.input_ids.unsqueeze(0).cuda(), encode.attention_mask.unsqueeze(0).cuda())

### Training

In [51]:
import wandb

In [52]:
loss_fct = nn.MSELoss()

In [53]:
def create_training_args(fold):
    training_args = TrainingArguments(
        output_dir=str(MODELS_PATH/f"{cfg.model_name.replace('/', '_')}-{fold}"),
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,
        per_device_train_batch_size=cfg.batch_size,
        per_device_eval_batch_size=cfg.batch_size,
        num_train_epochs=cfg.epochs,
        logging_strategy="epoch",
        logging_first_step=True,
        save_steps=40000,
        fp16=True,
        evaluation_strategy="epoch",
        save_total_limit = 3,
        load_best_model_at_end=True,
        metric_for_best_model='mse',
        greater_is_better=False,
        gradient_accumulation_steps=1,
        learning_rate=5e-5
    )
    return training_args

In [54]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    return {'mse': mean_squared_error(logits, labels), 'rmse': rmse_score_2(logits, labels)}

In [55]:
tokenizer = AutoTokenizer.from_pretrained(cfg.pretrained_transformers_model)

In [56]:
logger = logging.get_logger(__name__)

class CommonLitTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        input_ids = inputs.pop("input_ids")
        attention_mask = inputs.pop("attention_mask")
        outputs = model(input_ids, attention_mask)
        logits = outputs
        loss = loss_fct(logits.flatten(),
                        labels.float().flatten())
        zero_cat = torch.zeros([1, 1]).to(outputs.device)
        return (loss, torch.cat([zero_cat, outputs])) if return_outputs else loss

In [57]:
!rm -rf /home/commonlit/models/{cfg.model_name.replace('/', '_')}-*

In [58]:
%%time

from transformers import EarlyStoppingCallback

bin_step = 1
bestmodels = []
eval_rmses = []
for i in range(0, num_bins, bin_step):
    train_bins = bin_list[i:i+bin_step]
    print('train_bins', f'{i}: {train_bins}')
    tokenizer = AutoTokenizer.from_pretrained(cfg.pretrained_transformers_model)
    train_text, train_target, valid_text, valid_target = create_split([i])
    train_ds, valid_ds = create_train_valid_ds(tokenizer, train_text, train_target, valid_text, valid_target)
    training_args = create_training_args(i)
    model = CommonLitModel()
    wandb.init(project=f"commonlit_{cfg.model_name.replace('/', '_')}")
    trainer = CommonLitTrainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=valid_ds,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=9)]
    )
    trainer.train()
    trainer.save_model()
    print('training_args.output_dir', training_args.output_dir)
    tokenizer.save_pretrained(training_args.output_dir)
    result = trainer.evaluate()
    bestmodels.append(trainer.state.best_model_checkpoint)
    print('best_model_checkpoint', trainer.state.best_model_checkpoint)
    print('result', result)
    eval_rmses.append(result['eval_rmse'])

train_bins 0: [4]


Some weights of the model checkpoint at /home/commonlit/models/distilbart/lm were not used when initializing BartModel: ['final_logits_bias', 'lm_head.weight']
- This IS expected if you are initializing BartModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BartModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[34m[1mwandb[0m: Currently logged in as: [33mgilf[0m (use `wandb login --relogin` to force relogin)


Epoch,Training Loss,Validation Loss,Mse,Rmse
1,0.8217,0.406151,0.406151,0.6373
2,0.3536,0.524776,0.524776,0.724414
3,0.2157,0.352183,0.352183,0.59345
4,0.1482,0.400802,0.400802,0.633089
5,0.1003,0.312207,0.312207,0.558755
6,0.0821,0.552817,0.552817,0.743517
7,0.0693,0.321296,0.321296,0.56683
8,0.0525,0.33609,0.33609,0.579732
9,0.0456,0.491401,0.491401,0.701
10,0.0428,0.471864,0.471864,0.686924


training_args.output_dir /home/commonlit/models/valhalla_distilbart-mnli-12-9-0


best_model_checkpoint /home/commonlit/models/valhalla_distilbart-mnli-12-9-0/checkpoint-815
result {'eval_loss': 0.31220677495002747, 'eval_mse': 0.31220677495002747, 'eval_rmse': 0.5587546825408936, 'eval_runtime': 4.1369, 'eval_samples_per_second': 57.29, 'epoch': 14.0, 'eval_mem_cpu_alloc_delta': 8192, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 0, 'eval_mem_gpu_peaked_delta': 831624704}
train_bins 1: [10]


Some weights of the model checkpoint at /home/commonlit/models/distilbart/lm were not used when initializing BartModel: ['final_logits_bias', 'lm_head.weight']
- This IS expected if you are initializing BartModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BartModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


VBox(children=(Label(value=' 0.05MB of 0.05MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train/loss,0.0255
train/learning_rate,3e-05
train/epoch,14.0
train/global_step,2282.0
_runtime,1291.0
_timestamp,1622812291.0
_step,30.0
eval/loss,0.31221
eval/mse,0.31221
eval/rmse,0.55875


0,1
train/loss,█▄▂▂▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,██▇▇▆▆▅▅▄▄▃▃▂▂▁
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
_runtime,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
_timestamp,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
_step,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇███
eval/loss,▄▇▂▄▁█▁▂▆▆▂▂▂▅▁
eval/mse,▄▇▂▄▁█▁▂▆▆▂▂▂▅▁
eval/rmse,▄▇▂▄▁█▁▂▆▆▂▃▂▅▁


Epoch,Training Loss,Validation Loss,Mse,Rmse
1,1.054,0.40798,0.40798,0.638733
2,0.3583,0.384088,0.384088,0.619748
3,0.2215,0.290426,0.290426,0.538912
4,0.1609,0.605732,0.605732,0.778288
5,0.117,0.473139,0.473139,0.687851
6,0.0816,0.376189,0.376189,0.613342
7,0.0668,0.282795,0.282795,0.531785
8,0.0566,0.36334,0.36334,0.602777
9,0.0506,0.280559,0.280559,0.529678
10,0.0431,0.269287,0.269287,0.518928


training_args.output_dir /home/commonlit/models/valhalla_distilbart-mnli-12-9-1


best_model_checkpoint /home/commonlit/models/valhalla_distilbart-mnli-12-9-1/checkpoint-3260
result {'eval_loss': 0.2528814375400543, 'eval_mse': 0.2528814375400543, 'eval_rmse': 0.5028731822967529, 'eval_runtime': 4.1642, 'eval_samples_per_second': 56.913, 'epoch': 29.0, 'eval_mem_cpu_alloc_delta': 0, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 0, 'eval_mem_gpu_peaked_delta': 831624704}
train_bins 2: [9]


Some weights of the model checkpoint at /home/commonlit/models/distilbart/lm were not used when initializing BartModel: ['final_logits_bias', 'lm_head.weight']
- This IS expected if you are initializing BartModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BartModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


VBox(children=(Label(value=' 0.05MB of 0.05MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train/loss,0.0078
train/learning_rate,0.0
train/epoch,29.0
train/global_step,4727.0
_runtime,2659.0
_timestamp,1622814959.0
_step,60.0
eval/loss,0.25288
eval/mse,0.25288
eval/rmse,0.50287


0,1
train/loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,███▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▁▁▁
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
_runtime,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
_timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
eval/loss,▄▄▂█▅▃▂▃▂▁▂▁▁▁▁▁▁▂▂▁▁▁▁▁▁▁▁▁▁▁
eval/mse,▄▄▂█▅▃▂▃▂▁▂▁▁▁▁▁▁▂▂▁▁▁▁▁▁▁▁▁▁▁
eval/rmse,▄▄▂█▆▄▂▄▂▁▂▁▁▂▁▁▁▂▂▁▁▁▁▁▂▁▁▁▁▁


Epoch,Training Loss,Validation Loss,Mse,Rmse
1,1.1245,0.427618,0.427618,0.653925
2,0.3791,0.405077,0.405077,0.636457
3,0.2294,0.421815,0.421815,0.649473
4,0.1489,0.617894,0.617894,0.786062
5,0.1182,0.264701,0.264701,0.514491
6,0.088,0.333747,0.333747,0.577708
7,0.0738,0.382515,0.382515,0.618478
8,0.0536,0.30563,0.30563,0.552838
9,0.0523,0.296162,0.296162,0.544207
10,0.0483,0.391507,0.391507,0.625705


training_args.output_dir /home/commonlit/models/valhalla_distilbart-mnli-12-9-2


best_model_checkpoint /home/commonlit/models/valhalla_distilbart-mnli-12-9-2/checkpoint-815
result {'eval_loss': 0.2647012174129486, 'eval_mse': 0.2647012174129486, 'eval_rmse': 0.5144912004470825, 'eval_runtime': 4.1631, 'eval_samples_per_second': 56.928, 'epoch': 14.0, 'eval_mem_cpu_alloc_delta': 0, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 0, 'eval_mem_gpu_peaked_delta': 831624704}
train_bins 3: [2]


Some weights of the model checkpoint at /home/commonlit/models/distilbart/lm were not used when initializing BartModel: ['final_logits_bias', 'lm_head.weight']
- This IS expected if you are initializing BartModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BartModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


VBox(children=(Label(value=' 0.05MB of 0.05MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train/loss,0.0306
train/learning_rate,3e-05
train/epoch,14.0
train/global_step,2282.0
_runtime,1289.0
_timestamp,1622816257.0
_step,30.0
eval/loss,0.2647
eval/mse,0.2647
eval/rmse,0.51449


0,1
train/loss,█▆▂▂▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,██▇▇▆▆▅▅▄▄▃▃▂▂▁
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
_runtime,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
_timestamp,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
_step,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇███
eval/loss,▄▄▄█▁▂▃▂▂▄▂▃▂▂▁
eval/mse,▄▄▄█▁▂▃▂▂▄▂▃▂▂▁
eval/rmse,▅▄▄█▁▃▄▂▂▄▂▃▂▂▁


Epoch,Training Loss,Validation Loss,Mse,Rmse
1,1.0708,0.397298,0.397298,0.630316
2,0.3926,0.337266,0.337266,0.580746
3,0.2215,0.511511,0.511511,0.7152
4,0.1727,0.543167,0.543167,0.736998
5,0.1174,0.355499,0.355499,0.596237
6,0.0952,0.263255,0.263255,0.513084
7,0.0651,0.250535,0.250535,0.500534
8,0.0598,0.335457,0.335457,0.579187
9,0.05,0.353784,0.353784,0.594797
10,0.042,0.290083,0.290083,0.538594


training_args.output_dir /home/commonlit/models/valhalla_distilbart-mnli-12-9-3


best_model_checkpoint /home/commonlit/models/valhalla_distilbart-mnli-12-9-3/checkpoint-1141
result {'eval_loss': 0.2505345344543457, 'eval_mse': 0.2505345344543457, 'eval_rmse': 0.5005342364311218, 'eval_runtime': 4.1728, 'eval_samples_per_second': 56.797, 'epoch': 16.0, 'eval_mem_cpu_alloc_delta': 0, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 0, 'eval_mem_gpu_peaked_delta': 831624704}
train_bins 4: [0]


Some weights of the model checkpoint at /home/commonlit/models/distilbart/lm were not used when initializing BartModel: ['final_logits_bias', 'lm_head.weight']
- This IS expected if you are initializing BartModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BartModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


VBox(children=(Label(value=' 0.05MB of 0.05MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train/loss,0.0234
train/learning_rate,2e-05
train/epoch,16.0
train/global_step,2608.0
_runtime,1474.0
_timestamp,1622817740.0
_step,34.0
eval/loss,0.25053
eval/mse,0.25053
eval/rmse,0.50053


0,1
train/loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,██▇▇▆▆▅▅▅▄▄▃▃▂▂▁▁
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇██████
train/global_step,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇██████
_runtime,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
_timestamp,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
eval/loss,▅▃▇█▄▁▁▃▃▂▃▃▃▂▃▂▁
eval/mse,▅▃▇█▄▁▁▃▃▂▃▃▃▂▃▂▁
eval/rmse,▅▃▇█▄▁▁▃▄▂▃▄▄▂▃▂▁


Epoch,Training Loss,Validation Loss,Mse,Rmse
1,1.1014,0.407902,0.407902,0.638672
2,0.354,0.814069,0.814069,0.902258
3,0.2288,0.28239,0.28239,0.531403
4,0.1585,0.347143,0.347143,0.589189
5,0.1276,0.241749,0.241749,0.491679
6,0.0888,0.262364,0.262364,0.512215
7,0.0668,0.273715,0.273715,0.523178
8,0.0574,0.298754,0.298754,0.546584
9,0.0527,0.289685,0.289685,0.538224
10,0.0457,0.437733,0.437733,0.661614


training_args.output_dir /home/commonlit/models/valhalla_distilbart-mnli-12-9-4


best_model_checkpoint /home/commonlit/models/valhalla_distilbart-mnli-12-9-4/checkpoint-815
result {'eval_loss': 0.24174857139587402, 'eval_mse': 0.24174857139587402, 'eval_rmse': 0.4916793406009674, 'eval_runtime': 4.1692, 'eval_samples_per_second': 56.845, 'epoch': 14.0, 'eval_mem_cpu_alloc_delta': 0, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 0, 'eval_mem_gpu_peaked_delta': 831624704}
train_bins 5: [1]


Some weights of the model checkpoint at /home/commonlit/models/distilbart/lm were not used when initializing BartModel: ['final_logits_bias', 'lm_head.weight']
- This IS expected if you are initializing BartModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BartModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


VBox(children=(Label(value=' 0.05MB of 0.05MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train/loss,0.0269
train/learning_rate,3e-05
train/epoch,14.0
train/global_step,2282.0
_runtime,1287.0
_timestamp,1622819036.0
_step,30.0
eval/loss,0.24175
eval/mse,0.24175
eval/rmse,0.49168


0,1
train/loss,█▃▂▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,██▇▇▆▆▅▅▄▄▃▃▂▂▁
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
_runtime,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
_timestamp,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
_step,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇███
eval/loss,▃█▁▂▁▁▁▂▂▃▁▂▁▁▁
eval/mse,▃█▁▂▁▁▁▂▂▃▁▂▁▁▁
eval/rmse,▄█▂▃▁▁▂▂▂▄▁▂▂▁▁


Epoch,Training Loss,Validation Loss,Mse,Rmse
1,1.1132,0.302302,0.302302,0.54982
2,0.3746,0.289528,0.289528,0.538078
3,0.2333,0.401804,0.401804,0.63388
4,0.1537,0.445543,0.445543,0.66749
5,0.107,0.253754,0.253754,0.50374
6,0.0894,0.295136,0.295136,0.543265
7,0.0765,0.301763,0.301763,0.54933
8,0.0553,0.350879,0.350879,0.59235
9,0.0548,0.238516,0.238516,0.488381
10,0.0454,0.342995,0.342995,0.585658


training_args.output_dir /home/commonlit/models/valhalla_distilbart-mnli-12-9-5


best_model_checkpoint /home/commonlit/models/valhalla_distilbart-mnli-12-9-5/checkpoint-2771
result {'eval_loss': 0.23296745121479034, 'eval_mse': 0.23296745121479034, 'eval_rmse': 0.4826670289039612, 'eval_runtime': 4.1553, 'eval_samples_per_second': 57.035, 'epoch': 26.0, 'eval_mem_cpu_alloc_delta': -262144, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 262144, 'eval_mem_gpu_peaked_delta': 831624704}
train_bins 6: [5]


Some weights of the model checkpoint at /home/commonlit/models/distilbart/lm were not used when initializing BartModel: ['final_logits_bias', 'lm_head.weight']
- This IS expected if you are initializing BartModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BartModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


VBox(children=(Label(value=' 0.05MB of 0.05MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train/loss,0.0107
train/learning_rate,1e-05
train/epoch,26.0
train/global_step,4238.0
_runtime,2367.0
_timestamp,1622821413.0
_step,54.0
eval/loss,0.23297
eval/mse,0.23297
eval/rmse,0.48267


0,1
train/loss,█▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,██▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
_runtime,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
_timestamp,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
eval/loss,▃▃▇█▂▃▃▅▁▅▂▄▃▂▂▂▁▂▂▂▃▁▁▁▂▂▁
eval/mse,▃▃▇█▂▃▃▅▁▅▂▄▃▂▂▂▁▂▂▂▃▁▁▁▂▂▁
eval/rmse,▄▃▇█▂▃▄▅▁▅▂▄▃▂▂▂▁▂▂▂▃▁▁▁▂▃▁


Epoch,Training Loss,Validation Loss,Mse,Rmse
1,1.2911,1.443701,1.443701,1.201541
2,0.4009,0.217819,0.217819,0.46671
3,0.2578,0.327554,0.327554,0.572323
4,0.1845,0.195931,0.195931,0.442641
5,0.1247,0.305386,0.305386,0.552617
6,0.1032,0.258478,0.258478,0.508407
7,0.0708,0.240801,0.240801,0.490714
8,0.0588,0.358231,0.358231,0.598524
9,0.0504,0.19732,0.19732,0.444207
10,0.0439,0.212409,0.212409,0.460878


training_args.output_dir /home/commonlit/models/valhalla_distilbart-mnli-12-9-6


best_model_checkpoint /home/commonlit/models/valhalla_distilbart-mnli-12-9-6/checkpoint-2119
result {'eval_loss': 0.19326727092266083, 'eval_mse': 0.19326728582382202, 'eval_rmse': 0.4396217465400696, 'eval_runtime': 4.1676, 'eval_samples_per_second': 56.867, 'epoch': 22.0, 'eval_mem_cpu_alloc_delta': -66977792, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 66977792, 'eval_mem_gpu_peaked_delta': 831624704}
train_bins 7: [8]


Some weights of the model checkpoint at /home/commonlit/models/distilbart/lm were not used when initializing BartModel: ['final_logits_bias', 'lm_head.weight']
- This IS expected if you are initializing BartModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BartModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


VBox(children=(Label(value=' 0.05MB of 0.05MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train/loss,0.0151
train/learning_rate,1e-05
train/epoch,22.0
train/global_step,3586.0
_runtime,2008.0
_timestamp,1622823429.0
_step,46.0
eval/loss,0.19327
eval/mse,0.19327
eval/rmse,0.43962


0,1
train/loss,█▄▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,██▇▇▇▆▆▆▅▅▅▅▄▄▄▃▃▃▂▂▂▁▁
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
_runtime,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
_timestamp,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
eval/loss,█▁▂▁▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/mse,█▁▂▁▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/rmse,█▁▂▁▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁


Epoch,Training Loss,Validation Loss,Mse,Rmse
1,1.0992,0.361428,0.361428,0.601189
2,0.3425,0.276389,0.276389,0.525727
3,0.2683,0.383329,0.383329,0.619136
4,0.1567,0.318419,0.318419,0.564287
5,0.1096,0.233274,0.233274,0.482985
6,0.0875,0.208624,0.208624,0.456754
7,0.0697,0.273953,0.273953,0.523405
8,0.0578,0.286662,0.286662,0.535408
9,0.0492,0.281239,0.281239,0.53032
10,0.0466,0.254545,0.254545,0.504525


training_args.output_dir /home/commonlit/models/valhalla_distilbart-mnli-12-9-7


best_model_checkpoint /home/commonlit/models/valhalla_distilbart-mnli-12-9-7/checkpoint-978
result {'eval_loss': 0.20862385630607605, 'eval_mse': 0.20862387120723724, 'eval_rmse': 0.45675361156463623, 'eval_runtime': 4.1554, 'eval_samples_per_second': 57.034, 'epoch': 15.0, 'eval_mem_cpu_alloc_delta': 0, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 0, 'eval_mem_gpu_peaked_delta': 831624704}
train_bins 8: [7]


Some weights of the model checkpoint at /home/commonlit/models/distilbart/lm were not used when initializing BartModel: ['final_logits_bias', 'lm_head.weight']
- This IS expected if you are initializing BartModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BartModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


VBox(children=(Label(value=' 0.05MB of 0.05MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train/loss,0.0278
train/learning_rate,3e-05
train/epoch,15.0
train/global_step,2445.0
_runtime,1371.0
_timestamp,1622824808.0
_step,32.0
eval/loss,0.20862
eval/mse,0.20862
eval/rmse,0.45675


0,1
train/loss,█▄▂▂▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,██▇▇▆▆▅▅▄▄▃▃▂▂▁▁
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇██████
train/global_step,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇██████
_runtime,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
_timestamp,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
_step,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇███
eval/loss,▇▄█▅▂▁▄▄▄▃▂▂▂▂▂▁
eval/mse,▇▄█▅▂▁▄▄▄▃▂▂▂▂▂▁
eval/rmse,▇▄█▆▂▁▄▄▄▃▂▂▂▂▂▁


Epoch,Training Loss,Validation Loss,Mse,Rmse
1,1.0534,0.362739,0.362739,0.602278
2,0.3685,0.392976,0.392976,0.626878
3,0.231,0.416351,0.416351,0.645253
4,0.1641,0.321068,0.321068,0.566628
5,0.1239,0.321288,0.321288,0.566822
6,0.0965,0.273602,0.273602,0.52307
7,0.0809,0.343296,0.343296,0.585915
8,0.0622,0.467765,0.467765,0.683933
9,0.053,0.347668,0.347668,0.589634
10,0.0452,0.378757,0.378757,0.615432


training_args.output_dir /home/commonlit/models/valhalla_distilbart-mnli-12-9-8


best_model_checkpoint /home/commonlit/models/valhalla_distilbart-mnli-12-9-8/checkpoint-2119
result {'eval_loss': 0.2643921375274658, 'eval_mse': 0.2643921375274658, 'eval_rmse': 0.5141907334327698, 'eval_runtime': 4.1634, 'eval_samples_per_second': 56.925, 'epoch': 22.0, 'eval_mem_cpu_alloc_delta': 0, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 0, 'eval_mem_gpu_peaked_delta': 831624704}
train_bins 9: [6]


Some weights of the model checkpoint at /home/commonlit/models/distilbart/lm were not used when initializing BartModel: ['final_logits_bias', 'lm_head.weight']
- This IS expected if you are initializing BartModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BartModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


VBox(children=(Label(value=' 0.05MB of 0.05MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train/loss,0.0129
train/learning_rate,1e-05
train/epoch,22.0
train/global_step,3586.0
_runtime,2002.0
_timestamp,1622826820.0
_step,46.0
eval/loss,0.26439
eval/mse,0.26439
eval/rmse,0.51419


0,1
train/loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,██▇▇▇▆▆▆▅▅▅▅▄▄▄▃▃▃▂▂▂▁▁
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
_runtime,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
_timestamp,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
eval/loss,▄▅▆▃▃▁▄█▄▅▂▂▁▁▂▃▄▃▂▁▂▃▁
eval/mse,▄▅▆▃▃▁▄█▄▅▂▂▁▁▂▃▄▃▂▁▂▃▁
eval/rmse,▅▆▆▃▃▁▄█▄▅▂▂▁▁▂▄▄▄▂▁▃▃▁


Epoch,Training Loss,Validation Loss,Mse,Rmse
1,0.8964,0.336719,0.336719,0.580275
2,0.3495,0.283872,0.283872,0.532796
3,0.2196,0.382977,0.382977,0.618851
4,0.1504,0.309515,0.309515,0.556341
5,0.1179,0.403884,0.403884,0.635519
6,0.0867,0.355286,0.355286,0.596058
7,0.0669,0.276818,0.276818,0.526135
8,0.0577,0.369419,0.369419,0.607799
9,0.0481,0.304568,0.304568,0.551877
10,0.0485,0.285782,0.285782,0.534586


training_args.output_dir /home/commonlit/models/valhalla_distilbart-mnli-12-9-9


best_model_checkpoint /home/commonlit/models/valhalla_distilbart-mnli-12-9-9/checkpoint-1141
result {'eval_loss': 0.27681806683540344, 'eval_mse': 0.27681809663772583, 'eval_rmse': 0.5261350274085999, 'eval_runtime': 4.138, 'eval_samples_per_second': 57.032, 'epoch': 16.0, 'eval_mem_cpu_alloc_delta': 0, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 0, 'eval_mem_gpu_peaked_delta': 831624704}
train_bins 10: [11]


Some weights of the model checkpoint at /home/commonlit/models/distilbart/lm were not used when initializing BartModel: ['final_logits_bias', 'lm_head.weight']
- This IS expected if you are initializing BartModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BartModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


VBox(children=(Label(value=' 0.05MB of 0.05MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train/loss,0.0229
train/learning_rate,2e-05
train/epoch,16.0
train/global_step,2608.0
_runtime,1471.0
_timestamp,1622828299.0
_step,34.0
eval/loss,0.27682
eval/mse,0.27682
eval/rmse,0.52614


0,1
train/loss,█▆▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,██▇▇▆▆▅▅▅▄▄▃▃▂▂▁▁
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇██████
train/global_step,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇██████
_runtime,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
_timestamp,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
eval/loss,▄▁▆▂▇▅▁▅▂▁█▂▁▃▂▂▁
eval/mse,▄▁▆▂▇▅▁▅▂▁█▂▁▃▂▂▁
eval/rmse,▄▁▆▃▇▅▁▅▂▁█▂▁▄▂▂▁


Epoch,Training Loss,Validation Loss,Mse,Rmse
1,1.02,0.703058,0.703058,0.838486
2,0.3505,0.29764,0.29764,0.545564
3,0.2671,0.387953,0.387953,0.622859
4,0.1625,0.277614,0.277614,0.526891
5,0.1183,0.297521,0.297521,0.545455
6,0.0941,0.361552,0.361552,0.601292
7,0.069,0.331196,0.331196,0.575496
8,0.061,0.460011,0.460011,0.678241
9,0.0544,0.327841,0.327841,0.572574
10,0.0422,0.359667,0.359667,0.599722


training_args.output_dir /home/commonlit/models/valhalla_distilbart-mnli-12-9-10


best_model_checkpoint /home/commonlit/models/valhalla_distilbart-mnli-12-9-10/checkpoint-652
result {'eval_loss': 0.27761438488960266, 'eval_mse': 0.2776143550872803, 'eval_rmse': 0.5268912315368652, 'eval_runtime': 4.1582, 'eval_samples_per_second': 56.755, 'epoch': 13.0, 'eval_mem_cpu_alloc_delta': 0, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 0, 'eval_mem_gpu_peaked_delta': 831624704}
train_bins 11: [3]


Some weights of the model checkpoint at /home/commonlit/models/distilbart/lm were not used when initializing BartModel: ['final_logits_bias', 'lm_head.weight']
- This IS expected if you are initializing BartModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BartModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


VBox(children=(Label(value=' 0.05MB of 0.05MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
train/loss,0.0324
train/learning_rate,3e-05
train/epoch,13.0
train/global_step,2119.0
_runtime,1193.0
_timestamp,1622829501.0
_step,28.0
eval/loss,0.27761
eval/mse,0.27761
eval/rmse,0.52689


0,1
train/loss,█▅▂▂▁▁▁▁▁▁▁▁▁▁
train/learning_rate,█▇▇▆▆▅▅▄▄▃▃▂▂▁
train/epoch,▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
train/global_step,▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
_runtime,▁▁▁▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
_timestamp,▁▁▁▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇████
_step,▁▁▁▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇▇██
eval/loss,█▁▃▁▁▂▂▄▂▂▃▁▂▁
eval/mse,█▁▃▁▁▂▂▄▂▂▃▁▂▁
eval/rmse,█▁▃▁▁▃▂▄▂▃▃▁▂▁


Epoch,Training Loss,Validation Loss,Mse,Rmse
1,1.2762,0.549787,0.549787,0.741476
2,0.3527,0.482895,0.482895,0.694907
3,0.2101,0.711496,0.711496,0.843502
4,0.1563,0.366229,0.36623,0.605169
5,0.1063,0.367648,0.367648,0.60634
6,0.0831,0.361316,0.361316,0.601095
7,0.0732,0.584388,0.584388,0.764453
8,0.0668,0.339701,0.339701,0.582839
9,0.0488,0.44639,0.44639,0.668124
10,0.0437,0.366734,0.366734,0.605585


training_args.output_dir /home/commonlit/models/valhalla_distilbart-mnli-12-9-11


best_model_checkpoint /home/commonlit/models/valhalla_distilbart-mnli-12-9-11/checkpoint-1304
result {'eval_loss': 0.3397010266780853, 'eval_mse': 0.3397010266780853, 'eval_rmse': 0.582838773727417, 'eval_runtime': 4.1625, 'eval_samples_per_second': 56.696, 'epoch': 17.0, 'eval_mem_cpu_alloc_delta': 0, 'eval_mem_gpu_alloc_delta': 0, 'eval_mem_cpu_peaked_delta': 0, 'eval_mem_gpu_peaked_delta': 831624704}
CPU times: user 9h 57min 18s, sys: 2h 43s, total: 11h 58min 1s
Wall time: 5h 34min 26s


In [59]:
BEST_MODEL_FOLDER = MODELS_PATH/cfg.model_name/'best'
!rm -rf {BEST_MODEL_FOLDER}
!mkdir -p {BEST_MODEL_FOLDER}

In [60]:
'Mean best RSME losses', np.array(eval_rmses).mean()

('Mean best RSME losses', 0.5081192329525948)

In [62]:
BEST_MODEL_FOLDER

PosixPath('/home/commonlit/models/valhalla/distilbart-mnli-12-9/best')

In [68]:
from shutil import copyfile

def normalize_name(path_name):
    return path_name.replace('valhalla/', 'valhalla_')

for i, best_model in enumerate(bestmodels):
    print(f'Processing {i}th model')
    best_model_file = f'{best_model}/pytorch_model.bin'
    if Path(best_model_file).exists():
        copyfile(best_model_file, f'{BEST_MODEL_FOLDER}/{i}_pytorch_model.bin')
        tokenizer_path = Path(BEST_MODEL_FOLDER/f'tokenizer-{i}')
        tokenizer_path.mkdir(parents=True, exist_ok=True)
        assert tokenizer_path.exists()

        tokenizer_json = Path(normalize_name(f'{MODELS_PATH/cfg.model_name}-{i}/tokenizer.json'))
        assert tokenizer_json.exists(), f'{tokenizer_json} does not exist'
        copyfile(tokenizer_json, tokenizer_path/'tokenizer.json')

        vocab_txt = Path(normalize_name(f'{MODELS_PATH/cfg.model_name}-{i}/vocab.json'))
        assert vocab_txt.exists(), f'{vocab_txt} does not exist'
        copyfile(vocab_txt, tokenizer_path/'vocab.json')

        config_json = Path(normalize_name(f'{MODELS_PATH/cfg.model_name}-{i}/config.json'))
        assert config_json.exists()
        copyfile(config_json, tokenizer_path/'config.json')
    else:
        print(f'{best_model_file} is missing')

Processing 0th model
Processing 1th model
Processing 2th model
Processing 3th model
Processing 4th model
Processing 5th model
Processing 6th model
Processing 7th model
Processing 8th model
Processing 9th model
Processing 10th model
Processing 11th model


In [69]:
import shutil

shutil.make_archive(MODELS_PATH/cfg.model_name/'best_models', 'zip', BEST_MODEL_FOLDER)

'/home/commonlit/models/valhalla/distilbart-mnli-12-9/best_models.zip'

In [None]:
state_dict = torch.load(str(MODELS_PATH/f'distilroberta-0/checkpoint-105/pytorch_model.bin'))

In [None]:
loaded_model = CommonLitModel()

In [None]:
loaded_model.load_state_dict(state_dict)