<a href="https://colab.research.google.com/github/cpptake/CommonLit/blob/main/pre_trained_roberta_solution_in_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Overview
This is kernel is almost the same as [Lightweight Roberta solution in PyTorch](https://www.kaggle.com/andretugan/lightweight-roberta-solution-in-pytorch), but instead of "roberta-base", it starts from [Maunish's pre-trained model](https://www.kaggle.com/maunish/clrp-roberta-base).

Acknowledgments: some ideas were taken from kernels by [Torch](https://www.kaggle.com/rhtsingh) and [Maunish](https://www.kaggle.com/maunish).

ｺﾊﾞﾔｼNotebook

<a>https://www.kaggle.com/takeshikobayashi/pre-trained-roberta-solution-in-pytorch

In [None]:
!nvidia-smi

Sat Jul  3 13:46:31 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   54C    P0    33W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install -q kaggle
!mkdir /root/.kaggle
!cp /content/drive/MyDrive/Colab\ Notebooks/kaggle.json /root/.kaggle/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [None]:
# !mkdir /content/drive/MyDrive/CommonLit/input/commonlitreadabilityprize/clrp-roberta-base

In [None]:
!pip install transformers
!pip install colorama



In [None]:
import os
import math
import random
import time

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AdamW
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig
from transformers import get_cosine_schedule_with_warmup

from sklearn.model_selection import KFold,StratifiedKFold

import gc
gc.enable()

In [None]:
NUM_FOLDS = 5
NUM_EPOCHS = 1#3
BATCH_SIZE = 16
MAX_LEN = 248
EVAL_SCHEDULE = [(0.50, 16), (0.49, 8), (0.48, 4), (0.47, 2), (-1., 1)]
ROBERTA_PATH = "/content/drive/MyDrive/CommonLit/input/commonlitreadabilityprize/clrp-roberta-base/clrp_roberta_base"
TOKENIZER_PATH = "/content/drive/MyDrive/CommonLit/input/commonlitreadabilityprize/clrp-roberta-base/clrp_roberta_base"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True

In [None]:
train_df = pd.read_csv("/content/drive/MyDrive/CommonLit/input/commonlitreadabilityprize/train.csv")

# Remove incomplete entries if any.
train_df.drop(train_df[(train_df.target == 0) & (train_df.standard_error == 0)].index,
              inplace=True)
train_df.reset_index(drop=True, inplace=True)

test_df = pd.read_csv("/content/drive/MyDrive/CommonLit/input/commonlitreadabilityprize/test.csv")
submission_df = pd.read_csv("/content/drive/MyDrive/CommonLit/input/commonlitreadabilityprize/sample_submission.csv")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

# Dataset

In [None]:
class LitDataset(Dataset):
    def __init__(self, df, inference_only=False):
        super().__init__()

        self.df = df        
        self.inference_only = inference_only
        self.text = df.excerpt.tolist()
        #self.text = [text.replace("\n", " ") for text in self.text]
        
        if not self.inference_only:
            self.target = torch.tensor(df.target.values, dtype=torch.float32)        
    
        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',            
            max_length = MAX_LEN,
            truncation = True,
            return_attention_mask=True
        )        
 

    def __len__(self):
        return len(self.df)

    
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return (input_ids, attention_mask)            
        else:
            target = self.target[index]
            return (input_ids, attention_mask, target)

# Model
The model is inspired by the one from [Maunish](https://www.kaggle.com/maunish/clrp-roberta-svm).

In [None]:
class LitModel(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(ROBERTA_PATH)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})                       
        
        self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config=config)  
            
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(768, 1)                        
        )
        

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)        

        # There are a total of 13 layers of hidden states.
        # 1 for the embedding layer, and 12 for the 12 Roberta layers.
        # We take the hidden states from the last Roberta layer.
        last_layer_hidden_states = roberta_output.hidden_states[-1]

        # The number of cells is MAX_LEN.
        # The size of the hidden state of each cell is 768 (for roberta-base).
        # In order to condense hidden states of all cells to a context vector,
        # we compute a weighted average of the hidden states of all cells.
        # We compute the weight of each cell, using the attention neural network.
        weights = self.attention(last_layer_hidden_states)
                
        # weights.shape is BATCH_SIZE x MAX_LEN x 1
        # last_layer_hidden_states.shape is BATCH_SIZE x MAX_LEN x 768        
        # Now we compute context_vector as the weighted average.
        # context_vector.shape is BATCH_SIZE x 768
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        
        # Now we reduce the context vector to the prediction score.
        return self.regressor(context_vector)

In [None]:
def eval_mse(model, data_loader):
    """Evaluates the mean squared error of the |model| on |data_loader|"""
    model.eval()            
    mse_sum = 0

    with torch.no_grad():
        for batch_num, (input_ids, attention_mask, target) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)                        
            target = target.to(DEVICE)           
            
            pred = model(input_ids, attention_mask)                       

            mse_sum += nn.MSELoss(reduction="sum")(pred.flatten(), target).item()
                

    return mse_sum / len(data_loader.dataset)

In [None]:
def predict(model, data_loader):
    """Returns an np.array with predictions of the |model| on |data_loader|"""
    model.eval()

    result = np.zeros(len(data_loader.dataset))    
    index = 0
    
    with torch.no_grad():
        for batch_num, (input_ids, attention_mask) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
                        
            pred = model(input_ids, attention_mask)                        

            result[index : index + pred.shape[0]] = pred.flatten().to("cpu")
            index += pred.shape[0]

    return result

In [None]:
def train(model, model_path, train_loader, val_loader,
          optimizer, scheduler=None, num_epochs=NUM_EPOCHS):    
    best_val_rmse = None
    best_epoch = 0
    step = 0
    last_eval_step = 0
    eval_period = EVAL_SCHEDULE[0][1]

    pred_list = []
    target_list = []
    pred_lists = []
    target_lists = []

    start = time.time()

    for epoch in range(num_epochs):                           
        val_rmse = None         

        for batch_num, (input_ids, attention_mask, target) in enumerate(train_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)            
            target = target.to(DEVICE)                        

            optimizer.zero_grad()
            
            model.train()

            pred = model(input_ids, attention_mask)

            # pred_list.append(pred.flatten())
            # target_list.append(target)

            pred_test = torch.cat([a, b], axis=0)


            
            # print("##########")
            # # print(pred)
            # print("what is input id??")
            # print("Shape ",input_ids.shape)
            # print("nakami ",input_ids)
            # ### stacking用追加
            # pd_stack[pd_stack['id'] == input_ids] = pred
            # ###
            # print("#####  Done  #####")

            mse = nn.MSELoss(reduction="mean")(pred.flatten(), target)
                        
            mse.backward()

            optimizer.step()
            if scheduler:
                scheduler.step()
            
            if step >= last_eval_step + eval_period:
                # Evaluate the model on val_loader.
                elapsed_seconds = time.time() - start
                num_steps = step - last_eval_step
                print(f"\n{num_steps} steps took {elapsed_seconds:0.3} seconds")
                last_eval_step = step
                
                val_rmse = math.sqrt(eval_mse(model, val_loader))                            

                print(f"Epoch: {epoch} batch_num: {batch_num}", 
                      f"val_rmse: {val_rmse:0.4}")

                for rmse, period in EVAL_SCHEDULE:
                    if val_rmse >= rmse:
                        eval_period = period
                        break                               
                
                if not best_val_rmse or val_rmse < best_val_rmse:                    
                    best_val_rmse = val_rmse
                    best_epoch = epoch
                    torch.save(model.state_dict(), model_path)
                    print(f"New best_val_rmse: {best_val_rmse:0.4}")
                else:       
                    print(f"Still best_val_rmse: {best_val_rmse:0.4}",
                          f"(from epoch {best_epoch})")                                    
                    
                start = time.time()
                                            
            step += 1
        

            # pred_list.append(pred.flatten())
            # target_list.append(target_list)

        pred_lists.append(pred_list)
        target_lists.append(target_list)

    hoge = pd.DataFrame()
    hoge['pred'] = pred_lists
    hoge['target'] = target_lists
                        
    
    return best_val_rmse,pred_lists

In [None]:
# pd_stack = train_df.copy()

# pd_stack = pd_stack.drop(columns = ['url_legal','license','excerpt','standard_error'])
# pd_stack['predict'] = 0


# pd_stack[pd_stack['id'] == 'c12129c31'] = 111
# pd_stack

In [None]:
def create_optimizer(model):
    named_parameters = list(model.named_parameters())    
    
    roberta_parameters = named_parameters[:197]    
    attention_parameters = named_parameters[199:203]
    regressor_parameters = named_parameters[203:]
        
    attention_group = [params for (name, params) in attention_parameters]
    regressor_group = [params for (name, params) in regressor_parameters]

    parameters = []
    parameters.append({"params": attention_group})
    parameters.append({"params": regressor_group})

    for layer_num, (name, params) in enumerate(roberta_parameters):
        weight_decay = 0.0 if "bias" in name else 0.01

        lr = 2e-5

        if layer_num >= 69:        
            lr = 5e-5

        if layer_num >= 133:
            lr = 1e-4

        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})

    return AdamW(parameters)

In [None]:
train_df.shape

(2833, 6)

In [None]:
gc.collect()

## stacking 出力用
pd_stack = train_df.copy()
pd_stack = pd_stack.drop(columns = ['url_legal','license','excerpt','standard_error'])
pd_stack['predict'] = 0
####

hoge = pd.DataFrame()

SEED = 1000
list_val_rmse = []
pred_lists = []

# kfold = KFold(n_splits=NUM_FOLDS, random_state=SEED, shuffle=True)
kfold = KFold(n_splits=NUM_FOLDS, random_state=SEED, shuffle=True)

for fold, (train_indices, val_indices) in enumerate(kfold.split(train_df)):    
    print(f"\nFold {fold + 1}/{NUM_FOLDS}")
    model_path = f"model_{fold + 1}.pth"
        
    set_random_seed(SEED + fold)
    
    train_dataset = LitDataset(train_df.loc[train_indices])    
    val_dataset = LitDataset(train_df.loc[val_indices])    
        
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              drop_last=True, shuffle=True, num_workers=2)    
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE,
                            drop_last=False, shuffle=False, num_workers=2)    
        
    set_random_seed(SEED + fold)    
    
    model = LitModel().to(DEVICE)
    
    optimizer = create_optimizer(model)                        
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_training_steps=NUM_EPOCHS * len(train_loader),
        num_warmup_steps=50)    
    
    preds,pred_lists = train(model, model_path, train_loader, val_loader, optimizer, scheduler=scheduler)
    
    list_val_rmse.append(preds)

    del model
    gc.collect()
    
    print("\nPerformance estimates:")
    print(list_val_rmse)
    print("Mean:", np.array(list_val_rmse).mean())
    


Fold 1/5


Some weights of the model checkpoint at /content/drive/MyDrive/CommonLit/input/commonlitreadabilityprize/clrp-roberta-base/clrp_roberta_base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /content/drive/MyDrive/CommonLit/input/commonlitreadabilityprize/clrp-roberta-base/clrp_roberta_base and a


16 steps took 6.88 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8982
New best_val_rmse: 0.8982

16 steps took 6.32 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6901
New best_val_rmse: 0.6901

16 steps took 6.32 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.7285
Still best_val_rmse: 0.6901 (from epoch 0)

16 steps took 6.32 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6206
New best_val_rmse: 0.6206

16 steps took 6.32 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5612
New best_val_rmse: 0.5612

16 steps took 6.32 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5853
Still best_val_rmse: 0.5612 (from epoch 0)

16 steps took 6.32 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.519
New best_val_rmse: 0.519

16 steps took 6.32 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5158
New best_val_rmse: 0.5158

Performance estimates:
[0.5158341249599311]
Mean: 0.5158341249599311

Fold 2/5


Some weights of the model checkpoint at /content/drive/MyDrive/CommonLit/input/commonlitreadabilityprize/clrp-roberta-base/clrp_roberta_base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /content/drive/MyDrive/CommonLit/input/commonlitreadabilityprize/clrp-roberta-base/clrp_roberta_base and a


16 steps took 6.86 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.015
New best_val_rmse: 1.015

16 steps took 6.34 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7179
New best_val_rmse: 0.7179

16 steps took 6.32 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6887
New best_val_rmse: 0.6887

16 steps took 6.32 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6121
New best_val_rmse: 0.6121

16 steps took 6.33 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6337
Still best_val_rmse: 0.6121 (from epoch 0)

16 steps took 6.32 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.542
New best_val_rmse: 0.542

16 steps took 6.32 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5414
New best_val_rmse: 0.5414

16 steps took 6.33 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5203
New best_val_rmse: 0.5203

Performance estimates:
[0.5158341249599311, 0.5202513661652025]
Mean: 0.5180427455625668

Fold 3/5


Some weights of the model checkpoint at /content/drive/MyDrive/CommonLit/input/commonlitreadabilityprize/clrp-roberta-base/clrp_roberta_base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /content/drive/MyDrive/CommonLit/input/commonlitreadabilityprize/clrp-roberta-base/clrp_roberta_base and a


16 steps took 6.86 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9905
New best_val_rmse: 0.9905

16 steps took 6.33 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8327
New best_val_rmse: 0.8327

16 steps took 6.32 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.791
New best_val_rmse: 0.791

16 steps took 6.33 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6647
New best_val_rmse: 0.6647

16 steps took 6.32 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.657
New best_val_rmse: 0.657

16 steps took 6.32 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.583
New best_val_rmse: 0.583

16 steps took 6.32 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5804
New best_val_rmse: 0.5804

16 steps took 6.33 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5458
New best_val_rmse: 0.5458

Performance estimates:
[0.5158341249599311, 0.5202513661652025, 0.5457709919746703]
Mean: 0.5272854943666013

Fold 4/5


Some weights of the model checkpoint at /content/drive/MyDrive/CommonLit/input/commonlitreadabilityprize/clrp-roberta-base/clrp_roberta_base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /content/drive/MyDrive/CommonLit/input/commonlitreadabilityprize/clrp-roberta-base/clrp_roberta_base and a


16 steps took 6.86 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.085
New best_val_rmse: 1.085

16 steps took 6.33 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7577
New best_val_rmse: 0.7577

16 steps took 6.33 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6035
New best_val_rmse: 0.6035

16 steps took 6.33 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6662
Still best_val_rmse: 0.6035 (from epoch 0)

16 steps took 6.32 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6299
Still best_val_rmse: 0.6035 (from epoch 0)

16 steps took 6.33 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5688
New best_val_rmse: 0.5688

16 steps took 6.32 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5643
New best_val_rmse: 0.5643

16 steps took 6.33 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5345
New best_val_rmse: 0.5345

Performance estimates:
[0.5158341249599311, 0.5202513661652025, 0.5457709919746703, 0.5345460208272431]
Mean: 0.5291006259817618

Fold 5/5


Some weights of the model checkpoint at /content/drive/MyDrive/CommonLit/input/commonlitreadabilityprize/clrp-roberta-base/clrp_roberta_base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /content/drive/MyDrive/CommonLit/input/commonlitreadabilityprize/clrp-roberta-base/clrp_roberta_base and a


16 steps took 6.86 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9232
New best_val_rmse: 0.9232

16 steps took 6.33 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7569
New best_val_rmse: 0.7569

16 steps took 6.33 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6911
New best_val_rmse: 0.6911

16 steps took 6.33 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6214
New best_val_rmse: 0.6214

16 steps took 6.32 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.5909
New best_val_rmse: 0.5909

16 steps took 6.33 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5567
New best_val_rmse: 0.5567

16 steps took 6.33 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5423
New best_val_rmse: 0.5423

16 steps took 6.32 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5124
New best_val_rmse: 0.5124

Performance estimates:
[0.5158341249599311, 0.5202513661652025, 0.5457709919746703, 0.5345460208272431, 0.5123893558932818]
Mean: 0.5257583719640657


In [None]:
# len(pred_lists[0])

[len(v) for v in pred_lists]
# pred_lists.shapes

[141]

In [None]:
aaa = hoge['pred'][0]
aaa

[tensor([-0.2975, -0.2553, -0.2861, -0.2735, -0.3167, -0.3550, -0.2259, -0.3057,
         -0.3544, -0.3047, -0.2692, -0.2288, -0.2226, -0.2275, -0.2747, -0.2764],
        device='cuda:0', grad_fn=<ViewBackward>),
 tensor([-0.3647, -0.2856, -0.2904, -0.2996, -0.3314, -0.2874, -0.2392, -0.2823,
         -0.2933, -0.2674, -0.2512, -0.3316, -0.2344, -0.2427, -0.3171, -0.2846],
        device='cuda:0', grad_fn=<ViewBackward>),
 tensor([-0.3424, -0.3121, -0.3118, -0.3148, -0.3647, -0.2723, -0.3455, -0.3420,
         -0.3433, -0.3159, -0.2254, -0.2746, -0.2995, -0.3086, -0.3243, -0.3777],
        device='cuda:0', grad_fn=<ViewBackward>),
 tensor([-0.3713, -0.2858, -0.3152, -0.3297, -0.2515, -0.3494, -0.2898, -0.3975,
         -0.4229, -0.4976, -0.3583, -0.3149, -0.3341, -0.4082, -0.3028, -0.3944],
        device='cuda:0', grad_fn=<ViewBackward>),
 tensor([-0.3624, -0.4318, -0.4142, -0.3841, -0.4036, -0.3909, -0.4430, -0.3281,
         -0.4299, -0.3678, -0.3949, -0.3921, -0.4031, -0.4093, -0.3

In [None]:
hoge.shape

(141, 2)

In [None]:
train_df.shape

(2833, 6)

In [None]:
preds

0.5123893558932818

# Inference

In [None]:
test_dataset = LitDataset(test_df, inference_only=True)

In [None]:
all_predictions = np.zeros((len(list_val_rmse), len(test_df)))

test_dataset = LitDataset(test_df, inference_only=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                         drop_last=False, shuffle=False, num_workers=2)

for index in range(len(list_val_rmse)):            
    model_path = f"/content/drive/MyDrive/CommonLit/output/pre-trained-roberta-solution-in-pytorch/model_{index + 1}.pth"
    print(f"\nUsing {model_path}")
                        
    model = LitModel()
    model.load_state_dict(torch.load(model_path))    
    model.to(DEVICE)
    
    all_predictions[index] = predict(model, test_loader)
    
    del model
    gc.collect()


Using /content/drive/MyDrive/CommonLit/output/pre-trained-roberta-solution-in-pytorch/model_1.pth


Some weights of the model checkpoint at /content/drive/MyDrive/CommonLit/input/commonlitreadabilityprize/clrp-roberta-base/clrp_roberta_base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /content/drive/MyDrive/CommonLit/input/commonlitreadabilityprize/clrp-roberta-base/clrp_roberta_base and a


Using /content/drive/MyDrive/CommonLit/output/pre-trained-roberta-solution-in-pytorch/model_2.pth


Some weights of the model checkpoint at /content/drive/MyDrive/CommonLit/input/commonlitreadabilityprize/clrp-roberta-base/clrp_roberta_base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /content/drive/MyDrive/CommonLit/input/commonlitreadabilityprize/clrp-roberta-base/clrp_roberta_base and a


Using /content/drive/MyDrive/CommonLit/output/pre-trained-roberta-solution-in-pytorch/model_3.pth


Some weights of the model checkpoint at /content/drive/MyDrive/CommonLit/input/commonlitreadabilityprize/clrp-roberta-base/clrp_roberta_base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /content/drive/MyDrive/CommonLit/input/commonlitreadabilityprize/clrp-roberta-base/clrp_roberta_base and a


Using /content/drive/MyDrive/CommonLit/output/pre-trained-roberta-solution-in-pytorch/model_4.pth


Some weights of the model checkpoint at /content/drive/MyDrive/CommonLit/input/commonlitreadabilityprize/clrp-roberta-base/clrp_roberta_base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /content/drive/MyDrive/CommonLit/input/commonlitreadabilityprize/clrp-roberta-base/clrp_roberta_base and a


Using /content/drive/MyDrive/CommonLit/output/pre-trained-roberta-solution-in-pytorch/model_5.pth


Some weights of the model checkpoint at /content/drive/MyDrive/CommonLit/input/commonlitreadabilityprize/clrp-roberta-base/clrp_roberta_base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /content/drive/MyDrive/CommonLit/input/commonlitreadabilityprize/clrp-roberta-base/clrp_roberta_base and a

In [None]:
predictions = all_predictions.mean(axis=0)
submission_df.target = predictions
print(submission_df)
submission_df.to_csv("/content/drive/MyDrive/CommonLit/output/pre-trained-roberta-solution-in-pytorch/submission.csv", index=False)

In [None]:
all_predictions

array([[-0.4110876 , -0.74669015, -0.40717465, -2.30235672, -1.78312433,
        -1.41173363,  0.23158659],
       [-0.32000226, -0.51885808, -0.2224544 , -2.41730547, -1.62717474,
        -1.00956726,  0.30301461],
       [-0.52017486, -0.5785206 , -0.33318728, -2.56297708, -1.76609135,
        -1.57540727,  0.17660873],
       [-0.52818757, -0.60989404, -0.32330498, -2.55047774, -1.77477276,
        -1.29904222,  0.28325811],
       [-0.41792297, -0.70612031, -0.42490819, -2.6773386 , -1.92220187,
        -1.48353493,  0.19137977]])