In [1]:
import os
import math
import random
import time

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AdamW
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig
from transformers import get_cosine_schedule_with_warmup,get_cosine_with_hard_restarts_schedule_with_warmup

from sklearn.model_selection import KFold

import gc
gc.enable()

In [2]:
NUM_FOLDS = 5
NUM_EPOCHS = 3
BATCH_SIZE = 12
MAX_LEN = 248
SEED = 1000
WORKERS = 4
EVAL_SCHEDULE = [(0.50, 16), (0.49, 8), (0.48, 4), (0.47, 2), (-1., 1)]
ROBERTA_LARGE_PATH = "../input/clrp-roberta-large/clrp_roberta_large"
TOKENIZER_LARGET_PATH = "../input/clrp-roberta-large/clrp_roberta_large"
ROBERTA_PATH = "/kaggle/input/roberta-base"
TOKENIZER_PATH = "/kaggle/input/roberta-base"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
tokenizer_base = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
tokenizer_large = AutoTokenizer.from_pretrained(TOKENIZER_LARGET_PATH)

In [4]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True

In [5]:
train_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/train.csv")

# Remove incomplete entries if any.
train_df.drop(train_df[(train_df.target == 0) & (train_df.standard_error == 0)].index,
              inplace=True)
train_df.reset_index(drop=True, inplace=True)

test_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
submission_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/sample_submission.csv")

# Dataset

In [6]:
class LitDataset(Dataset):
    def __init__(self, df, inference_only=False):
        super().__init__()

        self.df = df        
        self.inference_only = inference_only
        self.text = df.excerpt.tolist()
        #self.text = [text.replace("\n", " ") for text in self.text]
        
        if not self.inference_only:
            self.target = torch.tensor(df.target.values, dtype=torch.float32)        
    
        self.encoded = tokenizer_base.batch_encode_plus(
            self.text,
            padding = 'max_length',            
            max_length = MAX_LEN,
            truncation = True,
            return_attention_mask=True
        )        
 

    def __len__(self):
        return len(self.df)

    
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return (input_ids, attention_mask)            
        else:
            target = self.target[index]
            return (input_ids, attention_mask, target)

# Model
The model is inspired by the one from [Maunish](https://www.kaggle.com/maunish/clrp-roberta-svm).

In [7]:
class LitModel(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(ROBERTA_PATH)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})                       
        
        self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config=config)  
            
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(768, 1)                        
        )
        

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)        

        # There are a total of 13 layers of hidden states.
        # 1 for the embedding layer, and 12 for the 12 Roberta layers.
        # We take the hidden states from the last Roberta layer.
        last_layer_hidden_states = roberta_output.hidden_states[-1]

        # The number of cells is MAX_LEN.
        # The size of the hidden state of each cell is 768 (for roberta-base).
        # In order to condense hidden states of all cells to a context vector,
        # we compute a weighted average of the hidden states of all cells.
        # We compute the weight of each cell, using the attention neural network.
        weights = self.attention(last_layer_hidden_states)
                
        # weights.shape is BATCH_SIZE x MAX_LEN x 1
        # last_layer_hidden_states.shape is BATCH_SIZE x MAX_LEN x 768        
        # Now we compute context_vector as the weighted average.
        # context_vector.shape is BATCH_SIZE x 768
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        
        # Now we reduce the context vector to the prediction score.
        return self.regressor(context_vector)

In [8]:
class LitModelBig(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(ROBERTA_LARGE_PATH)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})                       
        
        self.roberta = AutoModel.from_pretrained(ROBERTA_LARGE_PATH, config=config)  
            
        self.attention = nn.Sequential(            
            nn.Linear(1024, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(1024, 1)                        
        )
        

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)        

        # There are a total of 13 layers of hidden states.
        # 1 for the embedding layer, and 12 for the 12 Roberta layers.
        # We take the hidden states from the last Roberta layer.
        last_layer_hidden_states = roberta_output.hidden_states[-1]

        # The number of cells is MAX_LEN.
        # The size of the hidden state of each cell is 768 (for roberta-base).
        # In order to condense hidden states of all cells to a context vector,
        # we compute a weighted average of the hidden states of all cells.
        # We compute the weight of each cell, using the attention neural network.
        weights = self.attention(last_layer_hidden_states)
                
        # weights.shape is BATCH_SIZE x MAX_LEN x 1
        # last_layer_hidden_states.shape is BATCH_SIZE x MAX_LEN x 768        
        # Now we compute context_vector as the weighted average.
        # context_vector.shape is BATCH_SIZE x 768
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        
        # Now we reduce the context vector to the prediction score.
        return self.regressor(context_vector)

In [9]:
def eval_mse(model, data_loader, file_indx):
    """Evaluates the mean squared error of the |model| on |data_loader|"""
    model.eval()            
    mse_sum = 0
    tempoof= pd.DataFrame(columns=['id','pred','target'])
    
    with torch.no_grad():
        
        for batch_num, (input_ids, attention_mask, target) in enumerate(data_loader):
            file_ids_batch_start = (batch_num) * BATCH_SIZE
            file_ids_batch_end = (batch_num) * BATCH_SIZE + BATCH_SIZE 
            input_ids = input_ids.to(DEVICE)

            attention_mask = attention_mask.to(DEVICE)                        
            target = target.to(DEVICE)
            
            pred = model(input_ids, attention_mask)                       
            
            numpy_preds = torch.reshape(pred, (-1,)).cpu()
            numpy_target = torch.reshape(target, (-1,)).cpu()

            
            ids = train_df.loc[file_indx].id[file_ids_batch_start:file_ids_batch_end].values
            preds = numpy_preds.cpu().detach().numpy()
            targets = numpy_target.cpu().detach().numpy()
            
            tmp = pd.DataFrame({"id":ids, 'pred' : preds, 'targets': targets})
            tempoof = pd.concat([tempoof,tmp])
            mse_sum += nn.MSELoss(reduction="sum")(pred.flatten(), target).item()
            

    return mse_sum / len(data_loader.dataset), tempoof

In [10]:
def predict(model, data_loader):
    """Returns an np.array with predictions of the |model| on |data_loader|"""
    model.eval()

    result = np.zeros(len(data_loader.dataset))    
    index = 0
    
    with torch.no_grad():
        for batch_num, (input_ids, attention_mask) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
                        
            pred = model(input_ids, attention_mask)                        

            result[index : index + pred.shape[0]] = pred.flatten().to("cpu")
            index += pred.shape[0]

    return result

In [11]:
oof = pd.DataFrame(columns=['id','target'])
oof.set_index('id', inplace=True)

### Commonlit-roberta-0467     

In [12]:
list_val_rmse = []

kfold = KFold(n_splits=NUM_FOLDS, random_state=SEED, shuffle=True)

model_name = 'roberta-0467'
for fold, (train_indices, val_indices) in enumerate(kfold.split(train_df)):    
    print(f"\nFold {fold + 1}/{NUM_FOLDS}")

    model_path = f"../input/commonlit-roberta-0467/model_{fold + 1}.pth"
    
    print(f"\nUsing {model_path}")
    
    set_random_seed(SEED + fold)
    
    
    train_dataset = LitDataset(train_df.loc[train_indices])  
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              drop_last=True, shuffle=True, num_workers=WORKERS)
    val_dataset = LitDataset(train_df.loc[val_indices])  
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE,drop_last=False, shuffle=False, num_workers=WORKERS)    
    
    
    model = LitModel()
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))    
    model.to(DEVICE)
    

    crnt_val_scor,oof_tmp = eval_mse(model, val_loader,val_indices)
    val_rmse = math.sqrt(crnt_val_scor)
    oof = pd.concat([oof,oof_tmp])
    #oof.merge(oof_tmp, how='left',on='id')
oof.to_csv('oof_score_roberta-0467.csv',index=False)    



Fold 1/5

Using ../input/commonlit-roberta-0467/model_1.pth


Some weights of the model checkpoint at /kaggle/input/roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Fold 2/5

Using ../input/commonlit-roberta-0467/model_2.pth


Some weights of the model checkpoint at /kaggle/input/roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Fold 3/5

Using ../input/commonlit-roberta-0467/model_3.pth


Some weights of the model checkpoint at /kaggle/input/roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Fold 4/5

Using ../input/commonlit-roberta-0467/model_4.pth


Some weights of the model checkpoint at /kaggle/input/roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Fold 5/5

Using ../input/commonlit-roberta-0467/model_5.pth


Some weights of the model checkpoint at /kaggle/input/roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### commonlit-roberta-large

In [13]:
list_val_rmse = []

kfold = KFold(n_splits=NUM_FOLDS, random_state=SEED, shuffle=True)
oof = pd.DataFrame(columns=['id','target'])
oof.set_index('id', inplace=True)
model_name = 'roberta_large_self_trained'

for fold, (train_indices, val_indices) in enumerate(kfold.split(train_df)):
    print(f"\nFold {fold + 1}/{NUM_FOLDS}")

    model_path = f"../input/robertalarge/model_{fold + 1}.pth"
    
    print(f"\nUsing {model_path}")
    
    set_random_seed(SEED + fold)
    
    
    train_dataset = LitDataset(train_df.loc[train_indices])  
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              drop_last=True, shuffle=True, num_workers=WORKERS)
    val_dataset = LitDataset(train_df.loc[val_indices])  
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE,drop_last=False, shuffle=False, num_workers=WORKERS)    
    
    
    model = LitModelBig()
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))    
    model.to(DEVICE)
    

    crnt_val_scor,oof_tmp = eval_mse(model, val_loader,val_indices)
    val_rmse = math.sqrt(crnt_val_scor)
    
    #oof.merge(oof_tmp, how='left',on='id')
    oof = pd.concat([oof,oof_tmp])
oof.to_csv('oof_score_roberta_large_self_trained.csv',index=False)        


Fold 1/5

Using ../input/robertalarge/model_1.pth


Some weights of the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN


Fold 2/5

Using ../input/robertalarge/model_2.pth


Some weights of the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN


Fold 3/5

Using ../input/robertalarge/model_3.pth


Some weights of the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN


Fold 4/5

Using ../input/robertalarge/model_4.pth


Some weights of the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN


Fold 5/5

Using ../input/robertalarge/model_5.pth


Some weights of the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN

In [14]:
test = train_df #test_df

from glob import glob
import os
import matplotlib.pyplot as plt
import json
from collections import defaultdict
import time

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.optimizer import Optimizer
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import (
    Dataset, DataLoader, 
    SequentialSampler, RandomSampler
)
from transformers import RobertaConfig
from transformers import (
    get_cosine_schedule_with_warmup, 
    get_cosine_with_hard_restarts_schedule_with_warmup
)
from transformers import RobertaTokenizer
from transformers import RobertaModel
from IPython.display import clear_output
from tqdm import tqdm, trange

def convert_examples_to_features(data, tokenizer, max_len, is_test=False):
    data = data.replace('\n', '')
    tok = tokenizer.encode_plus(
        data, 
        max_length=max_len, 
        truncation=True,
        return_attention_mask=True,
        return_token_type_ids=True
    )
    curr_sent = {}
    padding_length = max_len - len(tok['input_ids'])
    curr_sent['input_ids'] = tok['input_ids'] + ([0] * padding_length)
    curr_sent['token_type_ids'] = tok['token_type_ids'] + \
        ([0] * padding_length)
    curr_sent['attention_mask'] = tok['attention_mask'] + \
        ([0] * padding_length)
    return curr_sent

class DatasetRetriever(Dataset):
    def __init__(self, data, tokenizer, max_len, is_test=False):
        self.data = data
        if 'excerpt' in self.data.columns:
            self.excerpts = self.data.excerpt.values.tolist()
        else:
            self.excerpts = self.data.text.values.tolist()
        self.targets = self.data.target.values.tolist()
        self.tokenizer = tokenizer
        self.is_test = is_test
        self.max_len = max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, item):
        if not self.is_test:
            excerpt, label = self.excerpts[item], self.targets[item]
            features = convert_examples_to_features(
                excerpt, self.tokenizer, 
                self.max_len, self.is_test
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'token_type_ids':torch.tensor(features['token_type_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
                'label':torch.tensor(label, dtype=torch.double),
            }
        else:
            excerpt = self.excerpts[item]
            features = convert_examples_to_features(
                excerpt, self.tokenizer, 
                self.max_len, self.is_test
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'token_type_ids':torch.tensor(features['token_type_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
            }

class CommonLitModel(nn.Module):
    def __init__(
        self, 
        model_name, 
        config,  
        multisample_dropout=False,
        output_hidden_states=False
    ):
        super(CommonLitModel, self).__init__()
        self.config = config
        self.roberta = RobertaModel.from_pretrained(
            model_name, 
            output_hidden_states=output_hidden_states
        )
        self.layer_norm = nn.LayerNorm(config.hidden_size)
        if multisample_dropout:
            self.dropouts = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
        self.regressor = nn.Linear(config.hidden_size, 1)
        self._init_weights(self.layer_norm)
        self._init_weights(self.regressor)
 
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
 
    def forward(
        self, 
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        labels=None
    ):
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
        sequence_output = outputs[1]
        sequence_output = self.layer_norm(sequence_output)
 
        # multi-sample dropout
        for i, dropout in enumerate(self.dropouts):
            if i == 0:
                logits = self.regressor(dropout(sequence_output))
            else:
                logits += self.regressor(dropout(sequence_output))
        
        logits /= len(self.dropouts)
 
        # calculate loss
        loss = None
        if labels is not None:
            loss_fn = torch.nn.MSELoss()
            logits = logits.view(-1).to(labels.dtype)
            loss = torch.sqrt(loss_fn(logits, labels.view(-1)))
        
        output = (logits,) + outputs[1:]
        return ((loss,) + output) if loss is not None else output

def make_model(model_name, num_labels=1):
    tokenizer = RobertaTokenizer.from_pretrained(model_name)
    config = RobertaConfig.from_pretrained(model_name)
    config.update({'num_labels':num_labels})
    model = CommonLitModel(model_name, config=config)
    return model, tokenizer

def make_loader(data, tokenizer, max_len, batch_size,is_test):
    
    test_dataset = DatasetRetriever(data, tokenizer, max_len, is_test=is_test)
    test_sampler = SequentialSampler(test_dataset)
    test_loader = DataLoader(
        test_dataset, 
        batch_size=batch_size, 
        sampler=test_sampler, 
        pin_memory=False, 
        drop_last=False, 
        num_workers=WORKERS
    )

    return test_loader

class Evaluator:
    def __init__(self, model, scalar=None,files_ids=None):
        self.model = model
        self.scalar = scalar
        self.files_ids = files_ids

    def evaluate(self, data_loader, tokenizer):
        preds = []
        self.model.eval()
        total_loss = 0
        
        tempoof= pd.DataFrame(columns=['id','pred','target'])
        
        with torch.no_grad():
            for batch_idx, batch_data in enumerate(data_loader):
                
                file_ids_batch_start = (batch_idx) * BATCH_SIZE
                file_ids_batch_end = (batch_idx) * BATCH_SIZE + BATCH_SIZE 
                
                input_ids, attention_mask, token_type_ids,labels = batch_data['input_ids'], \
                    batch_data['attention_mask'], batch_data['token_type_ids'], batch_data['label']
                
                input_ids, attention_mask, token_type_ids, labels = input_ids.cuda(), \
                    attention_mask.cuda(), token_type_ids.cuda(), labels.cuda()
                
                ids = train_df.loc[self.files_ids].id[file_ids_batch_start:file_ids_batch_end].values
                
                
                if self.scalar is not None:
                    with torch.cuda.amp.autocast():
                        outputs = self.model(
                            input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            labels=labels
                        )
                else:
                    outputs = self.model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids
                    )
                
                logits = outputs[0].detach().cpu().numpy().squeeze().tolist()
                
                preds += logits

                #numpy_preds = torch.reshape(preds, (-1,)).cpu()
                numpy_target = torch.reshape(labels, (-1,)).cpu()
                temp_preds = logits
                targets = numpy_target.cpu().detach().numpy()
                #print("IDS", ids)
                #print("Targets", targets)
                #print("Preds",temp_preds)
                
                tmp = pd.DataFrame({"id":ids, 'pred' : temp_preds, 'target': targets})
                tempoof = pd.concat([tempoof,tmp])
        return preds, tempoof

def config(fold, model_name, load_model_path):
    
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    
    max_len = 250
    batch_size = BATCH_SIZE

    model, tokenizer = make_model(model_name=model_name, num_labels=1)
    model.load_state_dict(
        torch.load(f'{load_model_path}/model{fold}.bin')
    )
    test_loader = make_loader(
        test, tokenizer, max_len=max_len,
        batch_size=batch_size,
        is_test=False
    )

    if torch.cuda.device_count() >= 1:
        print('Model pushed to {} GPU(s), type {}.'.format(
            torch.cuda.device_count(), 
            torch.cuda.get_device_name(0))
        )
        model = model.cuda() 
    else:
        raise ValueError('CPU training is not supported')

    # scaler = torch.cuda.amp.GradScaler()
    scaler = None
    return (
        model, tokenizer, 
        test_loader, scaler
    )

def run(fold=0, model_name=None, load_model_path=None, files_ids=None):
    
    model, tokenizer, test_loader, scaler = config(fold, model_name, load_model_path)
    
    evaluator = Evaluator(model, scaler, files_ids)

    test_time_list = []

    torch.cuda.synchronize()
    tic1 = time.time()
    
    preds, tempoof = evaluator.evaluate(test_loader, tokenizer)

    torch.cuda.synchronize()
    tic2 = time.time() 
    test_time_list.append(tic2 - tic1)
    
    del model, tokenizer, test_loader, scaler
    gc.collect()
    torch.cuda.empty_cache()
    
    return preds, tempoof

In [15]:
kfold = KFold(n_splits=NUM_FOLDS, random_state=SEED, shuffle=True)

oof = pd.DataFrame(columns=['id','target'])
oof.set_index('id', inplace=True)

model_names = ['commonlit-roberta-base-i','roberta-large-itptfit','commonlit-roberta-large-ii' ]
print("Using",model_names[0])
for fold, (train_indices, val_indices) in enumerate(kfold.split(train_df)):
    print(f"\nFold {fold + 1}/{NUM_FOLDS}")
    
    set_random_seed(SEED + fold)
    
    test = train_df.loc[val_indices]

    preds, tempoof = run(fold, '../input/roberta-base/', '../input/commonlit-roberta-base-i/',val_indices) 
    oof = pd.concat([oof,tempoof])

oof.to_csv(f'oof_{model_names[0]}.csv',index=False)

oof = pd.DataFrame(columns=['id','target'])
oof.set_index('id', inplace=True)

print("Using",model_names[1])
for fold, (train_indices, val_indices) in enumerate(kfold.split(train_df)):
    print(f"\nFold {fold + 1}/{NUM_FOLDS}")
    
    set_random_seed(SEED + fold)
    
    test = train_df.loc[val_indices]

    preds, tempoof = run(fold, '../input/clrp-roberta-large/clrp_roberta_large/', '../input/roberta-large-itptfit/',val_indices) 
    oof = pd.concat([oof,tempoof])

oof.to_csv(f'oof_{model_names[1]}.csv',index=False)

oof = pd.DataFrame(columns=['id','target'])
oof.set_index('id', inplace=True)

print("Using",model_names[2])
for fold, (train_indices, val_indices) in enumerate(kfold.split(train_df)):
    print(f"\nFold {fold + 1}/{NUM_FOLDS}")
    
    set_random_seed(SEED + fold)
    
    test = train_df.loc[val_indices]

    preds, tempoof = run(fold, '../input/clrp-roberta-large/clrp_roberta_large/', '../input/roberta-large-itptfit/',val_indices) 
    oof = pd.concat([oof,tempoof])

oof.to_csv(f'oof_{model_names[2]}.csv',index=False)

Using commonlit-roberta-base-i

Fold 1/5


Some weights of the model checkpoint at ../input/roberta-base/ were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.

Fold 2/5


Some weights of the model checkpoint at ../input/roberta-base/ were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.

Fold 3/5


Some weights of the model checkpoint at ../input/roberta-base/ were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.

Fold 4/5


Some weights of the model checkpoint at ../input/roberta-base/ were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.

Fold 5/5


Some weights of the model checkpoint at ../input/roberta-base/ were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Using roberta-large-itptfit

Fold 1/5


Some weights of the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large/ were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large/ and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRA

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.

Fold 2/5


Some weights of the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large/ were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large/ and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRA

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.

Fold 3/5


Some weights of the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large/ were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large/ and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRA

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.

Fold 4/5


Some weights of the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large/ were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large/ and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRA

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.

Fold 5/5


Some weights of the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large/ were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large/ and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRA

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Using commonlit-roberta-large-ii

Fold 1/5


Some weights of the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large/ were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large/ and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRA

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.

Fold 2/5


Some weights of the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large/ were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large/ and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRA

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.

Fold 3/5


Some weights of the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large/ were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large/ and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRA

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.

Fold 4/5


Some weights of the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large/ were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large/ and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRA

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.

Fold 5/5


Some weights of the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large/ were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-large/clrp_roberta_large/ and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRA

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.


In [16]:
oof

Unnamed: 0,target,id,pred
0,-1.054013,dd1000b26,-1.149684
1,0.247197,37c1b32fb,0.450214
2,-0.952325,0a43a07f1,-0.978375
3,-3.081337,c57b50918,-2.172103
4,0.245806,587502a70,0.441140
...,...,...,...
9,-1.680656,7382b7a7a,-1.853938
10,0.120458,9c5ff50d5,0.073447
11,0.747775,25f93b2f6,0.577809
0,0.189476,2c26db523,0.350102


In [17]:
#from sklearn.metrics import mean_squared_error
#mean_squared_error(oof.targets, oof['roberta-0467'],squared=False)