# Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install transformers



In [None]:
import pandas as pd
import numpy as np

import random
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn

import os
from collections import defaultdict
from tqdm import tqdm_notebook

from transformers import AutoConfig, AutoTokenizer, AutoModel
import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

import torch.nn.init as init
import torch.nn.functional as F
from torch.nn import Parameter
from torch.autograd.function import InplaceFunction
import math

from torch.utils.data import Sampler, Dataset, DataLoader
import random

from more_itertools import chunked, flatten

In [None]:
%cd drive/MyDrive/CommonLit

/content/drive/MyDrive/CommonLit


# Get folds

In [None]:
df = pd.read_csv("train_folds.csv")

# Seed Everything

In [None]:
def seed_everything(seed=12):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=12)

# Configuration

In [None]:
MAX_LEN = 256
EPOCHS = 4
ROBERTA_PATH = "roberta-large"
PRETRAIN_PATH = "Pretrain_CLRP_Roberta/pretrained_roberta_large.bin"
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
LEARNING_RATE = 2e-5
TOKENIZER = transformers.AutoTokenizer.from_pretrained(ROBERTA_PATH)

# Dataset

In [None]:
# class RobertaDatasetMW:
#     def __init__(self,df):
#         self.excerpt = df.excerpt.values
#         self.target = df.target.values

#     def __len__(self):
#         return len(self.excerpt)
    
#     def __getitem__(self,item):
#         excerpt = str(self.excerpt[item])
#         excerpt = " ".join(excerpt.split())
#         inputs = TOKENIZER(excerpt, add_special_tokens=True, max_length=MAX_LEN, padding=True, truncation=True)
        
#         ids = inputs["input_ids"]
#         mask = inputs["attention_mask"]
        
#         padding_len = MAX_LEN-len(ids)
#         ids = ids+([0]*padding_len)
#         mask = mask+([0]*padding_len)
 
#         return {"ids": ids,
#             "mask": mask,
#             "targets": torch.tensor(self.target[item], dtype=torch.float)}

In [None]:
class RobertaDataset:
    def __init__(self,df):
        self.excerpt = df.excerpt.values
        self.target = df.target.values

    def __len__(self):
        return len(self.excerpt)
    
    def __getitem__(self,item):
        excerpt = str(self.excerpt[item])
        excerpt = " ".join(excerpt.split())
        inputs = TOKENIZER(excerpt, add_special_tokens=True, max_length=MAX_LEN, padding=True, truncation=True)
        
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        
        padding_len = MAX_LEN-len(ids)
        ids = ids+([0]*padding_len)
        mask = mask+([0]*padding_len)
 
        return {"ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "targets": torch.tensor(self.target[item], dtype=torch.float)}

In [None]:
# def mask_collator(batch):
    
#     ids = [sample["ids"] for sample in batch]
#     mask = [sample["mask"] for sample in batch]
#     targets = [sample["targets"] for sample in batch]

#     mlm_probability = 0.05
    
#     input_ids = []
#     for sentence in ids:
#         r = np.random.uniform(0,1,len(sentence))
#         sentence = np.array(sentence)
#         sentence[(r < mlm_probability)&(sentence!=0)&(sentence!=4)&(sentence!=2)] = 50264
#         sentence = list(sentence)
#         input_ids.append(sentence)
    
#     return {"ids": torch.tensor(input_ids, dtype=torch.long),
#             "mask": torch.tensor(mask, dtype=torch.long),
#             "targets": torch.tensor(targets, dtype=torch.float)}

# Smart Batching

In [None]:
class RobertaDatasetSmart:
    def __init__(self,df):
        self.excerpt = df.excerpt.values
        self.target = df.target.values
        self.standard_error = df.standard_error.values

    def __len__(self):
        return len(self.excerpt)
    
    def __getitem__(self,item):
        excerpt = str(self.excerpt[item])
        excerpt = " ".join(excerpt.split())
        inputs = TOKENIZER(excerpt, add_special_tokens=True, max_length=MAX_LEN, padding=False, truncation=True)
        
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
 
        return {"ids": ids,
            "mask": mask,
            "targets": torch.tensor(self.target[item], dtype=torch.float)}

In [None]:
class SmartBatchingSampler(Sampler):
    def __init__(self, dataset, batch_size):
        self.sample_lengths = [len(seq["ids"]) for seq in dataset]
        self.sorted_input_ids = list(np.argsort(self.sample_lengths))
        self.batches = []
        self.len = len(dataset)

        while len(self.sorted_input_ids) > 0:  

            to_take = min(batch_size, len(self.sorted_input_ids))
            select = random.randint(0, len(self.sorted_input_ids) - to_take)
            batch = self.sorted_input_ids[select:(select + to_take)]
            self.batches.append(batch)
            del self.sorted_input_ids[select:select + to_take]
    
    def __iter__(self):

        yield from flatten(self.batches)

    def __len__(self):
        return self.len

In [None]:
def pad_collator(batch):
    
    ids = [sample["ids"] for sample in batch]
    mask = [sample["mask"] for sample in batch]
    targets = [sample["targets"] for sample in batch]
    
    max_len = max([len(text) for text in ids])
    
    input_ids = []
    attention_mask = []
    for i,m in zip(ids, mask):
        padding_len = max_len-len(i)
        input_ids.append(list(i)+([0]*padding_len))
        attention_mask.append(list(m)+([0]*padding_len))
    
    return {"ids": torch.tensor(input_ids, dtype=torch.long),
            "mask": torch.tensor(attention_mask, dtype=torch.long),
            "targets": torch.tensor(targets, dtype=torch.float)}

# Model Building

In [None]:
class RobertaModel(nn.Module):
    
    def __init__(self, model_type="attention"):
        super(RobertaModel,self).__init__()

        self.model_type = model_type
        
        self.config = AutoConfig.from_pretrained(ROBERTA_PATH)
        self.config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})   
        self.roberta = transformers.AutoModel.from_pretrained(PRETRAIN_PATH, config=self.config)

        if model_type == "attention":
            
            self.attention = nn.Sequential(            
            nn.Linear(1024, 256),            
            nn.Tanh(),                       
            nn.Linear(256, 1),
            nn.Softmax(dim=1)
            )   

            self.layer_norm1 = nn.LayerNorm(1024)
            self.linear1 = nn.Linear(1024, 256)
            self.linear2 = nn.Linear(256, 1)
            self.layer_norm2 = nn.LayerNorm(256)
                           
        elif model_type == "mean":
        
            self.layer_norm1 = nn.LayerNorm(1024)
            self.linear1 = nn.Linear(1024, 256)
            self.linear2 = nn.Linear(256, 1)
            self.layer_norm2 = nn.LayerNorm(256)

    def freeze(self):
        for child in self.roberta.children():
            for param in child.parameters():
                param.requires_grad = False

    def unfreeze(self):
        for child in self.roberta.children():
            for param in child.parameters():
                param.requires_grad = True
        
    def forward(self, ids, mask, loss_fn = None, targets = None):

        if self.model_type == "mean":

            outputs = self.roberta(ids, mask)
            last_hidden_state = outputs[0]
            input_mask_expanded = mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
            sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
            sum_mask = input_mask_expanded.sum(1)
            sum_mask = torch.clamp(sum_mask, min=1e-9)
            mean_embeddings = sum_embeddings / sum_mask
            norm_mean_embeddings = self.layer_norm1(mean_embeddings)
            logits = self.linear1(norm_mean_embeddings)
            logits = self.linear2(self.layer_norm2(logits))

        elif self.model_type=="attention":

            roberta_output = self.roberta(input_ids=ids,
                                  attention_mask=mask)        
            last_layer_hidden_states = roberta_output.last_hidden_state
            weights = self.attention(last_layer_hidden_states)
            context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)
            norm_context_vector = self.layer_norm1(context_vector)
            logits = self.linear1(norm_context_vector)
            logits = self.linear2(self.layer_norm2(logits)) 

        if targets is not None:

            loss = torch.sqrt(loss_fn(logits.view(-1),targets.view(-1)))
            return loss, logits

        else:

            return logits

# Evaluation Scheduler

In [None]:
def evaluate(EVAL_STEPS,valid_interval, valid_loss,train_loss, final_train_loss, index, best_loss, epoch):
    
        print(f"Epoch:{epoch}| Batch {index} | Train Loss:{train_loss.avg()} | Validation loss:{valid_loss}")
        if (valid_loss < best_loss):
            
            for rmse, steps in EVAL_STEPS:
                if valid_loss > rmse:
                    valid_interval = steps
                    break
              
            print(f"Validation loss decreased from {best_loss} to {valid_loss}.")
            final_train_loss = train_loss.avg()
            best_loss = valid_loss
            torch.save(model.state_dict(),f'Models/CodeRobertaLargeAttentionNorm3/model{fold}.bin')

            
        return valid_interval, best_loss, final_train_loss

# Training Function

In [None]:
def train_fn(train_dataloader, valid_dataloader, model, optimizer, device, scheduler):
    

    EVAL_STEPS = [(0.50,80),(0.49,40), (0.48, 40), (-1., 40)]
    valid_interval = EVAL_STEPS[0][1]
    best_loss = np.inf
    final_train_loss = None
    accumulation_steps = 4
    lr_schedule = [5e-5, 2e-5, 5e-6, 2e-6]
    # lr_schedule = [4e-5, 2e-5, 1e-5, 5e-6,2.5e-6]

    for epoch in range(EPOCHS):
      
        train_loss = AvgCounter()
        lr = lr_schedule[epoch]
        optimizer = scheduler(optimizer,lr)

        for index, d in tqdm_notebook(enumerate(train_dataloader), total=len(train_dataloader)):
  
            ids = d["ids"]
            mask = d["mask"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            model.train()
            loss, outputs = model(ids=ids, mask=mask, loss_fn = loss_fn, targets = targets)
            
            train_loss.update(loss.item(), len(d))
            loss = loss / accumulation_steps 
            loss.backward()

            if index % accumulation_steps == 0:             
                optimizer.step() 
                # scheduler.step()                           
                optimizer.zero_grad()

            if (index % valid_interval == 0) | ((len(train_dataloader)-index) == 1):

                valid_loss = eval_fn(valid_dataloader,model,device)
                
                valid_interval, best_loss, final_train_loss = evaluate(EVAL_STEPS,valid_interval, valid_loss,train_loss, final_train_loss, index, best_loss, epoch )
            
    return final_train_loss, best_loss

# Evaluation Function

In [None]:
def eval_fn(data_loader, model, device):
    model.eval()
    valid_loss = AvgCounter()
    
    with torch.no_grad():
        for bi, d in enumerate(data_loader):
            ids = d["ids"]
            mask = d["mask"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            loss, outputs = model(ids=ids, mask=mask, loss_fn = loss_fn, targets = targets)
            
            valid_loss.update(loss.item(), len(d))
            
    return valid_loss.avg()

# AvgCounter

In [None]:
class AvgCounter:
    def __init__(self):
        self.reset()
        
    def update(self, loss, n_samples):
        self.loss += loss * n_samples
        self.n_samples += n_samples
        
    def avg(self):
        return self.loss / self.n_samples
    
    def reset(self):
        self.loss = 0
        self.n_samples = 0

# Useful Functions

In [None]:
# create pytorch dataloader
def create_dataloader(df, fold, smart = True):
    
    train = df[df.kfold!=fold].reset_index(drop=True)
    valid = df[df.kfold==fold].reset_index(drop=True)

    if smart:
        train_dataset = RobertaDatasetSmart(train)
        valid_dataset = RobertaDatasetSmart(valid)

        sampler_train = SmartBatchingSampler(train_dataset, TRAIN_BATCH_SIZE)
        sampler_valid = SmartBatchingSampler(valid_dataset, VALID_BATCH_SIZE)

        train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size= TRAIN_BATCH_SIZE, sampler = sampler_train, collate_fn=pad_collator)
        valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size= VALID_BATCH_SIZE, sampler = sampler_valid, collate_fn=pad_collator)

    else:
        train_dataset = RobertaDataset(train)
        valid_dataset = RobertaDataset(valid)
    
        sampler = torch.utils.data.RandomSampler(train_dataset)

        train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size= TRAIN_BATCH_SIZE, sampler = sampler)
        valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size= VALID_BATCH_SIZE)

    return train_dataloader, valid_dataloader

In [None]:
def create_model(device):

    model = RobertaModel().to(device)
    model.roberta.embeddings.requires_grad_(False)
    
    return model

In [None]:
# create the optimizer
def create_optimizer(model):
    named_parameters = list(model.named_parameters()) 
    no_decay = ['bias', 'gamma', 'beta']   
    
    parameters = []
    lr = 3e-5
    regressor_lr = 2e-5
    for layer in range(23,-1,-1):
        layer_params = {
          'params': [
                      p for n,p in model.named_parameters() if not any(nd in n for nd in no_decay) \
                      and (f'encoder.layer.{layer}.' in n)
                      ],
          'lr': lr
      }
        parameters.append(layer_params)

        lr *= 0.975

    regressor_params = {
      'params': [p for n,p in model.named_parameters() if "roberta" not in n],
      'lr': regressor_lr
    }
    parameters.append(regressor_params)

    return AdamW(parameters)

In [None]:
# create the optimizer
def create_optimizer_simple(model):
    parameters = []
    no_decay = ['bias', 'gamma', 'beta'] 

    no_decay_parameters = {
        'params': [
                      p for n,p in model.named_parameters() if any(nd in n for nd in no_decay) 
                      ],
          'wd': 0.00
    }

    decay_parameters = {
        'params': [
                      p for n,p in model.named_parameters() if not any(nd in n for nd in no_decay) 
                      ]
    }
    
    parameters.append(no_decay_parameters)
    parameters.append(decay_parameters)

    return AdamW(parameters)

In [None]:
# create scheduler
def create_scheduler(optimizer, num_warmup_steps, num_train_steps, scheduler_name = "get_cosine_schedule_with_warmup" ):

    if scheduler_name == "get_linear_schedule_with_warmup":
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)
        
    elif scheduler_name == "get_cosine_schedule_with_warmup":
        scheduler = get_cosine_schedule_with_warmup(optimizer,num_training_steps=num_train_steps,num_warmup_steps=50) 
        
    else:
        raise Exception(f"Unknown scheduler: {scheduler_name}")

    return scheduler

In [None]:
def scheduler(optimizer,lr):
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    return optimizer

# Training

In [None]:
loss_fn=nn.MSELoss()
loss=defaultdict(list)
results_val = {}
results_train = {}
for fold in range(5):

    seed_everything(12)
    
    device = torch.device("cuda")
    model = create_model(device)

    print("################################")
    print(f"Training Fold {fold}")
    print("################################")

    train_dataloader, valid_dataloader = create_dataloader(df, fold, smart = False)
    num_train_steps = len(train_dataloader) * EPOCHS

    optimizer = create_optimizer(model)
    # scheduler = create_scheduler(optimizer, num_warmup_steps = 0, num_train_steps = num_train_steps )

    seed_everything(12)
    
    results_train[fold], results_val[fold] = train_fn(train_dataloader,valid_dataloader, model, optimizer, device, scheduler)

print("################################")
print("RESULTS")
print("################################")
cv_val = np.mean([results_val[i] for i in range(5)])
cv_train = np.mean([results_train[i] for i in range(5)])
print(f"Results of cross validation for seed 42: Train : {cv_train}, Val : {cv_val}")

Some weights of the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_large.bin were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_large.bin and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probab

################################
Training Fold 0
################################


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:0| Batch 0 | Train Loss:1.2114845514297485 | Validation loss:2.6666462857958297
Validation loss decreased from inf to 2.6666462857958297.
Epoch:0| Batch 80 | Train Loss:0.9440634934990494 | Validation loss:0.7946985557045735
Validation loss decreased from 2.6666462857958297 to 0.7946985557045735.
Epoch:0| Batch 160 | Train Loss:0.7959773105493984 | Validation loss:0.6087920002534356
Validation loss decreased from 0.7946985557045735 to 0.6087920002534356.
Epoch:0| Batch 240 | Train Loss:0.7199942431261925 | Validation loss:0.5048203724370876
Validation loss decreased from 0.6087920002534356 to 0.5048203724370876.
Epoch:0| Batch 283 | Train Loss:0.6945749181257167 | Validation loss:0.4902101541069192
Validation loss decreased from 0.5048203724370876 to 0.4902101541069192.



HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:1| Batch 0 | Train Loss:0.34480157494544983 | Validation loss:0.487276908167651
Validation loss decreased from 0.4902101541069192 to 0.487276908167651.
Epoch:1| Batch 40 | Train Loss:0.40558303056693656 | Validation loss:0.4766013267594324
Validation loss decreased from 0.487276908167651 to 0.4766013267594324.
Epoch:1| Batch 80 | Train Loss:0.3996189703911911 | Validation loss:0.552078450229806
Epoch:1| Batch 120 | Train Loss:0.40035585024632697 | Validation loss:0.48991662389795543
Epoch:1| Batch 160 | Train Loss:0.4025356905615848 | Validation loss:0.5005925335514714
Epoch:1| Batch 200 | Train Loss:0.40118577668619393 | Validation loss:0.4968637219617065
Epoch:1| Batch 240 | Train Loss:0.40635620892295204 | Validation loss:0.4734403923363753
Validation loss decreased from 0.4766013267594324 to 0.4734403923363753.
Epoch:1| Batch 280 | Train Loss:0.40371336742864383 | Validation loss:0.4809839536606426
Epoch:1| Batch 283 | Train Loss:0.40375598262943013 | Validation loss:0.480983

HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:2| Batch 0 | Train Loss:0.2725757658481598 | Validation loss:0.47954842951935783
Epoch:2| Batch 40 | Train Loss:0.2637985795736313 | Validation loss:0.47449339716367317
Epoch:2| Batch 80 | Train Loss:0.26306414089085145 | Validation loss:0.48178456303939016
Epoch:2| Batch 120 | Train Loss:0.26283121626239175 | Validation loss:0.4671718408943902
Validation loss decreased from 0.4734403923363753 to 0.4671718408943902.
Epoch:2| Batch 160 | Train Loss:0.2631094564469705 | Validation loss:0.46419436725932106
Validation loss decreased from 0.4671718408943902 to 0.46419436725932106.
Epoch:2| Batch 200 | Train Loss:0.2629429470766243 | Validation loss:0.4628282600725201
Validation loss decreased from 0.46419436725932106 to 0.4628282600725201.
Epoch:2| Batch 240 | Train Loss:0.25952838401453127 | Validation loss:0.47328151573597543
Epoch:2| Batch 280 | Train Loss:0.2552495640023111 | Validation loss:0.479981983841305
Epoch:2| Batch 283 | Train Loss:0.25510352780439066 | Validation loss:0.

HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:3| Batch 0 | Train Loss:0.15003250539302826 | Validation loss:0.48109605530617944
Epoch:3| Batch 40 | Train Loss:0.19090421679543285 | Validation loss:0.4644282586138013
Epoch:3| Batch 80 | Train Loss:0.19263819347561142 | Validation loss:0.46787302678739523
Epoch:3| Batch 120 | Train Loss:0.1946536760069122 | Validation loss:0.4639560492105887
Epoch:3| Batch 160 | Train Loss:0.19550566162381852 | Validation loss:0.4664485238387551
Epoch:3| Batch 200 | Train Loss:0.19802109178025923 | Validation loss:0.46863799720582827
Epoch:3| Batch 240 | Train Loss:0.19605210135957513 | Validation loss:0.4675988972606793
Epoch:3| Batch 280 | Train Loss:0.19723840704804213 | Validation loss:0.462762604716798
Validation loss decreased from 0.4628282600725201 to 0.462762604716798.
Epoch:3| Batch 283 | Train Loss:0.19727907634117234 | Validation loss:0.462762604716798



Some weights of the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_large.bin were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_large.bin and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probab

################################
Training Fold 1
################################


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:0| Batch 0 | Train Loss:1.0997647047042847 | Validation loss:2.1406705010105185
Validation loss decreased from inf to 2.1406705010105185.
Epoch:0| Batch 80 | Train Loss:0.913148871542495 | Validation loss:0.6751042054572576
Validation loss decreased from 2.1406705010105185 to 0.6751042054572576.
Epoch:0| Batch 160 | Train Loss:0.7579767611456214 | Validation loss:0.6208080871843956
Validation loss decreased from 0.6751042054572576 to 0.6208080871843956.
Epoch:0| Batch 240 | Train Loss:0.6999593898963137 | Validation loss:0.5345298895533656
Validation loss decreased from 0.6208080871843956 to 0.5345298895533656.
Epoch:0| Batch 283 | Train Loss:0.6769148186898567 | Validation loss:0.5415182147227543



HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:1| Batch 0 | Train Loss:0.4882616698741913 | Validation loss:0.5594259047172438
Epoch:1| Batch 80 | Train Loss:0.4074048688750208 | Validation loss:0.5608306770593348
Epoch:1| Batch 160 | Train Loss:0.4026092411198231 | Validation loss:0.49811580777168274
Validation loss decreased from 0.5345298895533656 to 0.49811580777168274.
Epoch:1| Batch 200 | Train Loss:0.4001018941698976 | Validation loss:0.5932513269740092
Epoch:1| Batch 240 | Train Loss:0.39806320527529815 | Validation loss:0.48124676753937357
Validation loss decreased from 0.49811580777168274 to 0.48124676753937357.
Epoch:1| Batch 280 | Train Loss:0.3980703544521247 | Validation loss:0.5490343239105923
Epoch:1| Batch 283 | Train Loss:0.3981013815115455 | Validation loss:0.5490343239105923



HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:2| Batch 0 | Train Loss:0.4988097548484802 | Validation loss:0.5464842537759056
Epoch:2| Batch 40 | Train Loss:0.31363941256592914 | Validation loss:0.4791783713538882
Validation loss decreased from 0.48124676753937357 to 0.4791783713538882.
Epoch:2| Batch 80 | Train Loss:0.29575919580680354 | Validation loss:0.47548390400241797
Validation loss decreased from 0.4791783713538882 to 0.47548390400241797.
Epoch:2| Batch 120 | Train Loss:0.2791795061885818 | Validation loss:0.4920696502840015
Epoch:2| Batch 160 | Train Loss:0.27118217218551577 | Validation loss:0.4842730720697994
Epoch:2| Batch 200 | Train Loss:0.2661223907153405 | Validation loss:0.4683315252334299
Validation loss decreased from 0.47548390400241797 to 0.4683315252334299.
Epoch:2| Batch 240 | Train Loss:0.2617250190607245 | Validation loss:0.47404167202996533
Epoch:2| Batch 280 | Train Loss:0.2607091024411955 | Validation loss:0.471807459920225
Epoch:2| Batch 283 | Train Loss:0.2606735138458685 | Validation loss:0.471

HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:3| Batch 0 | Train Loss:0.17906200885772705 | Validation loss:0.47168405102172367
Epoch:3| Batch 40 | Train Loss:0.17466845381550672 | Validation loss:0.4747373558266062
Epoch:3| Batch 80 | Train Loss:0.1863052804528931 | Validation loss:0.4740339393766833
Epoch:3| Batch 120 | Train Loss:0.18898799085666326 | Validation loss:0.47908552998388315
Epoch:3| Batch 160 | Train Loss:0.18715844196932657 | Validation loss:0.47240262803897054
Epoch:3| Batch 200 | Train Loss:0.18751103853556647 | Validation loss:0.4727373309958149
Epoch:3| Batch 240 | Train Loss:0.18839329654250403 | Validation loss:0.48109659321711096
Epoch:3| Batch 280 | Train Loss:0.18897822740557355 | Validation loss:0.4756355969838693
Epoch:3| Batch 283 | Train Loss:0.18908849602538935 | Validation loss:0.4756355969838693



Some weights of the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_large.bin were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_large.bin and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probab

################################
Training Fold 2
################################


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:0| Batch 0 | Train Loss:0.9143547415733337 | Validation loss:1.3293246089572637
Validation loss decreased from inf to 1.3293246089572637.
Epoch:0| Batch 80 | Train Loss:0.8278189349321672 | Validation loss:0.7008025096335881
Validation loss decreased from 1.3293246089572637 to 0.7008025096335881.
Epoch:0| Batch 160 | Train Loss:0.7544807339121836 | Validation loss:0.5799414924752544
Validation loss decreased from 0.7008025096335881 to 0.5799414924752544.
Epoch:0| Batch 240 | Train Loss:0.6951382906481438 | Validation loss:0.68671526287643
Epoch:0| Batch 283 | Train Loss:0.6735309648366881 | Validation loss:0.5485064280284963
Validation loss decreased from 0.5799414924752544 to 0.5485064280284963.



HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:1| Batch 0 | Train Loss:0.522756814956665 | Validation loss:0.5669304967766077
Epoch:1| Batch 80 | Train Loss:0.45007880216027485 | Validation loss:0.5567955387310243
Epoch:1| Batch 160 | Train Loss:0.44220072077298017 | Validation loss:0.5207762585979112
Validation loss decreased from 0.5485064280284963 to 0.5207762585979112.
Epoch:1| Batch 240 | Train Loss:0.4317102337650244 | Validation loss:0.5136511103368141
Validation loss decreased from 0.5207762585979112 to 0.5136511103368141.
Epoch:1| Batch 283 | Train Loss:0.423831102556326 | Validation loss:0.48095972689104755
Validation loss decreased from 0.5136511103368141 to 0.48095972689104755.



HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:2| Batch 0 | Train Loss:0.2305557280778885 | Validation loss:0.48023634918139013
Validation loss decreased from 0.48095972689104755 to 0.48023634918139013.
Epoch:2| Batch 40 | Train Loss:0.3078844667207904 | Validation loss:0.47747362562468354
Validation loss decreased from 0.48023634918139013 to 0.47747362562468354.
Epoch:2| Batch 80 | Train Loss:0.2954292989071505 | Validation loss:0.479216777522799
Epoch:2| Batch 120 | Train Loss:0.28638309363491277 | Validation loss:0.4910407670786683
Epoch:2| Batch 160 | Train Loss:0.28213508445653857 | Validation loss:0.4756601561962719
Validation loss decreased from 0.47747362562468354 to 0.4756601561962719.
Epoch:2| Batch 200 | Train Loss:0.28230682549191943 | Validation loss:0.47109575716542523
Validation loss decreased from 0.4756601561962719 to 0.47109575716542523.
Epoch:2| Batch 240 | Train Loss:0.2807930751831205 | Validation loss:0.4977966914294471
Epoch:2| Batch 280 | Train Loss:0.2781820987679356 | Validation loss:0.47142362384728

HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:3| Batch 0 | Train Loss:0.371002733707428 | Validation loss:0.4727779723389048
Epoch:3| Batch 40 | Train Loss:0.2239573984974768 | Validation loss:0.47936352478786254
Epoch:3| Batch 80 | Train Loss:0.2154884006322166 | Validation loss:0.4721798044694981
Epoch:3| Batch 120 | Train Loss:0.2135546948175785 | Validation loss:0.47607729573484875
Epoch:3| Batch 160 | Train Loss:0.20970721788102795 | Validation loss:0.47087232831498266
Validation loss decreased from 0.47109575716542523 to 0.47087232831498266.
Epoch:3| Batch 200 | Train Loss:0.20796531865104514 | Validation loss:0.4790122616039196
Epoch:3| Batch 240 | Train Loss:0.20496797577103143 | Validation loss:0.4699995817852692
Validation loss decreased from 0.47087232831498266 to 0.4699995817852692.
Epoch:3| Batch 280 | Train Loss:0.20575035816834067 | Validation loss:0.4710980706231695
Epoch:3| Batch 283 | Train Loss:0.20533641842259487 | Validation loss:0.4710980706231695



Some weights of the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_large.bin were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_large.bin and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probab

################################
Training Fold 3
################################


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:0| Batch 0 | Train Loss:0.886536717414856 | Validation loss:0.9690399610660445
Validation loss decreased from inf to 0.9690399610660445.
Epoch:0| Batch 80 | Train Loss:0.9098551770051321 | Validation loss:0.6568610063740905
Validation loss decreased from 0.9690399610660445 to 0.6568610063740905.
Epoch:0| Batch 160 | Train Loss:0.7644360243163494 | Validation loss:0.5637827983624498
Validation loss decreased from 0.6568610063740905 to 0.5637827983624498.
Epoch:0| Batch 240 | Train Loss:0.7240517589561177 | Validation loss:0.5750460280498988
Epoch:0| Batch 283 | Train Loss:0.6991358703710664 | Validation loss:0.6044432361361006



HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:1| Batch 0 | Train Loss:0.520781934261322 | Validation loss:0.6771744607200085
Epoch:1| Batch 80 | Train Loss:0.4714160900057098 | Validation loss:0.550112913192158
Validation loss decreased from 0.5637827983624498 to 0.550112913192158.
Epoch:1| Batch 160 | Train Loss:0.4336942740663979 | Validation loss:0.5053768157958984
Validation loss decreased from 0.550112913192158 to 0.5053768157958984.
Epoch:1| Batch 240 | Train Loss:0.4248874847201391 | Validation loss:0.47644574604403805
Validation loss decreased from 0.5053768157958984 to 0.47644574604403805.
Epoch:1| Batch 280 | Train Loss:0.421206502144447 | Validation loss:0.48830723762512207
Epoch:1| Batch 283 | Train Loss:0.4201805032789707 | Validation loss:0.48830723762512207



HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:2| Batch 0 | Train Loss:0.1988549679517746 | Validation loss:0.48837073881861187
Epoch:2| Batch 40 | Train Loss:0.2758866463129113 | Validation loss:0.49040178702750675
Epoch:2| Batch 80 | Train Loss:0.28760347294586674 | Validation loss:0.47345033392939767
Validation loss decreased from 0.47644574604403805 to 0.47345033392939767.
Epoch:2| Batch 120 | Train Loss:0.2814387147091637 | Validation loss:0.4878964757835361
Epoch:2| Batch 160 | Train Loss:0.27123253061349345 | Validation loss:0.4748697562116972
Epoch:2| Batch 200 | Train Loss:0.27126140629325934 | Validation loss:0.4685283750295639
Validation loss decreased from 0.47345033392939767 to 0.4685283750295639.
Epoch:2| Batch 240 | Train Loss:0.26949099417296685 | Validation loss:0.4788933303994192
Epoch:2| Batch 280 | Train Loss:0.26701214930765144 | Validation loss:0.4752907937681171
Epoch:2| Batch 283 | Train Loss:0.2667502057804188 | Validation loss:0.4752907937681171



HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:3| Batch 0 | Train Loss:0.17357400059700012 | Validation loss:0.4729470341977939
Epoch:3| Batch 40 | Train Loss:0.18940551251899906 | Validation loss:0.4872306075314401
Epoch:3| Batch 80 | Train Loss:0.196770784203653 | Validation loss:0.46655712958792567
Validation loss decreased from 0.4685283750295639 to 0.46655712958792567.
Epoch:3| Batch 120 | Train Loss:0.19955730173459724 | Validation loss:0.477546224921522
Epoch:3| Batch 160 | Train Loss:0.19680738814684176 | Validation loss:0.47187413506104914
Epoch:3| Batch 200 | Train Loss:0.19402516132860043 | Validation loss:0.47510125729399666
Epoch:3| Batch 240 | Train Loss:0.19505104898664466 | Validation loss:0.4826004074912676
Epoch:3| Batch 280 | Train Loss:0.1931990207938537 | Validation loss:0.4682022142158428
Epoch:3| Batch 283 | Train Loss:0.1930246205974213 | Validation loss:0.4682022142158428



Some weights of the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_large.bin were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_large.bin and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probab

################################
Training Fold 4
################################


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:0| Batch 0 | Train Loss:1.2632033824920654 | Validation loss:1.7960365809185403
Validation loss decreased from inf to 1.7960365809185403.
Epoch:0| Batch 80 | Train Loss:0.921869702913143 | Validation loss:0.8076204439284096
Validation loss decreased from 1.7960365809185403 to 0.8076204439284096.
Epoch:0| Batch 160 | Train Loss:0.8158502930439777 | Validation loss:0.6962344554108633
Validation loss decreased from 0.8076204439284096 to 0.6962344554108633.
Epoch:0| Batch 240 | Train Loss:0.7619119459662694 | Validation loss:0.5961767756183383
Validation loss decreased from 0.6962344554108633 to 0.5961767756183383.
Epoch:0| Batch 283 | Train Loss:0.749963512722875 | Validation loss:0.7076437433420772



HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:1| Batch 0 | Train Loss:0.796697735786438 | Validation loss:0.6620561178301422
Epoch:1| Batch 80 | Train Loss:0.48644929314837043 | Validation loss:0.532252387891353
Validation loss decreased from 0.5961767756183383 to 0.532252387891353.
Epoch:1| Batch 160 | Train Loss:0.46125422390351384 | Validation loss:0.5176122774120787
Validation loss decreased from 0.532252387891353 to 0.5176122774120787.
Epoch:1| Batch 240 | Train Loss:0.4518219552841424 | Validation loss:0.4987496187989141
Validation loss decreased from 0.5176122774120787 to 0.4987496187989141.
Epoch:1| Batch 280 | Train Loss:0.4453432482014347 | Validation loss:0.505252515136356
Epoch:1| Batch 283 | Train Loss:0.44434184497091134 | Validation loss:0.505252515136356



HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:2| Batch 0 | Train Loss:0.2097598761320114 | Validation loss:0.5065597721808394
Epoch:2| Batch 40 | Train Loss:0.3112239223427889 | Validation loss:0.4983170580276301
Validation loss decreased from 0.4987496187989141 to 0.4983170580276301.
Epoch:2| Batch 80 | Train Loss:0.29714255807576356 | Validation loss:0.49372119949737064
Validation loss decreased from 0.4983170580276301 to 0.49372119949737064.
Epoch:2| Batch 120 | Train Loss:0.28627878907790855 | Validation loss:0.4887319507313446
Validation loss decreased from 0.49372119949737064 to 0.4887319507313446.
Epoch:2| Batch 160 | Train Loss:0.2799621434500499 | Validation loss:0.48842068121466836
Validation loss decreased from 0.4887319507313446 to 0.48842068121466836.
Epoch:2| Batch 200 | Train Loss:0.27664878282380934 | Validation loss:0.49050488732230496
Epoch:2| Batch 240 | Train Loss:0.27474991915384267 | Validation loss:0.48898016021285257
Epoch:2| Batch 280 | Train Loss:0.2746205714885875 | Validation loss:0.49567294582514

HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:3| Batch 0 | Train Loss:0.1588321477174759 | Validation loss:0.4933177950516553
Epoch:3| Batch 40 | Train Loss:0.18788623955191636 | Validation loss:0.4869333022916821
Validation loss decreased from 0.48842068121466836 to 0.4869333022916821.
Epoch:3| Batch 80 | Train Loss:0.19758802302825598 | Validation loss:0.48910302868191624
Epoch:3| Batch 120 | Train Loss:0.1951939058143738 | Validation loss:0.488304358552879
Epoch:3| Batch 160 | Train Loss:0.1933629433884754 | Validation loss:0.4882665824302485
Epoch:3| Batch 200 | Train Loss:0.19739693864735205 | Validation loss:0.4873061539001868
Epoch:3| Batch 240 | Train Loss:0.2025403302023767 | Validation loss:0.4982002226399704
Epoch:3| Batch 280 | Train Loss:0.201863402189203 | Validation loss:0.48867174668211333
Epoch:3| Batch 283 | Train Loss:0.2018513053944203 | Validation loss:0.48867174668211333

################################
RESULTS
################################
Results of cross validation for seed 42: Train : 0.21059715

In [None]:
# I added double layer norm to attention 
# Results of cross validation for seed 42: Train : 0.22953509619398332, Val : 0.4700026281702686 LB 0.474
# Now I add accumulation step = 4
# Results of cross validation for seed 42: Train : 0.23496394569678217, Val : 0.4683334995323504 LB 0.466
# Change to 768 in the attention layer
# Results of cross validation for seed 42: Train : 0.21059715945799667, Val : 0.47091682872302093 LB ? 