# Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install transformers



In [None]:
import pandas as pd
import numpy as np

import random
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn

import os
from collections import defaultdict
from tqdm import tqdm_notebook

from transformers import AutoConfig, AutoTokenizer, AutoModel
import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

import torch.nn.init as init
import torch.nn.functional as F
from torch.nn import Parameter
from torch.autograd.function import InplaceFunction
import math

from torch.utils.data import Sampler, Dataset, DataLoader
import random

from more_itertools import chunked, flatten

In [None]:
%cd drive/MyDrive/CommonLit

/content/drive/MyDrive/CommonLit


# Get folds

In [None]:
df = pd.read_csv("train_folds.csv")

# Seed Everything

In [None]:
def seed_everything(seed=12):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=12)

# Configuration

In [None]:
MAX_LEN = 256
EPOCHS = 4
ROBERTA_PATH = "roberta-large"
PRETRAIN_PATH = "Pretrain_CLRP_Roberta/pretrained_roberta_large.bin"
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
LEARNING_RATE = 2e-5
TOKENIZER = transformers.AutoTokenizer.from_pretrained(ROBERTA_PATH)

# Dataset

In [None]:
class RobertaDataset:
    def __init__(self,df):
        self.excerpt = df.excerpt.values
        self.target = df.target.values

    def __len__(self):
        return len(self.excerpt)
    
    def __getitem__(self,item):
        excerpt = str(self.excerpt[item])
        excerpt = " ".join(excerpt.split())
        inputs = TOKENIZER(excerpt, add_special_tokens=True, max_length=MAX_LEN, padding=True, truncation=True)
        
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        
        padding_len = MAX_LEN-len(ids)
        ids = ids+([0]*padding_len)
        mask = mask+([0]*padding_len)
 
        return {"ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "targets": torch.tensor(self.target[item], dtype=torch.float)}

# Model Building

In [None]:
class RobertaModel(nn.Module):
    
    def __init__(self, model_type="mean"):
        super(RobertaModel,self).__init__()

        self.model_type = model_type
        
        self.config = AutoConfig.from_pretrained(ROBERTA_PATH)
        self.config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 0.0})   
        self.roberta = transformers.AutoModel.from_pretrained(PRETRAIN_PATH, config=self.config)

        if model_type == "attention":
            
            self.attention = nn.Sequential(            
            nn.Linear(1024, 256),            
            nn.Tanh(),                       
            nn.Linear(256, 1),
            nn.Softmax(dim=1)
            )   

            self.linear = (nn.Linear(1024, 1))
                           
        elif model_type == "mean":
        
            self.layer_norm1 = nn.LayerNorm(1024)
            self.linear1 = nn.Linear(1024, 768)
            self.linear2 = nn.Linear(768, 1)
            self.layer_norm2 = nn.LayerNorm(768)

    def freeze(self):
        for child in self.roberta.children():
            for param in child.parameters():
                param.requires_grad = False

    def unfreeze(self):
        for child in self.roberta.children():
            for param in child.parameters():
                param.requires_grad = True
        
    def forward(self, ids, mask, loss_fn = None, targets = None):

        if self.model_type == "mean":

            outputs = self.roberta(ids, mask)
            last_hidden_state = outputs[0]
            input_mask_expanded = mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
            sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
            sum_mask = input_mask_expanded.sum(1)
            sum_mask = torch.clamp(sum_mask, min=1e-9)
            mean_embeddings = sum_embeddings / sum_mask
            norm_mean_embeddings = self.layer_norm1(mean_embeddings)
            logits = self.linear1(norm_mean_embeddings)
            logits = self.linear2(self.layer_norm2(logits))

        elif self.model_type=="attention":

            roberta_output = self.roberta(input_ids=ids,
                                  attention_mask=mask)        
            last_layer_hidden_states = roberta_output.last_hidden_state
            weights = self.attention(last_layer_hidden_states)
            context_vector = torch.sum(weights * last_layer_hidden_states, dim=1) 
            logits = self.linear(context_vector)

        if targets is not None:

            loss = torch.sqrt(loss_fn(logits.view(-1),targets.view(-1)))
            return loss, logits

        else:

            return logits

# Evaluation Scheduler

In [None]:
def evaluate(EVAL_STEPS,valid_interval, valid_loss,train_loss, final_train_loss, index, best_loss, epoch):
    
        print(f"Epoch:{epoch}| Batch {index} | Train Loss:{train_loss.avg()} | Validation loss:{valid_loss}")
        if (valid_loss < best_loss):
            
            for rmse, steps in EVAL_STEPS:
                if valid_loss > rmse:
                    valid_interval = steps
                    break
              
            print(f"Validation loss decreased from {best_loss} to {valid_loss}.")
            final_train_loss = train_loss.avg()
            best_loss = valid_loss
            torch.save(model.state_dict(),f'Models/CodeRobertaLargeNoEps/model{fold}.bin')

            
        return valid_interval, best_loss, final_train_loss

# Training Function

In [None]:
def train_fn(train_dataloader, valid_dataloader, model, optimizer, device, scheduler):
    

    EVAL_STEPS = [(0.50,80),(0.49,60), (0.48, 40), (-1., 40)]
    valid_interval = EVAL_STEPS[0][1]
    best_loss = np.inf
    final_train_loss = None
    accumulation_steps = 4
    lr_schedule = [5e-5, 2e-5, 5e-6, 2e-6]
    # lr_schedule = [4e-5, 2e-5, 1e-5, 5e-6,2.5e-6]

    for epoch in range(EPOCHS):
      
        train_loss = AvgCounter()
        lr = lr_schedule[epoch]
        optimizer = scheduler(optimizer,lr)

        for index, d in tqdm_notebook(enumerate(train_dataloader), total=len(train_dataloader)):
  
            ids = d["ids"]
            mask = d["mask"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            model.train()
            loss, outputs = model(ids=ids, mask=mask, loss_fn = loss_fn, targets = targets)
            
            train_loss.update(loss.item(), len(d))
            loss = loss / accumulation_steps 
            loss.backward()

            if index % accumulation_steps == 0:             
                optimizer.step() 
                # scheduler.step()                           
                optimizer.zero_grad()

            if (index % valid_interval == 0) | ((len(train_dataloader)-index) == 1):

                valid_loss = eval_fn(valid_dataloader,model,device)
                
                valid_interval, best_loss, final_train_loss = evaluate(EVAL_STEPS,valid_interval, valid_loss,train_loss, final_train_loss, index, best_loss, epoch )
            
    return final_train_loss, best_loss

# Evaluation Function

In [None]:
def eval_fn(data_loader, model, device):
    model.eval()
    valid_loss = AvgCounter()
    
    with torch.no_grad():
        for bi, d in enumerate(data_loader):
            ids = d["ids"]
            mask = d["mask"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            loss, outputs = model(ids=ids, mask=mask, loss_fn = loss_fn, targets = targets)
            
            valid_loss.update(loss.item(), len(d))
            
    return valid_loss.avg()

# AvgCounter

In [None]:
class AvgCounter:
    def __init__(self):
        self.reset()
        
    def update(self, loss, n_samples):
        self.loss += loss * n_samples
        self.n_samples += n_samples
        
    def avg(self):
        return self.loss / self.n_samples
    
    def reset(self):
        self.loss = 0
        self.n_samples = 0

# Useful Functions

In [None]:
# create pytorch dataloader
def create_dataloader(df, fold, smart = True):
    
    train = df[df.kfold!=fold].reset_index(drop=True)
    valid = df[df.kfold==fold].reset_index(drop=True)

    if smart:
        train_dataset = RobertaDatasetSmart(train)
        valid_dataset = RobertaDatasetSmart(valid)

        sampler_train = SmartBatchingSampler(train_dataset, TRAIN_BATCH_SIZE)
        sampler_valid = SmartBatchingSampler(valid_dataset, VALID_BATCH_SIZE)

        train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size= TRAIN_BATCH_SIZE, sampler = sampler_train, collate_fn=pad_collator)
        valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size= VALID_BATCH_SIZE, sampler = sampler_valid, collate_fn=pad_collator)

    else:
        train_dataset = RobertaDataset(train)
        valid_dataset = RobertaDataset(valid)
    
        sampler = torch.utils.data.RandomSampler(train_dataset)

        train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size= TRAIN_BATCH_SIZE, sampler = sampler  )
        valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size= VALID_BATCH_SIZE)

    return train_dataloader, valid_dataloader

In [None]:
def create_model(device):

    model = RobertaModel().to(device)
    
    return model

In [None]:
# create the optimizer
def create_optimizer(model):
    named_parameters = list(model.named_parameters()) 
    no_decay = ['bias', 'gamma', 'beta']   
    
    parameters = []
    lr = 3e-5
    regressor_lr = 2e-5
    for layer in range(23,-1,-1):
        layer_params = {
          'params': [
                      p for n,p in model.named_parameters() if not any(nd in n for nd in no_decay) \
                      and (f'encoder.layer.{layer}.' in n)
                      ],
          'lr': lr
      }
        parameters.append(layer_params)

        lr *= 0.975

    regressor_params = {
      'params': [p for n,p in model.named_parameters() if "roberta" not in n],
      'lr': regressor_lr
    }

    parameters.append(regressor_params)

    regressor_params = {
      'params': [
                      p for n,p in model.named_parameters() if not any(nd in n for nd in no_decay) \
                      and (f'roberta.embeddings' in n)
                      ],
      'lr': regressor_lr
    }
    parameters.append(regressor_params)

    return AdamW(parameters)

In [None]:
# create scheduler
def create_scheduler(optimizer, num_warmup_steps, num_train_steps, scheduler_name = "get_cosine_schedule_with_warmup" ):

    if scheduler_name == "get_linear_schedule_with_warmup":
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)
        
    elif scheduler_name == "get_cosine_schedule_with_warmup":
        scheduler = get_cosine_schedule_with_warmup(optimizer,num_training_steps=num_train_steps,num_warmup_steps=50) 
        
    else:
        raise Exception(f"Unknown scheduler: {scheduler_name}")

    return scheduler

In [None]:
def scheduler(optimizer,lr):
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    return optimizer

# Training

In [None]:
loss_fn=nn.MSELoss()
loss=defaultdict(list)
results_val = {}
results_train = {}
for fold in range(5):

    seed_everything(12)
    
    device = torch.device("cuda")
    model = create_model(device)

    print("################################")
    print(f"Training Fold {fold}")
    print("################################")

    train_dataloader, valid_dataloader = create_dataloader(df, fold, smart = False)
    num_train_steps = len(train_dataloader) * EPOCHS

    optimizer = create_optimizer(model)
    # scheduler = create_scheduler(optimizer, num_warmup_steps = 0, num_train_steps = num_train_steps )

    seed_everything(12)
    
    results_train[fold], results_val[fold] = train_fn(train_dataloader,valid_dataloader, model, optimizer, device, scheduler)

print("################################")
print("RESULTS")
print("################################")
cv_val = np.mean([results_val[i] for i in range(5)])
cv_train = np.mean([results_train[i] for i in range(5)])
print(f"Results of cross validation for seed 12: Train : {cv_train}, Val : {cv_val}") # I always used seed 12

Some weights of the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_large.bin were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_large.bin and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probab

################################
Training Fold 0
################################


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:0| Batch 0 | Train Loss:0.7076603174209595 | Validation loss:1.6521922467460095
Validation loss decreased from inf to 1.6521922467460095.
Epoch:0| Batch 80 | Train Loss:0.8113821549915973 | Validation loss:0.7384189463837046
Validation loss decreased from 1.6521922467460095 to 0.7384189463837046.
Epoch:0| Batch 160 | Train Loss:0.7309419739505519 | Validation loss:0.6017998343622181
Validation loss decreased from 0.7384189463837046 to 0.6017998343622181.
Epoch:0| Batch 240 | Train Loss:0.6824526216108274 | Validation loss:0.5486489819808745
Validation loss decreased from 0.6017998343622181 to 0.5486489819808745.
Epoch:0| Batch 283 | Train Loss:0.6669465970300453 | Validation loss:0.5277437735611284
Validation loss decreased from 0.5486489819808745 to 0.5277437735611284.



HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:1| Batch 0 | Train Loss:0.4974327087402344 | Validation loss:0.5049597679729193
Validation loss decreased from 0.5277437735611284 to 0.5049597679729193.
Epoch:1| Batch 80 | Train Loss:0.3879103347843076 | Validation loss:0.5332624364486882
Epoch:1| Batch 160 | Train Loss:0.3824170788993006 | Validation loss:0.48577200023221295
Validation loss decreased from 0.5049597679729193 to 0.48577200023221295.
Epoch:1| Batch 200 | Train Loss:0.38069388359340267 | Validation loss:0.5066926789955354
Epoch:1| Batch 240 | Train Loss:0.3815836522465425 | Validation loss:0.49179812573211296
Epoch:1| Batch 280 | Train Loss:0.3843456796267703 | Validation loss:0.5126626128461998
Epoch:1| Batch 283 | Train Loss:0.3845754325599738 | Validation loss:0.5126626128461998



HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:2| Batch 0 | Train Loss:0.2555971145629883 | Validation loss:0.5129252636516598
Epoch:2| Batch 40 | Train Loss:0.2407602079030944 | Validation loss:0.48796599851527683
Epoch:2| Batch 80 | Train Loss:0.24006395722612922 | Validation loss:0.48358645195692357
Validation loss decreased from 0.48577200023221295 to 0.48358645195692357.
Epoch:2| Batch 120 | Train Loss:0.2341556853252994 | Validation loss:0.47788897618441517
Validation loss decreased from 0.48358645195692357 to 0.47788897618441517.
Epoch:2| Batch 160 | Train Loss:0.23149577787388925 | Validation loss:0.48105190967170286
Epoch:2| Batch 200 | Train Loss:0.22995754746507055 | Validation loss:0.4714326955063242
Validation loss decreased from 0.47788897618441517 to 0.4714326955063242.
Epoch:2| Batch 240 | Train Loss:0.22934270727436573 | Validation loss:0.46949476549323177
Validation loss decreased from 0.4714326955063242 to 0.46949476549323177.
Epoch:2| Batch 280 | Train Loss:0.23012181798334225 | Validation loss:0.471155799

HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:3| Batch 0 | Train Loss:0.19775594770908356 | Validation loss:0.4713605814416644
Epoch:3| Batch 40 | Train Loss:0.1697894689149973 | Validation loss:0.4785980200263816
Epoch:3| Batch 80 | Train Loss:0.17308638769167442 | Validation loss:0.4722188719561402
Epoch:3| Batch 120 | Train Loss:0.1689995420742626 | Validation loss:0.4733644791052375
Epoch:3| Batch 160 | Train Loss:0.16954725145460656 | Validation loss:0.4760307258283588
Epoch:3| Batch 200 | Train Loss:0.1689322257775869 | Validation loss:0.47193366289138794
Epoch:3| Batch 240 | Train Loss:0.17086002170359446 | Validation loss:0.4760834270379913
Epoch:3| Batch 280 | Train Loss:0.1708765352456383 | Validation loss:0.47907712929685353
Epoch:3| Batch 283 | Train Loss:0.17069052963871773 | Validation loss:0.47907712929685353



Some weights of the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_large.bin were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_large.bin and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probab

################################
Training Fold 1
################################


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:0| Batch 0 | Train Loss:0.8624992966651917 | Validation loss:1.215126206337566
Validation loss decreased from inf to 1.215126206337566.
Epoch:0| Batch 80 | Train Loss:0.8325814483342348 | Validation loss:0.598919831321273
Validation loss decreased from 1.215126206337566 to 0.598919831321273.
Epoch:0| Batch 160 | Train Loss:0.723447381894781 | Validation loss:0.5502879991078041
Validation loss decreased from 0.598919831321273 to 0.5502879991078041.
Epoch:0| Batch 240 | Train Loss:0.6814016007288858 | Validation loss:0.5795343809984099
Epoch:0| Batch 283 | Train Loss:0.6631879326953015 | Validation loss:0.5126204595599376
Validation loss decreased from 0.5502879991078041 to 0.5126204595599376.



HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:1| Batch 0 | Train Loss:0.40184837579727173 | Validation loss:0.5077774673700333
Validation loss decreased from 0.5126204595599376 to 0.5077774673700333.
Epoch:1| Batch 80 | Train Loss:0.37857605424928076 | Validation loss:0.5561486335707383
Epoch:1| Batch 160 | Train Loss:0.38805207989600876 | Validation loss:0.4991651993822044
Validation loss decreased from 0.5077774673700333 to 0.4991651993822044.
Epoch:1| Batch 180 | Train Loss:0.3891274851659385 | Validation loss:0.49792610108852386
Validation loss decreased from 0.4991651993822044 to 0.49792610108852386.
Epoch:1| Batch 240 | Train Loss:0.39191569363180534 | Validation loss:0.5089700654358931
Epoch:1| Batch 283 | Train Loss:0.3923497961860308 | Validation loss:0.48260061149026307
Validation loss decreased from 0.49792610108852386 to 0.48260061149026307.



HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:2| Batch 0 | Train Loss:0.18602171540260315 | Validation loss:0.48046331359467037
Validation loss decreased from 0.48260061149026307 to 0.48046331359467037.
Epoch:2| Batch 40 | Train Loss:0.2312888906496327 | Validation loss:0.4736556593381183
Validation loss decreased from 0.48046331359467037 to 0.4736556593381183.
Epoch:2| Batch 80 | Train Loss:0.2457384255565243 | Validation loss:0.47102875293980184
Validation loss decreased from 0.4736556593381183 to 0.47102875293980184.
Epoch:2| Batch 120 | Train Loss:0.24555043746863514 | Validation loss:0.48274661810465264
Epoch:2| Batch 160 | Train Loss:0.2431661495881051 | Validation loss:0.48633627047840977
Epoch:2| Batch 200 | Train Loss:0.242476646431643 | Validation loss:0.48053263589529926
Epoch:2| Batch 240 | Train Loss:0.24238047968302526 | Validation loss:0.4783144987804789
Epoch:2| Batch 280 | Train Loss:0.24114390937246885 | Validation loss:0.478531173837017
Epoch:2| Batch 283 | Train Loss:0.24024855062155656 | Validation loss:

HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:3| Batch 0 | Train Loss:0.18438643217086792 | Validation loss:0.4785375739906875
Epoch:3| Batch 40 | Train Loss:0.17751744298673258 | Validation loss:0.4810113640318454
Epoch:3| Batch 80 | Train Loss:0.17578052205068093 | Validation loss:0.47565999669088443
Epoch:3| Batch 120 | Train Loss:0.17192983498011738 | Validation loss:0.47701064528713766
Epoch:3| Batch 160 | Train Loss:0.17514169512328154 | Validation loss:0.4829584311851313
Epoch:3| Batch 200 | Train Loss:0.17370702659905846 | Validation loss:0.47577447496669395
Epoch:3| Batch 240 | Train Loss:0.1738470813792771 | Validation loss:0.4807583295123678
Epoch:3| Batch 280 | Train Loss:0.1750040429236626 | Validation loss:0.4772388299166317
Epoch:3| Batch 283 | Train Loss:0.17484213784337044 | Validation loss:0.4772388299166317



Some weights of the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_large.bin were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_large.bin and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probab

################################
Training Fold 2
################################


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:0| Batch 0 | Train Loss:1.010431170463562 | Validation loss:1.358408812066199
Validation loss decreased from inf to 1.358408812066199.
Epoch:0| Batch 80 | Train Loss:0.8191278882232713 | Validation loss:0.6797250908025554
Validation loss decreased from 1.358408812066199 to 0.6797250908025554.
Epoch:0| Batch 160 | Train Loss:0.7562767979891404 | Validation loss:0.6345247475194259
Validation loss decreased from 0.6797250908025554 to 0.6345247475194259.
Epoch:0| Batch 240 | Train Loss:0.7002484679098446 | Validation loss:0.6528056184170952
Epoch:0| Batch 283 | Train Loss:0.6756291055238582 | Validation loss:0.5173266030533213
Validation loss decreased from 0.6345247475194259 to 0.5173266030533213.



HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:1| Batch 0 | Train Loss:0.4655999541282654 | Validation loss:0.5158079220375544
Validation loss decreased from 0.5173266030533213 to 0.5158079220375544.
Epoch:1| Batch 80 | Train Loss:0.43943336973955605 | Validation loss:0.6278873942267726
Epoch:1| Batch 160 | Train Loss:0.45506932813188306 | Validation loss:0.6003325942536475
Epoch:1| Batch 240 | Train Loss:0.4547116723050715 | Validation loss:0.5277907634285134
Epoch:1| Batch 283 | Train Loss:0.4471415855305296 | Validation loss:0.5071575969037875
Validation loss decreased from 0.5158079220375544 to 0.5071575969037875.



HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:2| Batch 0 | Train Loss:0.36897706985473633 | Validation loss:0.5044519418561962
Validation loss decreased from 0.5071575969037875 to 0.5044519418561962.
Epoch:2| Batch 80 | Train Loss:0.3200125605971725 | Validation loss:0.482143082878959
Validation loss decreased from 0.5044519418561962 to 0.482143082878959.
Epoch:2| Batch 120 | Train Loss:0.3151139940850991 | Validation loss:0.5205151119702299
Epoch:2| Batch 160 | Train Loss:0.31260972132223735 | Validation loss:0.47414944037585194
Validation loss decreased from 0.482143082878959 to 0.47414944037585194.
Epoch:2| Batch 200 | Train Loss:0.3133458944398965 | Validation loss:0.47375352659695585
Validation loss decreased from 0.47414944037585194 to 0.47375352659695585.
Epoch:2| Batch 240 | Train Loss:0.3097304256625195 | Validation loss:0.5045812969476404
Epoch:2| Batch 280 | Train Loss:0.308657537814349 | Validation loss:0.471026964170832
Validation loss decreased from 0.47375352659695585 to 0.471026964170832.
Epoch:2| Batch 283 |

HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:3| Batch 0 | Train Loss:0.19304391741752625 | Validation loss:0.47331411192114925
Epoch:3| Batch 40 | Train Loss:0.24580236070039796 | Validation loss:0.47881307043659854
Epoch:3| Batch 80 | Train Loss:0.24912666317857343 | Validation loss:0.46951942838413613
Validation loss decreased from 0.471026964170832 to 0.46951942838413613.
Epoch:3| Batch 120 | Train Loss:0.2437483831501204 | Validation loss:0.4776994986853129
Epoch:3| Batch 160 | Train Loss:0.23862480626713414 | Validation loss:0.4727218050352285
Epoch:3| Batch 200 | Train Loss:0.23890508674270478 | Validation loss:0.477605047989899
Epoch:3| Batch 240 | Train Loss:0.24184762924538608 | Validation loss:0.47244666516780853
Epoch:3| Batch 280 | Train Loss:0.24293261946732464 | Validation loss:0.47707848880492465
Epoch:3| Batch 283 | Train Loss:0.24294763551631443 | Validation loss:0.47707848880492465



Some weights of the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_large.bin were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_large.bin and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probab

################################
Training Fold 3
################################


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:0| Batch 0 | Train Loss:1.0791959762573242 | Validation loss:1.3853414738681955
Validation loss decreased from inf to 1.3853414738681955.
Epoch:0| Batch 80 | Train Loss:0.8272557449929508 | Validation loss:0.6484480743676844
Validation loss decreased from 1.3853414738681955 to 0.6484480743676844.
Epoch:0| Batch 160 | Train Loss:0.7211102541559231 | Validation loss:0.5802848020070036
Validation loss decreased from 0.6484480743676844 to 0.5802848020070036.
Epoch:0| Batch 240 | Train Loss:0.6860107207570333 | Validation loss:0.5795477567843987
Validation loss decreased from 0.5802848020070036 to 0.5795477567843987.
Epoch:0| Batch 283 | Train Loss:0.6665791025988652 | Validation loss:0.5922002867913582



HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:1| Batch 0 | Train Loss:0.5650491714477539 | Validation loss:0.5916469600838674
Epoch:1| Batch 80 | Train Loss:0.4312630420849647 | Validation loss:0.5243520379905969
Validation loss decreased from 0.5795477567843987 to 0.5243520379905969.
Epoch:1| Batch 160 | Train Loss:0.40499229766197087 | Validation loss:0.4799972048527758
Validation loss decreased from 0.5243520379905969 to 0.4799972048527758.
Epoch:1| Batch 200 | Train Loss:0.4016747361985012 | Validation loss:0.5183429596289783
Epoch:1| Batch 240 | Train Loss:0.3981092801603539 | Validation loss:0.4777985326421093
Validation loss decreased from 0.4799972048527758 to 0.4777985326421093.
Epoch:1| Batch 280 | Train Loss:0.3942488671621818 | Validation loss:0.4687999898279217
Validation loss decreased from 0.4777985326421093 to 0.4687999898279217.
Epoch:1| Batch 283 | Train Loss:0.3932316263376827 | Validation loss:0.4687999898279217



HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:2| Batch 0 | Train Loss:0.33261990547180176 | Validation loss:0.4673892637373696
Validation loss decreased from 0.4687999898279217 to 0.4673892637373696.
Epoch:2| Batch 40 | Train Loss:0.2606367348897748 | Validation loss:0.47532477995879213
Epoch:2| Batch 80 | Train Loss:0.2617100872743277 | Validation loss:0.4775586243666394
Epoch:2| Batch 120 | Train Loss:0.25660360770777235 | Validation loss:0.4638326717934138
Validation loss decreased from 0.4673892637373696 to 0.4638326717934138.
Epoch:2| Batch 160 | Train Loss:0.2554637606006972 | Validation loss:0.4571227662160363
Validation loss decreased from 0.4638326717934138 to 0.4571227662160363.
Epoch:2| Batch 200 | Train Loss:0.2507705833410742 | Validation loss:0.4634958740691064
Epoch:2| Batch 240 | Train Loss:0.2457619462765104 | Validation loss:0.4584468032272769
Epoch:2| Batch 280 | Train Loss:0.24489327496268995 | Validation loss:0.46913403377566537
Epoch:2| Batch 283 | Train Loss:0.24464979547430094 | Validation loss:0.4691

HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:3| Batch 0 | Train Loss:0.10082424432039261 | Validation loss:0.467007513197375
Epoch:3| Batch 40 | Train Loss:0.17710344784143495 | Validation loss:0.46084149454680967
Epoch:3| Batch 80 | Train Loss:0.17978678832267536 | Validation loss:0.4565871616064663
Validation loss decreased from 0.4571227662160363 to 0.4565871616064663.
Epoch:3| Batch 120 | Train Loss:0.1850499764146391 | Validation loss:0.4716263365157893
Epoch:3| Batch 160 | Train Loss:0.1856736226805619 | Validation loss:0.4584699273109436
Epoch:3| Batch 200 | Train Loss:0.18407055830110364 | Validation loss:0.46284751132340496
Epoch:3| Batch 240 | Train Loss:0.18522831400338544 | Validation loss:0.4618119924840793
Epoch:3| Batch 280 | Train Loss:0.1853112712157791 | Validation loss:0.45701837140909385
Epoch:3| Batch 283 | Train Loss:0.18496394956248327 | Validation loss:0.45701837140909385



Some weights of the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_large.bin were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_large.bin and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probab

################################
Training Fold 4
################################


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:0| Batch 0 | Train Loss:1.2930927276611328 | Validation loss:1.265312889931907
Validation loss decreased from inf to 1.265312889931907.
Epoch:0| Batch 80 | Train Loss:0.8215533207964014 | Validation loss:0.7354658535668548
Validation loss decreased from 1.265312889931907 to 0.7354658535668548.
Epoch:0| Batch 160 | Train Loss:0.7347561787374272 | Validation loss:0.7050184624295839
Validation loss decreased from 0.7354658535668548 to 0.7050184624295839.
Epoch:0| Batch 240 | Train Loss:0.686449088125308 | Validation loss:0.5525530329052831
Validation loss decreased from 0.7050184624295839 to 0.5525530329052831.
Epoch:0| Batch 283 | Train Loss:0.6675378975104278 | Validation loss:0.5468806840584312
Validation loss decreased from 0.5525530329052831 to 0.5468806840584312.



HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:1| Batch 0 | Train Loss:0.5113670229911804 | Validation loss:0.5617832495293147
Epoch:1| Batch 80 | Train Loss:0.41836675725601336 | Validation loss:0.522825215064304
Validation loss decreased from 0.5468806840584312 to 0.522825215064304.
Epoch:1| Batch 160 | Train Loss:0.4071887458518425 | Validation loss:0.5029745227854017
Validation loss decreased from 0.522825215064304 to 0.5029745227854017.
Epoch:1| Batch 240 | Train Loss:0.3986223547785114 | Validation loss:0.5018246112994744
Validation loss decreased from 0.5029745227854017 to 0.5018246112994744.
Epoch:1| Batch 283 | Train Loss:0.39728344121659304 | Validation loss:0.492489153230694
Validation loss decreased from 0.5018246112994744 to 0.492489153230694.



HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:2| Batch 0 | Train Loss:0.16038724780082703 | Validation loss:0.4913192613863609
Validation loss decreased from 0.492489153230694 to 0.4913192613863609.
Epoch:2| Batch 60 | Train Loss:0.2390093678822283 | Validation loss:0.4868868272489225
Validation loss decreased from 0.4913192613863609 to 0.4868868272489225.
Epoch:2| Batch 80 | Train Loss:0.2371081292261312 | Validation loss:0.48926026048794596
Epoch:2| Batch 120 | Train Loss:0.23879925758878062 | Validation loss:0.48390443392202886
Validation loss decreased from 0.4868868272489225 to 0.48390443392202886.
Epoch:2| Batch 160 | Train Loss:0.24325920261951708 | Validation loss:0.4859177455515929
Epoch:2| Batch 200 | Train Loss:0.23908830011513696 | Validation loss:0.4861511225011987
Epoch:2| Batch 240 | Train Loss:0.23871735804926805 | Validation loss:0.4800774509638128
Validation loss decreased from 0.48390443392202886 to 0.4800774509638128.
Epoch:2| Batch 280 | Train Loss:0.23741460962757946 | Validation loss:0.4801594770290482

HBox(children=(FloatProgress(value=0.0, max=284.0), HTML(value='')))

Epoch:3| Batch 0 | Train Loss:0.1288483887910843 | Validation loss:0.4825835276237676
Epoch:3| Batch 40 | Train Loss:0.15192920805477514 | Validation loss:0.48153174993857534
Epoch:3| Batch 80 | Train Loss:0.1729275909838853 | Validation loss:0.4825236073262255
Epoch:3| Batch 120 | Train Loss:0.17500811713663014 | Validation loss:0.4869215662630511
Epoch:3| Batch 160 | Train Loss:0.1688942051498416 | Validation loss:0.4799799986288581
Validation loss decreased from 0.4800774509638128 to 0.4799799986288581.
Epoch:3| Batch 200 | Train Loss:0.16901029848414867 | Validation loss:0.47886615886654654
Validation loss decreased from 0.4799799986288581 to 0.47886615886654654.
Epoch:3| Batch 240 | Train Loss:0.17174281979064723 | Validation loss:0.48108336518348105
Epoch:3| Batch 280 | Train Loss:0.17128354993738312 | Validation loss:0.4774508830946936
Validation loss decreased from 0.47886615886654654 to 0.4774508830946936.
Epoch:3| Batch 283 | Train Loss:0.1715512820716265 | Validation loss:0.

In [None]:
# Results of cross validation for seed 42: Train : 0.3348806785935321, Val : 0.48983509208954557 LB 0.489
# Results of cross validation for seed 42: Train : 0.2606250740913959, Val : 0.4757214925658535 LB 0.468
# Results of cross validation for seed 42: Train : 0.20288117347075163, Val : 0.4683537180574847 LB 0.464 Change the shape to 768
# increase accumulation steps from 4 to 8
# Results of cross validation for seed 42: Train : 0.29265708032267856, Val : 0.4810768190404059
# Same accumulation step but proper learning rate !
# Results of cross validation for seed 42: Train : 0.23795280918271736, Val : 0.4706005893122982 LB ?
# change layer norm eps 1e-2
# Results of cross validation for seed 12: Train : 0.2127049908399834, Val : 0.46800723113644294
# no layer norm in roberta
# Results of cross validation for seed 12: Train : 0.2150556268539044, Val : 0.46881619830366594