# Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install transformers



In [None]:
import pandas as pd
import numpy as np

import random
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn

import os
from collections import defaultdict
from tqdm import tqdm_notebook

from transformers import AutoConfig, AutoTokenizer, AutoModel
import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

import torch.nn.init as init
import torch.nn.functional as F
from torch.nn import Parameter
from torch.autograd.function import InplaceFunction
import math

from torch.utils.data import Sampler, Dataset, DataLoader
import random

from more_itertools import chunked, flatten

In [None]:
%cd drive/MyDrive/CommonLit

/content/drive/MyDrive/CommonLit


# Get folds

In [None]:
df = pd.read_csv("train_folds.csv")

# Seed Everything

In [None]:
def seed_everything(seed=12):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=12)

# Configuration

In [None]:
MAX_LEN = 256
EPOCHS = 3
ROBERTA_PATH = "roberta-base"
PRETRAIN_PATH = "Pretrain_CLRP_Roberta/pretrained_roberta_base.bin"
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 8
LEARNING_RATE = 2e-5
TOKENIZER = transformers.AutoTokenizer.from_pretrained(ROBERTA_PATH)

# Dataset

In [None]:
# class RobertaDatasetMW:
#     def __init__(self,df):
#         self.excerpt = df.excerpt.values
#         self.target = df.target.values

#     def __len__(self):
#         return len(self.excerpt)
    
#     def __getitem__(self,item):
#         excerpt = str(self.excerpt[item])
#         excerpt = " ".join(excerpt.split())
#         inputs = TOKENIZER(excerpt, add_special_tokens=True, max_length=MAX_LEN, padding=True, truncation=True)
        
#         ids = inputs["input_ids"]
#         mask = inputs["attention_mask"]
        
#         padding_len = MAX_LEN-len(ids)
#         ids = ids+([0]*padding_len)
#         mask = mask+([0]*padding_len)
 
#         return {"ids": ids,
#             "mask": mask,
#             "targets": torch.tensor(self.target[item], dtype=torch.float)}

In [None]:
class RobertaDataset:
    def __init__(self,df):
        self.excerpt = df.excerpt.values
        self.target = df.target.values

    def __len__(self):
        return len(self.excerpt)
    
    def __getitem__(self,item):
        excerpt = str(self.excerpt[item])
        excerpt = " ".join(excerpt.split())
        inputs = TOKENIZER(excerpt, add_special_tokens=True, max_length=MAX_LEN, padding=True, truncation=True)
        
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        
        padding_len = MAX_LEN-len(ids)
        ids = ids+([0]*padding_len)
        mask = mask+([0]*padding_len)
 
        return {"ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "targets": torch.tensor(self.target[item], dtype=torch.float)}

In [None]:
# def mask_collator(batch):
    
#     ids = [sample["ids"] for sample in batch]
#     mask = [sample["mask"] for sample in batch]
#     targets = [sample["targets"] for sample in batch]

#     mlm_probability = 0.05
    
#     input_ids = []
#     for sentence in ids:
#         r = np.random.uniform(0,1,len(sentence))
#         sentence = np.array(sentence)
#         sentence[(r < mlm_probability)&(sentence!=0)&(sentence!=4)&(sentence!=2)] = 50264
#         sentence = list(sentence)
#         input_ids.append(sentence)
    
#     return {"ids": torch.tensor(input_ids, dtype=torch.long),
#             "mask": torch.tensor(mask, dtype=torch.long),
#             "targets": torch.tensor(targets, dtype=torch.float)}

# Smart Batching

In [None]:
class RobertaDatasetSmart:
    def __init__(self,df):
        self.excerpt = df.excerpt.values
        self.target = df.target.values
        self.standard_error = df.standard_error.values

    def __len__(self):
        return len(self.excerpt)
    
    def __getitem__(self,item):
        excerpt = str(self.excerpt[item])
        excerpt = " ".join(excerpt.split())
        inputs = TOKENIZER(excerpt, add_special_tokens=True, max_length=MAX_LEN, padding=False, truncation=True)
        
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
 
        return {"ids": ids,
            "mask": mask,
            "targets": torch.tensor(self.target[item], dtype=torch.float)}

In [None]:
class SmartBatchingSampler(Sampler):
    def __init__(self, dataset, batch_size):
        self.sample_lengths = [len(seq["ids"]) for seq in dataset]
        self.sorted_input_ids = list(np.argsort(self.sample_lengths))
        self.batches = []
        self.len = len(dataset)

        while len(self.sorted_input_ids) > 0:  

            to_take = min(batch_size, len(self.sorted_input_ids))
            select = random.randint(0, len(self.sorted_input_ids) - to_take)
            batch = self.sorted_input_ids[select:(select + to_take)]
            self.batches.append(batch)
            del self.sorted_input_ids[select:select + to_take]
    
    def __iter__(self):

        yield from flatten(self.batches)

    def __len__(self):
        return self.len

In [None]:
def pad_collator(batch):
    
    ids = [sample["ids"] for sample in batch]
    mask = [sample["mask"] for sample in batch]
    targets = [sample["targets"] for sample in batch]
    
    max_len = max([len(text) for text in ids])
    
    input_ids = []
    attention_mask = []
    for i,m in zip(ids, mask):
        padding_len = max_len-len(i)
        input_ids.append(list(i)+([0]*padding_len))
        attention_mask.append(list(m)+([0]*padding_len))
    
    return {"ids": torch.tensor(input_ids, dtype=torch.long),
            "mask": torch.tensor(attention_mask, dtype=torch.long),
            "targets": torch.tensor(targets, dtype=torch.float)}

# Model Building

In [None]:
class RobertaModel(nn.Module):
    
    def __init__(self, model_type="attention"):
        super(RobertaModel,self).__init__()

        self.model_type = model_type
        
        self.config = AutoConfig.from_pretrained(ROBERTA_PATH)
        self.config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})   
        self.roberta = transformers.AutoModel.from_pretrained(PRETRAIN_PATH, config=self.config)

        if model_type == "attention":
            
            self.attention = nn.Sequential(            
            nn.Linear(768, 256),            
            nn.Tanh(),                       
            nn.Linear(256, 1),
            nn.Softmax(dim=1)
            )   

            self.layer_norm1 = nn.LayerNorm(768)
            self.linear1 = nn.Linear(768, 256)
            self.linear2 = nn.Linear(256, 1)
            self.layer_norm2 = nn.LayerNorm(256)
                           
        elif model_type == "mean":
        
            self.layer_norm1 = nn.LayerNorm(1024)
            self.linear1 = nn.Linear(1024, 256)
            self.linear2 = nn.Linear(256, 1)
            self.layer_norm2 = nn.LayerNorm(256)

    def freeze(self):
        for child in self.roberta.children():
            for param in child.parameters():
                param.requires_grad = False

    def unfreeze(self):
        for child in self.roberta.children():
            for param in child.parameters():
                param.requires_grad = True
        
    def forward(self, ids, mask, loss_fn = None, targets = None):

        if self.model_type == "mean":

            outputs = self.roberta(ids, mask)
            last_hidden_state = outputs[0]
            input_mask_expanded = mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
            sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
            sum_mask = input_mask_expanded.sum(1)
            sum_mask = torch.clamp(sum_mask, min=1e-9)
            mean_embeddings = sum_embeddings / sum_mask
            norm_mean_embeddings = self.layer_norm1(mean_embeddings)
            logits = self.linear1(norm_mean_embeddings)
            logits = self.linear2(self.layer_norm2(logits))

        elif self.model_type=="attention":

            roberta_output = self.roberta(input_ids=ids,
                                  attention_mask=mask)        
            last_layer_hidden_states = roberta_output.last_hidden_state
            weights = self.attention(last_layer_hidden_states)
            context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)
            norm_context_vector = self.layer_norm1(context_vector)
            logits = self.linear1(norm_context_vector)
            logits = self.linear2(self.layer_norm2(logits)) 

        if targets is not None:

            loss = torch.sqrt(loss_fn(logits.view(-1),targets.view(-1)))
            return loss, logits

        else:

            return logits

# Evaluation Scheduler

In [None]:
def evaluate(EVAL_STEPS,valid_interval, valid_loss,train_loss, final_train_loss, index, best_loss, epoch):
    
        print(f"Epoch:{epoch}| Batch {index} | Train Loss:{train_loss.avg()} | Validation loss:{valid_loss}")
        if (valid_loss < best_loss):
            
            for rmse, steps in EVAL_STEPS:
                if valid_loss > rmse:
                    valid_interval = steps
                    break
              
            print(f"Validation loss decreased from {best_loss} to {valid_loss}.")
            final_train_loss = train_loss.avg()
            best_loss = valid_loss
            torch.save(model.state_dict(),f'Models/CodeRobertaBaseAttentionNorm/model{fold}.bin')

            
        return valid_interval, best_loss, final_train_loss

# Training Function

In [None]:
def train_fn(train_dataloader, valid_dataloader, model, optimizer, device, scheduler):
    

    EVAL_STEPS = [(0.50,20),(0.49,10), (0.48, 10), (-1., 5)]
    valid_interval = EVAL_STEPS[0][1]
    best_loss = np.inf
    final_train_loss = None
    accumulation_steps = 1
    # lr_schedule = [5e-5, 2e-5, 5e-6, 2e-6]
    lr_schedule = [4e-5, 2e-5, 1e-5, 5e-6,2.5e-6]

    for epoch in range(EPOCHS):
      
        train_loss = AvgCounter()
        lr = lr_schedule[epoch]
        optimizer = scheduler(optimizer,lr)

        for index, d in tqdm_notebook(enumerate(train_dataloader), total=len(train_dataloader)):
  
            ids = d["ids"]
            mask = d["mask"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            model.train()
            loss, outputs = model(ids=ids, mask=mask, loss_fn = loss_fn, targets = targets)
            
            train_loss.update(loss.item(), len(d))
            loss = loss / accumulation_steps 
            loss.backward()

            if index % accumulation_steps == 0:             
                optimizer.step() 
                # scheduler.step()                           
                optimizer.zero_grad()

            if (index % valid_interval == 0) | ((len(train_dataloader)-index) == 1):

                valid_loss = eval_fn(valid_dataloader,model,device)
                
                valid_interval, best_loss, final_train_loss = evaluate(EVAL_STEPS,valid_interval, valid_loss,train_loss, final_train_loss, index, best_loss, epoch )
            
    return final_train_loss, best_loss

# Evaluation Function

In [None]:
def eval_fn(data_loader, model, device):
    model.eval()
    valid_loss = AvgCounter()
    
    with torch.no_grad():
        for bi, d in enumerate(data_loader):
            ids = d["ids"]
            mask = d["mask"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            loss, outputs = model(ids=ids, mask=mask, loss_fn = loss_fn, targets = targets)
            
            valid_loss.update(loss.item(), len(d))
            
    return valid_loss.avg()

# AvgCounter

In [None]:
class AvgCounter:
    def __init__(self):
        self.reset()
        
    def update(self, loss, n_samples):
        self.loss += loss * n_samples
        self.n_samples += n_samples
        
    def avg(self):
        return self.loss / self.n_samples
    
    def reset(self):
        self.loss = 0
        self.n_samples = 0

# Useful Functions

In [None]:
# create pytorch dataloader
def create_dataloader(df, fold, smart = True):
    
    train = df[df.kfold!=fold].reset_index(drop=True)
    valid = df[df.kfold==fold].reset_index(drop=True)

    if smart:
        train_dataset = RobertaDatasetSmart(train)
        valid_dataset = RobertaDatasetSmart(valid)

        sampler_train = SmartBatchingSampler(train_dataset, TRAIN_BATCH_SIZE)
        sampler_valid = SmartBatchingSampler(valid_dataset, VALID_BATCH_SIZE)

        train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size= TRAIN_BATCH_SIZE, sampler = sampler_train, collate_fn=pad_collator)
        valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size= VALID_BATCH_SIZE, sampler = sampler_valid, collate_fn=pad_collator)

    else:
        train_dataset = RobertaDataset(train)
        valid_dataset = RobertaDataset(valid)
    
        sampler = torch.utils.data.RandomSampler(train_dataset)

        train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size= TRAIN_BATCH_SIZE, sampler = sampler)
        valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size= VALID_BATCH_SIZE)

    return train_dataloader, valid_dataloader

In [None]:
def create_model(device):

    model = RobertaModel().to(device)
    model.roberta.embeddings.requires_grad_(False)
    
    return model

In [None]:
# create the optimizer
def create_optimizer(model):
    named_parameters = list(model.named_parameters()) 
    no_decay = ['bias', 'gamma', 'beta']   
    
    parameters = []
    lr = 3e-5
    regressor_lr = 2e-5
    for layer in range(11,-1,-1):
        layer_params = {
          'params': [
                      p for n,p in model.named_parameters() if not any(nd in n for nd in no_decay) \
                      and (f'encoder.layer.{layer}.' in n)
                      ],
          'lr': lr
      }
        parameters.append(layer_params)

        lr *= 0.975

    regressor_params = {
      'params': [p for n,p in model.named_parameters() if "roberta" not in n],
      'lr': regressor_lr
    }
    parameters.append(regressor_params)

    return AdamW(parameters)

In [None]:
# create the optimizer
def create_optimizer_simple(model):
    parameters = []
    no_decay = ['bias', 'gamma', 'beta'] 

    no_decay_parameters = {
        'params': [
                      p for n,p in model.named_parameters() if any(nd in n for nd in no_decay) 
                      ],
          'wd': 0.00
    }

    decay_parameters = {
        'params': [
                      p for n,p in model.named_parameters() if not any(nd in n for nd in no_decay) 
                      ]
    }
    
    parameters.append(no_decay_parameters)
    parameters.append(decay_parameters)

    return AdamW(parameters)

In [None]:
# create scheduler
def create_scheduler(optimizer, num_warmup_steps, num_train_steps, scheduler_name = "get_cosine_schedule_with_warmup" ):

    if scheduler_name == "get_linear_schedule_with_warmup":
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)
        
    elif scheduler_name == "get_cosine_schedule_with_warmup":
        scheduler = get_cosine_schedule_with_warmup(optimizer,num_training_steps=num_train_steps,num_warmup_steps=50) 
        
    else:
        raise Exception(f"Unknown scheduler: {scheduler_name}")

    return scheduler

In [None]:
def scheduler(optimizer,lr):
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    return optimizer

# Training

In [None]:
loss_fn=nn.MSELoss()
loss=defaultdict(list)
results_val = {}
results_train = {}
for fold in range(5):

    seed_everything(12)
    
    device = torch.device("cuda")
    model = create_model(device)

    print("################################")
    print(f"Training Fold {fold}")
    print("################################")

    train_dataloader, valid_dataloader = create_dataloader(df, fold, smart = False)
    num_train_steps = len(train_dataloader) * EPOCHS

    optimizer = create_optimizer(model)
    # scheduler = create_scheduler(optimizer, num_warmup_steps = 0, num_train_steps = num_train_steps )

    seed_everything(12)
    
    results_train[fold], results_val[fold] = train_fn(train_dataloader,valid_dataloader, model, optimizer, device, scheduler)

print("################################")
print("RESULTS")
print("################################")
cv_val = np.mean([results_val[i] for i in range(5)])
cv_train = np.mean([results_train[i] for i in range(5)])
print(f"Results of cross validation for seed 42: Train : {cv_train}, Val : {cv_val}")

Some weights of the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_base.bin were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_base.bin and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able

################################
Training Fold 0
################################


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=71.0), HTML(value='')))

Epoch:0| Batch 0 | Train Loss:0.7923513650894165 | Validation loss:1.1315773750694704
Validation loss decreased from inf to 1.1315773750694704.
Epoch:0| Batch 20 | Train Loss:0.7022720291501 | Validation loss:0.5726539438039484
Validation loss decreased from 1.1315773750694704 to 0.5726539438039484.
Epoch:0| Batch 40 | Train Loss:0.6623189914517287 | Validation loss:0.5725836304711623
Validation loss decreased from 0.5726539438039484 to 0.5725836304711623.
Epoch:0| Batch 60 | Train Loss:0.632723626543264 | Validation loss:0.5162458125974091
Validation loss decreased from 0.5725836304711623 to 0.5162458125974091.
Epoch:0| Batch 70 | Train Loss:0.6240275900968364 | Validation loss:0.5838566611350422



HBox(children=(FloatProgress(value=0.0, max=71.0), HTML(value='')))

Epoch:1| Batch 0 | Train Loss:0.47334906458854675 | Validation loss:0.5670962514172138
Epoch:1| Batch 20 | Train Loss:0.4471422476427896 | Validation loss:0.5006726581445882
Validation loss decreased from 0.5162458125974091 to 0.5006726581445882.
Epoch:1| Batch 40 | Train Loss:0.43455057173240474 | Validation loss:0.4870706771461057
Validation loss decreased from 0.5006726581445882 to 0.4870706771461057.
Epoch:1| Batch 50 | Train Loss:0.4266965541185117 | Validation loss:0.46860667769338044
Validation loss decreased from 0.4870706771461057 to 0.46860667769338044.
Epoch:1| Batch 55 | Train Loss:0.4259349544133459 | Validation loss:0.47772178095830997
Epoch:1| Batch 60 | Train Loss:0.42672585854764844 | Validation loss:0.4776248298060726
Epoch:1| Batch 65 | Train Loss:0.4257525679740039 | Validation loss:0.4775362335880038
Epoch:1| Batch 70 | Train Loss:0.42493186660215887 | Validation loss:0.49636086674643237



HBox(children=(FloatProgress(value=0.0, max=71.0), HTML(value='')))

Epoch:2| Batch 0 | Train Loss:0.28441858291625977 | Validation loss:0.48755747940338834
Epoch:2| Batch 5 | Train Loss:0.3246716956297557 | Validation loss:0.4772646043082358
Epoch:2| Batch 10 | Train Loss:0.31300704045729205 | Validation loss:0.474333631950365
Epoch:2| Batch 15 | Train Loss:0.3108386155217886 | Validation loss:0.47060252893978444
Epoch:2| Batch 20 | Train Loss:0.307386447985967 | Validation loss:0.46617568987356106
Validation loss decreased from 0.46860667769338044 to 0.46617568987356106.
Epoch:2| Batch 25 | Train Loss:0.30324855332191175 | Validation loss:0.4723271754845767
Epoch:2| Batch 30 | Train Loss:0.3003646803479041 | Validation loss:0.46995916878673394
Epoch:2| Batch 35 | Train Loss:0.2966213615404235 | Validation loss:0.4840830085982739
Epoch:2| Batch 40 | Train Loss:0.3018399629651046 | Validation loss:0.5073697558591064
Epoch:2| Batch 45 | Train Loss:0.3023525359837905 | Validation loss:0.47986521141629823
Epoch:2| Batch 50 | Train Loss:0.30227304615226447 

Some weights of the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_base.bin were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_base.bin and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able

################################
Training Fold 1
################################


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=71.0), HTML(value='')))

Epoch:0| Batch 0 | Train Loss:0.9456509351730347 | Validation loss:1.0269526965181592
Validation loss decreased from inf to 1.0269526965181592.
Epoch:0| Batch 20 | Train Loss:0.7040430122897738 | Validation loss:0.7170228219368089
Validation loss decreased from 1.0269526965181592 to 0.7170228219368089.
Epoch:0| Batch 40 | Train Loss:0.6663441432685386 | Validation loss:0.5394638562706154
Validation loss decreased from 0.7170228219368089 to 0.5394638562706154.
Epoch:0| Batch 60 | Train Loss:0.6289182842754927 | Validation loss:0.47770893342897924
Validation loss decreased from 0.5394638562706154 to 0.47770893342897924.
Epoch:0| Batch 65 | Train Loss:0.6228836214903629 | Validation loss:0.5244309268367122
Epoch:0| Batch 70 | Train Loss:0.6162984136124732 | Validation loss:0.51649886285755



HBox(children=(FloatProgress(value=0.0, max=71.0), HTML(value='')))

Epoch:1| Batch 0 | Train Loss:0.4579852819442749 | Validation loss:0.4884379033891248
Epoch:1| Batch 5 | Train Loss:0.461474284529686 | Validation loss:0.512942224321231
Epoch:1| Batch 10 | Train Loss:0.4488692717118697 | Validation loss:0.4728953013537635
Validation loss decreased from 0.47770893342897924 to 0.4728953013537635.
Epoch:1| Batch 15 | Train Loss:0.44598596170544624 | Validation loss:0.47613012203028504
Epoch:1| Batch 20 | Train Loss:0.4342355373359862 | Validation loss:0.48109807032094876
Epoch:1| Batch 25 | Train Loss:0.43429407248130214 | Validation loss:0.46412412948171855
Validation loss decreased from 0.4728953013537635 to 0.46412412948171855.
Epoch:1| Batch 30 | Train Loss:0.43022657690509675 | Validation loss:0.46896119793535956
Epoch:1| Batch 35 | Train Loss:0.4266153507762485 | Validation loss:0.46070065863535437
Validation loss decreased from 0.46412412948171855 to 0.46070065863535437.
Epoch:1| Batch 40 | Train Loss:0.42409316214119513 | Validation loss:0.462059

HBox(children=(FloatProgress(value=0.0, max=71.0), HTML(value='')))

Epoch:2| Batch 0 | Train Loss:0.33816802501678467 | Validation loss:0.4723286360082492
Epoch:2| Batch 5 | Train Loss:0.33157842606306076 | Validation loss:0.4920391215824745
Epoch:2| Batch 10 | Train Loss:0.3248330368237062 | Validation loss:0.45775271301538173
Validation loss decreased from 0.46070065863535437 to 0.45775271301538173.
Epoch:2| Batch 15 | Train Loss:0.30732125975191593 | Validation loss:0.4687765425359699
Epoch:2| Batch 20 | Train Loss:0.30375570200738455 | Validation loss:0.4580667638023135
Epoch:2| Batch 25 | Train Loss:0.30386013365708864 | Validation loss:0.46332107148539853
Epoch:2| Batch 30 | Train Loss:0.30899027134141616 | Validation loss:0.4796342110969651
Epoch:2| Batch 35 | Train Loss:0.3085673612852891 | Validation loss:0.474544367320101
Epoch:2| Batch 40 | Train Loss:0.3071570741694148 | Validation loss:0.4675201970926473
Epoch:2| Batch 45 | Train Loss:0.30681930093661597 | Validation loss:0.47204032246495636
Epoch:2| Batch 50 | Train Loss:0.306951803903953

Some weights of the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_base.bin were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_base.bin and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able

################################
Training Fold 2
################################


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=71.0), HTML(value='')))

Epoch:0| Batch 0 | Train Loss:1.0584193468093872 | Validation loss:1.0341821220559133
Validation loss decreased from inf to 1.0341821220559133.
Epoch:0| Batch 20 | Train Loss:0.7059010394981929 | Validation loss:0.635661610415284
Validation loss decreased from 1.0341821220559133 to 0.635661610415284.
Epoch:0| Batch 40 | Train Loss:0.6590849262912098 | Validation loss:0.5463583393835686
Validation loss decreased from 0.635661610415284 to 0.5463583393835686.
Epoch:0| Batch 60 | Train Loss:0.6222408619083342 | Validation loss:0.6356387961078698
Epoch:0| Batch 70 | Train Loss:0.6133832965098637 | Validation loss:0.5820295071937669



HBox(children=(FloatProgress(value=0.0, max=71.0), HTML(value='')))

Epoch:1| Batch 0 | Train Loss:0.5412919521331787 | Validation loss:0.5794928094870607
Epoch:1| Batch 20 | Train Loss:0.44679054617881775 | Validation loss:0.5573476895060337
Epoch:1| Batch 40 | Train Loss:0.44457350707635646 | Validation loss:0.5338764612523603
Validation loss decreased from 0.5463583393835686 to 0.5338764612523603.
Epoch:1| Batch 60 | Train Loss:0.4396584967120749 | Validation loss:0.5233924600859763
Validation loss decreased from 0.5338764612523603 to 0.5233924600859763.
Epoch:1| Batch 70 | Train Loss:0.43223227623482824 | Validation loss:0.49763073530835167
Validation loss decreased from 0.5233924600859763 to 0.49763073530835167.



HBox(children=(FloatProgress(value=0.0, max=71.0), HTML(value='')))

Epoch:2| Batch 0 | Train Loss:0.3622843325138092 | Validation loss:0.48694887169649903
Validation loss decreased from 0.49763073530835167 to 0.48694887169649903.
Epoch:2| Batch 10 | Train Loss:0.33712476762858307 | Validation loss:0.4793195342513877
Validation loss decreased from 0.48694887169649903 to 0.4793195342513877.
Epoch:2| Batch 15 | Train Loss:0.32804011926054955 | Validation loss:0.5153447855526293
Epoch:2| Batch 20 | Train Loss:0.33347817829677034 | Validation loss:0.48702097883526707
Epoch:2| Batch 25 | Train Loss:0.3295140272149673 | Validation loss:0.5150200770774358
Epoch:2| Batch 30 | Train Loss:0.33082574461736985 | Validation loss:0.4934559477047181
Epoch:2| Batch 35 | Train Loss:0.32811399300893146 | Validation loss:0.48290796284104737
Epoch:2| Batch 40 | Train Loss:0.3233114640887191 | Validation loss:0.49984086525272314
Epoch:2| Batch 45 | Train Loss:0.3217801639567251 | Validation loss:0.48707336481188385
Epoch:2| Batch 50 | Train Loss:0.32440049917090175 | Valida

Some weights of the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_base.bin were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_base.bin and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able

################################
Training Fold 3
################################


HBox(children=(FloatProgress(value=0.0, max=71.0), HTML(value='')))

Epoch:0| Batch 0 | Train Loss:0.9990240931510925 | Validation loss:1.0267089676689094
Validation loss decreased from inf to 1.0267089676689094.
Epoch:0| Batch 20 | Train Loss:0.7194385117008573 | Validation loss:0.5992325997688401
Validation loss decreased from 1.0267089676689094 to 0.5992325997688401.
Epoch:0| Batch 40 | Train Loss:0.6740186178102726 | Validation loss:0.526514277491771
Validation loss decreased from 0.5992325997688401 to 0.526514277491771.
Epoch:0| Batch 60 | Train Loss:0.6359398003484382 | Validation loss:0.7246886151777187
Epoch:0| Batch 70 | Train Loss:0.6345498746549579 | Validation loss:0.5124035645538653
Validation loss decreased from 0.526514277491771 to 0.5124035645538653.



HBox(children=(FloatProgress(value=0.0, max=71.0), HTML(value='')))

Epoch:1| Batch 0 | Train Loss:0.39480799436569214 | Validation loss:0.5238974291673848
Epoch:1| Batch 20 | Train Loss:0.4393762747446696 | Validation loss:0.5292337282862462
Epoch:1| Batch 40 | Train Loss:0.4255430036928596 | Validation loss:0.4827947717317393
Validation loss decreased from 0.5124035645538653 to 0.4827947717317393.
Epoch:1| Batch 50 | Train Loss:0.4238975305183261 | Validation loss:0.4883542430232948
Epoch:1| Batch 60 | Train Loss:0.424133653523492 | Validation loss:0.46690969483953126
Validation loss decreased from 0.4827947717317393 to 0.46690969483953126.
Epoch:1| Batch 65 | Train Loss:0.42387614105687 | Validation loss:0.4670727427156878
Epoch:1| Batch 70 | Train Loss:0.42055229401924243 | Validation loss:0.47110713010942434



HBox(children=(FloatProgress(value=0.0, max=71.0), HTML(value='')))

Epoch:2| Batch 0 | Train Loss:0.27288201451301575 | Validation loss:0.4665902233879331
Validation loss decreased from 0.46690969483953126 to 0.4665902233879331.
Epoch:2| Batch 5 | Train Loss:0.29292092720667523 | Validation loss:0.47111454585068663
Epoch:2| Batch 10 | Train Loss:0.29601034251126374 | Validation loss:0.4671824597556826
Epoch:2| Batch 15 | Train Loss:0.29884185642004013 | Validation loss:0.47276288082062357
Epoch:2| Batch 20 | Train Loss:0.2964006761709849 | Validation loss:0.46703094244003296
Epoch:2| Batch 25 | Train Loss:0.29522363726909345 | Validation loss:0.478384170313956
Epoch:2| Batch 30 | Train Loss:0.29942586104716024 | Validation loss:0.4636119553740595
Validation loss decreased from 0.4665902233879331 to 0.4636119553740595.
Epoch:2| Batch 35 | Train Loss:0.29846804589033127 | Validation loss:0.45983786398256327
Validation loss decreased from 0.4636119553740595 to 0.45983786398256327.
Epoch:2| Batch 40 | Train Loss:0.30061492105809656 | Validation loss:0.4660

Some weights of the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_base.bin were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at Pretrain_CLRP_Roberta/pretrained_roberta_base.bin and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able

################################
Training Fold 4
################################


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=71.0), HTML(value='')))

Epoch:0| Batch 0 | Train Loss:0.9600926637649536 | Validation loss:0.9083432294953038
Validation loss decreased from inf to 0.9083432294953038.
Epoch:0| Batch 20 | Train Loss:0.6892463451340085 | Validation loss:0.6863762568420088
Validation loss decreased from 0.9083432294953038 to 0.6863762568420088.
Epoch:0| Batch 40 | Train Loss:0.6605728443075971 | Validation loss:0.6892816642640343
Epoch:0| Batch 60 | Train Loss:0.648099846038662 | Validation loss:0.5911960114895458
Validation loss decreased from 0.6863762568420088 to 0.5911960114895458.
Epoch:0| Batch 70 | Train Loss:0.6502224798773376 | Validation loss:0.6461040029223536



HBox(children=(FloatProgress(value=0.0, max=71.0), HTML(value='')))

Epoch:1| Batch 0 | Train Loss:0.6819481253623962 | Validation loss:0.6054006903104379
Epoch:1| Batch 20 | Train Loss:0.4768948640142168 | Validation loss:0.5407709685009969
Validation loss decreased from 0.5911960114895458 to 0.5407709685009969.
Epoch:1| Batch 40 | Train Loss:0.4600063969449299 | Validation loss:0.5245160590594923
Validation loss decreased from 0.5407709685009969 to 0.5245160590594923.
Epoch:1| Batch 60 | Train Loss:0.4508063216678432 | Validation loss:0.500753102587982
Validation loss decreased from 0.5245160590594923 to 0.500753102587982.
Epoch:1| Batch 70 | Train Loss:0.44708911843702825 | Validation loss:0.4942208255680514
Validation loss decreased from 0.500753102587982 to 0.4942208255680514.



HBox(children=(FloatProgress(value=0.0, max=71.0), HTML(value='')))

Epoch:2| Batch 0 | Train Loss:0.2829974591732025 | Validation loss:0.49228657446276974
Validation loss decreased from 0.4942208255680514 to 0.49228657446276974.
Epoch:2| Batch 10 | Train Loss:0.3462645411491394 | Validation loss:0.4921416069420291
Validation loss decreased from 0.49228657446276974 to 0.4921416069420291.
Epoch:2| Batch 20 | Train Loss:0.3373187255291712 | Validation loss:0.5029395810315307
Epoch:2| Batch 30 | Train Loss:0.33363915066565236 | Validation loss:0.48390723240207617
Validation loss decreased from 0.4921416069420291 to 0.48390723240207617.
Epoch:2| Batch 40 | Train Loss:0.34048883944022945 | Validation loss:0.48677360927554925
Epoch:2| Batch 50 | Train Loss:0.3353884436336218 | Validation loss:0.4819319103385361
Validation loss decreased from 0.48390723240207617 to 0.4819319103385361.
Epoch:2| Batch 60 | Train Loss:0.33522490546351574 | Validation loss:0.4848184203597861
Epoch:2| Batch 70 | Train Loss:0.3343609542074338 | Validation loss:0.5023762545955013

##

In [None]:
# I added double layer norm to attention 
# Results of cross validation for seed 42: Train : 0.22953509619398332, Val : 0.4700026281702686 LB 0.474
# Now I add accumulation step = 4
# Results of cross validation for seed 42: Train : 0.23496394569678217, Val : 0.4683334995323504 LB 0.466
# Change to 768 in the attention layer
# Results of cross validation for seed 42: Train : 0.21059715945799667, Val : 0.47091682872302093 LB ? 


# for roberta base it seems thats proper lr works better ! I should try funnel large and deberta. Try electra pretraining with masked LM
# Results of cross validation for seed 42: Train : 0.32064014839244187, Val : 0.4690035422922859