### Imports

In [2]:
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools

import scipy as sp
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset
from torch.optim.swa_utils import AveragedModel, update_bn, SWALR
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
import warnings
warnings.filterwarnings("ignore")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### CFG

In [30]:
class CFG:
    model="microsoft/deberta-v3-base" # funnel-transformer/medium, funnel-transformer/large, google/bigbird-roberta-base, google/bigbird-roberta-large, roberta-large, roberta-base, microsoft/deberta-v3-base, microsoft/deberta-v3-large, google/electra-base-discriminator, google/electra-large-discriminator, xlm-roberta-base, xlm-roberta-large, xlnet-base-cased, xlnet-large-cased
    model_name="".join(("-".join(model.split("/"))).split("-"))
    gradient_checkpointing=False
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps_rate=0.0
    apex=True
    epochs=1
    encoder_lr=5e-5
    decoder_lr=5e-4
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=8
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    grad_clipping=True
    max_grad_norm=1000
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=4
    trn_fold=range(n_fold)
    num_workers=8
    eval_method="epoch" # step, epoch
    eval_step=30
    reinit_layers=1
    init_weight="normal" # xavier_uniform, xavier_normal, kaiming_uniform, kaiming_normal, orthogonal, normal
    pooling="mean" # mean, attention, cls, concat
    llrd="normal" # grouped, normal, none
    llr_decay=0.8
    msd_num=5
    msd=False 
    train=True
    swa=False
    swa_start_ratio=0.75
    swa_lr=1e-4
    anneal_epochs=int(1-swa_start_ratio * epochs)
    anneal_strategy='cos'
    fgm=False
    unscale=True
    wandb=False
    debug=False
    pseudo=False

if CFG.debug:
  CFG.epochs=2
    
DATA_DIR = "/content/drive/MyDrive/Kaggle Training Results/English Language Learning/data/"
OUTPUT_DIR = f"/content/drive/MyDrive/Kaggle Training Results/English Language Learning/trained/{CFG.model_name}{CFG.pooling}pooling/"

### Helper Functions

In [32]:
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores

def cv_split(func):
    fold = func(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
    return fold

def seed_everything(seed = 42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
seed_everything(seed=CFG.seed)

### Load Data

In [33]:
train = pd.read_csv(DATA_DIR + "train.csv")
test = pd.read_csv(DATA_DIR + "test.csv")
submission = pd.read_csv(DATA_DIR + "sample_submission.csv")

print(f"train.shape: {train.shape}")
display(train.head())
print(f"test.shape: {test.shape}")
display(test.head())
print(f"submission.shape: {submission.shape}")
display(submission.head())

train.shape: (3911, 8)


Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


test.shape: (3, 2)


Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


submission.shape: (3, 7)


Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,3.0,3.0,3.0,3.0,3.0,3.0
1,000BAD50D026,3.0,3.0,3.0,3.0,3.0,3.0
2,00367BB2546B,3.0,3.0,3.0,3.0,3.0,3.0


### Tokenizer

In [34]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
CFG.tokenizer = tokenizer
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


('/content/drive/MyDrive/Kaggle Training Results/English Language Learning/trained/microsoftdebertav3basemeanpooling/tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/Kaggle Training Results/English Language Learning/trained/microsoftdebertav3basemeanpooling/tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/Kaggle Training Results/English Language Learning/trained/microsoftdebertav3basemeanpooling/tokenizer/spm.model',
 '/content/drive/MyDrive/Kaggle Training Results/English Language Learning/trained/microsoftdebertav3basemeanpooling/tokenizer/added_tokens.json',
 '/content/drive/MyDrive/Kaggle Training Results/English Language Learning/trained/microsoftdebertav3basemeanpooling/tokenizer/tokenizer.json')

### CV Split

In [35]:
Fold = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
df = train.copy()
y = pd.get_dummies(data=df[CFG.target_cols], columns=CFG.target_cols)
for n, (train_index, val_index) in enumerate(Fold.split(X=train, y=y)):
        train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())
train.to_csv(DATA_DIR + "folded_train.csv")

fold
0    979
1    979
2    972
3    981
dtype: int64

### Dataset

In [36]:
if "bigbird" in CFG.model or "deberta" in CFG.model or "longformer" in CFG.model:
    CFG.max_len = 1428
print(f"max_len: {CFG.max_len}")

max_len: 1428


In [37]:
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['full_text'].values
        self.labels = df[cfg.target_cols].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, i):
        inputs = prepare_input(self.cfg, self.texts[i])
        label = torch.tensor(self.labels[i], dtype=torch.float)
        return inputs, label
    

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

### Pooling

In [38]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class AttentionPooling(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.attention = nn.Sequential(
        nn.Linear(in_dim, in_dim),
        nn.LayerNorm(in_dim),
        nn.GELU(),
        nn.Linear(in_dim, 1),
        )

    def forward(self, last_hidden_state, attention_mask):
        w = self.attention(last_hidden_state).float()
        w[attention_mask==0]=float('-inf')
        w = torch.softmax(w,1)
        attention_embeddings = torch.sum(w * last_hidden_state, dim=1)
        return attention_embeddings

### FGM

In [39]:
class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}

    def attack(self, epsilon = 1., emb_name = 'word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0:
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name = 'word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                assert name in self.backup
                param.data = self.backup[name]
            self.backup = {}

### Model

In [40]:
class Model(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        
        self.pool = MeanPooling()
        self.attention = AttentionPooling(self.config.hidden_size)
        self.high_dropout = nn.Dropout(p=0.5)
        self.concat_pool = nn.Linear(self.config.hidden_size*3, self.config.hidden_size)
        self.fc = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.fc)
        self._init_weights(self.concat_pool)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            if CFG.init_weight == 'normal':
                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            elif CFG.init_weight == 'xavier_uniform':
                module.weight.data = nn.init.xavier_uniform_(module.weight.data)
            elif CFG.init_weight == 'xavier_normal':
                module.weight.data = nn.init.xavier_normal_(module.weight.data)
            elif CFG.init_weight == 'kaiming_uniform':
                module.weight.data = nn.init.kaiming_uniform_(module.weight.data)
            elif CFG.init_weight == 'kaiming_normal':
                module.weight.data = nn.init.kaiming_normal_(module.weight.data)
            elif CFG.init_weight == 'orthogonal':
                module.weight.data = nn.init.orthogonal_(module.weight.data) 
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            if CFG.init_weight == 'normal':
                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            elif CFG.init_weight == 'xavier_uniform':
                module.weight.data = nn.init.xavier_uniform_(module.weight.data)
            elif CFG.init_weight == 'xavier_normal':
                module.weight.data = nn.init.xavier_normal_(module.weight.data)
            elif CFG.init_weight == 'kaiming_uniform':
                module.weight.data = nn.init.kaiming_uniform_(module.weight.data)
            elif CFG.init_weight == 'kaiming_normal':
                module.weight.data = nn.init.kaiming_normal_(module.weight.data)
            elif CFG.init_weight == 'orthogonal':
                module.weight.data = nn.init.orthogonal_(module.weight.data) 
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs.last_hidden_state

        if self.cfg.msd:
            mean_feature = torch.mean(torch.stack([self.pool(self.high_dropout(last_hidden_states), inputs['attention_mask']) for _ in range(self.cfg.msd_num)], dim=0), dim=0)    
            attention_feature = torch.mean(torch.stack([self.attention(self.high_dropout(last_hidden_states), inputs['attention_mask']) for _ in range(self.cfg.msd_num)], dim=0), dim=0)  
            cls_token_feature = torch.mean(torch.stack([self.high_dropout(last_hidden_states)[:, 0, :] for _ in range(self.cfg.msd_num)], dim=0), dim=0)
            combine_feature = torch.cat([mean_feature, attention_feature, cls_token_feature], dim = -1)
            feature = self.concat_pool(combine_feature)
            if self.cfg.pooling == "mean":
                return mean_feature
            elif self.cfg.pooling == "attention":
                return attention_feature
            elif self.cfg.pooling == "cls":
                return cls_token_feature
            else:
                return feature
        else:
        # mean pooled sentence representation
            mean_feature = self.pool(last_hidden_states, inputs['attention_mask'])
        # attention based sentence representation
            attention_feature = self.attention(last_hidden_states, inputs['attention_mask'])
        # CLS Token representation
            cls_token_feature = last_hidden_states[:, 0, :] # only cls token
        # Concat them
            combine_feature = torch.cat([mean_feature, attention_feature, cls_token_feature], dim = -1)
        # MLP
            feature = self.concat_pool(combine_feature)
            if self.cfg.pooling == "mean":
                return mean_feature
            elif self.cfg.pooling == "attention":
                return attention_feature
            elif self.cfg.pooling == "cls":
                return cls_token_feature
            else:
                return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

### Train

In [41]:
def re_initializing_layer(model, config, layer_num):
    print(f"reinitializing last {layer_num} layers")
    for module in model.model.encoder.layer[-layer_num:].modules():
        if isinstance(module, nn.Linear):
            if CFG.init_weight == 'normal':
                module.weight.data.normal_(mean=0.0, std=config.initializer_range)
            elif CFG.init_weight == 'xavier_uniform':
                module.weight.data = nn.init.xavier_uniform_(module.weight.data)
            elif CFG.init_weight == 'xavier_normal':
                module.weight.data = nn.init.xavier_normal_(module.weight.data)
            elif CFG.init_weight == 'kaiming_uniform':
                module.weight.data = nn.init.kaiming_uniform_(module.weight.data)
            elif CFG.init_weight == 'kaiming_normal':
                module.weight.data = nn.init.kaiming_normal_(module.weight.data)
            elif CFG.init_weight == 'orthogonal':
                module.weight.data = nn.init.orthogonal_(module.weight.data) 
                
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            if CFG.init_weight == 'normal':
                module.weight.data.normal_(mean=0.0, std=config.initializer_range)
            elif CFG.init_weight == 'xavier_uniform':
                module.weight.data = nn.init.xavier_uniform_(module.weight.data)
            elif CFG.init_weight == 'xavier_normal':
                module.weight.data = nn.init.xavier_normal_(module.weight.data)
            elif CFG.init_weight == 'kaiming_uniform':
                module.weight.data = nn.init.kaiming_uniform_(module.weight.data)
            elif CFG.init_weight == 'kaiming_normal':
                module.weight.data = nn.init.kaiming_normal_(module.weight.data)
            elif CFG.init_weight == 'orthogonal':
                module.weight.data = nn.init.orthogonal_(module.weight.data)
                
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    return model

def train_func(global_step, batch, model, criterion, optimizer, scheduler, scaler, fgm, swa_model, swa_scheduler, device):
    x, y = batch
    x = collate(x)
    for k,v in x.items():
        x[k] = v.to(device)
    y = y.to(device)
    optimizer.zero_grad()
    with torch.cuda.amp.autocast(enabled=CFG.apex):
        yhat = model(x)
        loss = criterion(yhat, y)
    if CFG.gradient_accumulation_steps > 1:
        loss = loss / CFG.gradient_accumulation_steps
    scaler.scale(loss).backward()
    if CFG.unscale:
        scaler.unscale_(optimizer)
    if CFG.grad_clipping:
        torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
    if CFG.fgm:
        fgm.attack()
        with torch.cuda.amp.autocast(enabled = CFG.apex):
            y_preds = model(x)
            loss_adv = criterion(y_preds, y)
            loss_adv.backward()
        fgm.restore()
    scaler.step(optimizer)
    scaler.update()
    if not CFG.swa:
        scheduler.step()
    else:
        if global_step < CFG.swa_start:
            scheduler.step()
        else:
            swa_model.update_parameters(model)
            swa_scheduler.step()

    return yhat, y, loss

def val(val_loader, model, criterion, device):
    count = 0
    mean_loss = 0
    
    model.eval()
    preds = []
    for batch in val_loader:
        with torch.inference_mode():
            count += 1
            x, y = batch
            x = collate(x)
            for k,v in x.items():
              x[k] = v.to(device)
            y = y.to(device)
            yhat = model(x)
            loss = criterion(yhat, y)
            if CFG.gradient_accumulation_steps > 1:
              loss = loss / CFG.gradient_accumulation_steps
            preds.append(yhat.to('cpu').numpy())
            mean_loss += loss
    mean_loss = mean_loss/count
    predictions = np.concatenate(preds)
    return mean_loss, predictions

def train_and_eval(batch, val_loader, val_labels, val_folds, global_step, best_score, model, criterion, optimizer, scheduler, progress_bar, scaler, fgm, swa_model, swa_scheduler, device, fold):
    yhat, y, training_loss = train_func(global_step, batch, model, criterion, optimizer, scheduler, scaler, fgm, swa_model, swa_scheduler, device)
    training_score, training_scores = get_score(yhat.cpu().detach().numpy(), y.cpu().detach().numpy())
    progress_bar.update(1)

    if global_step % CFG.eval_step == 0:
        val_loss, predictions = val(val_loader, model, criterion, device)
        val_score, val_scores = get_score(val_labels, predictions)
        print("=" * 30)
        print(f"step {global_step} | training loss: {training_loss} | training score: {training_score} | validation loss: {val_loss} | validation score: {val_score}")
        if val_score < best_score:
            best_score = val_score
            val_folds[[f"pred_{c}" for c in CFG.target_cols]] = predictions
            if not CFG.debug:
                print(f"saving best model with score: {best_score}")
                if CFG.msd:
                    torch.save(model.state_dict(), OUTPUT_DIR + f"modelfold{fold + 1}{CFG.llrd}llrdmsd{CFG.init_weight}.pth")
                else:
                    torch.save(model.state_dict(), OUTPUT_DIR + f"modelfold{fold + 1}{CFG.llrd}llrdnomsd{CFG.init_weight}.pth")
        print("=" * 30)
        print()
    
    if CFG.wandb and not CFG.debug:
        wandb.log(
            {f"[fold{fold}] training loss": training_loss}
        )
    
    return best_score, val_folds, training_loss

def train_and_eval_epoch(train_loader, val_loader, val_labels, val_folds, best_score, model, criterion, optimizer, epoch, scheduler, fgm, swa_model, swa_scheduler, progress_bar, device, global_step, fold):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    mean_training_loss = 0
    for batch in train_loader:
        global_step += 1
        best_score, val_folds, training_loss = train_and_eval(batch, val_loader, val_labels, val_folds, global_step, best_score, model, criterion, optimizer, scheduler, progress_bar, scaler, fgm, swa_model, swa_scheduler, device, fold)
        mean_training_loss += training_loss
    mean_training_loss = mean_training_loss / len(train_loader)
    return global_step, val_folds, best_score, mean_training_loss


def train_loop(train, fold):
    print(f"============== fold: {fold + 1} training ==============")
    train_folds = train[train['fold'] != fold].reset_index(drop=True)
    if CFG.pseudo:
        pseudo = pd.read_csv("/content/drive/MyDrive/Kaggle Training Results/English Language Learning/trained/pseudo.csv", index_col=0)
        pseudo["fold"] = [-1] * len(pseudo)
        train_folds = pd.concat([train_folds, pseudo])
    val_folds = train[train['fold'] == fold].reset_index(drop=True)
    val_labels = val_folds[CFG.target_cols].values

    
    train_dataset = TrainDataset(CFG, train_folds)
    val_dataset = TrainDataset(CFG, val_folds)
    
    train_loader = DataLoader(
        train_dataset,
        batch_size=CFG.batch_size,
        shuffle=True,
        num_workers=CFG.num_workers,
        pin_memory=True,
        drop_last=True
        )
    val_loader = DataLoader(
        val_dataset,
        batch_size=CFG.batch_size * 2,
        shuffle=False,
        num_workers=CFG.num_workers,
        pin_memory=True,
        drop_last=False
        )
    
    reinit_layers = CFG.reinit_layers
    model = Model(CFG, pretrained=True)

    if reinit_layers > 0 and "funnel" not in CFG.model:
      model = re_initializing_layer(model, model.config, reinit_layers)
    torch.save(model.config, OUTPUT_DIR+"config.pth")
    model.to(device)
    
    def get_optimizer_params_groupedllrd(model, encoder_lr, decoder_lr, weight_decay=0.0):
      no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
      total_layers = int(model.config.num_hidden_layers)
      group_layers = int(total_layers/4)
      group1 = [f"layer.{i}." for i in range(group_layers)]
      group2 = [f"layer.{i}." for i in range(group_layers, group_layers*2)]
      group3 = [f"layer.{i}." for i in range(group_layers*2, total_layers)]
      optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],
             'lr': encoder_lr/3, 'weight_decay': weight_decay},
             {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
             {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],
             'lr': encoder_lr*3.5, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)],
             'lr': encoder_lr/3, 'weight_decay': 0.0},
             {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)],
             'lr': encoder_lr, 'weight_decay': 0.0},
             {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)],
             'lr': encoder_lr*3.5, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': encoder_lr*10, 'weight_decay': 0.0}
      ]
      return optimizer_parameters

    def get_optimizer_params_llrd(model, encoder_lr, decoder_lr, weight_decay, llr_decay):
      no_decay = ["bias", "LayerNorm.weight"]
      optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if "model" not in n],
            "lr": decoder_lr,
            "weight_decay": 0.0,
        }
      ]
      layers = [model.model.embeddings] + list(model.model.encoder.layer)
      layers.reverse()
      lr = encoder_lr
      for layer in layers:
        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": weight_decay,
                "lr": lr,
            },
            {
                "params": [p for n, p in layer.named_parameters() if any (nd in n for nd in no_decay)],
                "weight_decay": 0.0,
                "lr": lr,
            }
        ]
        lr *= llr_decay
      return optimizer_grouped_parameters

    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
      param_optimizer = list(model.named_parameters())
      no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
      optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
      ]
      return optimizer_parameters


    if CFG.llrd == "grouped":
      optimizer_parameters = get_optimizer_params_groupedllrd(model,
                                                  encoder_lr=CFG.encoder_lr, 
                                                  decoder_lr=CFG.decoder_lr,
                                                  weight_decay=CFG.weight_decay)
    elif CFG.llrd == "normal":
      optimizer_parameters = get_optimizer_params_llrd(model,
                                                  encoder_lr=CFG.encoder_lr, 
                                                  decoder_lr=CFG.decoder_lr,
                                                  weight_decay=CFG.weight_decay,
                                                  llr_decay = CFG.llr_decay)
    else:
      optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)


    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=int(cfg.num_warmup_steps_rate * num_train_steps), num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=int(cfg.num_warmup_steps_rate * num_train_steps), num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_steps_total = CFG.epochs * len(train_loader)

    # FGM
    fgm = FGM(model)

    # SWA
    swa_model = AveragedModel(model)
    swa_scheduler = SWALR(
        optimizer,
        swa_lr=CFG.swa_lr,
        anneal_strategy=CFG.anneal_strategy
    )
    CFG.swa_start = CFG.swa_start_ratio * num_steps_total
    
    if CFG.eval_method == "epoch":
        CFG.eval_step = len(train_loader)
    num_steps_val = len(val_loader)
    
    scheduler = get_scheduler(CFG, optimizer, int(CFG.epochs * len(train_loader)))
    criterion = nn.SmoothL1Loss(reduction='mean')
    
    progress_bar_train = tqdm(range(num_steps_total))
    global_step = 0
    best_score = np.inf
    
    for epoch in range(CFG.epochs):
        global_step, val_folds, best_score, mean_training_loss = train_and_eval_epoch(train_loader, val_loader, val_labels, val_folds, best_score, model, criterion, optimizer, epoch, scheduler, fgm, swa_model, swa_scheduler, progress_bar_train, device, global_step, fold)
        print(f"best score after epoch {epoch + 1} : {best_score} | training loss: {mean_training_loss}")
    
    if CFG.swa:
        update_bn(train_loader, swa_model, device=torch.device('cuda'))
        mean_loss, predictions = val(val_loader, swa_model, criterion, device)
        swa_score = get_score(val_labels, predictions)
        print(f"SWA score: {swa_score}")
        if swa_score < best_score:
            print(f"saving model with score: {best_score}")
            if CFG.msd:
                torch.save(swa_model.state_dict(), OUTPUT_DIR + f"modelfold{fold + 1}{CFG.llrd}llrdmsd{CFG.init_weight}.pth")
            else:
                torch.save(swa_model.state_dict(), OUTPUT_DIR + f"modelfold{fold + 1}{CFG.llrd}llrdnomsd{CFG.init_weight}.pth")

    torch.cuda.empty_cache()
    gc.collect()
    return val_folds

In [42]:
if CFG.train:
    oof_df = pd.DataFrame()
    for fold in CFG.trn_fold:
        _oof_df = train_loop(train, fold)
        oof_df = pd.concat([oof_df, _oof_df])
        print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
        oof_df = oof_df.reset_index(drop=True)
        if not CFG.debug:
            oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl', protocol = 4)



Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


reinitializing last 1 layers


  0%|          | 0/366 [00:00<?, ?it/s]

step 366 | training loss: 0.16973647475242615 | training score: 0.5615291595458984 | validation loss: 0.11985740065574646 | validation score: 0.4935551053332832
saving best model with score: 0.4935551053332832

best score after epoch 1 : 0.4935551053332832 | training loss: 0.21864551305770874
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


reinitializing last 1 layers


  0%|          | 0/366 [00:00<?, ?it/s]

step 366 | training loss: 0.1444847583770752 | training score: 0.5284966826438904 | validation loss: 0.11282975226640701 | validation score: 0.47561079273025114
saving best model with score: 0.47561079273025114

best score after epoch 1 : 0.47561079273025114 | training loss: 0.180236354470253
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


reinitializing last 1 layers


  0%|          | 0/367 [00:00<?, ?it/s]

step 367 | training loss: 0.1549607515335083 | training score: 0.5432860851287842 | validation loss: 0.1263970285654068 | validation score: 0.5060120220619165
saving best model with score: 0.5060120220619165

best score after epoch 1 : 0.5060120220619165 | training loss: 0.2148781269788742
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


reinitializing last 1 layers


  0%|          | 0/366 [00:00<?, ?it/s]

step 366 | training loss: 0.12016137689352036 | training score: 0.48696354031562805 | validation loss: 0.1208503320813179 | validation score: 0.49383887790481107
saving best model with score: 0.49383887790481107

best score after epoch 1 : 0.49383887790481107 | training loss: 0.20390519499778748
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


In [43]:
if CFG.wandb:
  wandb.finish()

In [44]:
cv = get_score(oof_df[CFG.target_cols].values, oof_df[[f"pred_{c}" for c in CFG.target_cols]].values)
print(cv)

(0.49239353943354386, [0.5206828528015025, 0.476696090454622, 0.46493775960974687, 0.49219390925825496, 0.5185058621330008, 0.4813447623441358])
