### Imports

In [None]:
# !pip install transformers datasets sentencepiece optuna ray opendatasets
# !nvidia-smi

In [None]:
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools

import scipy as sp
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
import warnings
warnings.filterwarnings("ignore")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### CFG

In [None]:
class CFG:
    wandb=False
    competition=''
    _wandb_kernel=''
    debug=False
    apex=True
    model="microsoft/deberta-v3-base"
    gradient_checkpointing=True
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=8
    encoder_lr=5e-6
    decoder_lr=5e-6
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=8
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_cols=[]
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

DATA_DIR = ""
OUTPUT_DIR = ""

### Helper Functions

In [None]:
def get_score(y_hat, y):
    pass

def cv_split(func):
    fold = func(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
    return fold

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

### Load Data

In [None]:
train = pd.read_csv(DATA_DIR + "train.csv")
test = pd.read_csv(DATA_DIR + "test.csv")
submission = pd.read_csv(DATA_DIR + "sample_submission.csv")

print(f"train.shape: {train.shape}")
display(train.head())
print(f"test.shape: {test.shape}")
display(test.head())
print(f"submission.shape: {submission.shape}")
display(submission.head())

### CV Split

In [None]:
splitter = cv_split(None)
for n, (train_index, val_index) in enumerate(Fold.split(train, train[CFG.target_cols])):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

### Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
CFG.tokenizer = tokenizer
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')

### Dataset

In [None]:
lengths = []
tk0 = tqdm(train['full_text'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 2 # cls & sep
print(f"max_len: {CFG.max_len}")

In [None]:
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class Dataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['full_text'].values
        self.labels = df[cfg.target_cols].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label
    

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

### Model

In [None]:
backbone = AutoModel.from_pretrained(CFG.model)

In [None]:
class Pooling(nn.Module):
    def __init__(self):
        super().__init__()
        pass
    def forward():
        pass

In [None]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = backbone
        pass
    def forward():
        pass

### Training

In [None]:
def train(train_loader, model, criterion, optimizer, epoch, scheduler, progress_bar, device):
    count = 0
    mean_loss = 0
    mean_score = 0
    
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    for batch in train_dataloader:
        count += 1
        x, y = batch
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            yhat = model(x)
            loss = criterion(yhat, y)
        score = get_score(yhat, y)
        mean_loss += loss
        mean_score += score
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        scheduler.step()
        progress_bar.update(1)
    
    mean_loss = mean_loss/count
    mean_score = mean_score/count
    return mean_loss, mean_score

def val(val_loader, model, criterion, epoch, progress_bar, device):
    count = 0
    mean_loss = 0
    mean_score = 0
    
    model.eval()
    for batch in val_loader:
        with torch.inference_mode():
            count += 1
            x, y = batch
            yhat = model(x)
            loss = criterion(yhat, y)
            score = get_score(yhat, y)
            mean_loss += loss
            mean_score += score
            progress_bar.update(1)
    mean_loss = mean_loss/count
    mean_score = mean_score/count
    return mean_loss, mean_score

def train_loop(train, fold):
    print(f"============== fold: {fold + 1} training ==============")
    
    train_folds = train[train['fold'] != fold].reset_index(drop=True)
    val_folds = train[train['fold'] == fold].reset_index(drop=True)
    val_labels = val_folds[CFG.target_cols].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    val_dataset = TrainDataset(CFG, val_folds)
    
    train_loader = DataLoader(
        train_dataset,
        batch_size=CFG.batch_size,
        shuffle=True,
        num_workers=CFG.num_workers,
        pin_memory=True,
        drop_last=True
        )
    val_loader = DataLoader(
        val_dataset,
        batch_size=CFG.batch_size * 2,
        shuffle=False,
        num_workers=CFG.num_workers,
        pin_memory=True,
        drop_last=False
        )
    
    model = Model(CFG, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+"config.pth")
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_steps_total = CFG.epochs * len(train_loader)
    num_steps_val = len(val_loader)
    
    scheduler = get_scheduler(CFG, optimizer, num_steps_total)
    criterion = nn.SmoothL1Loss(reduction='mean')
    
    progress_bar_train = tqdm(range(num_steps_total))
    
    best_score = np.inf
    
    for epoch in range(CFG.epochs):
        mean_loss_train, mean_score_train = train_epoch(train_loader, model, criterion, optimizer, epoch, scheduler, progress_bar_train, device)
        print("================================================================")
        print(f"mean training loss at epoch {epoch + 1}: {mean_loss_train}")
        print(f"mean training score at epoch {epoch}: {mean_score_train}")
        
        print(f"============== epoch: {epoch + 1} evaluating ==============")
        progress_bar_val = tqdm(range(num_steps_val))
        mean_loss_val, mean_score_val = val_epoch(val_loader, model, criterion, epoch, progress_bar_val, device)
        print(f"mean validation loss at epoch {epoch + 1}: {mean_loss_val}")
        print(f"mean valiation score at epoch {epoch + 1}: {mean_score_val}")
        
        if mean_score_val < best_score:
            best_score = mean_score_val
            print(f"saving best model with score: {best_score}")
            torch.save(model, OUTPUT_DIR + f"modelfold{fold + 1}.pth")
        print("================================================================")
        print(" ")
    torch.cuda.empty_cache()
    gc.collect()

In [None]:
train_loop(train, fold)