## Import and EDA

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import scipy as sp
from datasets import Dataset,DatasetDict

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam,SGD,AdamW

from sklearn.model_selection import StratifiedGroupKFold
import shutil
import random,os,math, time
from tqdm.auto import tqdm

import gc

import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM =true

device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

In [None]:
class CFG:
    seed = 42
    debug = False
    train = True
    n_fold = 4
    trn_fold = [3]
    apex = True
    print_freq = 100
    target_size = 1
    num_workers = 4
    scheduler = 'linear' # 'linear''cosine'
    batch_scheduler = True
    num_cycles = 0.5
    num_warmup_steps = 0 #50 100 500 1000
    epochs = 4
    encoder_lr = 4e-6 #5e-6 8e-6 9e-6
    decoder_lr = 4e-6
    min_lr = 1e-6
    eps = 1e-6
    betas = (0.9, 0.999)
    batch_size = 32
    fc_dropout = 0.15 #0, 0.15,0.3
    max_len = 512
    weight_decay = 0.01
    criterion = "BCE"   # "MSE"
    gradient_accumulation_steps =1
    max_grad_norm = 1000
    # model_nm = '../input/deberta-v3-large/deberta-v3-large'
    # model_nm = 'anferico/bert-for-patents'
    # model_nm = '../input/bert-for-patents/bert-for-patents'
#     model_nm = 'microsoft/deberta-base'
    model_nm = 'google/electra-large-discriminator'

In [None]:
use_title = True
output_dir = "./"
sep = "[SEP]"
use_sep = False
use_full_dataset = True
use_cross_val = True

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [None]:
path = Path('../input/us-patent-phrase-to-phrase-matching')
! pip install --no-index --find-links ../input/huggingface-datasets datasets -q
df = pd.read_csv(path/'train.csv')
titles = pd.read_csv('../input/cpc-codes/titles.csv', dtype=str)
df = df.merge(titles, left_on='context', right_on='code')

df_anchors = pd.DataFrame(df['anchor'].unique(),columns = ['anchor'])
df_anchors['anchor_val'] = np.arange(len(df_anchors))

df = df.merge(df_anchors, left_on='anchor', right_on='anchor')
df.head()

In [None]:
if use_title:
    if use_sep:
        df['input'] = df.context + sep + df.title + sep + df.target + sep + df.anchor
    else:
        df['input'] = 'TEXT1: ' + df.context + ';TEXT2: ' + df.title + '; TEXT3: ' + df.target + '; ANC1: ' + df.anchor
else:
    if use_sep:
        df['input'] = df.context + sep + df.target + sep + df.anchor
    else:
        df['input'] = 'TEXT1: ' + df.context + '; TEXT2: ' + df.target + '; ANC1: ' + df.anchor

if not use_full_dataset:
    df = df.iloc[0:10000]

###################Tokenizer################################
tokz = AutoTokenizer.from_pretrained(CFG.model_nm)
tokz.save_pretrained(output_dir+'tokenizer/')

In [None]:
len_anchor = []
len_target = []
len_title = []
for text in df['anchor'].values:
    len_anchor.append(len(tokz(text, add_special_tokens=False)['input_ids']))
for text in df['target'].values:
    len_target.append(len(tokz(text, add_special_tokens=False)['input_ids']))
for text in df['title'].values:
    len_title.append(len(tokz(text, add_special_tokens=False)['input_ids']))
    
CFG.max_len = max(len_anchor)+ max(len_target)+ max(len_title)+ 5 #(for safety)
print(max(len_anchor))
print(max(len_target))
print(max(len_title))
print(CFG.max_len)

In [None]:
###################Dataset################################
def tok_func(x):
    inputs = tokz(x,
                  add_special_tokens=True,
                  max_length=CFG.max_len,
                  padding="max_length",
                  return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self,df):
        self.texts = df['input'].values
        self.labels = df['score'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = tok_func(self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

In [None]:
cv = StratifiedGroupKFold(n_splits=CFG.n_fold)
df = df.sample(frac =1 , random_state = CFG.seed)
scores = (df.score*100).astype(int)
idxs = np.arange(len(df))
folds = list(cv.split(idxs,scores,df.anchor_val))

def get_fold(folds,fold_num):
    trn,val = folds[fold_num]
    return trn,val

In [None]:
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model_nm, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model_nm, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, self.cfg.target_size)
        self._init_weights(self.fc)
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        self._init_weights(self.attention)
        self.linear = nn.Linear(self.config.hidden_size, 1)
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        # feature = torch.mean(last_hidden_states, 1)
        weights = self.attention(last_hidden_states)
        feature = torch.sum(weights * last_hidden_states, dim=1)
        return feature

    def forward(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs[0]
        input_mask_expanded = inputs["attention_mask"].unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        out = sum_embeddings / sum_mask
        
        out = self.layer_norm1(out)
        output = self.fc(out)
        return output

In [None]:
###################HELPERS################################
class AverageMeter(object):
    def __init__(self):
        self.reset()
    
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        
    def update(self, val ,n=1):
        self.val = val
        self.sum +=val*n
        self.count +=n
        self.avg = self.sum/self.count
        
def asMinutes(s):
    m = math.floor(s/60)
    s -=m*60
    return '%dm %ds' %(m,s)

def timeSince(since,percent):
    now = time.time()
    s = now-since
    es = s/(percent)
    rs = es-s
    return '%s(remain %s)' %(asMinutes(s), asMinutes(rs))

def get_score(y_true, y_pred):
    score = sp.stats.pearsonr(y_true,y_pred)[0]
    return score

def get_logger(filename=output_dir+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

In [None]:
###################Train ,Validation,Inference function################################
def train_fn(fold, train_loader,model, criterion,optimizer, epoch,scheduler,device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled = CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0

    for step,(inputs,labels) in enumerate(train_loader):
        for k,v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled = CFG.apex):
            y_preds = model(inputs)
            loss = criterion(y_preds.view(-1,1),labels.view(-1,1))
        if CFG.gradient_accumulation_steps >1:
            loss = loss/CFG.gradient_accumulation_steps
        losses.update(loss.item(),batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),CFG.max_grad_norm)
        if (step+1) %CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step +=1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
    
        if step% CFG.print_freq == 0 or step ==(len(train_loader)-1):
            print('EPOCH : [{0}][{1}/{2}]'
                 'Elapsed : {remain:s}'
                 'Loss: {loss.val:.4f}({loss.avg:.4f})'
                 'Grad: {grad_norm: .4f}'
                 'LR: {lr:.8f}'
                 .format(epoch+1,step,len(train_loader),
                        remain =timeSince(start,float(step+1)/len(train_loader)),
                        loss = losses,
                        grad_norm = grad_norm,
                        lr = scheduler.get_lr()[0]))
    return losses.avg

def valid_fn(valid_loader, model,criterion,device):
    model.eval()
    losses = AverageMeter()
    preds = []
    start = end = time.time()
    
    for step, (inputs,labels) in enumerate(valid_loader):
        for k,v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1,1), labels.view(-1,1))
        if CFG.gradient_accumulation_steps>1:
            loss = loss/ CFG.gradient_accumulation_steps
        losses.update(loss.item(),batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy()) #???
        end = time.time()
        if step % CFG.print_freq ==0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}]'
                 'Elapsed {remain:s}'
                  'Loss: {loss.val:.4f} ({loss.avg:.4f})'
                 .format(step, len(valid_loader),
                        loss = losses,
                        remain = timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg,predictions

def inference_fn(test_loader,model,device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader,total = len(test_loader))
    
    for inputs in tk0:
        for k,v in inputs.item():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [None]:
###################Train loop################################
def train_loop(df,folds,fold):
    
    LOGGER.info(f"-----------fold : {fold} training -----------")
    
    train_idx,val_idx = get_fold(folds,fold)
    train_folds = df.iloc[train_idx]
    valid_folds = df.iloc[val_idx]
    
#     train_folds = folds[folds['fold'] != fold].reset_index(drop = True)
#     valid_folds = folds[folds['fold'] == fold].reset_index(drop = True)
    valid_labels = valid_folds['score'].values
    
    train_dataset = TrainDataset( train_folds) #---
    valid_dataset = TrainDataset( valid_folds)
    
    train_loader = DataLoader(train_dataset,
                             batch_size = CFG.batch_size,
                             shuffle = True,
                             num_workers = CFG.num_workers, pin_memory = True, drop_last = True)
    valid_loader = DataLoader(valid_dataset,
                             batch_size = CFG.batch_size,
                             shuffle = False,
                             num_workers = CFG.num_workers, pin_memory = True, drop_last = False)
    
    model = CustomModel(CFG,config_path = None, pretrained = True)
    torch.save(model.config, output_dir+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay = 0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params' : [p for n,p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
            'lr': encoder_lr,'weight_decay' : weight_decay},
            {'params' : [p for n,p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
            "lr": encoder_lr, 'weight_decay' : 0.0},
            {'params' : [p for n,p in model.named_parameters() if "model" not in n],
            'lr' : decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters
    
    optimizer_parameters = get_optimizer_params(model,
                                               encoder_lr = CFG.encoder_lr,
                                               decoder_lr = CFG.decoder_lr,
                                               weight_decay = CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr= CFG.encoder_lr, eps = CFG.eps, betas = CFG.betas)
    
    def get_scheduler(cfg, optimizer,num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps = cfg.num_warmup_steps, num_training_steps = num_train_steps)
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps = cfg.num_warmup_steps, num_training_steps = num_train_steps, num_cycles = cfg.num_cycles)
        
        return scheduler
    
    num_train_steps = int(len(train_folds)/CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG,optimizer,num_train_steps)
    
    if CFG.criterion == "BCE":
        criterion =  nn.BCEWithLogitsLoss(reduction = 'mean')
    elif CFG.criterion == "MSE":
        criterion = nn.MSELoss(reduction = 'mean')
    
    best_score = 0.0
    
    for epoch in range(CFG.epochs):
        start_time = time.time()
#         if epoch == 0:
#             for name, param in model.named_parameters():
#                 if name.startswith("model"):
#                     param.requires_grad = False
#         else:
#             for name, param in model.named_parameters():
#                 if name.startswith("model"):
#                     param.requires_grad = True
        
        avg_loss = train_fn(fold,train_loader,model,criterion,optimizer,epoch,scheduler,device)
        
        avg_val_loss, predictions = valid_fn(valid_loader,model,criterion,device)
        print(predictions.shape)
        print(valid_labels.shape)
        score = get_score(valid_labels,predictions)
        
        elapsed = time.time()-start_time
        
        LOGGER.info(f'EPOCH {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss:{avg_val_loss:.4f} time:{elapsed:.0f}s')
        LOGGER.info(f'EPOCH {epoch+1} - SCORE: {score:.4f}')
        
        if best_score <score:
            best_score = score
            LOGGER.info(f'EPOCH {epoch+1} - Save best score: {best_score: .4f} Model')
            torch.save({'model' : model.state_dict(),
                       'predictions': predictions},
                      output_dir + f"{CFG.model_nm.replace('/','-')}_fold{fold}_best.pth")
    predictions = torch.load(output_dir + f"{CFG.model_nm.replace('/','-')}_fold{fold}_best.pth",
                            map_location = torch.device('cpu'))['predictions']
    
    valid_folds['pred'] = predictions
    
    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds          

In [None]:
if __name__ == '__main__':
    def get_result(oof_df):
        labels = oof_df['score'].values
        preds = oof_df['pred'].values
        score = get_score(labels,preds)
        LOGGER.info(f'Score: {score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in CFG.trn_fold:
            _oof_df = train_loop(df,folds,fold)
            oof_df = pd.concat([oof_df,_oof_df])
            LOGGER.info(f"----------fold:{fold} result----------")
            get_result(_oof_df)
        oof_df = oof_df.reset_index(drop = True)
        LOGGER.info(f"======== CV ========")
        get_result(oof_df)
        oof_df.to_pickle(output_dir+'oof_df.pkl')