# Directory settings

In [None]:
import os
INPUT_DIR = '../input/us-patent-phrase-to-phrase-matching/'
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [None]:
class CFG_1:
    num_workers=4
    path=""#"../input/uspp-full-models/"
    config_path=""#path+'seed_777_config.pth'
    model=""#"microsoft/deberta-v3-large"
    batch_size=32
    fc_dropout=""#0.2
    target_size=1
    max_len=""#140
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]

In [None]:
use_title = True
use_sep = False
sep = "[s]"

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

# Library

In [None]:
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import shutil
import string
import pickle
import random
import joblib
import itertools
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Utils

In [None]:
def get_score(y_true, y_pred):
    score = sp.stats.pearsonr(y_true, y_pred)[0]
    return score


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

# Data Loading

In [None]:
test = pd.read_csv(INPUT_DIR+'test.csv')
submission = pd.read_csv(INPUT_DIR+'sample_submission.csv')

titles = pd.read_csv('../input/cpc-codes/titles.csv', dtype=str)
test = test.merge(titles, left_on='context', right_on='code')

if use_title:
    if use_sep:
        test['input'] = test.context + sep + test.title + sep + test.target + sep + test.anchor
    else:
        test['input'] = 'TEXT1: ' + test.context + ';TEXT2: ' + test.title + '; TEXT3: ' + test.target + '; ANC1: ' + test.anchor
else:
    if use_sep:
        test['input'] = test.context + sep + test.target + sep + test.anchor
    else:
        test['input'] = 'TEXT1: ' + test.context + '; TEXT2: ' + test.target + '; ANC1: ' + test.anchor              

In [None]:
def tok_func(x):
    inputs = CFG_1.tokenizer(x,
                  add_special_tokens=True,
                  max_length=CFG_1.max_len,
                  padding="max_length",
                  return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs

class TestDataset(Dataset):
    def __init__(self,df):
        self.texts = df['input'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = tok_func(self.texts[item])
        return inputs

In [None]:
###################Model################################
class CustomModel_Attn(nn.Module):
    def __init__(self,cfg,config_path = None , pretrained = False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model_nm, output_hidden_states = True)
        else:
            self.config = torch.load(config_path)
        
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model_nm, config = self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, self.cfg.target_size)
        self._init_weights(self.fc)
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512,1),
            nn.Softmax(dim=1))
        self._init_weights(self.attention)
    
    def _init_weights(self,module):
        if isinstance(module,nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module,nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module,nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        weights = self.attention(last_hidden_states)
        feature = torch.sum(weights * last_hidden_states, dim =1)
        return feature
    
    def forward(self,inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

In [None]:
###################Model################################
class CustomModel_Cls(nn.Module):
    def __init__(self,cfg,config_path = None , pretrained = False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model_nm, output_hidden_states = True)
        else:
            self.config = torch.load(config_path)
        
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model_nm, config = self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        self.dropout = nn.Dropout(cfg.fc_dropout)
        self.linear = nn.Linear(self.config.hidden_size,self.cfg.target_size)
        self._init_weights(self.linear)
#         self.relu = nn.ReLU()
    
    def _init_weights(self,module):
        if isinstance(module,nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module,nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module,nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    
    def forward(self,inputs):
        output = self.model(**inputs)
        cls_output = output['last_hidden_state'][:,0,:]
        dropout_output = self.dropout(cls_output)
        linear_output = self.linear(dropout_output)
#         output = self.relu(linear_output)
        return linear_output

In [None]:
class CustomModel_Ch(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model_nm, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model_nm, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, self.cfg.target_size)
        self._init_weights(self.fc)
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        self.layer_norm1 = nn.LayerNorm(self.config.hidden_size)
        self._init_weights(self.attention)
        self.linear = nn.Linear(self.config.hidden_size, 1)
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        # feature = torch.mean(last_hidden_states, 1)
        weights = self.attention(last_hidden_states)
        feature = torch.sum(weights * last_hidden_states, dim=1)
        return feature

    def forward(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs[0]
        input_mask_expanded = inputs["attention_mask"].unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        out = sum_embeddings / sum_mask
        
        out = self.layer_norm1(out)
        output = self.fc(out)
        return output

# inference

In [None]:
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [None]:
test_dataset = TestDataset(test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG_1.batch_size,
                         shuffle=False,
                         num_workers=CFG_1.num_workers, pin_memory=True, drop_last=False)

In [None]:
model_dict = { 1.01 : {'max_len' : 126,'fc_dropout' : 0.2 ,'seed' : 777,'model_nm' : "bfp_attn",
                       'model_pth' : "../input/usppm-101-bfp-attn/zip/",'model' : '../input/bert-for-patents/bert-for-patents'},
              1.07 : {'max_len' : 126,'fc_dropout' : 0.2,'seed' : 123,'model_nm' : "bfp_attn",
                      'model_pth' :"../input/usppm-107/zip/",'model' : 'anferico/bert-for-patents'},
              2.01 : {'max_len' : 140,'fc_dropout' : 0,'seed' : 42,'model_nm' : "deberta_cls",
                      'model_pth' :"../input/201-usppm-deberta-cls/",'model' : 'microsoft/deberta-v3-large'},
              3.01 : {'max_len' : 140,'fc_dropout' : 0.2,'seed' : 42,'model_nm' : "deberta_attn",
                      'model_pth' :"../input/usppm-301-deberta-attn/",'model' : 'microsoft/deberta-v3-large'},
              5.01 : {'max_len' : 140,'fc_dropout' : 0.2,'seed' : 42,'model_nm' : "deberta_ch",
                      'model_pth' :"../input/501-deberta-ch/",'model' : 'microsoft/deberta-v3-large'},
              5.02 : {'max_len' : 140,'fc_dropout' : 0.2,'seed' : 42,'model_nm' : "deberta_ch", 
                      'model_pth' :"../input/502-deberta-ch/",'model' : 'microsoft/deberta-v3-large'},
              6.09 : {'max_len' : 117,'fc_dropout' : 0.15,'seed' : 42,'model_nm' : "electra_ch",
                      'model_pth' :"../input/usppm-609-electra/",'model' : 'google/electra-large-discriminator'}
             }

In [None]:
predictions_list = []

######
###Keep model order same as oof weighting list
######

model_list = [2.01,1.01,6.09,5.02,1.07,3.01] 
weight_list =[0.435, 0.205, 0.195, 0.390, 0.090]
assert len(model_list) == len(weight_list)+1

for i,m in enumerate(model_list):
    
    CFG_1.seed = model_dict[m]['seed']
    seed_everything(seed=CFG_1.seed)
    
    model_nm = model_dict[m]['model_nm']
    CFG_1.path = model_dict[m]['model_pth']
    CFG_1.fc_dropout = model_dict[m]['fc_dropout']
    CFG_1.max_len = model_dict[m]['max_len']
    CFG_1.model = model_dict[m]['model']
    CFG_1.tokenizer = AutoTokenizer.from_pretrained(CFG_1.path+'tokenizer/')
    CFG_1.config_path=CFG_1.path+'config.pth'
    
    model_predictions = []
    for fold in  CFG_1.trn_fold:
            
        #choose right model class
        if "attn" in model_nm:
            model = CustomModel_Attn(CFG_1, config_path=CFG_1.config_path, pretrained=False)
        elif "cls" in model_nm:
            model = CustomModel_Cls(CFG_1, config_path=CFG_1.config_path, pretrained=False)
        elif "ch" in model_nm:
            model = CustomModel_Ch(CFG_1, config_path=CFG_1.config_path, pretrained=False)        
        #load model weights
        state = torch.load(CFG_1.path+f"{CFG_1.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
        model.load_state_dict(state['model'])
        prediction = inference_fn(test_loader, model, device)
        model_predictions.append(prediction)
        del model, state, prediction; gc.collect()
        torch.cuda.empty_cache()
    model_predictions_mean = np.mean(model_predictions, axis=0)
    predictions_list.append(model_predictions_mean)

md2 = predictions_list[0]
for i,k in enumerate(weight_list):
    md2 = weight_list[i] * predictions_list[i+1] + (1-weight_list[i])*md2

# Submission

In [None]:
test['score'] = md2
test[['id', 'score']].to_csv('submission.csv', index=False)