# U.S. Patent Phrase to Phrase Matching
This is a notebook for the [U.S. Patent Phrase to Phrase Matching competition](https://www.kaggle.com/competitions/us-patent-phrase-to-phrase-matching) based on:
* [Jeremy Howard's notebook](https://www.kaggle.com/code/jhoward/iterate-like-a-grandmaster)
* [Nayak Roshan's notebook](https://www.kaggle.com/code/nayakroshan/uspppm-pytorch-lightning)
* [Y. Nakama's notebook](https://www.kaggle.com/code/yasufuminakama/pppm-deberta-v3-large-baseline-w-w-b-train/notebook)
This is the inference step. The training step is [here](https://www.kaggle.com/code/edmundtang/uspppm-01-training/edit/run/97244433).

## Libraries

In [1]:
import gc
import re
import time
import math
import random
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
from scipy import stats
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm

import torch
print(f"torch.__version__: {torch.__version__}")
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForSequenceClassification, BertForSequenceClassification
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

torch.__version__: 1.11.0
tokenizers.__version__: 0.12.1
transformers.__version__: 4.18.0
env: TOKENIZERS_PARALLELISM=true


## Configuration (CFG)

In [2]:
class CFG1:
    model_type = 'bert'
    model_name = 'anferico/bert-for-patents'
    path = '../input/patentbert01/' 
    config_path = path + 'config.pth'
    batch_size = 16
    num_workers = 4
    max_len = 150
    fc_dropout = 0.20
    target_size = 1
    
class CFG2:
    model_type = 'deberta'
    model_name = 'microsoft/deberta-v3-large'
    path = '../input/debertav3large02/' 
    config_path = path + 'config.pth'
    batch_size = 16
    num_workers = 4
    max_len = 150
    fc_dropout = 0.20
    target_size = 1

In [3]:
import os

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle:
    !pip install -q datasets
import datasets
from datasets import load_dataset, Dataset, DatasetDict
INPUT_DIR = '../input/us-patent-phrase-to-phrase-matching/'
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

[0m

In [4]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything()

## Data Loading

In [5]:
test_df = pd.read_csv(INPUT_DIR+'test.csv')
submission = pd.read_csv(INPUT_DIR+'sample_submission.csv')
print(f"test.shape: {test_df.shape}")
print(f"submission.shape: {submission.shape}")
display(test_df.head())
display(submission.head())

test.shape: (36, 4)
submission.shape: (36, 2)


Unnamed: 0,id,anchor,target,context
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02
1,09e418c93a776564,adjust gas flow,altering gas flow,F23
2,36baf228038e314b,lower trunnion,lower locating,B60
3,1f37ead645e7f0c8,cap component,upper portion,D06
4,71a5b6ad068d531f,neural stimulation,artificial neural network,H04


Unnamed: 0,id,score
0,4112d61851461f60,0
1,09e418c93a776564,0
2,36baf228038e314b,0
3,1f37ead645e7f0c8,0
4,71a5b6ad068d531f,0


In [6]:
def get_cpc_texts():
    contexts = []
    pattern = '[A-Z]\d+'
    for file_name in os.listdir('../input/cpc-data/CPCSchemeXML202105'):
        result = re.findall(pattern, file_name)
        if result:
            contexts.append(result)
    contexts = sorted(set(sum(contexts, [])))
    results = {}
    for cpc in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'Y']:
        with open(f'../input/cpc-data/CPCTitleList202202/cpc-section-{cpc}_20220201.txt') as f:
            s = f.read()
        pattern = f'{cpc}\t\t.+'
        result = re.findall(pattern, s)
        cpc_result = result[0].lstrip(pattern)
        for context in [c for c in contexts if c[0] == cpc]:
            pattern = f'{context}\t\t.+'
            result = re.findall(pattern, s)
            results[context] = cpc_result + ". " + result[0].lstrip(pattern)
    return results

cpc_texts = get_cpc_texts()
torch.save(cpc_texts, OUTPUT_DIR+"cpc_texts.pth")
test_df['context_text'] = test_df['context'].map(cpc_texts)
display(test_df.head())

Unnamed: 0,id,anchor,target,context,context_text
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02,PHYSICS. OPTICS
1,09e418c93a776564,adjust gas flow,altering gas flow,F23,MECHANICAL ENGINEERING; LIGHTING; HEATING; WEA...
2,36baf228038e314b,lower trunnion,lower locating,B60,PERFORMING OPERATIONS; TRANSPORTING. VEHICLES ...
3,1f37ead645e7f0c8,cap component,upper portion,D06,TEXTILES; PAPER. TREATMENT OF TEXTILES OR THE ...
4,71a5b6ad068d531f,neural stimulation,artificial neural network,H04,ELECTRICITY. ELECTRIC COMMUNICATION TECHNIQUE


In [7]:
tokz1 = AutoTokenizer.from_pretrained(CFG1.path + 'tokenizer')
tokz1.save_pretrained(OUTPUT_DIR+'tokenizer1/')
CFG1.tokenizer = tokz1
sep = tokz1.sep_token

tokz2 = AutoTokenizer.from_pretrained(CFG2.path + 'tokenizer')
tokz2.save_pretrained(OUTPUT_DIR+'tokenizer2/')
CFG2.tokenizer = tokz2
sep = tokz1.sep_token


In [8]:
test_df['inputs'] = test_df.anchor + sep + test_df.target + sep + test_df.context_text
display(test_df.head())

Unnamed: 0,id,anchor,target,context,context_text,inputs
0,4112d61851461f60,opc drum,inorganic photoconductor drum,G02,PHYSICS. OPTICS,opc drum[SEP]inorganic photoconductor drum[SEP...
1,09e418c93a776564,adjust gas flow,altering gas flow,F23,MECHANICAL ENGINEERING; LIGHTING; HEATING; WEA...,adjust gas flow[SEP]altering gas flow[SEP]MECH...
2,36baf228038e314b,lower trunnion,lower locating,B60,PERFORMING OPERATIONS; TRANSPORTING. VEHICLES ...,lower trunnion[SEP]lower locating[SEP]PERFORMI...
3,1f37ead645e7f0c8,cap component,upper portion,D06,TEXTILES; PAPER. TREATMENT OF TEXTILES OR THE ...,cap component[SEP]upper portion[SEP]TEXTILES; ...
4,71a5b6ad068d531f,neural stimulation,artificial neural network,H04,ELECTRICITY. ELECTRIC COMMUNICATION TECHNIQUE,neural stimulation[SEP]artificial neural netwo...


## Build Dataset

In [9]:
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs

class TestDataset(Dataset):
    def __init__(self, cfg, ds):
        self.cfg = cfg
        self.inputs = ds['inputs']
        
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        return inputs

In [10]:
test_ds = Dataset.from_pandas(test_df)

test_dataset1 = TestDataset(CFG1, test_ds)
test_loader1 = DataLoader(test_dataset1,
                         batch_size=CFG1.batch_size, shuffle=False,
                         num_workers=CFG1.num_workers, pin_memory=True, drop_last=False)

test_dataset2 = TestDataset(CFG2, test_ds)
test_loader2 = DataLoader(test_dataset2,
                         batch_size=CFG2.batch_size, shuffle=False,
                         num_workers=CFG2.num_workers, pin_memory=True, drop_last=False)

## Load Model

In [11]:
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model_name, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if self.cfg.model_type == 'deberta':
            if pretrained:
                self.model = AutoModel.from_pretrained(cfg.model_name, config=self.config)
            else:
                self.model = AutoModel.from_config(self.config)
            self.fc_dropout = nn.Dropout(cfg.fc_dropout)
            self.fc = nn.Linear(self.config.hidden_size, self.cfg.target_size)
            self._init_weights(self.fc)
            self.attention = nn.Sequential(
                nn.Linear(self.config.hidden_size, 512),
                nn.Tanh(),
                nn.Linear(512, 1),
                nn.Softmax(dim=1)
            )
            self._init_weights(self.attention)
        elif self.cfg.model_type == 'bert':
            if pretrained:
                self.model = AutoModel.from_pretrained(cfg.model_name, config=self.config)
            else:
                self.model = AutoModel.from_config(self.config)
            self.head = nn.Linear(self.config.hidden_size, 1, bias=True)
            self.dropout = nn.Dropout(0.5)
            
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, inputs):
        if self.cfg.model_type == 'deberta':
            outputs = self.model(**inputs)
            last_hidden_states = outputs[0]
            # feature = torch.mean(last_hidden_states, 1)
            weights = self.attention(last_hidden_states)
            feats = torch.sum(weights * last_hidden_states, dim=1)
            output = self.fc(self.fc_dropout(feats))
        elif self.cfg.model_type == 'bert':
            feats = self.model(**inputs)
            feats = torch.sum(feats[0], 1)/feats[0].shape[1]
            feats = self.dropout(feats)
            output = self.head(feats)
        return output

In [12]:
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

## Make Predictions

In [13]:
predictions = []

model = CustomModel(CFG1, config_path=CFG1.config_path, pretrained = False)
state = torch.load(CFG1.path+f"{CFG1.model_name.replace('/', '-')}_best.pth",
                  map_location=torch.device('cpu'))
model.load_state_dict(state['model'])
prediction = inference_fn(test_loader1, model, device)
predictions.append(prediction)

del model, state, prediction
torch.cuda.empty_cache()
gc.collect()

model = CustomModel(CFG2, config_path=CFG2.config_path, pretrained = False)
state = torch.load(CFG2.path+f"{CFG2.model_name.replace('/', '-')}_best.pth",
                  map_location=torch.device('cpu'))
model.load_state_dict(state['model'])
prediction = inference_fn(test_loader2, model, device)
predictions.append(prediction)

pred_list = [list(predictions[0]),list(predictions[1])]
pred_df = pd.DataFrame(pred_list).transpose()
#print(pred_df)

  0%|          | 0/3 [00:00<?, ?it/s]

Exception ignored in: <function _ConnectionBase.__del__ at 0x7ff0362b04d0>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 132, in __del__
    self._close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor


  0%|          | 0/3 [00:00<?, ?it/s]

In [14]:
wt = 0.4
preds_ensemble = []

preds_ensemble = [np.round(4 * (i[0]*wt+j[0]*(1-wt)))/4 for i,j in zip(pred_df[0],pred_df[1])]

display(preds_ensemble)

[0.5,
 0.75,
 0.5,
 0.25,
 0.0,
 0.5,
 0.5,
 0.0,
 0.25,
 1.0,
 0.25,
 0.25,
 0.75,
 0.75,
 0.75,
 0.25,
 0.25,
 0.0,
 0.5,
 0.25,
 0.25,
 0.25,
 0.25,
 0.25,
 0.5,
 0.0,
 0.0,
 0.0,
 0.0,
 0.5,
 0.25,
 0.0,
 0.75,
 0.5,
 0.25,
 0.25]

In [15]:
submission['score'] = preds_ensemble
display(submission.head())
submission[['id', 'score']].to_csv('submission.csv', index=False)

Unnamed: 0,id,score
0,4112d61851461f60,0.5
1,09e418c93a776564,0.75
2,36baf228038e314b,0.5
3,1f37ead645e7f0c8,0.25
4,71a5b6ad068d531f,0.0
