<a href="https://www.kaggle.com/code/prokaggler/uspppm-inference?scriptVersionId=95325606" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
%config IPCompleter.use_jedi=False

In [2]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
import random
import os
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoConfig, AutoTokenizer, get_linear_schedule_with_warmup,TrainingArguments, Trainer
from datasets import load_metric
from transformers import BertModel
import torch.nn as nn
from torch.optim import Adam
from tqdm import tqdm

In [3]:
class Config:
    # data
    train_csv = '../input/us-patent-phrase-to-phrase-matching/train.csv'
    test_cssv = '../input/us-patent-phrase-to-phrase-matching/test.csv'
    sub_csv = '../input/us-patent-phrase-to-phrase-matching/sample_submission.csv'
    
    # model
    model = 'anferico/bert-for-patents'
    
    max_len = 32
    num_epoch = 2
    batch_size = 64
    epochs = 7
    lr = 1e-6
    
    train = False

In [4]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [5]:
if torch.cuda.is_available():     
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
print( 'device set to =>', device)

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB
device set to => cuda


In [6]:
train_df = pd.read_csv( Config.train_csv )
test_df = pd.read_csv( Config.test_cssv )
sub_df = pd.read_csv( Config.sub_csv )

In [7]:
train_df.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0


In [8]:
score_map = dict(zip( range(5), ['0.00', '0.25', '0.50', '0.75', '1.00']))
inverse_score_map = dict(zip( [0.00, 0.25, 0.50, 0.75, 1.00],range(5) ))

In [9]:
train = pd.DataFrame()
train['text_input'] = train_df['anchor']+ '[sep]' + train_df['target'] + '[sep]' + train_df['context']
train['label'] = train_df['score'].map( inverse_score_map)
train.head()

Unnamed: 0,text_input,label
0,abatement[sep]abatement of pollution[sep]A47,2
1,abatement[sep]act of abating[sep]A47,3
2,abatement[sep]active catalyst[sep]A47,1
3,abatement[sep]eliminating process[sep]A47,2
4,abatement[sep]forest region[sep]A47,0


In [10]:
class PatentTrainDataset(Dataset):
    def __init__( self, text_input, labels, tokenizer):
        self.text_input = text_input
        self.labels = labels
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len( self.text_input)
        
    def __getitem__(self, idx):
        
        text_data = self.tokenizer.encode_plus(
            self.text_input[ idx ],
            add_special_tokens = True,
            pad_to_max_length = True,
            return_attention_mask = True,
            max_length = Config.max_len,
        )
        input_ids =text_data[ 'input_ids' ]
        masks = text_data['attention_mask']
        labels = self.labels[ idx ]
        return {
            'input_ids': torch.tensor( input_ids, dtype=torch.long),
            'labels': torch.tensor( labels, dtype=torch.long),
            'masks': torch.tensor( masks, dtype=torch.long )
        }

In [11]:
if Config.train:
    tokenizer = AutoTokenizer.from_pretrained( Config.model)   

In [12]:
# torch.save({
#             'tokeniker': tokenizer,
           
#             }, './tokenizer.pt')

In [13]:
train_samples = int(train.shape[0] * 0.9)
train_data = train.iloc[:train_samples,:]
val_data = train.iloc[train_samples:, :]
print( 'train samples =>', len( train_data))
print( 'validation samples =>', len(val_data))

train samples => 32825
validation samples => 3648


In [14]:
class PatentModel(nn.Module):
    def __init__( self, dropout):
        super(PatentModel, self).__init__()
        self.bert = BertModel.from_pretrained( Config.model)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(1024,5)
        self.relu = nn.ReLU()
        
    def forward( self, input_id, mask):
        _, pooled_data = self.bert(  input_ids= input_id, attention_mask=mask,return_dict=False )
        dropuout_output = self.dropout( pooled_data )
        linear_output = self.linear( dropuout_output)
        final_layer = self.relu( linear_output )
        return final_layer

In [15]:
class PhraseModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.model_name = model_name

        config = AutoConfig.from_pretrained(model_name)
        config.update(
            {
                "output_hidden_states": True,
                "add_pooling_layer": True,
                "num_labels": 1,
            }
        )
        self.transformer = AutoModel.from_pretrained(model_name, config=config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.output = nn.Linear(config.hidden_size, 1)

    def forward(self, ids, mask, token_type_ids):
        transformer_out = self.transformer(ids, mask, token_type_ids)
        output = transformer_out.pooler_output
        output = self.dropout(output)
        output = self.output(output)
        return output, 0, {}

In [16]:
val_data.reset_index(inplace=True)
val_data.drop( ['index'], axis=1,inplace=True) #.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [17]:
def train( model, train, val, lr=1e-6, epochs=3):
    
    train_patent_ds= PatentTrainDataset(
                        text_input= train.text_input,
                        labels = train.label,
                        tokenizer = tokenizer
    )

    val_patent_ds = PatentTrainDataset(
                        text_input= val.text_input,
                        labels = val.label,
                        tokenizer = tokenizer
    )
    train_dl = DataLoader( train_patent_ds, batch_size=Config.batch_size, shuffle=True)
    val_dl = DataLoader( val_patent_ds, batch_size=Config.batch_size, )
    
    
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam( model.parameters(), lr = lr)
    if torch.cuda.is_available():
        model = model.cuda()
        criterion = criterion.cuda()
        
    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0
        for item in tqdm(train_dl):
            train_label = item['labels'].to(device)
            mask = item['masks'].to(device)
            input_id = item['input_ids'].squeeze(1).to(device)
            output = model(input_id, mask)
            batch_loss = criterion(output, train_label)
            total_loss_train += batch_loss.item()
                
            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

        with torch.no_grad():

            for item in val_dl:

                val_label = item['labels'].to(device)
                mask = item['masks'].to(device)
                input_id = item['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                batch_loss = criterion(output, val_label)
                total_loss_val += batch_loss.item()
                    
                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc
        save_path =  'bert.pt'  #f'bert_{epoch_num}.pt'
        torch.save({
            'epoch': epoch_num,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
#             'loss': LOSS,
            }, save_path)    
        print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
        print( 'model saved to =>', save_path)

In [18]:

if Config.train:
    model = PatentModel(dropout=0.5)
    train(model, train_data, val_data, Config.lr, Config.epochs)


In [19]:
# !zip usppmbert_v1.zip bert.pt

In [20]:
# torch.save(
# {
#     'model' : model
# }, 'USPPPM_v7.pt')

In [21]:
model_dict = torch.load( '../input/uppsm-v1/USPPPM_v7.pt')
model = model_dict['model']
tokenizer_dict = torch.load('../input/uppsm-v1/tokenizer.pt')
tokenizer = tokenizer_dict['tokeniker']

In [22]:
# saved_model_path = '../input/uppsm-v1/usppmbert_v1/bert.pt'
# model_dict = torch.load(saved_model_path)
# tokenizer_dict = torch.load('../input/uppsm-v1/tokenizer.pt')
# tokenizer = tokenizer_dict['tokeniker']

# model = PatentModel(dropout=0.5)
# model.load_state_dict(model_dict['model_state_dict']) 

In [23]:
class PatentTestDataset( Dataset ):
    def __init__( self, anchor, target, context, tokenizer, max_len ):
        self.anchor = anchor
        self.target = target
        self.context = context
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__( self ):
        return len( self.anchor )
    
    def __getitem__( self, idx ):
        anchor = self.anchor[idx ]
        target = self.target[ idx ]
        context = self.context[ idx ]
        
        encoded_data = self.tokenizer.encode_plus(
                    context+ ' ' + anchor,
                    target,
                    padding = 'max_length',
                    max_length = self.max_len,
                    truncation=True,
                    return_attention_mask = True,
        )
        
        input_ids = encoded_data["input_ids"]
        attention_mask = encoded_data["attention_mask"]
        token_type_ids = encoded_data["token_type_ids"]
        
        return {
            'input_ids' : torch.tensor( input_ids, dtype= torch.long),
            'attention_mask': torch.tensor( attention_mask, dtype=torch.long),
            'token_type': torch.tensor( token_type_ids, dtype=torch.long),
        }

In [24]:
def predict(model, test_data):

    test = PatentTestDataset( 
                    anchor = test_data.anchor.values,
                    target = test_data.target.values,
                    context = test_data.context.values,
                    tokenizer = tokenizer,
                    max_len = Config.max_len
    )

    test_dataloader = DataLoader(test, batch_size=32)

    if torch.cuda.is_available():
        model = model.cuda()
    score_list = []
    with torch.no_grad():

        for item in test_dataloader:

            mask = item['attention_mask'].to(device)
            input_id = item['input_ids'].squeeze(1).to(device)
            output = model(input_id, mask)
            preds = output.argmax(dim=1)
            score_list.append(  preds )

    print( 'test generated =>' , len( score_list))
    return score_list



In [25]:
scores = predict(model, test_df)

test generated => 2


In [26]:
scorelist =[]
for item in scores:
    scorelist.append(item.cpu().numpy())
preds = np.hstack(scorelist)
sub_df['preds'] =preds

In [27]:
sub_df.head()

Unnamed: 0,id,score,preds
0,4112d61851461f60,0,2
1,09e418c93a776564,0,2
2,36baf228038e314b,0,2
3,1f37ead645e7f0c8,0,2
4,71a5b6ad068d531f,0,1


In [28]:
sub_df['score']= sub_df.preds.astype(float).map(score_map)

In [29]:
sub_df[['id', 'score']].to_csv('submission.csv', index=False)