In [4]:
from transformers import Trainer, TrainingArguments, AutoModel, AutoModelForSequenceClassification, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
import funcy as f
import random
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast, GradScaler
from datasets import load_dataset, load_metric
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import matplotlib.pyplot as plt
import copy
import torch.optim as optim
import random
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
from datasets import load_dataset, load_metric

import pytorch_lightning as pl
import util
from util import *

In [5]:
training_set = pd.read_json("training_set.json.gz", lines=True, orient="records")
testing_set = pd.read_json("testing_set.json.gz", lines=True, orient="records")

In [6]:
training_set.shape

(972, 110)

In [7]:
17603/972

18.11008230452675

In [3]:
subset = list(sorted(set(all_tiers_100)-set(["PersonalizedProduct"])))

In [4]:
import funcy as f
from random import shuffle

@f.collecting
def create_examples(row, num_neg=2):
    abstract = row.abstract
    claims = row.claims
    yield (abstract, claims, True)
    for text in [abstract,claims]:
        neg_count = 0
        shuffle(subset)
        for tag in subset:
            if row[tag]:
                yield (text, f"Tagged as {tier_translations[tag]}.", True)
            elif num_neg is None or neg_count < num_neg:
                neg_count = neg_count + 1
                yield (text, f"Tagged as {tier_translations[tag]}.", False)
            else:
                continue

In [5]:
training_triplets = pd.DataFrame(training_set.apply(f.partial(create_examples, num_neg=3), axis=1).explode().tolist()).drop_duplicates()
training_triplets.columns = ["sentence1", "sentence2", "label"]
training_triplets = training_triplets.reset_index()
training_triplets.shape

(17603, 4)

In [6]:
testing_triplets = pd.DataFrame(testing_set.apply(f.partial(create_examples, num_neg=2), axis=1).explode().tolist()).drop_duplicates()
testing_triplets.columns = ["sentence1", "sentence2", "label"]
testing_triplets = testing_triplets.reset_index()
testing_triplets.shape

(4105, 4)

In [7]:
model_name = "bertForPatents"

In [8]:
# triplets = pd.read_parquet("triples.parquet")
# triplets.reset_index(inplace=True)

# test_triplets = pd.read_parquet("testing_triples.parquet")
# test_triplets.reset_index(inplace=True)

In [9]:
class CustomDataset(Dataset):

    def __init__(self, data, maxlen=512, with_labels=True, bert_model=model_name):

        self.data = data  # pandas dataframe
        #Initialize the tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(bert_model)  

        self.maxlen = maxlen
        self.with_labels = with_labels 

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        # Selecting sentence1 and sentence2 at the specified index in the data frame
        sent1 = self.data.sentence1[index]
        sent2 = self.data.sentence2[index]

        # Tokenize the pair of sentences to get token ids, attention masks and token type ids
        encoded_pair = self.tokenizer(sent1, sent2, 
                                      padding='max_length',  # Pad to max_length
                                      truncation=True,  # Truncate to max_length
                                      max_length=self.maxlen,  
                                      return_tensors='pt')  # Return torch.Tensor objects
        
        token_ids = encoded_pair['input_ids'].squeeze(0)  # tensor of token ids
        attn_masks = encoded_pair['attention_mask'].squeeze(0)  # binary tensor with "0" for padded values and "1" for the other values
        token_type_ids = encoded_pair['token_type_ids'].squeeze(0)  # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens

        if self.with_labels:  # True if the dataset has labels
            label = self.data.label[index].astype(np.long)
            return token_ids, attn_masks, token_type_ids, label  
        else:
            return token_ids, attn_masks, token_type_ids

In [10]:
from pl_bolts.optimizers.lr_scheduler import LinearWarmupCosineAnnealingLR

In [11]:
def set_seed(seed):
    """ Set all seeds to make results reproducible """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    

class SentencePairClassifier(pl.LightningModule):

    def __init__(self,
                     training_dataset,
                     testing_dataset,
                     lr_warmup_steps=1000,
                     bert_model="bertForPatents", 
                     learning_rate=3e-5, 
                     freeze_bert=False,
                     batch_size=16,
                     seed=42):
        super(SentencePairClassifier, self).__init__()
        
        if seed:
            set_seed(seed)
        
        self.lr_warmup_steps = lr_warmup_steps
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.training_dataset = training_dataset
        self.testing_dataset = testing_dataset
        
        self.bert_layer = AutoModel.from_pretrained(bert_model, gradient_checkpointing=True)
        hidden_size = 1024

        # Freeze bert layers and only train the classification layer weights
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False

        # Classification layer
        self.cls_layer = nn.Linear(hidden_size, 1)

        self.dropout = nn.Dropout(p=0.1)
        
        self.train_accuracy = pl.metrics.Accuracy()
        self.val_accuracy = pl.metrics.Accuracy()
        
    def train_dataloader(self):
        return DataLoader(CustomDataset(self.training_dataset), batch_size=self.batch_size, shuffle=True, num_workers=32)

    def val_dataloader(self):
        return DataLoader(CustomDataset(self.testing_dataset), batch_size=self.batch_size, shuffle=False, num_workers=32)
    
    def optimizer_step(self, optimizer, *args, **kwargs):
        if self.trainer.global_step < self.lr_warmup_steps:
            lr_scale = min(1., float(self.trainer.global_step + 1) / float(self.lr_warmup_steps))
            lr = lr_scale * self.learning_rate
            self.log('learning_rate', lr, on_step=True, on_epoch=False)
            for pg in optimizer.param_groups:
                pg['lr'] = lr

        optimizer.step()
        optimizer.zero_grad()
        
    def configure_optimizers(self):
        return torch.optim.AdamW(params = self.parameters(), lr=self.learning_rate)


    def forward(self, input_ids, attn_masks, token_type_ids):
        cont_reps, pooler_output = self.bert_layer(input_ids, attn_masks, token_type_ids)
        logits = self.cls_layer(self.dropout(pooler_output))
        return logits
    
    def training_step(self, batch, batch_idx):
        seq, attn_masks, token_type_ids, labels = batch
        logits = self.forward(seq, attn_masks, token_type_ids)
        loss = self.criterion(logits.squeeze(-1), labels.float())
        self.log('train_loss', loss, on_epoch=False, on_step=True, prog_bar=True)
        self.log('train_acc_step', self.train_accuracy(F.sigmoid(logits).squeeze(-1), labels), on_step=True, on_epoch=False, prog_bar=True)
        return loss

    
    def validation_step(self, batch, batch_idx):
        seq, attn_masks, token_type_ids, labels = batch
        logits = self.forward(seq, attn_masks, token_type_ids)
        loss = self.criterion(logits.squeeze(-1), labels.float())
        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        self.log('val_accuracy', self.val_accuracy(F.sigmoid(logits).squeeze(-1),labels), on_step=False, on_epoch=True, prog_bar=True )
        
    def criterion(self, y_pred, y_true):
        return nn.BCEWithLogitsLoss()(y_pred, y_true)
        
    
        

In [12]:
callbacks = [
    pl.callbacks.ModelCheckpoint(monitor='val_loss', save_top_k=1),
    #pl.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=3, )
]

model = SentencePairClassifier(training_dataset=training_triplets, testing_dataset=testing_triplets)
trainer = pl.Trainer(gpus=1, 
                     accumulate_grad_batches=4,
                     max_epochs=5,
                     #auto_scale_batch_size=True,
                     precision=32,
                     log_every_n_steps=1,
                     flush_logs_every_n_steps=10,
                     callbacks=callbacks,
                     val_check_interval=0.10)
#trainer.tune(model)
trainer.fit(model)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type      | Params
---------------------------------------------
0 | bert_layer     | BertModel | 344 M 
1 | cls_layer      | Linear    | 1.0 K 
2 | dropout        | Dropout   | 0     
3 | train_accuracy | Accuracy  | 0     
4 | val_accuracy   | Accuracy  | 0     


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…






1