In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch
from torch import nn
from sklearn.feature_extraction.text import CountVectorizer
from transformers import BertTokenizer
from transformers import BertModel
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import f1_score
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import torch.nn.functional as F
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel
import logging
from util import *
import torch.nn.functional as F
from datetime import datetime
import pprint
import os
import torch
from torch import nn
import torch.nn.functional as F
import pytorch_lightning as pl

In [2]:
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    )

In [3]:
training_set = pd.read_json("training_set.json.gz", lines=True, orient="records")
testing_set = pd.read_json("testing_set.json.gz", lines=True, orient="records")

In [4]:
subset = list(sorted(set(all_tiers_100)-set(["PersonalizedProduct"])))

In [5]:
model_name = "/home/martin/IdeaProjects/phenetics/bertForPatents/"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModel.from_pretrained(model_name, gradient_checkpointing=True)

In [6]:
training_set['labels']=training_set[subset].astype(int).values.tolist()
testing_set['labels']=testing_set[subset].astype(int).values.tolist()

In [7]:
def tokenize(tokenizer, text, max_len):
        text = str(text)
        text = " ".join(text.split())

        inputs = tokenizer.encode_plus(
            text,
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            f"input_ids": torch.tensor(ids, dtype=torch.long),
            f"attention_mask": torch.tensor(mask, dtype=torch.long),
            f"token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
        }

In [8]:
MAX_LEN_CLAIMS = 512
MAX_LEN_ABSTRACT = 160
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
EPOCHS = 100
LEARNING_RATE = 1e-5
SEED = 17
PRED_THRES = 0.4
ACCUM_STEPS = 8
NUM_LABELS = len(subset)

logdir="/var/patentmark/logdir/fit2/" + datetime.now().strftime("%Y%m%d-%H%M%S")

In [9]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer):
        self.tokenizer = tokenizer
        self.data = dataframe
        
        self.claims = dataframe.claims
        self.abstracts = dataframe.abstract
        
        self.labels = dataframe.labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        
        abstract = tokenize(tokenizer, self.abstracts[index], max_len=MAX_LEN_ABSTRACT)
        claims = tokenize(tokenizer, self.claims[index], MAX_LEN_CLAIMS)
        labels = torch.tensor(np.array(self.labels[index]), dtype=torch.
                              float)                
        return {"abstract": abstract, 
                "claims": claims,
                'labels': labels}
    
training_dataset = MultiLabelDataset(training_set, tokenizer)
testing_dataset = MultiLabelDataset(testing_set, tokenizer)

In [10]:
from sklearn.metrics import label_ranking_average_precision_score

In [11]:
from pytorch_lightning.loggers import TensorBoardLogger
logger = TensorBoardLogger('/var/patentmark/tb_logs', name='basicModel')

class BasicModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.text_embedder = AutoModel.from_pretrained(model_name, gradient_checkpointing=True)
        
        total_embedding_size = self.text_embedder.pooler.dense.out_features*2
        output_size = NUM_LABELS
        
        self.dropout1 = nn.Dropout(0.1)
        self.classifier = nn.Linear(total_embedding_size, output_size)

            
    def forward(self, abstract, claims):
        abstract_emb = self.text_embedder(input_ids=abstract["input_ids"], attention_mask=abstract["attention_mask"])
        abstract_emb = abstract_emb[0][:, 0]
        
        claim_emb = self.text_embedder(input_ids=claims["input_ids"], attention_mask=claims["attention_mask"])
        claim_emb = claim_emb[0][:, 0]
           
        x = torch.cat((abstract_emb, claim_emb), 1)
        x = self.dropout1(x)
        x = self.classifier(x)
        
        return x

class BasicSystem(pl.LightningModule):
    def __init__(self, batch_size=4, learning_rate = 1e-05):
        super().__init__()
        self.model = BasicModel()
        self.loss_function = torch.nn.BCEWithLogitsLoss()
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        
    def train_dataloader(self):
        return DataLoader(training_dataset, batch_size=self.batch_size, shuffle=True, num_workers=0)

    def val_dataloader(self):
        return DataLoader(testing_dataset, batch_size=4, shuffle=False, num_workers=0)
    
    def forward(self, abstract, claims):
        predictions = F.sigmoid(self.model(abstract=abstract, claims=claims))
        return predictions
    
    def training_step(self, batch, batch_idx):
        abstract = batch['abstract']
        claims = batch['claims']
        labels = batch['labels']
        logits = self.model(abstract=abstract, claims=claims)
        loss = self.loss_function(logits, labels)

        labels = labels.cpu().detach().numpy()
        logits = F.sigmoid(logits).cpu().detach().numpy()
        lrap = label_ranking_average_precision_score(labels, logits)
        
        self.log('train_loss', loss, prog_bar=True)
        self.log('train_lrap', lrap, prog_bar=True)
        
        return loss
    
    def validation_step(self, batch, batch_idx):
        abstract = batch['abstract']
        claims = batch['claims']
        labels = batch['labels']
        logits = self.model(abstract=abstract, claims=claims)
        loss = self.loss_function(logits, labels)
        
        labels = labels.cpu().detach().numpy()
        logits = F.sigmoid(logits).cpu().detach().numpy()
        lrap = label_ranking_average_precision_score(labels, logits)
        
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_lrap', lrap, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(params = self.model.parameters(), lr=self.learning_rate)
        return optimizer
        

In [12]:
model = BasicSystem()
trainer = pl.Trainer(gpus=1, accumulate_grad_batches=4, logger=logger)
trainer.fit(model)

GPU available: True, used: True
2020-12-02 20:13:59 - GPU available: True, used: True
TPU available: False, using: 0 TPU cores
2020-12-02 20:13:59 - TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
2020-12-02 20:13:59 - LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type              | Params
----------------------------------------------------
0 | model         | BasicModel        | 344 M 
1 | loss_function | BCEWithLogitsLoss | 0     
2020-12-02 20:14:01 - 
  | Name          | Type              | Params
----------------------------------------------------
0 | model         | BasicModel        | 344 M 
1 | loss_function | BCEWithLogitsLoss | 0     


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




1