In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch
from torch import nn
from sklearn.feature_extraction.text import CountVectorizer
from transformers import BertTokenizer
from transformers import BertModel
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from catalyst import dl
from catalyst import dl, utils
from ignite.engine import Engine, Events
from ignite.metrics import Accuracy, Loss, RunningAverage
from ignite.handlers import ModelCheckpoint, EarlyStopping
from ignite.contrib.handlers import ProgressBar
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
from ignite.metrics import Accuracy, Loss
from sklearn.metrics import f1_score
import warnings
from ignite.utils import convert_tensor
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import torch.nn.functional as F
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel
import logging
from util import *
import torch.nn.functional as F
from pytorch_metric_learning import miners, losses
import catalyst.contrib as contrib
from datetime import datetime
import pprint
from ignite.engine import _prepare_batch
from ignite.engine import create_supervised_trainer

In [2]:
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    )

In [3]:
training_set = pd.read_json("training_set.json.gz", lines=True, orient="records")
testing_set = pd.read_json("testing_set.json.gz", lines=True, orient="records")

In [4]:
subset = list(sorted(set(all_tiers_100)-set(["PersonalizedProduct"])))

In [5]:
model_name = "/home/martin/IdeaProjects/phenetics/bertForPatents/"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModel.from_pretrained(model_name, gradient_checkpointing=True)

In [6]:
training_set['labels']=training_set[subset].astype(int).values.tolist()
testing_set['labels']=testing_set[subset].astype(int).values.tolist()

In [7]:
def tokenize(tokenizer, text, max_len):
        text = str(text)
        text = " ".join(text.split())

        inputs = tokenizer.encode_plus(
            text,
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            f"input_ids": torch.tensor(ids, dtype=torch.long),
            f"attention_mask": torch.tensor(mask, dtype=torch.long),
            f"token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
        }

In [8]:
MAX_LEN_CLAIMS = 512
MAX_LEN_ABSTRACT = 160
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
EPOCHS = 100
LEARNING_RATE = 1e-5
SEED = 17
PRED_THRES = 0.4
ACCUM_STEPS = 8
NUM_LABELS = len(subset)

device = utils.get_device()
logdir="/var/patentmark/logdir/fit2/" + datetime.now().strftime("%Y%m%d-%H%M%S")

In [9]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer):
        self.tokenizer = tokenizer
        self.data = dataframe
        
        self.claims = dataframe.claims
        self.abstracts = dataframe.abstract
        
        self.labels = dataframe.labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        
        abstract = tokenize(tokenizer, self.abstracts[index], max_len=MAX_LEN_ABSTRACT)
        claims = tokenize(tokenizer, self.claims[index], MAX_LEN_CLAIMS)
        labels = torch.tensor(np.array(self.labels[index]), dtype=torch.
                              float)                
        return {"abstract": abstract, 
                "claims": claims,
                'labels': labels}
    
training_dataset = MultiLabelDataset(training_set, tokenizer)
testing_dataset = MultiLabelDataset(testing_set, tokenizer)

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(training_dataset, **train_params)
testing_loader = DataLoader(testing_dataset, **test_params)

In [10]:
class BasicModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.text_embedder = AutoModel.from_pretrained(model_name, gradient_checkpointing=True)
        
        total_embedding_size = self.text_embedder.pooler.dense.out_features*2
        output_size = NUM_LABELS
        bottleneck = 768
    
        self.dropout1 = nn.Dropout(0.1)
        self.dense1 = nn.Linear(total_embedding_size, bottleneck)
        self.classifier = nn.Linear(bottleneck, output_size)

            
    def forward(self, abstract, claims):
        abstract_emb = self.text_embedder(input_ids=abstract["input_ids"], attention_mask=abstract["attention_mask"])
        abstract_emb = abstract_emb[0][:, 0]
        
        claim_emb = self.text_embedder(input_ids=claims["input_ids"], attention_mask=claims["attention_mask"])
        claim_emb = claim_emb[0][:, 0]
           
        x = torch.cat((abstract_emb, claim_emb), 1)
        x = self.dropout1(x)
        x = F.elu(self.dense1(x))
        x = self.classifier(x)
        x = F.sigmoid(x)
        
        return x

model = BasicModel().to(device)

In [11]:
optimizer = torch.optim.AdamW(params =  model.parameters(), lr=LEARNING_RATE)
loss_function = torch.nn.BCELoss() #WithLogitsLoss()

In [12]:
from ignite.metrics import Accuracy, Precision, Recall, Loss

In [None]:
def basic_train_step(engine, batch):
    model.train()
    
    abstract = convert_tensor(batch['abstract'], device=device)
    claims = convert_tensor(batch['claims'], device=device)
    labels = convert_tensor(batch['labels'], device=device)

    predictions = model(abstract=abstract, claims=claims)
    loss = loss_function(predictions, labels)
    loss.backward()
    
    if engine.state.iteration % ACCUM_STEPS == 0:
        optimizer.step()
        optimizer.zero_grad()

    return loss.item()

def basic_validation_step(engine, batch):
    model.eval()
    abstract = convert_tensor(batch['abstract'], device=device)
    claims = convert_tensor(batch['claims'], device=device)
    labels = convert_tensor(batch['labels'], device=device)
    
    with torch.no_grad():
        raw_predictions = model(abstract=abstract, claims=claims)
        return raw_predictions, labels
    
def thresholded_output_transform(output):
    y_pred, y = output
    y_pred = torch.round(y_pred)
    return y_pred, y

trainer = Engine(basic_train_step)

RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss')
pbar = ProgressBar(persist=True, bar_format="")
pbar.attach(trainer, ['loss'])

log_interval = 100
@trainer.on(Events.ITERATION_COMPLETED(every=log_interval))
def log_training_loss(trainer):
    print("Epoch[{}] Loss: {:.2f}".format(trainer.state.epoch, trainer.state.output))

evaluator = Engine(basic_validation_step)
Accuracy(output_transform=thresholded_output_transform, is_multilabel=True).attach(evaluator, 'val_accuracy')
Loss(loss_function).attach(evaluator, 'val_loss')

pbar = ProgressBar(persist=False, bar_format="")
pbar.attach(evaluator)
# Precision(output_transform=thresholded_output_transform, is_multilabel=True, average=True).attach(evaluator, 'val_precision')
# #Accuracy(output_transform=thresholded_output_transform, is_multilabel=True, labelwise=True).attach(evaluator, 'val_label_acc')

from sklearn.metrics import label_ranking_average_precision_score

@trainer.on(Events.ITERATION_COMPLETED(every=log_interval))
def log_validation_results(engine):
    evaluator.run(testing_loader)
    pbar.log_message("Running validation")
    
    metrics = evaluator.state.metrics
    pbar.log_message(
        "Val Results - Epoch: {} \nMetrics\n{}"
        .format(engine.state.epoch, pprint.pformat(metrics)))

trainer.run(training_loader, max_epochs=5)

2020-12-04 08:34:59 - Engine run starting with max_epochs=5.


HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

2020-12-04 08:37:10 - Engine run starting with max_epochs=1.


Epoch[1] Loss: 0.67


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

2020-12-04 08:37:35 - Epoch[1] Complete. Time taken: 00:00:25
2020-12-04 08:37:35 - Engine run complete. Time taken: 00:00:25


Running validation
Val Results - Epoch: 1 
Metrics
{'val_accuracy': 0.0, 'val_loss': 0.5321308877242446}


2020-12-04 08:39:50 - Engine run starting with max_epochs=1.


Epoch[1] Loss: 0.49


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

2020-12-04 08:40:15 - Epoch[1] Complete. Time taken: 00:00:25
2020-12-04 08:40:15 - Engine run complete. Time taken: 00:00:25


Running validation
Val Results - Epoch: 1 
Metrics
{'val_accuracy': 0.0, 'val_loss': 0.5325767942416815}


2020-12-04 08:41:14 - Epoch[1] Complete. Time taken: 00:06:14





HBox(children=(FloatProgress(value=0.0, max=243.0), HTML(value='')))

2020-12-04 08:42:29 - Engine run starting with max_epochs=1.


Epoch[2] Loss: 0.43


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

2020-12-04 08:42:54 - Epoch[1] Complete. Time taken: 00:00:25
2020-12-04 08:42:54 - Engine run complete. Time taken: 00:00:25


Running validation
Val Results - Epoch: 2 
Metrics
{'val_accuracy': 0.0, 'val_loss': 0.5320584902792801}


In [None]:


# #train_evaluator = Engine(validation_step)
# test_evaluator = Engine(validation_step)

# #Accuracy(is_multilabel=True, output_transform=thresholded_output_transform).attach(train_evaluator, 'train_accuracy')
# #Loss(classifier_function).attach(train_evaluator, 'train_loss')
# Accuracy(is_multilabel=True, output_transform=thresholded_output_transform).attach(test_evaluator, 'val_accuracy')
# Loss(classifier_function).attach(test_evaluator, 'val_loss')






# def log_validation_results(engine):
#     validation_evaluator.run(valid_iterator)
#     metrics = validation_evaluator.state.metrics
#     avg_accuracy = metrics['accuracy']
#     avg_bce = metrics['bce']
#     pbar.log_message(
#         "Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
#         .format(engine.state.epoch, avg_accuracy, avg_bce))
#     pbar.n = pbar.last_print_n = 0

In [None]:


# log_interval = 10
# @trainer.on(Events.ITERATION_COMPLETED(every=log_interval))
# def log_validation_results(trainer):
#     print("running validation")
#     evaluator.run(testing_loader)
#     metrics = evaluator.state.metrics
#     print("Validation Results - Epoch: {}  Avg accuracy: {:.2f}"
#           .format(trainer.state.epoch, metrics["accuracy"]))
