In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch
from torch import nn
from sklearn.feature_extraction.text import CountVectorizer
from transformers import BertTokenizer
from transformers import BertModel
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import f1_score
import warnings
warnings.simplefilter('ignore')
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import joblib
from sklearn import metrics
import torch.nn.functional as F
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel
import logging
from util import *
import torch.nn.functional as F
from datetime import datetime
import pprint
import os
import torch
from torch import nn
import torch.nn.functional as F
import pytorch_lightning as pl
import util
#from catalyst.metrics.functional import process_multilabel_components
#from catalyst.metrics import multi_label_accuracy
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.core.decorators import auto_move_data
from sklearn.metrics import label_ranking_average_precision_score, accuracy_score, f1_score
#from ignite.utils import convert_tensor
#from pytorch_metric_learning.miners import MultiSimilarityMiner
#from pytorch_metric_learning.losses import TripletMarginLoss
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    )

In [3]:
subset=list(sorted(set(all_tiers_100)-set(["PersonalizedProduct"])))

In [4]:
import funcy as f
from random import shuffle

@f.collecting
def create_examples(row, num_neg=2):
    abstract = row.abstract
    claims = row.claims
    yield (abstract, claims, True)
    for text in [abstract,claims]:
        neg_count = 0
        shuffle(subset)
        for tag in subset:
            if row[tag]:
                yield (text, f"Tagged as {tier_translations[tag]}.", True)
            elif num_neg is None or neg_count < num_neg:
                neg_count = neg_count + 1
                yield (text, f"Tagged as {tier_translations[tag]}.", False)
            else:
                continue
                


class PairDataset(Dataset):

    def __init__(self, data, tokenizer, maxlen=512, with_labels=True):

        self.data = data  # pandas dataframe
        #Initialize the tokenizer
        self.tokenizer = tokenizer #AutoTokenizer.from_pretrained(bert_model)  

        self.maxlen = maxlen
        self.with_labels = with_labels 

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        # Selecting sentence1 and sentence2 at the specified index in the data frame
        sent1 = self.data.sentence1[index]
        sent2 = self.data.sentence2[index]

        # Tokenize the pair of sentences to get token ids, attention masks and token type ids
        encoded_pair = self.tokenizer(sent1, sent2, 
                                      padding='max_length',  # Pad to max_length
                                      truncation=True,  # Truncate to max_length
                                      max_length=self.maxlen,  
                                      return_tensors='pt')  # Return torch.Tensor objects
        
        token_ids = encoded_pair['input_ids'].squeeze(0)  # tensor of token ids
        attn_masks = encoded_pair['attention_mask'].squeeze(0)  # binary tensor with "0" for padded values and "1" for the other values
        token_type_ids = encoded_pair['token_type_ids'].squeeze(0)  # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens

        if self.with_labels:  # True if the dataset has labels
            label = self.data.label[index].astype(np.long)
            return token_ids, attn_masks, token_type_ids, label  
        else:
            return token_ids, attn_masks, token_type_ids
        
    
class ConcatDataset(torch.utils.data.Dataset):
    def __init__(self, *datasets):
        self.datasets = datasets

    def __getitem__(self, i):
        return tuple(d[i] for d in self.datasets)

    def __len__(self):
        return min(len(d) for d in self.datasets)

In [5]:
class PatentDataset(Dataset):

    def __init__(self, dataframe, tokenizer, abstract_max_len=160, claims_max_len=512):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.claims_max_len = claims_max_len
        self.abstract_max_len = abstract_max_len

        self.claims = dataframe.claims
        self.abstracts = dataframe.abstract        
        self.labels = dataframe.labels
        
    def tokenize(self, text, max_len):
        text = str(text)
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            f"input_ids": torch.tensor(ids, dtype=torch.long),
            f"attention_mask": torch.tensor(mask, dtype=torch.long),
            f"token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
        }
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        
        abstract = self.tokenize(self.abstracts[index], max_len=self.abstract_max_len)
        claims = self.tokenize(self.claims[index], max_len=self.claims_max_len)
        embedded_cpc = torch.tensor(np.array(self.data.embedded_cpc[index]), dtype=torch.float)
        citation_emb = torch.tensor(np.array(self.data.citation_based_embedding[index]), dtype=torch.float)
        
        labels = torch.tensor(self.labels[index])
        return {"abstract": abstract, 
                "claims": claims,
                'embedded_cpc': embedded_cpc,
                'citation_emb': citation_emb,
                'labels': labels}
    


In [6]:
citation_embeddings = pd.read_parquet("/var/patentmark/citation_based_embeddings.parquet")
training_set = pd.read_json("training_set.json.gz", lines=True, orient="records")
testing_set = pd.read_json("testing_set.json.gz", lines=True, orient="records")

training_set = training_set.merge(citation_embeddings, right_index=True,left_on='publication_number', how="inner" ).reset_index()
testing_set = testing_set.merge(citation_embeddings, right_index=True, left_on='publication_number', how='inner').reset_index()

In [7]:
from sklearn.metrics import classification_report, f1_score

In [8]:
import torch.nn.functional as F
def linear_combination(x, y, epsilon): 
    return epsilon*x + (1-epsilon)*y

def reduce_loss(loss, reduction='mean'):
    return loss.mean() if reduction=='mean' else loss.sum() if reduction=='sum' else loss

class LabelSmoothingCrossEntropy(nn.Module):
    def __init__(self, pos_weights, epsilon:float=0.1, reduction='mean'):
        super().__init__()
        self.epsilon = epsilon
        self.reduction = reduction
        self.pos_weights = pos_weights
    
    def forward(self, preds, target):
        n = preds.size()[-1]
        #log_preds = F.log_softmax(preds, dim=-1)
        loss = reduce_loss(-preds.sum(dim=-1), self.reduction)
        nll = F.binary_cross_entropy_with_logits(preds, target, pos_weight=self.pos_weights.to("cuda:0"), reduction=self.reduction)
        return linear_combination(loss/n, nll, self.epsilon)

In [9]:
from pytorch_lightning.metrics import functional as FM

def set_seed(seed):
    """ Set all seeds to make results reproducible """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

class BasicSystem(pl.LightningModule):
    def __init__(self, 
                     batch_size=16, 
                     internal_embedding_size=512, 
                     classifier_dropout=0,
                     preclassifier_size = 256,
                     preclassifier_dropout = 0,
                     embedding_dropout= 0.2,
                     lr_warmup_steps=200,
                     model_name="bertForPatents/", 
                     gradient_checkpointing=True, 
                     learning_rate = 1e-5,
                     seed=42,
                     ):
        super().__init__()
        self.model_name = model_name
        self.subset = subset
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.preclassifier_dropout = preclassifier_dropout
        self.preclassifier_size = preclassifier_size
        self.classifier_dropout = classifier_dropout
        self.embedding_dropout = embedding_dropout
        self.internal_embedding_size = internal_embedding_size
        self.gradient_checkpointing = gradient_checkpointing
        if seed:
            set_seed(seed)
        self.lr_warmup_steps = lr_warmup_steps
        
        
    def prepare_data(self):
        self.training_set = training_set
        self.testing_set = testing_set
        training_triplets = pd.DataFrame(self.training_set.apply(f.partial(create_examples, num_neg=None), axis=1).explode().tolist()).drop_duplicates()
        training_triplets.columns = ["sentence1", "sentence2", "label"]
        self.training_triplets = training_triplets.reset_index()
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        
        self.cpc_embeddings = np.fromfile("/home/martin/patentmark/cpc.node2vec.emb.32d.bin", dtype=np.float32).reshape((-1,32))
        self.cpc_labelizer = joblib.load('./node2id.joblib')
        self.cpc_lookup = {c: n for n, c in enumerate(self.cpc_labelizer.classes_)}
        self.ratio = float(self.training_set.shape[0]) / float(self.training_triplets.shape[0])
        print(self.ratio)

    @f.collecting
    def convert_cpc_codes(self, codes):
        for code in codes:
            if code in self.cpc_lookup:
                yield self.cpc_lookup[code]
    
    def embed_cpc_codes(self,codes):
        embedding = np.zeros(32)
        converted = self.convert_cpc_codes(codes)

        if not converted:
            return embedding

        for code_id in converted:
            embedding = embedding + self.cpc_embeddings[code_id]

        return embedding / len(converted)
    
        
    def setup(self,stage):

        training_labels = self.training_set[self.subset].apply(util.array_labels, axis=1)
        testing_labels = self.testing_set[self.subset].apply(util.array_labels, axis=1)
        all_labels = np.concatenate((training_labels, testing_labels))
        
        self.label_encoder = MultiLabelBinarizer()
        self.label_encoder.fit(all_labels)
        self.output_size = len(self.label_encoder.classes_)

        self.training_set['labels'] = self.label_encoder.transform(training_labels).astype(np.float).tolist()
        self.testing_set['labels'] = self.label_encoder.transform(testing_labels).astype(np.float).tolist()
        
        self.training_set['embedded_cpc'] = self.training_set.cpc_codes.apply(self.embed_cpc_codes)
        self.testing_set['embedded_cpc'] = self.testing_set.cpc_codes.apply(self.embed_cpc_codes)
        
        self.label_dataset = PairDataset(self.training_triplets, tokenizer=self.tokenizer)
        self.patent_dataset = PatentDataset(self.training_set, self.tokenizer)
        self.training_dataset = ConcatDataset(self.label_dataset, self.patent_dataset)
        
        self.testing_dataset = PatentDataset(self.testing_set, self.tokenizer)
        
        self.setup_embedder()
        self.setup_classifier()
        self.setup_binary_classifier()
        
        
    def setup_embedder(self):
        self.text_embedder = AutoModel.from_pretrained(self.model_name, gradient_checkpointing=self.gradient_checkpointing)
#         for param in self.text_embedder.base_model.parameters():
#             param.requires_grad=False
            
        self.embedding_size = 1024
        self.embedding_dropout_layer = nn.Dropout(self.embedding_dropout)
        #self.embedding_layer = nn.Linear(full_embedding_size, self.internal_embedding_size)
        
        
    def setup_binary_classifier(self):
        self.binary_classifier_layer = nn.Linear(self.embedding_size, 1)
        
    def setup_classifier(self):
        self.classifier_layer = nn.Linear(self.embedding_size * 2 + 32 + 32, self.output_size)
        pos_weights = torch.tensor(1 / (self.training_set[self.subset].sum() / self.training_set.shape[0]).values).to('cuda:0')
        self.loss_function = self.get_loss_function(pos_weights)
        
    
    def classify_patent(self, abstract, claims, embedded_cpc, citation_emb):
        abstract_emb = self.text_embedder(input_ids=abstract["input_ids"], attention_mask=abstract["attention_mask"])
        abstract_emb = abstract_emb[1]
        
        claim_emb = self.text_embedder(input_ids=claims["input_ids"], attention_mask=claims["attention_mask"])
        claim_emb = claim_emb[1]
        
        x = torch.cat((abstract_emb, claim_emb, embedded_cpc, citation_emb), 1)
        
        if self.embedding_dropout > 0:
            x = self.embedding_dropout_layer(x)
            
        x = F.elu(x)
        x = self.classifier_layer(x)
        
        return x
    
    @auto_move_data
    def get_loss_function(self, pos_weights):
        #return nn.BCEWithLogitsLoss(pos_weight=pos_weights)
        return LabelSmoothingCrossEntropy(pos_weights)
    
    @auto_move_data
    def forward(self, abstract, claims, embedded_cpc, citation_emb):
        x = self.classify_patent(abstract, claims, embedded_cpc, citation_emb)
        return x
    
    def pair_forward(self, input_ids, attn_masks, token_type_ids):
        cont_reps, pooler_output = self.text_embedder(input_ids, attn_masks, token_type_ids)
        logits = self.binary_classifier_layer(self.embedding_dropout_layer(pooler_output))
        return logits
        
    def train_dataloader(self):
        return DataLoader(self.training_dataset, batch_size=self.batch_size, shuffle=True, num_workers=32)

    def val_dataloader(self):
        return DataLoader(self.testing_dataset, batch_size=self.batch_size, shuffle=False, num_workers=32)

    def test_dataloader(self):
        return self.val_dataloader()
    
    def training_step(self, concat_batch, batch_idx):
        pair_batch, batch = concat_batch
        
        abstract = batch['abstract']
        claims = batch['claims']
        embedded_cpc = batch['embedded_cpc']
        citation_emb = batch['citation_emb']
        labels = batch['labels']
        
        logits = self.classify_patent(abstract, claims, embedded_cpc, citation_emb)
        class_loss = self.loss_function(logits.squeeze(-1), labels.float()) #@* self.ratio
        
        seq, attn_masks, token_type_ids, pair_labels = pair_batch
        pair_logits = self.pair_forward(seq, attn_masks, token_type_ids)
        pair_loss = nn.BCEWithLogitsLoss()(pair_logits.squeeze(-1), pair_labels.float())
        
        loss = class_loss + pair_loss
        
        self.log("class_loss", class_loss, prog_bar=True)
        self.log("pair_loss", pair_loss, prog_bar=True)
        self.log("train_loss", loss, prog_bar=True)
 
        return loss
    
    
    def validation_step(self, batch, batch_idx):
        abstract = batch['abstract']
        claims = batch['claims']
        embedded_cpc = batch['embedded_cpc']
        labels = batch['labels']
        citation_emb = batch['citation_emb']
        
        logits = self.classify_patent(abstract, claims, embedded_cpc, citation_emb)
        loss = self.loss_function(logits.squeeze(-1), labels.float())
        self.log("val_loss", loss, prog_bar=True)
                
        predictions = (F.sigmoid(logits).squeeze(-1).cpu().detach().numpy() >= 0.5).tolist()
        labels = labels.cpu().detach().numpy().astype(np.bool).tolist()
        
        return predictions, labels
    
    def validation_epoch_end(self, outputs):
        predictions = list(f.cat(x[0] for x in outputs))
        labels = list(f.cat(x[1] for x in outputs))
        self.log("val_f1", f1_score(labels, predictions, average="samples"), prog_bar=True)
        print(classification_report(labels, predictions, target_names=self.label_encoder.classes_, digits=4))
        
#     def optimizer_step(self, optimizer, *args, **kwargs):
#         if self.trainer.global_step < self.lr_warmup_steps:
#             lr_scale = min(1., float(self.trainer.global_step + 1) / float(self.lr_warmup_steps))
#             lr = lr_scale * self.learning_rate
#             self.log('learning_rate', lr)
#             for pg in optimizer.param_groups:
#                 pg['lr'] = lr

#         optimizer.step()
#         optimizer.zero_grad()

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.learning_rate)

        

In [10]:
model = BasicSystem()
#early_stopping = EarlyStopping('val_f1', mode="max", verbose=True, patience=10)
#early_stopping = EarlyStopping('loss', mode="min", verbose=True, patience=3)
trainer = pl.Trainer(gpus=1,
                     #overfit_batches=5,
 #                    callbacks=[early_stopping],
                     #precision=16,
                     #auto_scale_batch_size=True,
                     #auto_lr_find=True,
                     log_every_n_steps=1,
                     #val_check_interval=100000,
                     #limit_val_batches=0.0,
                     flush_logs_every_n_steps=10)


GPU available: True, used: True
2020-12-15 09:18:25 - GPU available: True, used: True
TPU available: False, using: 0 TPU cores
2020-12-15 09:18:25 - TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
2020-12-15 09:18:25 - LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


In [11]:
#trainer.tune(model)

In [12]:
trainer.fit(model)

0.025049991893206506



  | Name                    | Type                       | Params
-----------------------------------------------------------------------
0 | text_embedder           | BertModel                  | 344 M 
1 | embedding_dropout_layer | Dropout                    | 0     
2 | classifier_layer        | Linear                     | 46 K  
3 | loss_function           | LabelSmoothingCrossEntropy | 0     
4 | binary_classifier_layer | Linear                     | 1 K   
2020-12-15 09:18:33 - 
  | Name                    | Type                       | Params
-----------------------------------------------------------------------
0 | text_embedder           | BertModel                  | 344 M 
1 | embedding_dropout_layer | Dropout                    | 0     
2 | classifier_layer        | Linear                     | 46 K  
3 | loss_function           | LabelSmoothingCrossEntropy | 0     
4 | binary_classifier_layer | Linear                     | 1 K   


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

                                               precision    recall  f1-score   support

                        Analysis and Modeling     0.0000    0.0000    0.0000        13
           Analysis and Modeling: 3D Modeling     0.0000    0.0000    0.0000        10
                            Anatomical Target     0.6875    1.0000    0.8148        22
           Anatomical Target: Lower Extremity     0.5333    0.5333    0.5333        15
     Anatomical Target: Lower Extremity - Hip     0.0000    0.0000    0.0000         5
    Anatomical Target: Lower Extremity - Knee     0.3125    1.0000    0.4762        10
                     Anatomical Target: Torso     0.0625    1.0000    0.1176         2
             Anatomical Target: Torso - Spine     0.0000    0.0000    0.0000         1
           Anatomical Target: Upper Extremity     0.1724    1.0000    0.2941         5
Anatomical Target: Upper Extremity - Shoulder     0.1562    1.0000    0.2703         5
                                      Imag

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

                                               precision    recall  f1-score   support

                        Analysis and Modeling     0.3333    1.0000    0.5000        78
           Analysis and Modeling: 3D Modeling     0.2778    1.0000    0.4348        65
                            Anatomical Target     0.6752    1.0000    0.8061       158
           Anatomical Target: Lower Extremity     0.4615    1.0000    0.6316       108
     Anatomical Target: Lower Extremity - Hip     0.0000    0.0000    0.0000        38
    Anatomical Target: Lower Extremity - Knee     0.3333    1.0000    0.5000        78
                     Anatomical Target: Torso     0.0000    0.0000    0.0000        34
             Anatomical Target: Torso - Spine     0.0000    0.0000    0.0000        20
           Anatomical Target: Upper Extremity     0.0000    0.0000    0.0000        29
Anatomical Target: Upper Extremity - Shoulder     0.0897    1.0000    0.1647        21
                                      Imag

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

                                               precision    recall  f1-score   support

                        Analysis and Modeling     0.3333    1.0000    0.5000        78
           Analysis and Modeling: 3D Modeling     0.2778    1.0000    0.4348        65
                            Anatomical Target     0.6752    1.0000    0.8061       158
           Anatomical Target: Lower Extremity     0.4615    1.0000    0.6316       108
     Anatomical Target: Lower Extremity - Hip     0.0000    0.0000    0.0000        38
    Anatomical Target: Lower Extremity - Knee     0.3333    1.0000    0.5000        78
                     Anatomical Target: Torso     0.0000    0.0000    0.0000        34
             Anatomical Target: Torso - Spine     0.0000    0.0000    0.0000        20
           Anatomical Target: Upper Extremity     0.0000    0.0000    0.0000        29
Anatomical Target: Upper Extremity - Shoulder     0.0897    1.0000    0.1647        21
                                      Imag

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

                                               precision    recall  f1-score   support

                        Analysis and Modeling     0.3333    1.0000    0.5000        78
           Analysis and Modeling: 3D Modeling     0.2778    1.0000    0.4348        65
                            Anatomical Target     0.6752    1.0000    0.8061       158
           Anatomical Target: Lower Extremity     0.4615    1.0000    0.6316       108
     Anatomical Target: Lower Extremity - Hip     0.0000    0.0000    0.0000        38
    Anatomical Target: Lower Extremity - Knee     0.3333    1.0000    0.5000        78
                     Anatomical Target: Torso     0.0000    0.0000    0.0000        34
             Anatomical Target: Torso - Spine     0.0905    1.0000    0.1660        20
           Anatomical Target: Upper Extremity     0.0000    0.0000    0.0000        29
Anatomical Target: Upper Extremity - Shoulder     0.0897    1.0000    0.1647        21
                                      Imag

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

                                               precision    recall  f1-score   support

                        Analysis and Modeling     0.3333    1.0000    0.5000        78
           Analysis and Modeling: 3D Modeling     0.2778    1.0000    0.4348        65
                            Anatomical Target     0.6752    1.0000    0.8061       158
           Anatomical Target: Lower Extremity     0.4615    1.0000    0.6316       108
     Anatomical Target: Lower Extremity - Hip     0.0000    0.0000    0.0000        38
    Anatomical Target: Lower Extremity - Knee     0.3333    1.0000    0.5000        78
                     Anatomical Target: Torso     0.0000    0.0000    0.0000        34
             Anatomical Target: Torso - Spine     0.0833    0.8000    0.1509        20
           Anatomical Target: Upper Extremity     0.0000    0.0000    0.0000        29
Anatomical Target: Upper Extremity - Shoulder     0.0946    1.0000    0.1728        21
                                      Imag

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

                                               precision    recall  f1-score   support

                        Analysis and Modeling     0.3333    1.0000    0.5000        78
           Analysis and Modeling: 3D Modeling     0.2778    1.0000    0.4348        65
                            Anatomical Target     0.6752    1.0000    0.8061       158
           Anatomical Target: Lower Extremity     0.4615    1.0000    0.6316       108
     Anatomical Target: Lower Extremity - Hip     0.0000    0.0000    0.0000        38
    Anatomical Target: Lower Extremity - Knee     0.3333    1.0000    0.5000        78
                     Anatomical Target: Torso     0.0000    0.0000    0.0000        34
             Anatomical Target: Torso - Spine     0.0841    0.4500    0.1417        20
           Anatomical Target: Upper Extremity     0.0000    0.0000    0.0000        29
Anatomical Target: Upper Extremity - Shoulder     0.0976    0.9524    0.1770        21
                                      Imag

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

                                               precision    recall  f1-score   support

                        Analysis and Modeling     0.3333    1.0000    0.5000        78
           Analysis and Modeling: 3D Modeling     0.2778    1.0000    0.4348        65
                            Anatomical Target     0.6752    1.0000    0.8061       158
           Anatomical Target: Lower Extremity     0.4615    1.0000    0.6316       108
     Anatomical Target: Lower Extremity - Hip     0.0000    0.0000    0.0000        38
    Anatomical Target: Lower Extremity - Knee     0.3333    1.0000    0.5000        78
                     Anatomical Target: Torso     0.0000    0.0000    0.0000        34
             Anatomical Target: Torso - Spine     0.0909    0.5000    0.1538        20
           Anatomical Target: Upper Extremity     0.0000    0.0000    0.0000        29
Anatomical Target: Upper Extremity - Shoulder     0.1105    0.9048    0.1969        21
                                      Imag

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

                                               precision    recall  f1-score   support

                        Analysis and Modeling     0.3333    1.0000    0.5000        78
           Analysis and Modeling: 3D Modeling     0.2778    1.0000    0.4348        65
                            Anatomical Target     0.6752    1.0000    0.8061       158
           Anatomical Target: Lower Extremity     0.4615    1.0000    0.6316       108
     Anatomical Target: Lower Extremity - Hip     0.0000    0.0000    0.0000        38
    Anatomical Target: Lower Extremity - Knee     0.3333    1.0000    0.5000        78
                     Anatomical Target: Torso     0.0000    0.0000    0.0000        34
             Anatomical Target: Torso - Spine     0.1429    0.3000    0.1935        20
           Anatomical Target: Upper Extremity     0.0000    0.0000    0.0000        29
Anatomical Target: Upper Extremity - Shoulder     0.0980    0.4762    0.1626        21
                                      Imag

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

                                               precision    recall  f1-score   support

                        Analysis and Modeling     0.3333    1.0000    0.5000        78
           Analysis and Modeling: 3D Modeling     0.2778    1.0000    0.4348        65
                            Anatomical Target     0.6752    1.0000    0.8061       158
           Anatomical Target: Lower Extremity     0.4615    1.0000    0.6316       108
     Anatomical Target: Lower Extremity - Hip     0.0000    0.0000    0.0000        38
    Anatomical Target: Lower Extremity - Knee     0.3333    1.0000    0.5000        78
                     Anatomical Target: Torso     0.0000    0.0000    0.0000        34
             Anatomical Target: Torso - Spine     0.0822    0.3000    0.1290        20
           Anatomical Target: Upper Extremity     0.0000    0.0000    0.0000        29
Anatomical Target: Upper Extremity - Shoulder     0.1075    0.9524    0.1932        21
                                      Imag

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

                                               precision    recall  f1-score   support

                        Analysis and Modeling     0.3333    1.0000    0.5000        78
           Analysis and Modeling: 3D Modeling     0.2778    1.0000    0.4348        65
                            Anatomical Target     0.6752    1.0000    0.8061       158
           Anatomical Target: Lower Extremity     0.4615    1.0000    0.6316       108
     Anatomical Target: Lower Extremity - Hip     0.0000    0.0000    0.0000        38
    Anatomical Target: Lower Extremity - Knee     0.3333    1.0000    0.5000        78
                     Anatomical Target: Torso     0.0000    0.0000    0.0000        34
             Anatomical Target: Torso - Spine     0.0879    0.4000    0.1441        20
           Anatomical Target: Upper Extremity     0.0000    0.0000    0.0000        29
Anatomical Target: Upper Extremity - Shoulder     0.1129    0.6667    0.1931        21
                                      Imag

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

                                               precision    recall  f1-score   support

                        Analysis and Modeling     0.3333    1.0000    0.5000        78
           Analysis and Modeling: 3D Modeling     0.2802    1.0000    0.4377        65
                            Anatomical Target     0.6752    1.0000    0.8061       158
           Anatomical Target: Lower Extremity     0.4615    1.0000    0.6316       108
     Anatomical Target: Lower Extremity - Hip     0.0000    0.0000    0.0000        38
    Anatomical Target: Lower Extremity - Knee     0.3348    1.0000    0.5016        78
                     Anatomical Target: Torso     0.0000    0.0000    0.0000        34
             Anatomical Target: Torso - Spine     0.0879    0.4000    0.1441        20
           Anatomical Target: Upper Extremity     0.1429    0.0690    0.0930        29
Anatomical Target: Upper Extremity - Shoulder     0.1049    0.7143    0.1829        21
                                      Imag

1

In [None]:
                                                precision    recall  f1-score   support

                        Analysis and Modeling     0.3333    1.0000    0.5000        78
           Analysis and Modeling: 3D Modeling     0.2778    1.0000    0.4348        65
                            Anatomical Target     0.6752    1.0000    0.8061       158
           Anatomical Target: Lower Extremity     0.4615    1.0000    0.6316       108
     Anatomical Target: Lower Extremity - Hip     0.0000    0.0000    0.0000        38
    Anatomical Target: Lower Extremity - Knee     0.3333    1.0000    0.5000        78
                     Anatomical Target: Torso     0.0000    0.0000    0.0000        34
             Anatomical Target: Torso - Spine     0.1429    0.3000    0.1935        20
           Anatomical Target: Upper Extremity     0.0000    0.0000    0.0000        29
Anatomical Target: Upper Extremity - Shoulder     0.0980    0.4762    0.1626        21
                                      Imaging     0.5494    1.0000    0.7091       128
                                  Imaging: CT     0.2393    1.0000    0.3862        56
                                 Imaging: MRI     0.2393    1.0000    0.3862        56
                          Imaging: Ultrasound     0.1129    0.2188    0.1489        32
                                Manufacturing     0.3291    1.0000    0.4952        77
        Manufacturing: Additive Manufacturing     0.0000    0.0000    0.0000        35
           Personalized Product: Guide or Jig     0.4829    1.0000    0.6513       113
                Personalized Product: Implant     0.5214    1.0000    0.6854       122
                         Specification of Use     0.3291    1.0000    0.4952        77
                Specification of Use: Disease     0.3750    0.1000    0.1579        30
      Specification of Use: Joint Replacement     0.1845    0.7381    0.2952        42
                              Surgical Method     0.3256    0.3590    0.3415        39

                                    micro avg     0.3673    0.8266    0.5086      1436
                                    macro avg     0.2732    0.6451    0.3628      1436
                                 weighted avg     0.3710    0.8266    0.4957      1436
                                  samples avg     0.3667    0.8372    0.4867      1436

In [14]:
                  precision    recall  f1-score   support

                        Analysis and Modeling       0.35      1.00      0.51        84
           Analysis and Modeling: 3D Modeling       0.31      0.97      0.47        71
                            Anatomical Target       0.67      1.00      0.81       164
           Anatomical Target: Lower Extremity       0.47      1.00      0.63       113
     Anatomical Target: Lower Extremity - Hip       0.16      1.00      0.28        40
    Anatomical Target: Lower Extremity - Knee       0.34      1.00      0.50        82
                     Anatomical Target: Torso       0.00      0.00      0.00        35
             Anatomical Target: Torso - Spine       0.00      0.00      0.00        21
           Anatomical Target: Upper Extremity       0.13      0.97      0.22        31
Anatomical Target: Upper Extremity - Shoulder       0.00      0.00      0.00        23
                                      Imaging       0.55      1.00      0.71       133
                                  Imaging: CT       0.00      0.00      0.00        59
                                 Imaging: MRI       0.00      0.00      0.00        59
                          Imaging: Ultrasound       0.00      0.00      0.00        32
                                Manufacturing       0.34      1.00      0.51        83
        Manufacturing: Additive Manufacturing       0.00      0.00      0.00        38
           Personalized Product: Guide or Jig       0.49      1.00      0.66       120
                Personalized Product: Implant       0.51      1.00      0.68       124
                         Specification of Use       0.33      1.00      0.49        79
                Specification of Use: Disease       0.00      0.00      0.00        30
      Specification of Use: Joint Replacement       0.21      0.91      0.34        44
                              Surgical Method       0.00      0.00      0.00        40

                                    micro avg       0.38      0.77      0.51      1505
                                    macro avg       0.22      0.58      0.31      1505
                                 weighted avg       0.34      0.77      0.46      1505
                                  samples avg       0.38      0.80      0.49      1505

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 5)

In [None]:
#trainer.test()

In [None]:
# def stack(listOfDicts):
#     initDict = listOfDicts[0]
#     finalDict = {}
#     for key in initDict.keys():
#         tensors = tuple(d[key] for d in listOfDicts)
#         finalDict[key] = torch.stack(tensors)
#     return finalDict

In [None]:
# abstracts = stack(list(tokenize(tokenizer, x, max_len=MAX_LEN_ABSTRACT) for x in testing_set.abstract))

# claims = stack(list(tokenize(tokenizer, x, max_len=MAX_LEN_CLAIMS) for x in testing_set.claims))

# predictions = model.forward(abstract=abstracts, claims=claims)

# binarized = predictions.detach().numpy() > 0.5

# from sklearn.metrics import classification_report
# testing_labels = testing_set[subset]
# print(classification_report(testing_labels, binarized))