In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

import torch
from torch import nn

from transformers import BertTokenizer
from transformers import BertModel
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from catalyst import dl
from catalyst import dl, utils

import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel
import logging

In [2]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [3]:
from util import *

In [4]:
training_set = pd.read_json("training_set.json.gz", lines=True, orient="records")
testing_set = pd.read_json("testing_set.json.gz", lines=True, orient="records")

In [5]:
subset = list(set(all_tiers_100)-set(["PersonalizedProduct"]))
subset

['SpecificationofUse_Disease',
 'AnatomicalTarget',
 'AnalysisAndModeling',
 'Imaging_Ultrasound',
 'AnalysisAndModeling_3DModeling',
 'AnatomicalTarget_UpperExtremity_Shoulder',
 'AnatomicalTarget_Torso',
 'AnatomicalTarget_LowerExtremity_Hip',
 'Imaging',
 'Manufacturing',
 'SpecificationofUse_JointReplacement',
 'Imaging_CT',
 'Imaging_MRI',
 'AnatomicalTarget_UpperExtremity',
 'PersonalizedProduct_Guide/Jig',
 'PersonalizedProduct_Implant',
 'AnatomicalTarget_LowerExtremity',
 'AnatomicalTarget_Torso_Spine',
 'Manufacturing_AdditiveManufacturing',
 'SurgicalMethod',
 'SpecificationofUse',
 'AnatomicalTarget_LowerExtremity_Knee']

In [6]:
training_set['labels']=training_set[subset].astype(int).values.tolist()
testing_set['labels']=testing_set[subset].astype(int).values.tolist()

In [7]:
# training_set['label'] = training_set.AnalysisAndModeling.astype(int)
# testing_set['label'] = testing_set.AnalysisAndModeling.astype(int)

In [8]:
#training_set.label

In [9]:
#training_data = Dataset.from_pandas(training_set, split="training")
#testing_data = Dataset.from_pandas(testing_set, split="testing")

In [10]:
cpc_embeddings = np.fromfile("/home/martin/cpc.emb.verse.32d.bin", dtype=np.float32).reshape((-1,32))


In [11]:
cpc_embeddings.shape

(164296, 32)

In [12]:
import joblib
cpc_labelizer = joblib.load('./node2id.joblib')
cpc_lookup = {c: n for n, c in enumerate(cpc_labelizer.classes_)}

In [13]:
@f.collecting
def convert_cpc_codes(codes):
    for code in codes:
        if code in cpc_lookup:
            yield cpc_lookup[code]

In [14]:
def embed_cpc_codes(codes):
    embedding = np.zeros(32)
    converted = convert_cpc_codes(codes)
    
    if not converted:
        return embedding
    
    for code_id in converted:
        embedding = embedding + cpc_embeddings[code_id]
        
    return embedding / len(converted)

In [15]:
training_set['embedded_cpc'] = training_set.cpc_codes.apply(embed_cpc_codes)
training_set.embedded_cpc

0      [0.47922264933586123, -0.5535850822925568, 1.5...
1      [0.5486629103811888, -0.5560514299342265, 1.22...
2      [0.7154346669421476, -0.4854765569462496, 1.37...
3      [0.6616662045319875, -0.46471265455087024, 1.3...
4      [0.6568996906280518, -0.39169512952075286, 1.2...
                             ...                        
967    [0.595665054661887, -0.3463691904076508, 1.523...
968    [0.9643049397889305, -0.1920595151536605, 1.77...
969    [0.9868654266551689, -0.15657623001822718, 1.8...
970    [1.0199734419584274, 0.16987256426364183, 1.72...
971    [1.1024012953042983, -0.17339173816144465, 1.8...
Name: embedded_cpc, Length: 972, dtype: object

In [16]:
testing_set['embedded_cpc'] = testing_set.cpc_codes.apply(embed_cpc_codes)
testing_set.embedded_cpc

0      [0.8762110610802968, -0.04536154021819432, 1.7...
1      [0.9439048937388829, 0.09446692040988378, 1.71...
2      [0.5234275807936987, -0.4350042740503947, 1.25...
3      [0.5594762146472931, -0.5152973771095276, 1.43...
4      [0.8534082993865013, -0.30192091688513756, 1.1...
                             ...                        
238    [0.4795035521189372, -0.7007373770078024, 1.32...
239    [0.7032414277394613, -0.6353410681088766, 1.00...
240    [0.8008655309677124, -0.24568895250558853, 1.4...
241    [0.8711005724393405, -0.49474271673422593, 1.1...
242    [0.727506908774376, -0.5190484967082739, 1.258...
Name: embedded_cpc, Length: 243, dtype: object

In [17]:
MAX_LEN_CLAIMS = 512
MAX_LEN_ABSTRACT = 160
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
EPOCHS = 100
LEARNING_RATE = 1e-5
SEED = 17
PRED_THRES = 0.4
ACCUM_STEPS = 8

In [18]:
#model_name = "allenai/longformer-base-4096"
#model_name = "albert-base-v2"
model_name = "/home/martin/IdeaProjects/phenetics/bertForPatents/"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
#cpc_coder = CountVectorizer(analyzer=cpc_split, min_df=5)

In [20]:
training_set['citing'] = training_set[['citations', 'cited_by']].apply(
        lambda row: list(set(row['citations']+row['cited_by'])), axis=1)

In [21]:
testing_set['citing'] = testing_set[['citations', 'cited_by']].apply(
        lambda row: list(set(row['citations']+row['cited_by'])), axis=1)

In [22]:
training_set['people'] = training_set[['assignees', 'inventors']].apply(lambda row: list(set(row['assignees']+row['inventors'])), axis=1)

In [23]:
testing_set['people'] = testing_set[['assignees', 'inventors']].apply(lambda row: list(set(row['assignees']+row['inventors'])), axis=1)

In [24]:
def format(t):
    CORP_TYPES = set(
        [
            "INC",
            "LLC" "CORP",
            "KK",
            "SA",
            "SRL",
            "LTD",
            "NL",
            "PTY",
            "AG",
            "GMBH",
            "KG",
            "OG",
            "LIMITED",
            "SARL",
            "BM",
            "PLC",
            "LP",
            "IP",
            "DBA",
            "CORP",
            "CO",
        ]
    )

    tokenized = strip_non_alphanum(strip_punctuation(t)).upper().split(" ")
    cleaned = [t for t in tokenized if t not in CORP_TYPES]
    return "".join(cleaned)


people_coder = CountVectorizer(analyzer=lambda x: map(format, x), min_df=2)

In [25]:
citing_coder = CountVectorizer(analyzer=lambda x: x, min_df=4)

In [26]:
citing_coder.fit(training_set.citing)
len(citing_coder.vocabulary_)

2076

In [27]:
people_coder.fit(training_set.people)

CountVectorizer(analyzer=<function <lambda> at 0x7f806801bd30>, min_df=2)

In [28]:
len(people_coder.vocabulary_)

506

In [29]:
#cpc_coder.fit(training_set.cpc_codes)

In [30]:
#len(cpc_coder.vocabulary_)

In [31]:
#training_set['cpc_vec'] = list(cpc_coder.transform(training_set.cpc_codes).todense())
#testing_set['cpc_vec'] = list(cpc_coder.transform(testing_set.cpc_codes).todense())
training_set['people_vec'] = list(people_coder.transform(training_set.people).todense())
testing_set['people_vec'] = list(people_coder.transform(testing_set.people).todense())
training_set['citing_vec'] = list(citing_coder.transform(training_set.citing).todense())
testing_set['citing_vec'] = list(citing_coder.transform(testing_set.citing).todense())

In [32]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer):
        self.tokenizer = tokenizer
        self.data = dataframe
        
        self.claims = dataframe.claims
        self.abstracts = dataframe.abstract
        
        self.targets = self.data.labels

    def __len__(self):
        return len(self.data)

    def tokenize(self, text, max_len, prefix=""):
        text = str(text)
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            f"input_ids": torch.tensor(ids, dtype=torch.long),
            f"attention_mask": torch.tensor(mask, dtype=torch.long),
            f"token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
        }
    
            
    def __getitem__(self, index):
        
        abstract = self.tokenize(self.abstracts[index], max_len=MAX_LEN_ABSTRACT, prefix="abstract_")
        claims = self.tokenize(self.claims[index], MAX_LEN_CLAIMS, prefix="claims_")
        
        #cpcs = torch.tensor(np.array(self.data.cpc_vec[index].values), dtype=torch.float)
        #people = torch.tensor(np.array(self.data.people_vec[index].values), dtype=torch.float)
        #citing = torch.tensor(np.array(self.data.citing_vec[index].values), dtype=torch.float)
        embedded_cpc = torch.tensor(np.array(self.data.embedded_cpc[index]), dtype=torch.float)
        return {"abstract": abstract, 
                "claims": claims, 
                
                #'cpcs': cpcs,
                #'people': people,
                #'citing': citing,
                 'embedded_cpc': embedded_cpc,
                'targets': torch.tensor(self.targets[index], dtype=torch.float)}

In [33]:
training_dataset = MultiLabelDataset(training_set, tokenizer)
testing_dataset = MultiLabelDataset(testing_set, tokenizer)

In [34]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(training_dataset, **train_params)
testing_loader = DataLoader(testing_dataset, **test_params)

In [35]:
loaders = {"train": training_loader, "valid": testing_loader}

In [36]:
base_model = AutoModel.from_pretrained(model_name, gradient_checkpointing=True)

In [37]:
base_model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(39859, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=Fals

In [38]:
base_model.pooler.dense.out_features

1024

In [39]:
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.text_embedder = AutoModel.from_pretrained(model_name, gradient_checkpointing=True)
        #self.cpc_embedder = torch.nn.EmbeddingBag.from_pretrained(torch.from_numpy(cpc_embeddings))
        
        self.dropout = torch.nn.Dropout(0.1)
        self.preclassifier = torch.nn.Linear(self.text_embedder.pooler.dense.out_features*2+32, 768)
        self.classifier = torch.nn.Linear(768, len(subset))
            
    def forward(self, abstract, claims, embedded_cpc):
        
        abstract_emb = self.text_embedder(input_ids=abstract["input_ids"], attention_mask=abstract["attention_mask"])
        abstract_emb = abstract_emb[0][:, 0]
        
        claim_emb = self.text_embedder(input_ids=claims["input_ids"], attention_mask=claims["attention_mask"])
        claim_emb = claim_emb[0][:, 0]
    
        all_emb = torch.cat((abstract_emb, claim_emb, embedded_cpc), 1)
        x = self.preclassifier(all_emb)
        x = torch.nn.Sigmoid()(x)
        x = self.dropout(x)
        x = self.classifier(x)
        
        return x

model = Model()

In [None]:
# class Model(torch.nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.text_embedder = AutoModel.from_pretrained(model_name, gradient_checkpointing=True)
#         #self.cpc_embedder = torch.nn.EmbeddingBag.from_pretrained(torch.from_numpy(cpc_embeddings))
#         self.dropout = torch.nn.Dropout(0.3)
#         self.classifier = torch.nn.Linear(self.text_embedder.pooler.dense.out_features*2+32, len(subset))
#         self.sigmoid = nn.Sigmoid()
        
#         #self.pre_classifier = torch.nn.Linear(base_model.pooler.dense.out_features*2+32, 768)
#         #self.cpc_embedder = torch.nn.Linear(len(cpc_coder.vocabulary_), 16)
#         #self.people_embedder = torch.nn.Linear(len(people_coder.vocabulary_), 16)
#         #self.citing_embedder = torch.nn.Linear(len(citing_coder.vocabulary_), 16)
        
#     def forward(self, abstract, claims, embedded_cpc):
        
#         abstract_emb = self.text_embedder(input_ids=abstract["input_ids"], attention_mask=abstract["attention_mask"])
#         abstract_emb = abstract_emb[0][:, 0]
        
#         claim_emb = self.text_embedder(input_ids=claims["input_ids"], attention_mask=claims["attention_mask"])
#         claim_emb = claim_emb[0][:, 0]
    
#         #cpc_emb = self.cpc_embedder(encoded_cpcs)
#         #people_emb = self.people_embedder(people)
#         #citing_emb = self.citing_embedder(citing)
        
#         #linear_emb = torch.cat((cpc_emb, people_emb, citing_emb), 1)
#         all_emb = torch.cat((abstract_emb, claim_emb, embedded_cpc), 1)
        
#         dropout = self.dropout(all_emb)
#         output = self.sigmoid(self.classifier(dropout))
#         return output

# model = Model()

In [40]:
model

Model(
  (text_embedder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(39859, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwi

In [41]:
device = utils.get_device()

In [42]:
from datetime import datetime
logdir="logdir/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")

In [43]:
%load_ext tensorboard

In [44]:
%tensorboard --logdir logdir/fit/

Reusing TensorBoard on port 6006 (pid 23398), started 3 days, 12:09:34 ago. (Use '!kill 23398' to kill it.)

In [45]:
import catalyst.contrib as contrib

In [46]:
#criterion = torch.nn.BCEWithLogitsLoss()
criterion = contrib.nn.criterion.LovaszLossMultiLabel()
optimizer = torch.optim.AdamW(params =  model.parameters(), lr=LEARNING_RATE)
scheduler = contrib.nn.OneCycleLRWithWarmup(optimizer, num_steps=500, lr_range=(1e-4, 1e-8), init_lr=1e-8, warmup_fraction=0.2)
#scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [2])
#scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)
#lrfinder = dl.LRFinder(final_lr=1)

runner = dl.SupervisedRunner(input_key=("abstract", "claims", "embedded_cpc"))
runner.train(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    loaders=loaders,
    logdir=logdir,
    num_epochs=EPOCHS,
    callbacks=[
               dl.MultiLabelAccuracyCallback(threshold=PRED_THRES, activation="None"),
               dl.EarlyStoppingCallback(patience=3, metric="loss", minimize=True),
               dl.TensorboardLogger(),
               #dl.CheckpointCallback(),
               dl.OptimizerCallback(accumulation_steps=ACCUM_STEPS),
               dl.ValidationManagerCallback(),
               ],
               #dl.MetricManagerCallback(num_classes=len(subset), )],
    
    fp16=True,
    verbose=True
)

1/100 * Epoch (train):   0% 0/243 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


1/100 * Epoch (train): 100% 243/243 [05:27<00:00,  1.35s/it, loss=1.129, lr=1.010e-06, momentum=0.899, multi_label_accuracy=0.761]
1/100 * Epoch (valid): 100% 61/61 [00:24<00:00,  2.46it/s, loss=1.027, multi_label_accuracy=0.788]
[2020-11-28 21:32:28,386] 
1/100 * Epoch 1 (_base): lr=1.010e-06 | momentum=0.8990
1/100 * Epoch 1 (train): loss=1.1131 | lr=5.120e-07 | momentum=0.8995 | multi_label_accuracy=0.6734
1/100 * Epoch 1 (valid): loss=1.0549 | multi_label_accuracy=0.7095
2/100 * Epoch (train): 100% 243/243 [05:26<00:00,  1.34s/it, loss=1.117, lr=2.010e-06, momentum=0.898, multi_label_accuracy=0.705]
2/100 * Epoch (valid): 100% 61/61 [00:24<00:00,  2.46it/s, loss=0.919, multi_label_accuracy=0.803]
[2020-11-28 21:38:45,285] 
2/100 * Epoch 2 (_base): lr=2.010e-06 | momentum=0.8980
2/100 * Epoch 2 (train): loss=1.0547 | lr=1.512e-06 | momentum=0.8985 | multi_label_accuracy=0.7187
2/100 * Epoch 2 (valid): loss=0.9900 | multi_label_accuracy=0.7356
3/100 * Epoch (train): 100% 243/243 [05:

In [None]:
logdir

In [47]:
predictions = np.vstack(list(map(
    lambda x: x["logits"].cpu().numpy(), 
    runner.predict_loader(loader=loaders["valid"], resume=f"logdir/fit/20201128-212612/checkpoints/train.6.pth")
)))

In [48]:
binary_predictions = torch.sigmoid(torch.from_numpy(predictions)) > 0.5

In [49]:
from sklearn.metrics import *
print(classification_report(testing_set[subset].astype(int), binary_predictions, target_names=subset))

                                          precision    recall  f1-score   support

              SpecificationofUse_Disease       0.00      0.00      0.00        30
                        AnatomicalTarget       0.67      1.00      0.81       164
                     AnalysisAndModeling       0.36      0.96      0.53        84
                      Imaging_Ultrasound       0.00      0.00      0.00        32
          AnalysisAndModeling_3DModeling       0.31      0.72      0.44        71
AnatomicalTarget_UpperExtremity_Shoulder       0.00      0.00      0.00        23
                  AnatomicalTarget_Torso       0.00      0.00      0.00        35
     AnatomicalTarget_LowerExtremity_Hip       0.00      0.00      0.00        40
                                 Imaging       0.55      1.00      0.71       133
                           Manufacturing       0.34      0.51      0.41        83
     SpecificationofUse_JointReplacement       0.12      0.23      0.16        44
               

In [None]:
precision    recall  f1-score   support

     SpecificationofUse_JointReplacement       0.21      0.32      0.25        44
                                 Imaging       0.55      1.00      0.71       133
                          SurgicalMethod       0.00      0.00      0.00        40
     Manufacturing_AdditiveManufacturing       0.00      0.00      0.00        38
                      Imaging_Ultrasound       0.00      0.00      0.00        32
                             Imaging_MRI       0.34      0.20      0.26        59
AnatomicalTarget_UpperExtremity_Shoulder       0.00      0.00      0.00        23
              SpecificationofUse_Disease       0.00      0.00      0.00        30
             PersonalizedProduct_Implant       0.51      1.00      0.68       124
                           Manufacturing       0.34      0.90      0.49        83
         AnatomicalTarget_UpperExtremity       0.00      0.00      0.00        31
                     AnalysisAndModeling       0.36      0.96      0.52        84
         AnatomicalTarget_LowerExtremity       0.47      1.00      0.63       113
                      SpecificationofUse       0.34      0.99      0.50        79
                        AnatomicalTarget       0.67      1.00      0.81       164
           PersonalizedProduct_Guide/Jig       0.49      1.00      0.66       120
            AnatomicalTarget_Torso_Spine       0.00      0.00      0.00        21
                              Imaging_CT       0.29      0.31      0.30        59
          AnalysisAndModeling_3DModeling       0.30      0.93      0.46        71
    AnatomicalTarget_LowerExtremity_Knee       0.34      0.78      0.48        82
                  AnatomicalTarget_Torso       0.00      0.00      0.00        35
     AnatomicalTarget_LowerExtremity_Hip       0.00      0.00      0.00        40

                               micro avg       0.43      0.71      0.54      1505
                               macro avg       0.24      0.47      0.31      1505
                            weighted avg       0.36      0.71      0.47      1505
                             samples avg       0.43      0.74      0.52      1505

In [None]:
# Longformer base (claims + abstract)
                                            precision    recall  f1-score   support

                     AnalysisAndModeling       0.35      1.00      0.51        84
                        AnatomicalTarget       0.67      1.00      0.81       164
            AnatomicalTarget_Torso_Spine       0.00      0.00      0.00        21
     AnatomicalTarget_LowerExtremity_Hip       0.00      0.00      0.00        40
                             Imaging_MRI       0.00      0.00      0.00        59
                                 Imaging       0.55      1.00      0.71       133
                           Manufacturing       0.34      0.99      0.50        83
             PersonalizedProduct_Implant       0.51      1.00      0.68       124
              SpecificationofUse_Disease       0.00      0.00      0.00        30
                      SpecificationofUse       0.34      0.89      0.49        79
     SpecificationofUse_JointReplacement       0.00      0.00      0.00        44
                  AnatomicalTarget_Torso       0.00      0.00      0.00        35
         AnatomicalTarget_UpperExtremity       0.00      0.00      0.00        31
                      Imaging_Ultrasound       0.00      0.00      0.00        32
                              Imaging_CT       0.32      0.25      0.28        59
          AnalysisAndModeling_3DModeling       0.28      0.80      0.42        71
                          SurgicalMethod       0.00      0.00      0.00        40
AnatomicalTarget_UpperExtremity_Shoulder       0.00      0.00      0.00        23
    AnatomicalTarget_LowerExtremity_Knee       0.34      1.00      0.51        82
           PersonalizedProduct_Guide/Jig       0.49      1.00      0.66       120
         AnatomicalTarget_LowerExtremity       0.47      1.00      0.63       113
     Manufacturing_AdditiveManufacturing       0.00      0.00      0.00        38

                               micro avg       0.44      0.69      0.54      1505
                               macro avg       0.21      0.45      0.28      1505
                            weighted avg       0.34      0.69      0.45      1505
                             samples avg       0.44      0.73      0.52      1505

In [None]:
    #Albert base w/ 256 length sequences (claims + abstract)                
    
    precision    recall  f1-score   support

         AnatomicalTarget_LowerExtremity       0.47      1.00      0.63       113
     Manufacturing_AdditiveManufacturing       0.67      0.05      0.10        38
                                 Imaging       0.55      1.00      0.71       133
                          SurgicalMethod       0.00      0.00      0.00        40
AnatomicalTarget_UpperExtremity_Shoulder       0.18      0.13      0.15        23
              SpecificationofUse_Disease       0.00      0.00      0.00        30
    AnatomicalTarget_LowerExtremity_Knee       0.45      0.40      0.43        82
                      SpecificationofUse       0.35      0.95      0.52        79
         AnatomicalTarget_UpperExtremity       0.00      0.00      0.00        31
            AnatomicalTarget_Torso_Spine       0.00      0.00      0.00        21
             PersonalizedProduct_Implant       0.51      1.00      0.68       124
                     AnalysisAndModeling       0.38      0.65      0.48        84
          AnalysisAndModeling_3DModeling       0.33      0.68      0.44        71
                  AnatomicalTarget_Torso       0.00      0.00      0.00        35
     SpecificationofUse_JointReplacement       0.18      0.68      0.28        44
                        AnatomicalTarget       0.67      1.00      0.81       164
                           Manufacturing       0.32      0.87      0.47        83
                             Imaging_MRI       0.26      0.15      0.19        59
                      Imaging_Ultrasound       0.20      0.12      0.15        32
                              Imaging_CT       0.32      0.34      0.33        59
     AnatomicalTarget_LowerExtremity_Hip       0.14      0.03      0.04        40
           PersonalizedProduct_Guide/Jig       0.50      1.00      0.66       120

                               micro avg       0.43      0.67      0.52      1505
                               macro avg       0.29      0.46      0.32      1505
                            weighted avg       0.39      0.67      0.47      1505
                             samples avg       0.43      0.70      0.51      1505

In [None]:
hamming_loss(testing_set[subset], binary_predictions)