In [2]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

import torch
from torch import nn

from transformers import BertTokenizer
from transformers import BertModel
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from catalyst import dl
from catalyst import dl, utils

import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel
import logging

import fastai

In [3]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [4]:
from util import *

In [5]:
training_set = pd.read_json("training_set.json.gz", lines=True, orient="records")
testing_set = pd.read_json("testing_set.json.gz", lines=True, orient="records")

In [6]:
subset = list(set(all_tiers_100)-set(["PersonalizedProduct"]))
subset

['AnatomicalTarget_Torso',
 'Manufacturing',
 'Imaging_Ultrasound',
 'AnatomicalTarget',
 'PersonalizedProduct_Guide/Jig',
 'AnalysisAndModeling_3DModeling',
 'Manufacturing_AdditiveManufacturing',
 'AnatomicalTarget_Torso_Spine',
 'AnatomicalTarget_LowerExtremity_Hip',
 'PersonalizedProduct_Implant',
 'Imaging_CT',
 'AnatomicalTarget_LowerExtremity_Knee',
 'SpecificationofUse',
 'AnalysisAndModeling',
 'AnatomicalTarget_UpperExtremity',
 'Imaging',
 'SpecificationofUse_Disease',
 'SpecificationofUse_JointReplacement',
 'AnatomicalTarget_UpperExtremity_Shoulder',
 'SurgicalMethod',
 'AnatomicalTarget_LowerExtremity',
 'Imaging_MRI']

In [7]:
training_set['labels']=training_set[subset].astype(int).values.tolist()
testing_set['labels']=testing_set[subset].astype(int).values.tolist()

In [8]:
# training_set['label'] = training_set.AnalysisAndModeling.astype(int)
# testing_set['label'] = testing_set.AnalysisAndModeling.astype(int)

In [9]:
#training_set.label

In [10]:
#training_data = Dataset.from_pandas(training_set, split="training")
#testing_data = Dataset.from_pandas(testing_set, split="testing")

In [11]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 100
LEARNING_RATE = 1e-5
SEED = 17
PRED_THRES = 0.4
ACCUM_STEPS = 4

In [12]:
#model_name = "allenai/longformer-base-4096"
model_name = "albert-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [13]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        
        self.claims = dataframe.claims
        self.abstracts = dataframe.abstract
        
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def tokenize(self, text, prefix=""):
        text = str(text)
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            f"input_ids": torch.tensor(ids, dtype=torch.long),
            f"attention_mask": torch.tensor(mask, dtype=torch.long),
            f"token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
        }
    
            
    def __getitem__(self, index):
        
        abstract = self.tokenize(self.abstracts[index], prefix="abstract_")
        claims = self.tokenize(self.claims[index], prefix="claims_")
        
        return {"abstract": abstract, "claims": claims, 'targets': torch.tensor(self.targets[index], dtype=torch.float)}

In [14]:
training_dataset = MultiLabelDataset(training_set, tokenizer, MAX_LEN)
testing_dataset = MultiLabelDataset(testing_set, tokenizer, MAX_LEN)

In [14]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

data = fastai.DataLoaders

training_loader = DataLoader(training_dataset, **train_params)
testing_loader = DataLoader(testing_dataset, **test_params)

In [15]:
loaders = {"train": training_loader, "valid": testing_loader}

In [16]:
base_model = AutoModel.from_pretrained(model_name)#, gradient_checkpointing=True)

In [17]:
base_model

AlbertModel(
  (embeddings): AlbertEmbeddings(
    (word_embeddings): Embedding(30000, 128, padding_idx=0)
    (position_embeddings): Embedding(512, 128)
    (token_type_embeddings): Embedding(2, 128)
    (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0, inplace=False)
  )
  (encoder): AlbertTransformer(
    (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
    (albert_layer_groups): ModuleList(
      (0): AlbertLayerGroup(
        (albert_layers): ModuleList(
          (0): AlbertLayer(
            (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (attention): AlbertAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (attention_dropout): Dropout(p=0, inplace=False)
      

In [18]:
base_model.pooler.out_features

768

In [19]:
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.l1 = AutoModel.from_pretrained(model_name)
        self.pre_classifier = torch.nn.Linear(base_model.pooler.out_features*2, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, len(subset))

    def forward(self, abstract, claims ):
        abstract_emb = self.l1(input_ids=abstract["input_ids"], attention_mask=abstract["attention_mask"])
        abstract_emb = abstract_emb[0][:, 0]
        claim_emb = self.l1(input_ids=claims["input_ids"], attention_mask=claims["attention_mask"])
        claim_emb = claim_emb[0][:, 0]
        
        text_emb = torch.cat((abstract_emb, claim_emb), 1)
        
        pooler = self.pre_classifier(text_emb)
        pooler = torch.nn.Sigmoid()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = Model()

In [20]:
device = utils.get_device()

In [21]:
from datetime import datetime
logdir="logdir/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")

In [22]:
%load_ext tensorboard

In [23]:
%tensorboard --logdir logdir/fit/

Reusing TensorBoard on port 6006 (pid 23398), started 0:00:54 ago. (Use '!kill 23398' to kill it.)

In [24]:
import catalyst.contrib as contrib

In [25]:
#criterion = torch.nn.BCEWithLogitsLoss()
criterion = contrib.nn.criterion.LovaszLossMultiLabel()
optimizer = torch.optim.AdamW(params =  model.parameters(), lr=LEARNING_RATE)
#scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [2])
#scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)
#lrfinder = dl.LRFinder(final_lr=1)

runner = dl.SupervisedRunner(input_key=("abstract", "claims"))
runner.train(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    #scheduler=scheduler,
    loaders=loaders,
    logdir=logdir,
    num_epochs=EPOCHS,
    callbacks=[
               dl.MultiLabelAccuracyCallback(threshold=PRED_THRES, activation="None"),
               dl.EarlyStoppingCallback(patience=2, metric="loss", minimize=True),
               #dl.TensorboardLogger(),
               #dl.CheckpointCallback(),
               dl.OptimizerCallback(accumulation_steps=ACCUM_STEPS),
               dl.ValidationManagerCallback()],
               #lrfinder],
               #dl.MetricManagerCallback(num_classes=len(subset), ),
    
    fp16=True,
    verbose=True
)

1/100 * Epoch (train):   0% 0/122 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


1/100 * Epoch (train): 100% 122/122 [01:04<00:00,  1.89it/s, loss=1.052, multi_label_accuracy=0.773]
1/100 * Epoch (valid): 100% 31/31 [00:07<00:00,  3.99it/s, loss=0.991, multi_label_accuracy=0.788]
[2020-11-25 09:18:50,020] 
1/100 * Epoch 1 (_base): lr=1.000e-05 | momentum=0.9000
1/100 * Epoch 1 (train): loss=1.0735 | multi_label_accuracy=0.7140
1/100 * Epoch 1 (valid): loss=1.0081 | multi_label_accuracy=0.7402
2/100 * Epoch (train): 100% 122/122 [01:07<00:00,  1.80it/s, loss=1.063, multi_label_accuracy=0.670]
2/100 * Epoch (valid): 100% 31/31 [00:08<00:00,  3.68it/s, loss=0.981, multi_label_accuracy=0.773]
[2020-11-25 09:20:06,553] 
2/100 * Epoch 2 (_base): lr=1.000e-05 | momentum=0.9000
2/100 * Epoch 2 (train): loss=1.0491 | multi_label_accuracy=0.7322
2/100 * Epoch 2 (valid): loss=1.0025 | multi_label_accuracy=0.7403
3/100 * Epoch (train): 100% 122/122 [01:07<00:00,  1.81it/s, loss=0.999, multi_label_accuracy=0.750]
3/100 * Epoch (valid): 100% 31/31 [00:08<00:00,  3.86it/s, loss=0

In [27]:
predictions = np.vstack(list(map(
    lambda x: x["logits"].cpu().numpy(), 
    runner.predict_loader(loader=loaders["valid"], resume=f"logdir/fit/20201125-091731/checkpoints/train.2.pth")
)))

In [28]:
testing_set[subset].head()

Unnamed: 0,AnatomicalTarget_UpperExtremity,AnalysisAndModeling,Manufacturing,AnalysisAndModeling_3DModeling,SpecificationofUse_Disease,AnatomicalTarget_LowerExtremity_Knee,AnatomicalTarget_LowerExtremity_Hip,PersonalizedProduct_Implant,PersonalizedProduct_Guide/Jig,AnatomicalTarget,...,AnatomicalTarget_Torso_Spine,AnatomicalTarget_LowerExtremity,Imaging,SpecificationofUse_JointReplacement,Imaging_Ultrasound,SurgicalMethod,SpecificationofUse,Imaging_MRI,Manufacturing_AdditiveManufacturing,AnatomicalTarget_UpperExtremity_Shoulder
0,False,True,True,False,False,True,False,True,False,True,...,False,True,False,False,False,False,False,False,False,False
1,False,True,False,True,False,False,True,False,True,True,...,False,True,True,False,False,False,False,False,False,False
2,False,False,False,False,False,True,False,False,True,True,...,False,True,False,False,False,True,False,False,False,False
3,False,False,False,False,True,False,False,True,False,False,...,False,False,True,False,True,False,True,True,False,False
4,False,True,True,False,False,False,True,True,False,True,...,False,True,True,False,False,False,False,True,False,False


In [29]:
binary_predictions = torch.sigmoid(torch.from_numpy(predictions)) > 0.5

In [30]:
from sklearn.metrics import *
print(classification_report(testing_set[subset].astype(int), binary_predictions, target_names=subset))

                                          precision    recall  f1-score   support

         AnatomicalTarget_UpperExtremity       0.00      0.00      0.00        31
                     AnalysisAndModeling       0.35      0.98      0.51        84
                           Manufacturing       0.34      1.00      0.51        83
          AnalysisAndModeling_3DModeling       0.31      0.87      0.46        71
              SpecificationofUse_Disease       0.00      0.00      0.00        30
    AnatomicalTarget_LowerExtremity_Knee       0.34      0.73      0.47        82
     AnatomicalTarget_LowerExtremity_Hip       0.20      0.25      0.22        40
             PersonalizedProduct_Implant       0.51      1.00      0.68       124
           PersonalizedProduct_Guide/Jig       0.49      1.00      0.66       120
                        AnatomicalTarget       0.67      1.00      0.81       164
                              Imaging_CT       0.24      0.98      0.39        59
               

In [None]:
hamming_loss(testing_set[subset], binary_predictions)