In [1]:
import json
class obj:
    # constructor
    def __init__(self, dict1):
        self.__dict__.update(dict1)
args = {
    "candidates": 20,
    "device": 'mps',
    'output_dir': 'tmp/local',
    "dev_dir":'datasets/development/processed_dev',
    "train_dir":'datasets/development/processed_dev',
    "test_dir":'datasets/development/processed_dev',
    "dictionary_path": 'datasets/development/dev_dictionary.txt',
    "max_length": 25,
    "model_name_or_path": 'dmis-lab/biobert-base-cased-v1.1',
    "batch_size": 16,
    "epochs": 1,
    "loss_fn": "similarity_nll",
    "contextualized": False,
    "similarity_type": 'log'
}
args = json.loads(json.dumps(args), object_hook=obj)
vars(args)

from importlib import reload

import pandas as pd
import time
import torch
from tqdm import tqdm
from transformers import (
    AutoModel,
    AutoTokenizer
)

# Local modules
from src.candidateDataset import CandidateDataset
from src.rerankNet import RerankNet
from src.umls import Umls
import src.utils as utils

  assert(sum([len(ixs)!=2 for ixs in token_ixs]), f"Offsets not lining up for mention in {file}")


In [2]:
# Initialize
start = time.time()
LOGGER = utils.init_logging()
LOGGER.info(args)
utils.init_seed(42)
bert = AutoModel.from_pretrained(args.model_name_or_path).to(args.device)
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)

# Set loss function
if args.loss_fn=='nll':
    loss_fn = utils.marginal_nll
elif args.loss_fn=='similarity_nll':
    loss_fn = utils.similarity_nll
elif args.loss_fn=='mse':
    loss_fn = utils.mse_loss
elif args.loss_fn=='mse5':
    loss_fn = utils.mse5_loss
else:
    raise Exception(f"Invalid loss function {args.loss_fn}")
    
# Build model
model = RerankNet(encoder=bert, tokenizer=tokenizer, device=args.device)

# Load UMLS data
umls = Umls('umls/processed')
LOGGER.info("UMLS data loaded")

# Load dictionary
dictionary = utils.load_dictionary(args.dictionary_path)
LOGGER.info("Dictionary loaded")

# Load training data
train_mentions = utils.load_mentions(args.train_dir)
train_set = CandidateDataset(train_mentions, dictionary, model.tokenizer, args.max_length, args.candidates, args.similarity_type, umls) 
train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.batch_size, shuffle=True)

# Load dev data for validation
dev_mentions = utils.load_mentions(args.dev_dir)
LOGGER.info("Mentions loaded")

07/17/2022 08:32:36 AM: [ <__main__.obj object at 0x105a05ac0> ]
Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.1 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
07/17/2022 08:32:40 AM: [ UMLS data loaded 

In [3]:
# Training loop
epoch_results = pd.DataFrame([], columns=['acc@1','acc@5','umls_similarity', 'max_acc@1'])
for epoch in range(args.epochs):
        ############## Candidate Generation ##############
        train_candidate_idxs = utils.get_topk_candidates(
                dict_names=list(dictionary[:,0]), 
                mentions=train_mentions, 
                tokenizer=model.tokenizer, 
                encoder=model.encoder, 
                max_length=args.max_length, 
                device=args.device, 
                topk=args.candidates)
                                
        # Add candidates to training dataset
        train_set.set_candidate_idxs(train_candidate_idxs)
        max_acc1 = train_set.max_acc1()
        LOGGER.info('Epoch {}: max possible acc@1 = {}'.format(epoch,max_acc1))

        ###################### Train ######################
        # Train encoder to properly rank candidates
        train_loss = 0
        train_steps = 0
        model.train()
        for i, data in tqdm(enumerate(train_loader), total=len(train_loader), desc=f'Training epoch {epoch}'):
                model.optimizer.zero_grad()
                batch_x, batch_y = data
                batch_pred = model(batch_x)
                loss = loss_fn(batch_pred, batch_y.to(args.device))
                loss.backward()
                model.optimizer.step()
                train_loss += loss.item()
                train_steps += 1

        train_loss = train_loss / (train_steps + 1e-9)
        LOGGER.info('Epoch {}: loss/train_per_epoch={}/{}'.format(epoch,train_loss,epoch))

        #################### Evaluate ####################
        # Get candidates on dev dataset
        dev_candidate_idxs = utils.get_topk_candidates(
                dict_names=list(dictionary[:,0]), 
                mentions=dev_mentions, 
                tokenizer=model.tokenizer, 
                encoder=model.encoder, 
                max_length=args.max_length, 
                device=args.device, 
                topk=5) # Only need top five candidates to evaluate performance

        # Log performance on dev after each epoch
        results = utils.evaluate(dev_mentions, dictionary[dev_candidate_idxs], umls)
        epoch_results.loc[epoch] = (results['acc1'], results['acc5'], results['umls_similarity'], max_acc1)
        LOGGER.info("Epoch {}: acc@1={}".format(epoch,results['acc1']))
        LOGGER.info("Epoch {}: acc@5={}".format(epoch,results['acc5']))
        LOGGER.info("Epoch {}: umls_similarity={}".format(epoch,results['umls_similarity']))

Bulk embedding...: 100%|██████████| 1/1 [00:01<00:00,  1.88s/it]
Bulk embedding...: 100%|██████████| 1/1 [00:00<00:00,  3.35it/s]
07/17/2022 08:32:49 AM: [ Epoch 0: max possible acc@1 = 0.875 ]
Training epoch 0:   0%|          | 0/1 [00:04<?, ?it/s]


NotImplementedError: The operator 'aten::logical_and.out' is not current implemented for the MPS device. If you want this op to be added in priority during the prototype phase of this feature, please comment on https://github.com/pytorch/pytorch/issues/77764. As a temporary fix, you can set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1` to use the CPU as a fallback for this op. WARNING: this will be slower than running natively on MPS.

In [6]:
# Evaluate on test data using best training model
start = time.time()
best_epoch = epoch_results.umls_similarity.argmax()
train_model_path = os.path.join(args.output_dir, "checkpoint_{}".format(best_epoch))
LOGGER.info(f'Loading epoch {best_epoch} model from {train_model_path}')

# Load training model
train_bert = AutoModel.from_pretrained(train_model_path).to(args.device)
train_tokenizer = AutoTokenizer.from_pretrained(train_model_path)

# Load test mentions
test_mentions = utils.load_mentions(args.test_dir)

# Predict topk=5 candidates
candidate_idxs = utils.get_topk_candidates(
        dict_names=list(dictionary[:,0]), 
        mentions=test_mentions, 
        tokenizer=train_tokenizer, 
        encoder=train_bert, 
        max_length=args.max_length, 
        device=args.device, 
        topk=5, # Only need top five candidates to evaluate performance
        doc_dir=None) # Update to allow contextualized embeddings

# Log performance
results = utils.evaluate(test_mentions, dictionary[candidate_idxs], umls)
LOGGER.info("Test result: acc@1={}".format(results['acc1']))
LOGGER.info("Test result: acc@5={}".format(results['acc5']))
LOGGER.info("Test result: umls_similarity={}".format(results['umls_similarity']))

LOGGER.info('Prediction time: ' + utils.format_time(start,time.time()))

# Write output
if not os.path.exists(args.output_dir):
    os.makedirs(args.output_dir)
output_file = os.path.join(args.output_dir,"predictions_eval.json")
with open(output_file, 'w') as f:
    json.dump(results, f, indent=2)

07/17/2022 07:40:22 AM: [ Loading epoch 0 model from tmp/local/checkpoint_0 ]
100%|██████████| 2/2 [00:00<00:00, 1731.75it/s]
Bulk embedding...: 100%|██████████| 1/1 [00:17<00:00, 17.72s/it]
Bulk embedding...: 100%|██████████| 1/1 [00:04<00:00,  4.73s/it]
07/17/2022 07:41:00 AM: [ Test result: acc@1=0.875 ]
07/17/2022 07:41:00 AM: [ Test result: acc@5=0.875 ]
07/17/2022 07:41:00 AM: [ Test result: umls_similarity=0.9166666666666666 ]
07/17/2022 07:41:00 AM: [ Prediction time: 0 hours 0 minutes 38 seconds ]


In [397]:
import src.utils as utils
reload(utils)

  # Check all annotations were fixed


<module 'src.utils' from '/Users/evan/code/thesis/src/utils.py'>

In [4]:
score, target = batch_pred, batch_y.to(args.device)

def similarity_nll(score, target):
    "Negative log likelihood of predicted similarity matching max candidate similarity"
    # Assign probabilities to each candidate
    preds = torch.nn.functional.softmax(score, dim=-1)

    # Aggregate predictions to a single similarity score
    pred_similarity = torch.sum(preds * target, dim=1)

    # Find max possible similarity given available candidates
    max_similarity = torch.max(target, dim=1).values

    # Avoid divide by zero in case of candidates all having zero similarity
    pred_similarity = torch.clamp(pred_similarity, min=1e-9, max=1)
    max_similarity = torch.clamp(max_similarity, min=1e-9, max=1)

    # Calculate loss out of max_similarity, rather than 1
    pred_similarity = pred_similarity / max_similarity
    return -torch.log(pred_similarity).mean()
similarity_nll(score, target)


  nonzero_finite_vals = torch.masked_select(tensor_view, torch.isfinite(tensor_view) & tensor_view.ne(0))


tensor(2.0818, device='mps:0', grad_fn=<NegBackward0>)

In [67]:
print(pred_similarity)
-torch.log(pred_similarity).mean()

tensor([0.0211, 0.1000, 0.0500, 0.0500, 0.0500, 0.1500, 0.4000, 1.0000],
       device='mps:0', grad_fn=<DivBackward0>)


tensor(2.2454, device='mps:0', grad_fn=<NegBackward0>)

In [54]:
utils.marginal_nll(score, target)

  assert(sum([len(ixs)!=2 for ixs in token_ixs]), f"Offsets not lining up for mention in {file}")
  assert(sum([len(ixs)!=2 for ixs in token_ixs]), f"Offsets not lining up for mention in {file}")


NotImplementedError: The operator 'aten::index.Tensor' is not current implemented for the MPS device. If you want this op to be added in priority during the prototype phase of this feature, please comment on https://github.com/pytorch/pytorch/issues/77764. As a temporary fix, you can set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1` to use the CPU as a fallback for this op. WARNING: this will be slower than running natively on MPS.

In [8]:
def mse_loss(score, target):
    "Calculates MSE loss between max similarity of the candidates and similarity of top prediction"
    # Find similarity of the top prediction
    pred_ixs = score.argmax(dim=1)
    predicted_similarity = torch.gather(target, 1, pred_ixs.unsqueeze_(dim=1)).squeeze().requires_grad_()

    # Find max similarity for each mention of the available candidates
    expected_similarity = torch.max(target, dim=1).values
    return torch.nn.functional.mse_loss(expected_similarity, predicted_similarity)

mse_loss(batch_pred, batch_y.to(args.device))

tensor(0.1250, device='mps:0', grad_fn=<MseLossBackward0>)

In [141]:
score, target = batch_pred, batch_y.to(args.device)
pred_ixs = score.argmax(dim=1)
predicted_similarity = torch.gather(target, 1, pred_ixs.unsqueeze_(dim=1)).squeeze().requires_grad_()
predicted_similarity # was it correct?

expected_similarity = torch.max(target, dim=1).values # was the correct CUI available?
torch.nn.functional.mse_loss(expected_similarity, predicted_similarity)

tensor(0.2500, grad_fn=<MseLossBackward0>)

In [None]:
"""
The network fails to train when using the following loss function.
The network is able to successfully train when using other loss functions, so 
this function appears to be the most likely point of failure.
"""

def mse_loss(score, target):
    """
    Calculates MSE loss between max similarity of the candidates and similarity of top prediction.

    Inputs:
        score: torch.Size([<mentions>, <candidates>]) Float tensor resulting from matrix multiplication of mention and dictionary embeddings
        target: torch.Size([<mentions>, <candidates>]) Similarity score (0,1] between candidates and gold CUIs for each mention.
    """
    # Find the similarity score between the top prediction and gold CUI for each mention
    # NOTE: This decouples the output of nn.forward() from the loss tensor returned. Are gradients stored on the outputs or on the model itself?
    pred_ixs = score.argmax(dim=1)
    predicted_similarity = torch.gather(target, 1, pred_ixs.unsqueeze_(dim=1)).squeeze().requires_grad_() 

    # Find max similarity score for each mention of the available candidates
    expected_similarity = torch.max(target, dim=1).values

    # Calculate MSE between the similarity score from the top prediction and the highest candidate similarity
    return torch.nn.functional.mse_loss(expected_similarity, predicted_similarity)