In [1]:
import json
class obj:
    # constructor
    def __init__(self, dict1):
        self.__dict__.update(dict1)
args = {
    "candidates": 20,
    "device": 'cpu',
    'output_dir': 'tmp/local',
    "dev_dir":'datasets/development/processed_dev',
    "train_dir":'datasets/development/processed_dev',
    "test_dir":'datasets/development/processed_dev',
    "dictionary_path": 'datasets/development/dev_dictionary.txt',
    "max_length": 25,
    "model_name_or_path": 'dmis-lab/biobert-base-cased-v1.1',
    "batch_size": 16,
    "epochs": 1,
    "loss_fn": "sce",
    "lr": 1e-5,
    "contextualized": False,
    "similarity_type": 'log'
}
args = json.loads(json.dumps(args), object_hook=obj)
vars(args)

from importlib import reload

import pandas as pd
import time
import torch
from tqdm import tqdm
from transformers import (
    AutoModel,
    AutoTokenizer
)

# Local modules
from src.candidateDataset import CandidateDataset
from src.rerankNet import RerankNet
from src.umls import Umls
import src.utils as utils

  from .autonotebook import tqdm as notebook_tqdm
  assert(sum([len(ixs)!=2 for ixs in token_ixs]), f"Offsets not lining up for mention in {file}")


In [2]:
# Initialize
start = time.time()
LOGGER = utils.init_logging()
LOGGER.info(args)
utils.init_seed(42)
bert = AutoModel.from_pretrained(args.model_name_or_path).to(args.device)
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)

# Set loss function
if args.loss_fn=='ce':
    loss_fn = utils.cross_entropy
elif args.loss_fn=='sce':
    loss_fn = utils.similarity_cross_entropy
else:
    raise Exception(f"Invalid loss function {args.loss_fn}")
    
# Build model
model = RerankNet(encoder=bert, 
                      tokenizer=tokenizer,
                      lr=args.lr,
                      device=args.device)

# Load UMLS data
umls = Umls('umls/processed')
LOGGER.info("UMLS data loaded")

# Load dictionary
dictionary = utils.load_dictionary(args.dictionary_path)
LOGGER.info("Dictionary loaded")

# Load training data
train_mentions = utils.load_mentions(args.train_dir)
if True:
    # TODO: Make this a configuration option if it works
    """
    Drop training records where:
        1) The exact mention is mapped to multiple CUIs in the dictionary
        2) The exact mention is mapped to CUI that is not the gold CUI
    Since encodings are non-contextualized, the network will always predict
    the highest similarity between exact mention matches. These training
    examples will only confuse the training.
    """
    name_cuis = utils.load_name_cuis(dictionary)
    consistent_mask = [utils.check_consistent(name_cuis, name,cui) for name,cui in train_mentions[:,:2]]
    LOGGER.info(f"Dropping {len(train_mentions)-sum(consistent_mask)} out of {len(train_mentions)} training records because of inconsistent exact mappings between annotation and dictionary CUIs")
    train_mentions = train_mentions[consistent_mask]
train_set = CandidateDataset(train_mentions, dictionary, model.tokenizer, args.max_length, args.candidates, args.similarity_type, umls) 
train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.batch_size, shuffle=True)

# Load dev data for validation
dev_mentions = utils.load_mentions(args.dev_dir)
LOGGER.info("Mentions loaded")

07/18/2022 02:11:03 PM: [ <__main__.obj object at 0x000002400C5A0310> ]
Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.1 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
07/18/2022 02:11:08 PM: [ UMLS data 

In [3]:
# Training loop
epoch_results = pd.DataFrame([], columns=['acc@1','acc@5','umls_similarity', 'max_acc@1'])
for epoch in range(args.epochs):
        ############## Candidate Generation ##############
        train_candidate_idxs = utils.get_topk_candidates(
                dict_names=list(dictionary[:,0]), 
                mentions=train_mentions, 
                tokenizer=model.tokenizer, 
                encoder=model.encoder, 
                max_length=args.max_length, 
                device=args.device, 
                topk=args.candidates)
                                
        # Add candidates to training dataset
        train_set.set_candidate_idxs(train_candidate_idxs)
        max_acc1 = train_set.max_acc1()
        LOGGER.info('Epoch {}: max possible acc@1 = {}'.format(epoch,max_acc1))

        ###################### Train ######################
        # Train encoder to properly rank candidates
        train_loss = 0
        train_steps = 0
        model.train()
        for i, data in tqdm(enumerate(train_loader), total=len(train_loader), desc=f'Training epoch {epoch}'):
                model.optimizer.zero_grad()
                batch_x, batch_y = data
                batch_pred = model(batch_x)
                loss = loss_fn(batch_pred, batch_y.to(args.device))
                loss.backward()
                model.optimizer.step()
                train_loss += loss.item()
                train_steps += 1

        train_loss = train_loss / (train_steps + 1e-9)
        LOGGER.info('Epoch {}: loss/train_per_epoch={}/{}'.format(epoch,train_loss,epoch))

        #################### Evaluate ####################
        # Get candidates on dev dataset
        dev_candidate_idxs = utils.get_topk_candidates(
                dict_names=list(dictionary[:,0]), 
                mentions=dev_mentions, 
                tokenizer=model.tokenizer, 
                encoder=model.encoder, 
                max_length=args.max_length, 
                device=args.device, 
                topk=5) # Only need top five candidates to evaluate performance

        # Log performance on dev after each epoch
        results = utils.evaluate(dev_mentions, dictionary[dev_candidate_idxs], umls)
        epoch_results.loc[epoch] = (results['acc1'], results['acc5'], results['umls_similarity'], max_acc1)
        LOGGER.info("Epoch {}: acc@1={}".format(epoch,results['acc1']))
        LOGGER.info("Epoch {}: acc@5={}".format(epoch,results['acc5']))
        LOGGER.info("Epoch {}: umls_similarity={}".format(epoch,results['umls_similarity']))

Bulk embedding...: 100%|██████████| 1/1 [00:01<00:00,  1.88s/it]
Bulk embedding...: 100%|██████████| 1/1 [00:00<00:00,  3.35it/s]
07/17/2022 08:32:49 AM: [ Epoch 0: max possible acc@1 = 0.875 ]
Training epoch 0:   0%|          | 0/1 [00:04<?, ?it/s]


NotImplementedError: The operator 'aten::logical_and.out' is not current implemented for the MPS device. If you want this op to be added in priority during the prototype phase of this feature, please comment on https://github.com/pytorch/pytorch/issues/77764. As a temporary fix, you can set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1` to use the CPU as a fallback for this op. WARNING: this will be slower than running natively on MPS.

In [6]:
# Evaluate on test data using best training model
start = time.time()
best_epoch = epoch_results.umls_similarity.argmax()
train_model_path = os.path.join(args.output_dir, "checkpoint_{}".format(best_epoch))
LOGGER.info(f'Loading epoch {best_epoch} model from {train_model_path}')

# Load training model
train_bert = AutoModel.from_pretrained(train_model_path).to(args.device)
train_tokenizer = AutoTokenizer.from_pretrained(train_model_path)

# Load test mentions
test_mentions = utils.load_mentions(args.test_dir)

# Predict topk=5 candidates
candidate_idxs = utils.get_topk_candidates(
        dict_names=list(dictionary[:,0]), 
        mentions=test_mentions, 
        tokenizer=train_tokenizer, 
        encoder=train_bert, 
        max_length=args.max_length, 
        device=args.device, 
        topk=5, # Only need top five candidates to evaluate performance
        doc_dir=None) # Update to allow contextualized embeddings

# Log performance
results = utils.evaluate(test_mentions, dictionary[candidate_idxs], umls)
LOGGER.info("Test result: acc@1={}".format(results['acc1']))
LOGGER.info("Test result: acc@5={}".format(results['acc5']))
LOGGER.info("Test result: umls_similarity={}".format(results['umls_similarity']))

LOGGER.info('Prediction time: ' + utils.format_time(start,time.time()))

# Write output
if not os.path.exists(args.output_dir):
    os.makedirs(args.output_dir)
output_file = os.path.join(args.output_dir,"predictions_eval.json")
with open(output_file, 'w') as f:
    json.dump(results, f, indent=2)

07/17/2022 07:40:22 AM: [ Loading epoch 0 model from tmp/local/checkpoint_0 ]
100%|██████████| 2/2 [00:00<00:00, 1731.75it/s]
Bulk embedding...: 100%|██████████| 1/1 [00:17<00:00, 17.72s/it]
Bulk embedding...: 100%|██████████| 1/1 [00:04<00:00,  4.73s/it]
07/17/2022 07:41:00 AM: [ Test result: acc@1=0.875 ]
07/17/2022 07:41:00 AM: [ Test result: acc@5=0.875 ]
07/17/2022 07:41:00 AM: [ Test result: umls_similarity=0.9166666666666666 ]
07/17/2022 07:41:00 AM: [ Prediction time: 0 hours 0 minutes 38 seconds ]


In [397]:
import src.utils as utils
reload(utils)

  # Check all annotations were fixed


<module 'src.utils' from '/Users/evan/code/thesis/src/utils.py'>

In [2]:
dictionary = utils.load_dictionary('datasets/n2c2/mrconso_dictionary.txt')
train_mentions = utils.load_mentions('datasets/n2c2/processed_train')

100%|██████████| 996820/996820 [00:00<00:00, 1497267.80it/s]
100%|██████████| 50/50 [00:03<00:00, 15.94it/s]


In [3]:
name_cuis = utils.load_name_cuis(dictionary)
consistent_mask = [utils.check_consistent(name_cuis, name,cui) for name,cui in train_mentions[:,:2]]
print(f"Dropping {len(train_mentions)-sum(consistent_mask)} out of {len(train_mentions)} training records because of inconsistent exact mappings between annotation and dictionary CUIs")
train_mentions = train_mentions[consistent_mask]

Dropping 433 out of 6428 training records because of inconsistent exact mappings between annotation and dictionary CUIs


In [None]:
covid_data = [
    ('tinnitus',0,775837),
    ('tinnitus',1,444),
]
covid = pd.DataFrame(covid_data, columns=['alias','in_cohort','omicron_84'])

control_data = [
    ('tinnitus',0,6221709),
    ('tinnitus',1,5596),
]
control = pd.DataFrame(control_data, columns=['alias','in_cohort','omicron_84'])
covid

In [None]:
from scipy.stats import chi2_contingency
for condition in ['tinnitus']:
    df = pd.merge(covid[covid.alias==condition],control[control.alias==condition],on=['alias','in_cohort'], suffixes=['covid','control'])
    df.columns = ['alias','in_cohort','covid','control']
    display(df)
    data = df[['covid','control']].values.T
    chi2, p, dof, ex = chi2_contingency(data)
    print('chi2',chi2)
    print('p',p)
    print('ex',ex)
pd.DataFrame([(condition, chi2, p)])

In [43]:
from scipy.stats import fisher_exact

oddsr, pval = fisher_exact(data)
oddsr, pval

(1.5716488844156822, 1.941220769980683e-22)