## **ACL-BioNLP'19 - MEDIQA 2019 Shared Task**
Authors: Gonzalo Recio and Jana Reventós 

## Models 

Question object and Questions & Answer class:

In [24]:
from scipy.stats import spearmanr
from sklearn.metrics import accuracy_score
from xml.dom.minidom import parse, parseString
from nltk import tokenize as tk
import nltk
import numpy as np
import unicodedata
import re
import os
import csv

class Question(object):
    def __init__(self, q_id, q, a_ids, a, r, s, l):
        self.question_id = q_id
        self.question = q
        self.answer_ids = a_ids
        self.answers = a
        self.reference_rank = r
        self.system_rank = s
        self.labels = l
    
    def __str__(self):
        return f"{self.question}\n  {self.answers}\n  {self.reference_rank}\n  {self.system_rank}\n  {self.labels}"
    
    def __repr__(self):
        return str(self)

class QuestionsAndAnswers(list):
    def __init__(self, dataset='Train'):
        ''' dataset = {Train,Test,Validation} '''
        list.__init__(self)
        self.PATH = 'MEDIQA2019_datasets/MEDIQA_Task3_QA/'
        p = self.read_dataset(dataset)
        self.extend(self.read_dataset(dataset))
        
        self.references = [np.array(q.reference_rank) for q in self]
        self.labels = [np.array(q.labels) for q in self]


    def preprocess_text(self, text):
        s = unicodedata.normalize("NFKD", text.lower())
        return re.sub(r'\[\d\]', '', s)

    def get_answers(self, answers):
        # return np.array((map(lambda ans: preprocess_text(ans.getElementsByTagName('AnswerText')[0].firstChild.nodeValue), answers)))
        answs, answs_ids, rank, chiqa, y = [], [], [], [], []
        for answer in answers:
            ans = self.preprocess_text(answer.getElementsByTagName('AnswerText')[0].firstChild.nodeValue)
            a_id = answer.getAttribute('AID')
            reference = int(answer.getAttribute('ReferenceRank'))
            system = int(answer.getAttribute('SystemRank'))
            label = answer.getAttribute('ReferenceScore')
            answs.append(ans); answs_ids.append(a_id); rank.append(reference); chiqa.append(system); y.append(int(label in ['3','4']))
        return answs, answs_ids, rank, chiqa, y
    
    def get_system_ranks(self):
          return [q.system_rank for q in self]
        
    def get_reference_ranks(self):
          return [q.reference_rank for q in self]
        
    def get_labels(self):
          return [q.labels for q in self]

    def read_dataset(self, dataset='Train'):
        i = 0
        indx2id = []
        QA, QA2 = [], []  # QA2 has also system ranks from ChiQA
        if dataset == 'Test': dataset = 'TestSet-wLabels'
        for filename in os.listdir(self.PATH):
            if not filename.endswith('.xml') or dataset not in filename: continue
            tree = parse(self.PATH + filename)
            questions = tree.getElementsByTagName('Question')
            for question in questions:
                qelem = question.getElementsByTagName('QuestionText')
                q, q_id = self.preprocess_text(qelem[0].firstChild.nodeValue), question.getAttribute('QID')
                # print(q) # --> questions
                answers = question.getElementsByTagName('Answer')
                answers_list, a_ids, rank, system, labels = self.get_answers(answers)
                QA.append([q,answers_list, rank, labels])
                question = Question(q_id=q_id, q=q, a_ids=a_ids, a=answers_list, r=rank, s=system, l=labels)
                # QA2.append([q,answers_list, rank, system, labels])
                QA2.append(question)
                indx2id.append(q_id); i+=1;
                # break
        return QA2

    def output_predictions(self, predictions, labels, file=''):
        assert len(predictions) == len(self)
        print('question_id,answer_id,label')
        with open(f'task3/sample_submission_round_2_{file}.csv', mode='w') as csv_file:
            for i, p in enumerate(predictions):
                q_id = self[i].question_id
                answers = self[i].answer_ids
                assert len(p) == len(answers), f'{len(p)} != {len(answers)}'
                # order = np.array(a)[np.argsort(p)]
                p = self.normalize_sequence(p)
                order = np.array(answers)[np.argsort(p)]
                # order = np.array(answers)[np.array(p)-1]
                lab = labels[i]
                ordered_lab = np.array(lab)[np.argsort(p)]
                if file == '':
                    
                    for a_id, l in zip(order,ordered_lab):
                        print(f"{q_id},{a_id},{int(l)}")
                else:
                    for a_id, l in zip(order,ordered_lab):
                        csv_file.write(f"{q_id},{a_id},{int(l)}\n")
            
    def normalize_sequence(self, seq):
        seq = np.array(seq)
        a = np.argsort(seq)
        seq[a] = list(range(1,len(seq)+1))
        return seq

    def accuracy(self, predictions):
        
        '''
        Compute accuracy incorrect answers (label values of 0) of the model predictions  
        '''
        
        # Model predictions:
        preds = np.concatenate(predictions)
        # preds = np.concatenate(predictions==0)
        # idx_preds_incorr_answ = [if predictions [i] == 0for i in range(len(pred))]
        
        # Ground truth labels:
        true  = np.concatenate(self.labels) 
        #labels_true_incorrect_answ = [self.labels[index] for index in idx_preds_incorr_answ]
        #true  = np.concatenate(labels_true_incorrect_answ) 
        
        assert len(preds) == len(true), f"{len(preds)}, {len(true)}"
        return accuracy_score(true, preds)

    def precision(self, predictions):
        '''
        Number of correct ranked answers divided by the total number of retrieved answers for an specific question.
        '''
        precisions = []
        num_answers = []
        for i in range(len(predictions)):
            labels = self.labels[i]
            p = self.normalize_sequence([x for j,x in enumerate(predictions[i]) if labels[j]==1])
            r = self.normalize_sequence([x for j,x in enumerate(self.references[i]) if labels[j]==1])
            if len(p) == 0:
                print(predictions[i])
            correct = sum([a == b for a,b in zip(p, r)])
            # for a,b in zip(p, r)
            # num_answers.append(len(p))
            precisions.append(correct/len(p))
        return np.mean(precisions)
        # return np.average(np.array(precisions), weights=num_answers)

    def mean_spearmanr(self, predictions):
        '''
        Penalizes the differences (d) on predicted rank and true rank
        '''
        assert len(predictions) == len(self.references)
        count, total = 0, 0
        preds, refs = [], []
        for i in range(len(predictions)):
            labels = self.labels[i]
            assert len(predictions[i]) == len(labels), f"{predictions}, {labels}"
            p = [x for j,x in enumerate(predictions[i]) if labels[j]==1]
            r = [x for j,x in enumerate(self.references[i]) if labels[j]==1]
            preds += p; refs += r
            if len(r) == 1:
                total += 1
                count += 1
            elif len(r) == 0:
                continue
            else:
                total += 1
                count += spearmanr(p, r)[0]
        return spearmanr(preds, refs)[0]
        # return count/total

    def mean_reciprocal_rank(self, predicted):
        '''
        Evaluates any process that produces a list of possible responses to a sample of queries, ordered by probability of correctness
        '''
        rs = []
        for k, (a, b) in enumerate(zip(predicted, self.references)):
            res = np.array(a)[np.argsort(b)]
            labels = self[k].labels
            res = [r if labels[i]==1 else 100 for i,r in enumerate(res)]
            rs.append([int(i==min(res)) for i in res])  # sets 1 in first ranked answer
        rs = (np.asarray(r).nonzero()[0] for r in rs)
        return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])
    
    

In [25]:
QA = QuestionsAndAnswers(dataset = 'Train') 
QA_val = QuestionsAndAnswers(dataset = 'Validation')
QA_test = QuestionsAndAnswers(dataset = 'Test') 

In [26]:
system_ranks = QA_test.get_system_ranks()
reference_ranks = [q.reference_rank for q in QA_test]
labels = [q.labels for q in QA_test]
system_labels = [np.ones(len(l)) for l in labels]

In [27]:
# QA.output_predictions(reference_ranks, labels)
# QA.output_predictions(system_ranks, system_labels)

In [28]:
QA_test.output_predictions(system_ranks, system_labels, file='test2')

question_id,answer_id,label


In [29]:
import evaluator

def evaluate(filename):
    for task in [3]:
        print(f"Testing Task (Round-2) : {task}")
        answer_file_path = f"task{task}/ground_truth_round_2.csv"
        _client_payload = {}
        _client_payload["submission_file_path"] = f"task{task}/sample_submission_round_2_{filename}.csv"

        # Instaiate a dummy context
        _context = {}
        # Instantiate an evaluator
        aicrowd_evaluator = evaluator.MediqaEvaluator(answer_file_path, task=task, round=2)
        # Evaluate
        result = aicrowd_evaluator._evaluate(_client_payload, _context)
        print(result)

evaluate('test2')

Testing Task (Round-2) : 3
Ground truth: task3/ground_truth_round_2.csv
Submission file: task3/sample_submission_round_2_test2.csv
{'score_acc': 0.5167118337850045, 'score_secondary_spearman': 0.3149635036496349, 'meta': {'MRR': 0.895, 'Precision': 0.5167118337850045}}


Baseline

Testing Task (Round-2) : 3
{'score_acc': 0.5167118337850045, 'score_secondary_spearman': 0.3149635036496349, 'meta': {'MRR': 0.895, 'Precision': 0.5167118337850045}}

# Models

#### BioBERT

In [44]:
import torch
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModel


# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt
# % matplotlib inline

# Load pre-trained model tokenizer (vocabulary)
# tokenizer = BertTokenizer.from_pretrained("dmis-lab/biobert-large-cased-v1.1")
# tokenizer = BertTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
tokenizer = BertTokenizer.from_pretrained("biobert/biobert_v1.1_pubmed")
#tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")


# model = BertModel.from_pretrained('dmis-lab/biobert-large-cased-v1.1',
#                                   output_hidden_states = True, # Whether the model returns all hidden-states.
#                                   )

model = BertModel.from_pretrained("biobert/biobert_v1.1_pubmed")

OSError: Can't load config for 'biobert/biobert_v1.1_pubmed'. Make sure that:

- 'biobert/biobert_v1.1_pubmed' is a correct model identifier listed on 'https://huggingface.co/models'

- or 'biobert/biobert_v1.1_pubmed' is the correct path to a directory containing a config.json file



In [32]:
def get_bert_sentence_embedding(sentence):
    marked_text = "[CLS] " + sentence + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)

    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1] * len(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)

        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
        CLS = outputs[0][0]
        hidden_states = outputs[2]

    token_embeddings = torch.stack(hidden_states, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    # print(len(hidden_states.shape))
    token_vecs = hidden_states[-2][0]

    # Calculate the average of all n token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)


    return sentence_embedding

def get_CLS(sentence):
    marked_text = "[CLS] " + sentence + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)

    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1] * len(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    outputs = model(tokens_tensor, segments_tensors)
    CLS = outputs[0][:,0,:]
    return CLS


def get_full_sentence_embedding(sentence):
    embeddings = []
    e = 0
    max_size = 1024#512
    for i in range(int(len(sentence)/max_size)+1):
#         print(i, max_size*(i+1), len(sentence)/max_size)
#         e = get_bert_sentence_embedding(sentence[i*max_size:max_size*(i+1)])
        e = get_CLS(sentence[i*max_size:max_size*(i+1)])
#         print(e)
        embeddings.append(e)
    embedding = torch.mean(torch.stack(embeddings), dim=0)
    print(embedding)
    return embedding

In [36]:
K = 7
q = get_bert_sentence_embedding(QA[K].question)
ans = [get_full_sentence_embedding(a) for a in QA[K].answers] # 512 is the maximum length

NameError: name 'model' is not defined

In [34]:
from scipy.spatial.distance import cosine
print('Label,Rank,Similarity')
for i,a in enumerate(ans):
    sim = 1-cosine(q.detach(), a.detach())
    print(QA.labels[K][i], QA.references[K][i], sim)

Label,Rank,Similarity


NameError: name 'ans' is not defined

In [35]:
marked_text = "[CLS] " + QA[K].question + " [SEP]"
tokenized_text = tokenizer.tokenize(marked_text, add_special_tokens=True)
print(tokenized_text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids = [1] * len(tokenized_text)
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
a = model(tokens_tensor)

NameError: name 'K' is not defined

In [None]:
a[0][:,0,:][0]

In [None]:
# train_features, test_features, train_labels, test_labels = train_test_split(features, labels)
# lr_clf = LogisticRegression()
# lr_clf.fit(train_features, train_labels)

#### Train and test Datasets

In [37]:
flatten   = lambda t: [item for sublist in t for item in sublist]

sentences = [[q.question, a]  for q in QA for a in q.answers]
labels    = flatten([q.labels for q in QA])

sentences_val = [[q.question, a]  for q in QA_val for a in q.answers]
labels_val    = flatten([q.labels for q in QA_val]) 

sentences_test = [[q.question, a]  for q in QA_test for a in q.answers]
labels_test    = flatten([q.labels for q in QA_test]) 

In [38]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

BATCH_SIZE = 8
BATCH_SIZE_TEST = 64
max_len_seq = 512

class MEDIQA_Dataset(Dataset):
    def __init__(self, X, y, transform=None):
        self.X = []
        self.y = np.array(y)
        for q, a in X:
            _q = tokenizer.convert_tokens_to_ids(tokenizer.tokenize("[CLS] question " + q + " [SEP]"))[:max_len_seq]
            _q += [0]*(max_len_seq-len(_q))
            _a = tokenizer.convert_tokens_to_ids(tokenizer.tokenize("[CLS] answer " + a + " [SEP]"))[:max_len_seq]
            _a += [0]*(max_len_seq-len(_a))
            self.X.append([_q, _a])
        self.X = np.array(self.X)
        
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, index):
        score = torch.FloatTensor([self.y[index]])
    
        q = torch.LongTensor(self.X[index][0])
        a = torch.LongTensor(self.X[index][1])
        
        return score, q, a

# Create train dataset
train_dataset = MEDIQA_Dataset(X=sentences, y=labels)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
# Create validation dataset
val_dataset = MEDIQA_Dataset(X=sentences_val, y=labels_val)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
# Create test dataset
test_dataset = MEDIQA_Dataset(X=sentences_test, y=labels_test)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE_TEST, shuffle=False)

In [None]:
# len(sentences)
# max_len_seq = 0
# lenghts = []
# for q,a in sentences:
#     lenghts.append(len(a))
#     if len(a)>max_len_seq:
#         max_len_seq = len(a)
# max_len_seq
# sorted(lenghts, reverse=True)

In [39]:
import torch.nn as nn
from torch.nn import functional as F

class MEDIQA_Model(nn.Module):
    def __init__(self):
        super(MEDIQA_Model, self).__init__()
        self.bert = BertModel.from_pretrained('biobert/biobert_v1.1_pubmed')
        #self.bert = AutoModel.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')
        modules = [self.bert.embeddings, *self.bert.encoder.layer[:-3]] #Replace 5 by what you want
        for module in modules:
            for param in module.parameters():
                param.requires_grad = False
        self.linear1 = nn.Linear(2*self.bert.config.hidden_size, 512)
        self.linear2 = nn.Linear(512, 128)
        self.linear3 = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()


    def forward(self, q, a):
        
#         _, pooled_output = self.bert(tokens, output_all=False)
#         print(q.shape, a.shape)

        CLS1 = self.get_CLS(q)
#       q_vect = self.get_sentence_vectors(q)

        CLS2 = self.get_CLS(a)
#       a_vect = self.get_sentence_vectors(a)
#       print('CLS:', CLS1.shape, CLS2.shape)
#       print('Q vect:', q_vect.shape,a_vect.shape)
        x = torch.cat([CLS1, CLS2], dim=1)
#         print('concat:', x.shape)
        x = F.dropout(x, 0.1)
        x = self.linear1(x)
        x = nn.LeakyReLU(0.1)(x)
        x = F.dropout(x, 0.2)
        x = self.linear2(x)
        x = nn.LeakyReLU(0.1)(x)
        x = F.dropout(x, 0.2)
        x = self.linear3(x)
        prob = self.sigmoid(x)
        return prob, CLS1, CLS2
        #return prob, q_vect, a_vect



    def get_CLS(self, indexed_tokens):
        # Map the token strings to their vocabulary indeces.
#         indexed_tokens = tokenizer.convert_tokens_to_ids(tokeniEARLY_STOPPINGtext)
#         segments_ids = [1] * len(indexed_tokens)
        tokens_tensor = indexed_tokens
#         segments_tensors = torch.tensor([segments_ids])
        outputs = self.bert(tokens_tensor)
        CLS = outputs[0][:,0,:]
        return CLS
    
    def get_sentence_vectors(self, indexed_tokens):
       
        # Token tensor of the sentence       
        tokens_tensor = indexed_tokens
        
        # Model
        hidden_states = self.bert(tokens_tensor)
        
        # Token vectors from BERT
        token_vecs = hidden_states[-2][0]
        
        # Calculate the average of all token vectors.
        sentence_embedding = torch.mean(token_vecs, dim=0)
        #sentence_embedding.size()
        return sentence_embedding
    

    def get_full_sentence_embedding(self, sentence):
        embeddings = []
        e = 0
        max_size = 1024#512
        for i in range(int(len(sentence)/max_size)+1):
    #         print(i, max_size*(i+1), len(sentence)/max_size)
    #         e = get_bert_sentence_embedding(sentence[i*max_size:max_size*(i+1)])
            e = self.get_CLS(sentence[i*max_size:max_size*(i+1)])
    #         print(e)
            embeddings.append(e)
        embedding = torch.mean(torch.stack(embeddings), dim=0)
        print(embedding)
        return embedding


In [10]:
import torch
cpu = torch.device('cpu')
cuda = torch.device('cuda')
device = cuda if torch.cuda.is_available() else cpu

In [11]:
# del bert_clf
# torch.cuda.empty_cache()

In [12]:
import gc
gc.collect()

702

In [13]:
print(torch.cuda.memory_allocated())
print(torch.cuda.max_memory_allocated())

AssertionError: Torch not compiled with CUDA enabled

In [15]:
bert_clf = MEDIQA_Model()
bert_clf = bert_clf.to(device)

NameError: name 'BertModel' is not defined

In [None]:
# len(bert_clf.bert.encoder.layer)
# bert_clf.bert.config.hidden_size

In [None]:
from sklearn.metrics import accuracy_score
def get_test_acc(model, data_loader, true_labels, return_probs_and_labels=False):
    model.eval()
    pred_probs = []
    with torch.no_grad():
        for s,q,a in tqdm.tqdm(data_loader):
            logits, _, _ = model(q.to(device),a.to(device))
            pred_probs.extend(logits.to('cpu'))
        pred_probs  = np.array([x.item() for x in pred_probs])
        pred_labels = (pred_probs > 0.5).astype(np.int16)
    acc = accuracy_score(true_labels, pred_labels)
    if return_probs_and_labels:
        return acc, pred_probs, pred_labels
    else:
        return acc
    

In [None]:
import tqdm
# bert_clf = MEDIQA_Model()
# bert_clf = bert_clf.to(device)
optimizer = torch.optim.Adam(bert_clf.parameters(), lr=8e-5)
bert_clf.train()
EPOCHS = 200
EARLY_STOPPING = 3
loss_func = nn.BCELoss()
def ranking_loss(x1, x2, y):
    def dist(a,b):
        cos = nn.CosineSimilarity(dim=1)
        return 1-cos(a,b)
    margin = torch.tensor(0.5, requires_grad=True).to(device)
    loss = y*dist(x1,x2) + (1-y)*torch.max(torch.tensor(0.0, requires_grad=True).to(device), margin - dist(x1,x2))
    return torch.mean(loss)
train_losses, test_losses, val_losses, val_accs, test_accs = [], [], [], [], []
for epoch_num in range(EPOCHS):
    losses = []
    for step_num, batch_data in enumerate(train_loader):
        y_true, questions, answers = batch_data #tuple(t.to(device) for t in batch_data)
        if questions.shape != answers.shape: continue
        logits, CLS1, CLS2 = bert_clf(questions.to(device), answers.to(device))
        # logits, q_vec, a_vect = bert_clf(questions.to(device), answers.to(device))
        # loss = loss_func(logits, y_true.to(device))
        loss = 0.5*ranking_loss(CLS1, CLS2, y_true.to(device)) + 0.5*loss_func(logits, y_true.to(device))

        
        bert_clf.zero_grad()
        loss.backward()
        print('step', loss.item(), end="\r")
        losses.append(loss.item())
        optimizer.step()
        del y_true
        del questions
        del answers
        torch.cuda.empty_cache()
    
    print()
    print(f'Epoch {epoch_num+1} mean loss:', np.mean(losses))
    train_losses.append(np.mean(losses))
    val_acc, val_probs_labels, _   = get_test_acc(bert_clf, val_loader,  labels_val,  return_probs_and_labels=True)
    test_acc, test_probs_labels, _ = get_test_acc(bert_clf, test_loader, labels_test, return_probs_and_labels=True)
    val_accs.append(val_acc)
    test_accs.append(test_acc)
    
    val_loss  = loss_func(torch.from_numpy(val_probs_labels),  torch.from_numpy(np.array(labels_val, dtype=np.double))).item()
    test_loss = loss_func(torch.from_numpy(test_probs_labels), torch.from_numpy(np.array(labels_test, dtype=np.double))).item()
    val_losses.append(val_loss)
    test_losses.append(test_loss)
    
    print(f'Val acc: ', val_acc, '   Val loss: ', val_loss)
    print(f'Test acc:', test_acc, '  Test loss:', test_loss)
    print()
    
    if len(test_accs) <= 1 or test_acc > max(test_accs[:-1]):
        torch.save(bert_clf.state_dict(), 'checkpoints/model')
    if len(test_accs) > EARLY_STOPPING and test_accs[-(EARLY_STOPPING+1)] > max(test_accs[-EARLY_STOPPING:]):
        print('Early stopping')
        # recover best execution
        model = MEDIQA_Model()
        model.load_state_dict(torch.load('checkpoints/model'))
        break

In [None]:
import matplotlib.pyplot as plt

plt.plot(train_losses, label='train loss')
plt.plot(test_losses, label='test loss')
plt.plot(test_accs, label='test acc')
plt.plot(val_losses, label='val loss')
plt.plot(val_accs, label='val acc')
plt.legend()
plt.savefig('figures/biobert_loss')
plt.show()

In [None]:
# Save model
torch.save(model.state_dict(), 'models/mediqa_model_clinicalbert_finetune_0_layer')

In [None]:
# Load model
# model = MEDIQA_Model()
# model.load_state_dict(torch.load(PATH))
# model.eval()
model = MEDIQA_Model()
model.load_state_dict(torch.load('checkpoints/model'))


In [None]:
acc, probs, y_pred = get_test_acc(model.to(device), test_loader, labels_test, return_probs_and_labels=True)

In [None]:
def get_ranking_predictions(probs, y):
    rankings  = []
    entailed = []
    i_start  = 0
    for i, q in enumerate(QA_test):
        rankings.append(1- np.array(probs[i_start:i_start+len(q.answers)]))
        entailed.append(y[i_start:i_start+len(q.answers)])
        i_start += len(q.answers)
        assert len(rankings[i] == len(QA_test[i].answer_ids))
        assert len(entailed[i] == len(QA_test[i].answers))
    return rankings, entailed

In [None]:
ranking_pred, labels_pred = get_ranking_predictions(probs, y_pred)
QA_test.output_predictions(ranking_pred, labels_pred, file='test_biobert2')

In [None]:
evaluate('test_biobert2')

In [None]:
bert_clf.eval()
pred_labels = []
bert_clf.to(device)
with torch.no_grad():
    for s,q,a in tqdm.tqdm(test_loader):
        logits = bert_clf(q.to(device),a.to(device))
        pred_labels.extend(logits.to('cpu'))
    pred_labels = np.array([x.item() for x in pred_labels])
    pred = (pred_labels > 0.5).astype(np.int16)
pred

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(labels_test, pred)

In [None]:
torch.cat([torch.tensor([1,2,3]), torch.tensor([1,2,3])], dim=0)

In [None]:
torch.tensor([1,2,3]).shape

In [None]:
for b in train_loader:
    print(b)

In [None]:
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine

cpu = torch.device('cpu')
cuda = torch.device('cuda')
device = cuda if torch.cuda.is_available() else cpu

model_names = [
    'bert-base-nli-stsb-mean-tokens',
    'bert-large-nli-stsb-mean-tokens',
    'roberta-base-nli-stsb-mean-tokens',
    'roberta-large-nli-stsb-mean-tokens',
    'distilbert-base-nli-stsb-mean-tokens'
]

for name in model_names:
    model = SentenceTransformer(name)
    model = model.to(cuda)

    representations_a = []
    representations_b = []

    with torch.no_grad():
        for (sent_a, sent_b) in tqdm(test_sentences, desc='Embedding Sentences', ncols=800):
            sentences_embeddings = model.encode([sent_a, sent_b])
            representations_a.append(sentences_embeddings[0])
            representations_b.append(sentences_embeddings[1])

    obtained_scores = []
    for idx, (repr_a, repr_b) in enumerate(zip(representations_a, representations_b)):
        score = 1 - cosine(repr_a, repr_b)
        obtained_scores.append(score)

    corr_score = pearsonr(test_scores[:len(obtained_scores)], obtained_scores)[0]
    print(f'{name}: {corr_score}')

#### BioELMo

In this section we will use BioELMo which is a biomedical version of embeddings from language model (ELMo), pre-trained on PubMed abstracts. 
Pre-training uses 10M recent PubMed abstracts (2.46B tokens in total), and BioELMo achieves an averaged forward and backward perplexity of 31.37 on a held-out test set. 
BioELMo encodes biomedical entity-type and relational information pretty well as shown in the paper [Probing Biomedical Embeddings from Language Models](https://arxiv.org/abs/1904.02181)

BioELMo is used as the same way as ELMO. Source code:
https://docs.allennlp.org/v1.0.0rc5/tutorials/how_to/elmo/
https://github.com/allenai/allennlp/blob/main/allennlp/modules/elmo.py


In [2]:
! pip freeze grep dataclasses
! pip uninstall -y dataclasses

alabaster==0.7.12
allennlp==1.3.0
anaconda-client==1.7.2
anaconda-navigator==1.10.0
anaconda-project==0.8.3
applaunchservices==0.2.1
appnope @ file:///opt/concourse/worker/volumes/live/0291c9e1-4b15-459f-623e-2770f55be269/volume/appnope_1594338395037/work
appscript @ file:///opt/concourse/worker/volumes/live/50ca4c96-3090-40bb-6981-3a6114ed0af4/volume/appscript_1594840187551/work
argh==0.26.2
argon2-cffi @ file:///opt/concourse/worker/volumes/live/59af29ac-4890-416e-7ab7-794f8d6f7ecd/volume/argon2-cffi_1596828548321/work
asn1crypto @ file:///tmp/build/80754af9/asn1crypto_1596577642040/work
astroid @ file:///opt/concourse/worker/volumes/live/21fd14a9-2a7e-484b-7394-5a9912cdcf80/volume/astroid_1592498459180/work
astropy==4.0.2
async-generator==1.10
atomicwrites==1.4.0
attrs @ file:///tmp/build/80754af9/attrs_1604765588209/work
autopep8 @ file:///tmp/build/80754af9/autopep8_1596578164842/work
Babel @ file:///tmp/build/80754af9/babel_1605108370292/work
backcall==0.2.0
ba

In [14]:
! pip install allennlp --upgrade

Requirement already up-to-date: allennlp in /Users/jreventos/opt/anaconda3/lib/python3.8/site-packages (1.3.0)


In [17]:
import sys
print(sys.version)

3.7.4 (v3.7.4:e09359112e, Jul  8 2019, 14:54:52) 
[Clang 6.0 (clang-600.0.57)]


In [18]:
import torch
from allennlp.allennlp.modules.elmo import Elmo, batch_to_ids


ModuleNotFoundError: No module named 'allennlp.version'

In [12]:
cpu = torch.device('cpu')
cuda = torch.device('cuda')
device = cuda if torch.cuda.is_available() else cpu

In [13]:
from allennlp.modules.elmo import Elmo
# elmo = Elmo(
#     options_file='https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json', 
#     weight_file='https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5',
#     num_output_representations=3,
#     dropout=0
    
# )

bioelmo = Elmo(
    options_file='bioelmo/biomed_elmo_options.json', 
    weight_file='bioelmo/biomed_elmo_weights.hdf5',
    num_output_representations=3,
    dropout=0
)
bioelmo = bioelmo.to(device)

ModuleNotFoundError: No module named 'allennlp.modules'

In [None]:
sentences = [['First', 'sentence', '.'], ['Another', '.'], ["I", "ate", "a", "carrot", "for", "breakfast"]]
character_ids = batch_to_ids(sentences).to(device)
embeddings = bioelmo(character_ids)
character_ids

In [None]:
def get_elmo_embedding(sentence):
    tokens = nltk.word_tokenize(sentence)
#     print(tokens)
    sentences = [tokens]
    character_ids = batch_to_ids(sentences).to(device)
    return bioelmo(character_ids)['elmo_representations'][2].mean(dim=0).mean(dim=0)
    

In [None]:
embeddings['elmo_representations'][0].mean(dim=0).mean(dim=0)

In [None]:
K = 0
q = get_elmo_embedding(QA[K].question)
ans = [get_elmo_embedding(a) for a in QA[K].answers]

In [None]:
from scipy.spatial.distance import cosine
print('Label,Rank,Similarity')
for i,a in enumerate(ans):
    sim = 1-cosine(q.detach().cpu(), a.detach().cpu())
    print(QA.labels[K][i], QA.references[K][i], sim)

In [None]:
QA[0].labels

In [None]:
flatten   = lambda t: [item for sublist in t for item in sublist]

sentences = [[q.question, a]  for q in QA for a in q.answers]
labels    = flatten([q.labels for q in QA])

sentences_val = [[q.question, a]  for q in QA_val for a in q.answers]
labels_val    = flatten([q.labels for q in QA_val])

sentences_test = [[q.question, a]  for q in QA_test for a in q.answers]
labels_test    = flatten([q.labels for q in QA_test]) 

In [None]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

BATCH_SIZE = 8
BATCH_SIZE_TEST = 64
max_len_seq = 512

class MEDIQA_Dataset2(Dataset):
    def __init__(self, X, y, transform=None):
        self.X = []
        self.y = np.array(y)
        for q, a in X:
            _q = batch_to_ids([nltk.word_tokenize(q)])
            _a = batch_to_ids([nltk.word_tokenize(a)])
            self.X.append([_q, _a])
#         self.X = np.array(self.X)
        
    def __len__(self):
#         return self.X.shape[0]
        return len(self.X)
    
    def __getitem__(self, index):
        score = torch.FloatTensor([self.y[index]])
    
        q = torch.LongTensor(self.X[index][0])
        a = torch.LongTensor(self.X[index][1])
        
        return score, q, a

# Create train dataset
train_dataset = MEDIQA_Dataset2(X=sentences, y=labels)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
# Create train dataset
val_dataset = MEDIQA_Dataset2(X=sentences_val, y=labels_val)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)
# Create test dataset
test_dataset = MEDIQA_Dataset2(X=sentences_test, y=labels_test)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE_TEST, shuffle=False)

In [None]:
import torch.nn as nn
from torch.nn import functional as F

class MEDIQA_Model_bioELMo(nn.Module):
    def __init__(self):
        super(MEDIQA_Model_bioELMo, self).__init__()
        self.bioelmo = Elmo(
            options_file='bioelmo/biomed_elmo_options.json', 
            weight_file='bioelmo/biomed_elmo_weights.hdf5',
            num_output_representations=3,
            dropout=0
        )
        for param in self.bioelmo.parameters():
            param.requires_grad = False
#         self.bioelmo = bioelmo.to(device)
#         modules = [self.bert.embeddings, *self.bert.encoder.layer[:-1]] #Replace 5 by what you want
#         for module in modules:
#             for param in module.parameters():
#                 param.requires_grad = False
        self.linear1 = nn.Linear(2*1024, 128)
        self.linear2 = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()


    def forward(self, q, a):
        
#         _, pooled_output = self.bert(tokens, output_all=False)
#         print(q.shape, a.shape)
        q_emb = self.get_elmo_embedding(q)
        a_emb = self.get_elmo_embedding(a)
#         print('CLS:', CLS1.shape, CLS2.shape)
        x = torch.cat([q_emb, a_emb], dim=0)
#         print('concat:', x.shape)
        x = F.dropout(x, 0.2)
        x = self.linear1(x)
        x = nn.LeakyReLU(0.1)(x)
        x = F.dropout(x, 0.1)
        x = self.linear2(x)
        prob = self.sigmoid(x)
        return prob, q_emb, a_emb


    def get_elmo_embedding(self, sentence):
#         tokens = nltk.word_tokenize(sentence)
#     print(tokens)
#         sentences = [tokens]
#         character_ids = batch_to_ids(sentence).to(device)
        return self.bioelmo(sentence)['elmo_representations'][2].mean(dim=0).mean(dim=0)

In [None]:
bioelmo_clf = MEDIQA_Model_bioELMo()
bioelmo_clf = bioelmo_clf.to(device)

In [None]:
from sklearn.metrics import accuracy_score
def get_test_acc(model, return_probs_and_labels=False):
    model.eval()
    pred_probs = []
    with torch.no_grad():
        for s,(q,a) in tqdm.tqdm(zip(test_dataset.y, test_dataset.X)):
            logits, _, _ = model(q.to(device),a.to(device))
            pred_probs.extend(logits.to('cpu'))
        pred_probs  = np.array([x.item() for x in pred_probs])
        pred_labels = (pred_probs > 0.5).astype(np.int16)
    acc = accuracy_score(labels_test, pred_labels)
    if return_probs_and_labels:
        return acc, pred_probs, pred_labels
    else:
        return acc

In [None]:
# Train
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

import tqdm
import random
# bert_clf = MEDIQA_Model()
# bert_clf = bert_clf.to(device)
optimizer = torch.optim.Adam(bioelmo_clf.parameters(), lr=1e-3)
bioelmo_clf.train()
EPOCHS = 100
loss_func = nn.BCELoss()

train_losses, test_losses, test_acc = [], [], []
N = len(train_dataset.y)
for epoch_num in range(EPOCHS):
    losses = []
    
    for step_num, batch_data in enumerate(list(zip(train_dataset.y, train_dataset.X))):
        y_true, (questions, answers) = batch_data #tuple(t.to(device) for t in batch_data)
        logits, CLS1, CLS2 = bioelmo_clf(questions.to(device), answers.to(device))
        y_true = torch.from_numpy(np.array([y_true], dtype=np.float32))
        loss = loss_func(logits, y_true.to(device))
        
        bioelmo_clf.zero_grad()
        loss.backward()
        print(f'step {step_num}/{N}', loss.item(), end="\r")
        losses.append(loss.item())
        optimizer.step()
        del y_true
        del questions
        del answers
        torch.cuda.empty_cache()
    print()
    print(f'Epoch {epoch_num+1}:', np.mean(losses))
    train_losses.append(np.mean(losses))
#     acc = get_test_acc(bioelmo_clf)
    acc, probs_labels, _ = get_test_acc(bioelmo_clf, return_probs_and_labels=True)
    test_acc.append(acc)
    test_loss = loss_func(torch.from_numpy(probs_labels), torch.from_numpy(np.array(labels_test, dtype=np.double))).item()
    test_losses.append(test_loss)
    test_acc.append(acc)
    print(f'Test acc:', acc, '  Test loss:', test_loss)
    print()
    
    if len(test_acc) > 5 and test_acc[-6] > max(test_acc[-5:]):
        print('Early stopping')
        break