In [192]:
from scipy.stats import spearmanr
from sklearn.metrics import accuracy_score
from xml.dom.minidom import parse, parseString
from nltk import tokenize as tk
import nltk
import numpy as np
import unicodedata
import re
import os
import csv

class Question(object):
    def __init__(self, q_id, q, a_ids, a, r, s, l):
        self.question_id = q_id
        self.question = q
        self.answer_ids = a_ids
        self.answers = a
        self.reference_rank = r
        self.system_rank = s
        self.labels = l
    
    def __str__(self):
        return f"{self.question}\n  {self.answers}\n  {self.reference_rank}\n  {self.system_rank}\n  {self.labels}"
    
    def __repr__(self):
        return str(self)

class QuestionsAndAnswers(list):
    def __init__(self, dataset='Train'):
        ''' dataset = {Train,Test,Validation} '''
        list.__init__(self)
        self.PATH = 'MEDIQA2019_datasets/MEDIQA_Task3_QA/'
        p = self.read_dataset(dataset)
        self.extend(self.read_dataset(dataset))
        
        self.references = [np.array(q.reference_rank) for q in self]
        self.labels = [np.array(q.labels) for q in self]


    def preprocess_text(self, text):
        s = unicodedata.normalize("NFKD", text.lower())
        return re.sub(r'\[\d\]', '', s)

    def get_answers(self, answers):
        # return np.array((map(lambda ans: preprocess_text(ans.getElementsByTagName('AnswerText')[0].firstChild.nodeValue), answers)))
        answs, answs_ids, rank, chiqa, y = [], [], [], [], []
        for answer in answers:
            ans = self.preprocess_text(answer.getElementsByTagName('AnswerText')[0].firstChild.nodeValue)
            a_id = answer.getAttribute('AID')
            reference = int(answer.getAttribute('ReferenceRank'))
            system = int(answer.getAttribute('SystemRank'))
            label = answer.getAttribute('ReferenceScore')
            answs.append(ans); answs_ids.append(a_id); rank.append(reference); chiqa.append(system); y.append(int(label in ['3','4']))
        return answs, answs_ids, rank, chiqa, y
    
    def get_system_ranks(self):
          return [q.system_rank for q in self]
        
    def get_reference_ranks(self):
          return [q.reference_rank for q in self]
        
    def get_labels(self):
          return [q.labels for q in self]

    def read_dataset(self, dataset='Train'):
        i = 0
        indx2id = []
        QA, QA2 = [], []  # QA2 has also system ranks from ChiQA
        if dataset == 'Test': dataset = 'TestSet-wLabels'
        for filename in os.listdir(self.PATH):
            if not filename.endswith('.xml') or dataset not in filename: continue
            tree = parse(self.PATH + filename)
            questions = tree.getElementsByTagName('Question')
            for question in questions:
                qelem = question.getElementsByTagName('QuestionText')
                q, q_id = self.preprocess_text(qelem[0].firstChild.nodeValue), question.getAttribute('QID')
                # print(q) # --> questions
                answers = question.getElementsByTagName('Answer')
                answers_list, a_ids, rank, system, labels = self.get_answers(answers)
                QA.append([q,answers_list, rank, labels])
                question = Question(q_id=q_id, q=q, a_ids=a_ids, a=answers_list, r=rank, s=system, l=labels)
                # QA2.append([q,answers_list, rank, system, labels])
                QA2.append(question)
                indx2id.append(q_id); i+=1;
                # break
        return QA2

    def output_predictions(self, predictions, labels, file=''):
        assert len(predictions) == len(self)
        print('question_id,answer_id,label')
        with open(f'task3/sample_submission_round_2_{file}.csv', mode='w') as csv_file:
            for i, p in enumerate(predictions):
                q_id = QA[i].question_id
                answers = QA[i].answer_ids
                # order = np.array(a)[np.argsort(p)]
                order = np.array(answers)[np.argsort(p)]
                # order = np.array(answers)[np.array(p)-1]
                lab = labels[i]
                ordered_lab = np.array(lab)[np.argsort(p)]
                if file == '':
                    
                    for a_id, l in zip(order,ordered_lab):
                        print(f"{q_id},{a_id},{int(l)}")
                else:
                    for a_id, l in zip(order,ordered_lab):
                        csv_file.write(f"{q_id},{a_id},{int(l)}\n")
            
    def normalize_sequence(self, seq):
        seq = np.array(seq)
        a = np.argsort(seq)
        seq[a] = list(range(1,len(seq)+1))
        return seq

    def accuracy(self, predictions):
        preds = np.concatenate(predictions)
        true  = np.concatenate(self.labels) 
        assert len(preds) == len(true), f"{len(preds)}, {len(true)}"
        return accuracy_score(true, preds)

    def precision(self, predictions):
        precisions = []
        num_answers = []
        for i in range(len(predictions)):
            labels = self.labels[i]
            p = self.normalize_sequence([x for j,x in enumerate(predictions[i]) if labels[j]==1])
            r = self.normalize_sequence([x for j,x in enumerate(self.references[i]) if labels[j]==1])
            if len(p) == 0:
                print(predictions[i])
            correct = sum([a == b for a,b in zip(p, r)])
            # for a,b in zip(p, r)
            # num_answers.append(len(p))
            precisions.append(correct/len(p))
        return np.mean(precisions)
        # return np.average(np.array(precisions), weights=num_answers)

    def mean_spearmanr(self, predictions):
        assert len(predictions) == len(self.references)
        count, total = 0, 0
        preds, refs = [], []
        for i in range(len(predictions)):
            labels = self.labels[i]
            assert len(predictions[i]) == len(labels), f"{predictions}, {labels}"
            p = [x for j,x in enumerate(predictions[i]) if labels[j]==1]
            r = [x for j,x in enumerate(self.references[i]) if labels[j]==1]
            preds += p; refs += r
            if len(r) == 1:
                total += 1
                count += 1
            elif len(r) == 0:
                continue
            else:
                total += 1
                count += spearmanr(p, r)[0]
        return spearmanr(preds, refs)[0]
        # return count/total

    def mean_reciprocal_rank(self, predicted):
        rs = []
        for k, (a, b) in enumerate(zip(predicted, self.references)):
            res = np.array(a)[np.argsort(b)]
            labels = QA[k].labels
            res = [r if labels[i]==1 else 100 for i,r in enumerate(res)]
            rs.append([int(i==min(res)) for i in res])  # sets 1 in first ranked answer
        rs = (np.asarray(r).nonzero()[0] for r in rs)
        return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])
    
    

In [86]:
QA = QuestionsAndAnswers(dataset = 'Test') 

In [87]:
system_ranks = QA.get_system_ranks()
reference_ranks = [q.reference_rank for q in QA]
labels = [q.labels for q in QA]
system_labels = [np.ones(len(l)) for l in labels]

In [88]:
# QA.output_predictions(reference_ranks, labels)
# QA.output_predictions(system_ranks, system_labels)

In [89]:
QA.output_predictions(system_ranks, system_labels, file='test2')

question_id,answer_id,label


In [90]:
import evaluator

for task in [3]:
    print("Testing Task (Round-2) : {}".format(task))
    answer_file_path = "task{}/ground_truth_round_2.csv".format(task)
    _client_payload = {}
    _client_payload["submission_file_path"] = "task{}/sample_submission_round_2_test2.csv".format(task)

    # Instaiate a dummy context
    _context = {}
    # Instantiate an evaluator
    aicrowd_evaluator = evaluator.MediqaEvaluator(answer_file_path, task=task, round=2)
    # Evaluate
    result = aicrowd_evaluator._evaluate(_client_payload, _context)
    print(result)

Testing Task (Round-2) : 3
Ground truth: task3/ground_truth_round_2.csv
Submission file: task3/sample_submission_round_2_test2.csv
{'score_acc': 0.5167118337850045, 'score_secondary_spearman': 0.3149635036496349, 'meta': {'MRR': 0.895, 'Precision': 0.5167118337850045}}


Baseline

Testing Task (Round-2) : 3
{'score_acc': 0.5167118337850045, 'score_secondary_spearman': 0.3149635036496349, 'meta': {'MRR': 0.895, 'Precision': 0.5167118337850045}}

# Models

#### BioBERT

In [91]:
import torch
from transformers import BertTokenizer, BertModel

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt
# % matplotlib inline

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained("dmis-lab/biobert-large-cased-v1.1")
model = BertModel.from_pretrained('dmis-lab/biobert-large-cased-v1.1',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

In [210]:
def get_bert_sentence_embedding(sentence):
    marked_text = "[CLS] " + sentence + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)

    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1] * len(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)

        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
        hidden_states = outputs[2]

    token_embeddings = torch.stack(hidden_states, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    # print(len(hidden_states.shape))
    token_vecs = hidden_states[-2][0]

    # Calculate the average of all n token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)


    return sentence_embedding

def get_full_sentence_embedding(sentence):
    embeddings = []
    e = 0
    max_size = 1024#512
    for i in range(int(len(sentence)/max_size)+1):
#         print(i, max_size*(i+1), len(sentence)/max_size)
        e = get_bert_sentence_embedding(sentence[i*max_size:max_size*(i+1)])
#         print(e)
        embeddings.append(e)
    embedding = torch.mean(torch.stack(embeddings), dim=0)
    print(embedding)
    return embedding

In [211]:
K = 7
q = get_bert_sentence_embedding(QA[K].question)
ans = [get_full_sentence_embedding(a) for a in QA[K].answers] # 512 is the maximum length

tensor([ 0.0133,  0.2374,  0.0653,  ..., -0.0812,  0.2875,  0.0144])
tensor([0.1322, 0.2805, 0.0596,  ..., 0.1509, 0.1216, 0.2607])
tensor([-0.0759,  0.2243,  0.0005,  ..., -0.0341,  0.0171,  0.0987])
tensor([-0.0166,  0.0915,  0.0515,  ...,  0.1757,  0.0220,  0.1903])
tensor([-0.0189,  0.0753,  0.0633,  ...,  0.0618,  0.0875,  0.1337])
tensor([0.0053, 0.1166, 0.0481,  ..., 0.0850, 0.0591, 0.1093])
tensor([-0.0180,  0.2393,  0.1346,  ...,  0.0838,  0.1778,  0.2939])
tensor([ 0.0107,  0.2480,  0.1055,  ..., -0.1089,  0.2255,  0.2621])


In [212]:
from scipy.spatial.distance import cosine
print('Label,Rank,Similarity')
for i,a in enumerate(ans):
    sim = 1-cosine(q, a)
    print(QA.labels[K][i], QA.references[K][i], sim)

Label,Rank,Similarity
1 2 0.96265709400177
0 8 0.9442784786224365
1 1 0.9374319911003113
0 7 0.940800130367279
0 6 0.9403217434883118
1 4 0.9442769289016724
0 5 0.9455589056015015
1 3 0.9560166001319885


#### BioELMo

https://docs.allennlp.org/v1.0.0rc5/tutorials/how_to/elmo/

https://github.com/allenai/allennlp/blob/main/allennlp/modules/elmo.py

In [161]:
from allennlp.modules.elmo import Elmo, batch_to_ids

In [162]:
# ! pip install allennlp
# ! pip install allennlp-models

In [163]:
from allennlp.modules.elmo import Elmo
elmo = Elmo(
    options_file='https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json', 
    weight_file='https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5',
    num_output_representations=3,
    dropout=0
    
)

bioelmo = Elmo(
    options_file='bioelmo/biomed_elmo_options.json', 
    weight_file='bioelmo/biomed_elmo_weights.hdf5',
    num_output_representations=3,
    dropout=0
)

In [176]:
sentences = [['First', 'sentence', '.'], ['Another', '.'], ["I", "ate", "a", "carrot", "for", "breakfast"]]
character_ids = batch_to_ids(sentences)
embeddings = bioelmo(character_ids)


In [203]:
def get_elmo_embedding(sentence):
    tokens = nltk.word_tokenize(sentence)
#     print(tokens)
    sentences = [tokens]
    character_ids = batch_to_ids(sentences)
    return bioelmo(character_ids)['elmo_representations'][2].mean(dim=0).mean(dim=0)
    

In [194]:
embeddings['elmo_representations'][0].mean(dim=0).mean(dim=0)

tensor([-0.2065, -0.2165,  0.1309,  ...,  0.0792,  0.1241,  0.0947],
       grad_fn=<MeanBackward1>)

In [206]:
K = 2
q = get_elmo_embedding(QA[K].question)
ans = [get_elmo_embedding(a) for a in QA[K].answers]

In [207]:
from scipy.spatial.distance import cosine
print('Label,Rank,Similarity')
for i,a in enumerate(ans):
    sim = 1-cosine(q.detach(), a.detach())
    print(QA.labels[K][i], QA.references[K][i], sim)

Label,Rank,Similarity
0 4 0.7380754351615906
1 1 0.7956207394599915
1 2 0.7879546284675598
0 5 0.7838222980499268
1 3 0.714128315448761
0 6 0.7684296369552612
0 7 0.6464023590087891
0 9 0.6179548501968384
0 10 0.6289558410644531
0 8 0.7213234305381775
