## **ACL-BioNLP'19 - MEDIQA 2019 Shared Task**
Authors: Gonzalo Recio and Jana Reventós 

## Pre - processing steps 

Connect with MyDrive

In [None]:
! ls "My Drive/HLE Final Project"
%cd "My Drive/HLE Final Project/data"

ls: cannot access 'My Drive/HLE Final Project': No such file or directory
[Errno 2] No such file or directory: 'My Drive/HLE Final Project/data'
/gdrive


In [None]:
! ls


biobert
MEDIQA2019-Task3-QA-TestSet-wLabels.xml
MEDIQA2019-Task3-QA-TrainingSet1-LiveQAMed.xml
MEDIQA2019-Task3-QA-TrainingSet2-Alexa.xml
MEDIQA2019-Task3-QA-ValidationSet.xml
MEDIQA_Task3_QA_TestSet.xml
QA_Task3_README.txt


In [2]:
PATH = 'MEDIQA2019_datasets/MEDIQA_Task3_QA'
PATH_EXTRA = 'MedQuAD/'

In [3]:
from xml.dom.minidom import parse, parseString
from nltk import tokenize as tk
import os
import numpy as np

## Read data
Read the data files from the MEDIQA task 3. 


In [4]:
import unicodedata
import re

def preprocess_text(text):
    '''
    Return the normal form for the Unicode string unistring. 
    :param text: string 
    :return: unicoded string
    '''
    s = unicodedata.normalize("NFKD", text.lower())
    return re.sub(r'\[\d\]', '', s)

def get_answers(answers):
    '''
    Return the QA answer with their corresponding true rank, ChiQA rank and reference score. 
    a) SystemRank: corresponds to CHiQA's rank. 
    b) ReferenceRank: corresponds to the correct rank. 
    c) ReferenceScore: is an additional score that we provide only in the training and validation sets, and that corresponds to the manual judgment/rating of the answer [4: Excellent, 3: Correct but Incomplete, 2: Related, 1: Incorrect].  

    :param answers: 
    :return: list of answers, reference rank, system rank and reference score 
    '''
    # return np.array((map(lambda ans: preprocess_text(ans.getElementsByTagName('AnswerText')[0].firstChild.nodeValue), answers)))
    answs, rank, chiqa, y = [], [], [], []
    for answer in answers:
        ans = preprocess_text(answer.getElementsByTagName('AnswerText')[0].firstChild.nodeValue)
        reference = int(answer.getAttribute('ReferenceRank'))
        system = int(answer.getAttribute('SystemRank'))
        label = answer.getAttribute('ReferenceScore')
        answs.append(ans); rank.append(reference); chiqa.append(system); y.append(int(label in ['3','4']))
    return answs, rank, chiqa, y
                                

In [33]:
i = 0
indx2id = []
QA, QA2 = [], []  # QA2 has also system ranks from ChiQA
for filename in os.listdir(PATH_EXTRA + '/'):
    # i += 1
    if any(s in filename for s in ('CDC', 'SeniorHealth',)):
        print(filename)
        dirname = PATH_EXTRA + '/' + filename
        for file in os.listdir(dirname):
            fullname = dirname + '/' + file
            print(fullname)
            tree = parse(fullname)
            questions = tree.getElementsByTagName('QAPair')
            Q, QT, T, A = [], [], [], []
            QTypes = {}
            for question in questions:
                qelem = question.getElementsByTagName('Question')
                q, qid = preprocess_text(qelem[0].firstChild.nodeValue), question.getAttribute('qid')
                qtype = qelem[0].getAttribute('qtype')
                
                a = question.getElementsByTagName('Answer')[0].firstChild.nodeValue
                # print(q, qtype, a) # --> questions
                if qtype not in QTypes:
                    QTypes[qtype] = {'q': q, 'a': [a]}
                else:
                    QTypes[qtype]['a'].append(a)
                Q.append(q); QT.append(q + qtype)

            assert len(set(Q)) == len(set(QT)), 'Error'
            for qtype in QTypes:
                q = QTypes[qtype]['q']
                # positive examples
                ans = QTypes[qtype]['a']
                QA.append([q,ans, [1]*len(ans), [1]*len(ans)])
                # negative examples
                for qtype_other in QTypes:
                    if qtype_other != qtype:
                        ans_wrong = QTypes[qtype_other]['a']
                        QA.append([q,ans_wrong, 
                                   [int(len(ans))+1]*len(ans_wrong), 
                                   [0]*len(ans_wrong)])
            break
print(i)
print(len(QA))
QA

7_SeniorHealth_QA
MedQuAD/7_SeniorHealth_QA/0000001.xml
9_CDC_QA
MedQuAD/9_CDC_QA/0000001.xml
25
74


[['what is (are) age-related macular degeneration ?',
  ['Age-related macular degeneration, also known as AMD, is an eye disease that affects the macula, a part of the retina. The retina sends light from the eye to the brain, and the macula allows you to see fine detail. AMD Blurs Central Vision AMD blurs the sharp central vision you need for straight-ahead activities such as reading, sewing, and driving. AMD causes no pain. How AMD Progresses In some cases, AMD advances so slowly that people notice little change in their vision. In others, the disease progresses faster and may lead to a loss of vision in both eyes. AMD is a common eye condition among people age 50 and older. It is a leading cause of vision loss in older adults. Two Forms of AMD There are two forms of age-related macular degeneration -- dry and wet.',
   'Age-related macular degeneration, or AMD, is a disease that blurs the sharp, central vision you need for straight-ahead activities such as reading, sewing, and drivin

In [4]:
# Make an array of tuples that for each question contains the retrieved answers ranking and labels. 
i = 0
indx2id = []
QA, QA2 = [], []  # QA2 has also system ranks from ChiQA
for filename in os.listdir(PATH + '/'):
    #i += 1
    print(filename)
    if not filename.endswith('.xml') or 'Training' not in filename: continue
    fullname = os.path.join('data/Train', filename)
    tree = parse(PATH + '/' + filename)
    questions = tree.getElementsByTagName('Question')
    for question in questions:
        qelem = question.getElementsByTagName('QuestionText')
        q, qid = preprocess_text(qelem[0].firstChild.nodeValue), question.getAttribute('QID')
        # print(q) # --> questions
        answers = question.getElementsByTagName('Answer')
        answers_list, rank, system, labels = get_answers(answers)
        QA.append([q,answers_list, rank, labels])
        QA2.append([q,answers_list, rank, system, labels])
        indx2id.append(qid); i+=1;
        # break
print(i)
print(len(QA))

MEDIQA2019-Task3-QA-TestSet-wLabels.xml
Task3_README.txt
MEDIQA2019-Task3-QA-ValidationSet.xml
.Rapp.history
MEDIQA2019-Task3-QA-TestSet.xml
MEDIQA2019-Task3-QA-TrainingSet2-Alexa.xml
MEDIQA2019-Task3-QA-TrainingSet1-LiveQAMed.xml
208
208


QA is an array of tuples <Question, [Answers], [Ranking], [Labels] >. In the next section we will display an example: 

In [6]:
question_num = 200
QA[question_num]

['ischemic syncope stroke diagonses. define?',
  'what is ischemic stroke?: a stroke is a medical emergency. there are two types - ischemic and hemorrhagic. ischemic stroke is the most common type. it is usually caused by a blood clot that blocks or plugs a blood vessel in the brain. this keeps blood from flowing to the brain. within minutes, brain cells begin to die. another cause is stenosis, or narrowing of the artery. this can happen because of atherosclerosis, a disease in which plaque builds up inside your arteries. transient ischemic attacks (tias) occur when the blood supply to the brain is interrupted briefly. having a tia can mean you are at risk for having a more serious stroke. symptoms of stroke are - sudden numbness or weakness of the face, arm or leg (especially on one side of the body) - sudden confusion, trouble speaking or understanding speech - sudden trouble seeing in one or both eyes - sudden trouble walking, dizziness, loss of balance or coordination - sudden seve

In [7]:
import unicodedata
import re

#s=QA[201][1][0]
ranked_answ = []
print('Question:',QA[question_num][0])
for i in range(len(QA[question_num][1])):
    answ = QA[question_num][1]
    rank = QA[question_num][2]
    ranked_answ.append((int(rank[i]),answ[i]))

ranked_answ = sorted(ranked_answ, key=lambda x: x[0])
for i in range(len(ranked_answ)):
    print(ranked_answ[i][0],'-',unicodedata.normalize("NFKD", ranked_answ[i][1]))

#s=unicodedata.normalize("NFKD", s)
#re.sub(r'\[\d\]', '', s)

Question: ischemic syncope stroke diagonses. define?
1 - what is ischemic stroke?: a stroke is a medical emergency. there are two types - ischemic and hemorrhagic. ischemic stroke is the most common type. it is usually caused by a blood clot that blocks or plugs a blood vessel in the brain. this keeps blood from flowing to the brain. within minutes, brain cells begin to die. another cause is stenosis, or narrowing of the artery. this can happen because of atherosclerosis, a disease in which plaque builds up inside your arteries. transient ischemic attacks (tias) occur when the blood supply to the brain is interrupted briefly. having a tia can mean you are at risk for having a more serious stroke. symptoms of stroke are - sudden numbness or weakness of the face, arm or leg (especially on one side of the body) - sudden confusion, trouble speaking or understanding speech - sudden trouble seeing in one or both eyes - sudden trouble walking, dizziness, loss of balance or coordination - sudd

## Play with BERT based models

### BERT: 

In [8]:
import torch
from transformers import BertTokenizer, BertModel

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt
# % matplotlib inline

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained("dmis-lab/biobert-large-cased-v1.1")

In [9]:

# Choose a question that we want embeddings for:
text = QA[13][0] 
# Introduce special tokens for BERT
marked_text = "[CLS] " + text #+ " [SEP]"

# Tokenize our sentence with the BERT tokenizer.
tokenized_text = tokenizer.tokenize(marked_text)

In [10]:
# Print tokenized text
tokenized_text

['[CLS]', 'what', 'are', 'the', 'treatments', 'of', 'stroke', '?']

In [11]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )
model = BertModel.from_pretrained('dmis-lab/biobert-large-cased-v1.1',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(58996, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=Fals

In [12]:
tokenized_text = tokenizer.tokenize(marked_text)

print('Tokenized sentence:\n',tokenized_text)
# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
print('Indexed tokens:\n',indexed_tokens)

Tokenized sentence:
 ['[CLS]', 'what', 'are', 'the', 'treatments', 'of', 'stroke', '?']
Indexed tokens:
 [101, 1184, 1132, 1103, 14115, 1104, 6625, 136]


In [13]:
# Mark each of the 22 tokens as belonging to sentence "1".
segments_ids = [1] * len(tokenized_text)
print('Segments ids:\n', segments_ids)


Segments ids:
 [1, 1, 1, 1, 1, 1, 1, 1]


In [16]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

In [17]:
# Run the text through BERT, and collect all of the hidden states produced
# from all 12 layers. 
with torch.no_grad():

    outputs = model(tokens_tensor, segments_tensors)

    # Evaluating the model will return a different number of objects based on 
    # how it's  configured in the `from_pretrained` call earlier. In this case, 
    # becase we set `output_hidden_states = True`, the third item will be the 
    # hidden states from all layers. See the documentation for more details:
    # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
    hidden_states = outputs[2]

In [18]:
token_embeddings = torch.stack(hidden_states, dim=0)
token_embeddings = torch.squeeze(token_embeddings, dim=1)

token_embeddings.size()

torch.Size([25, 8, 1024])

In [19]:
token_vecs = hidden_states[-2][0]

# Calculate the average of all 22 token vectors.
sentence_embedding = torch.mean(token_vecs, dim=0)

In [20]:
def get_bert_sentence_embedding(sentence):
    '''
    Get embeddings of a specific sentence with the BERT model
    :param sentence: string
    :return: sentence embeddings 
    '''
    marked_text = "[CLS] " + sentence + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)

    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1] * len(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)

        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
        hidden_states = outputs[2]

    token_embeddings = torch.stack(hidden_states, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    # print(len(hidden_states.shape))
    token_vecs = hidden_states[-2][0]

    # Calculate the average of all n token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)


    return sentence_embedding

In [24]:
# Question embeddings
print('Question:', QA[1][0])
q = get_bert_sentence_embedding(QA[1][0])
print('Question embedding:', q)

# Answer embeddings
print('\nAnswers embeddings:')
ans = [get_bert_sentence_embedding(a[:512]) for a in QA[1][1]] # 512 is the maximum length
for i in range(len(QA[1][1])):
    print('Answer:',QA[1][1][i])
    print('Embedding:',ans[i])


Question: what causes flu?
Question embedding: tensor([-0.1005,  0.1654,  0.1703,  ...,  0.0799,  0.4777,  0.2435])

Answers embeddings:
Answer: your baby and the flu (information): flu symptoms in infants and toddlers the flu is an infection of the nose, throat, and (sometimes) lungs. call your baby’s health care provider if you notice any of the following signs: - acting tired and cranky much of the time and not feeding well - cough - diarrhea and vomiting - has a fever or feels feverish (if no thermometer available) - runny nose how is the flu treated in babies? children younger than 2 years old will often need to be treated with medicine that fights off the flu virus. this is called antiviral medicine. the medicine works best if started within 48 hours after symptoms begin, if possible. oseltamivir (tamiflu) in liquid form will likely be used. although this drug is not approved for use in children younger than 1 year of age, serious side effects are quite rare. after talking about 

In [29]:
from scipy.spatial.distance import cosine
K = 1
print('Question:',QA[K][0])
print('\nAnwsers scores:')
print('Label,Rank,Similarity')
for i,a in enumerate(ans):
    sim = 1-cosine(q, a)
    print(QA[K][3][i], QA[K][2][i], sim)



Question: what causes flu?

Ansers scores:
Label,Rank,Similarity
0 7 0.9175547957420349
1 1 0.9207508563995361
0 8 0.9164698719978333
1 3 0.9207927584648132
1 2 0.922890305519104
0 5 0.913017988204956
0 6 0.9139269590377808
0 4 0.9235448837280273


## BIO-BERT

 BioBERT is a biomedical language representation model designed for biomedical text mining tasks such as biomedical named entity recognition, relation extraction, question answering, etc. 
 BioBERT is a BERT based model referenced in the paper: [BioBERT: a pre-trained biomedical language representation model for biomedical text mining](https://academic.oup.com/bioinformatics/article/36/4/1234/5566506) 

In [30]:
! git clone https://github.com/dmis-lab/biobert.git

Cloning into 'biobert'...
remote: Enumerating objects: 48, done.[K
remote: Counting objects:   2% (1/48)[Kremote: Counting objects:   4% (2/48)[Kremote: Counting objects:   6% (3/48)[Kremote: Counting objects:   8% (4/48)[Kremote: Counting objects:  10% (5/48)[Kremote: Counting objects:  12% (6/48)[Kremote: Counting objects:  14% (7/48)[Kremote: Counting objects:  16% (8/48)[Kremote: Counting objects:  18% (9/48)[Kremote: Counting objects:  20% (10/48)[Kremote: Counting objects:  22% (11/48)[Kremote: Counting objects:  25% (12/48)[Kremote: Counting objects:  27% (13/48)[Kremote: Counting objects:  29% (14/48)[Kremote: Counting objects:  31% (15/48)[Kremote: Counting objects:  33% (16/48)[Kremote: Counting objects:  35% (17/48)[Kremote: Counting objects:  37% (18/48)[Kremote: Counting objects:  39% (19/48)[Kremote: Counting objects:  41% (20/48)[Kremote: Counting objects:  43% (21/48)[Kremote: Counting objects:  45% (22/48)[Kremote: Countin

In [33]:
! cd biobert

In [36]:
! cd biobert

In [None]:
# Install requirements
! pip install -r biobert/requirements.txt

Collecting tensorflow-gpu==1.15.2
[?25l  Downloading https://files.pythonhosted.org/packages/32/ca/58e40e5077fa2a92004f398d705a288e958434f123938f4ce75ffe25b64b/tensorflow_gpu-1.15.2-cp36-cp36m-manylinux2010_x86_64.whl (411.0MB)
[K     |████████████████████████████████| 411.0MB 41kB/s 
Collecting pandas==0.23
[?25l  Downloading https://files.pythonhosted.org/packages/69/ec/8ff0800b8594691759b78a42ccd616f81e7099ee47b167eb9bbd502c02b9/pandas-0.23.0-cp36-cp36m-manylinux1_x86_64.whl (11.7MB)
[K     |████████████████████████████████| 11.7MB 51.2MB/s 
Collecting gast==0.2.2
  Downloading https://files.pythonhosted.org/packages/4e/35/11749bf99b2d4e3cceb4d55ca22590b0d7c2c62b9de38ac4a4a7f4687421/gast-0.2.2.tar.gz
Collecting tensorflow-estimator==1.15.1
[?25l  Downloading https://files.pythonhosted.org/packages/de/62/2ee9cd74c9fa2fa450877847ba560b260f5d0fb70ee0595203082dafcc9d/tensorflow_estimator-1.15.1-py2.py3-none-any.whl (503kB)
[K     |████████████████████████████████| 512kB 43.2MB/s 


In [None]:
# Download datasets
! ./download.sh

BIOBERT_DATA not set; downloading to default path ('data').
--2020-11-22 16:26:15--  https://docs.google.com/uc?export=download&confirm=SwVU&id=1cGqvAm9IZ_86C4Mj7Zf-w9CFilYVDl8j
Resolving docs.google.com (docs.google.com)... 74.125.20.113, 74.125.20.139, 74.125.20.101, ...
Connecting to docs.google.com (docs.google.com)|74.125.20.113|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-08-bk-docs.googleusercontent.com/docs/securesc/uga5fjll8m55msqajnbmp1tthn7a8guk/6vfb0snrjac6o3rgr889j8677odh5flt/1606062375000/13799006341648886493/08359205414467609423Z/1cGqvAm9IZ_86C4Mj7Zf-w9CFilYVDl8j?e=download [following]
--2020-11-22 16:26:15--  https://doc-08-bk-docs.googleusercontent.com/docs/securesc/uga5fjll8m55msqajnbmp1tthn7a8guk/6vfb0snrjac6o3rgr889j8677odh5flt/1606062375000/13799006341648886493/08359205414467609423Z/1cGqvAm9IZ_86C4Mj7Zf-w9CFilYVDl8j?e=download
Resolving doc-08-bk-docs.googleusercontent.com (doc-08-bk-docs.googleusercontent.c

In [None]:
# Download biobert_v1.1_pubmed and save int into the Colab Content
! export BIOBERT_DIR=./biobert_v1.1_pubmed
! echo $BIOBERT_DIR





In [None]:
! ls -ls

total 313
 4 drwx------ 2 root root  4096 Nov 21 17:42 biocodes
15 -rw------- 1 root root 15237 Nov 21 17:42 create_pretraining_data.py
 1 -rw------- 1 root root   903 Nov 21 17:42 download.sh
14 -rw------- 1 root root 13898 Nov 21 17:42 extract_features.py
 4 drwx------ 2 root root  4096 Nov 21 17:42 figs
 1 -rw------- 1 root root   562 Nov 21 17:42 __init__.py
12 -rw------- 1 root root 12060 Nov 21 17:42 LICENSE
38 -rw------- 1 root root 38084 Nov 21 17:42 modeling.py
 9 -rw------- 1 root root  9191 Nov 21 17:42 modeling_test.py
 7 -rw------- 1 root root  6258 Nov 21 17:42 optimization.py
 2 -rw------- 1 root root  1721 Nov 21 17:42 optimization_test.py
13 -rw------- 1 root root 13150 Nov 21 17:42 README.md
 1 -rw------- 1 root root   294 Nov 21 17:42 requirements.txt
34 -rw------- 1 root root 34783 Nov 21 17:42 run_classifier.py
27 -rw------- 1 root root 26953 Nov 21 17:42 run_ner.py
19 -rw------- 1 root root 18667 Nov 21 17:42 run_pretraining.py
46 -rw------- 1 root root 46789 Nov 

In [None]:
! ls

BioASQ			    modeling.py		  run_pretraining.py
biocodes		    modeling_test.py	  run_qa.py
create_pretraining_data.py  optimization.py	  run_re.py
download.sh		    optimization_test.py  sample_text.txt
extract_features.py	    README.md		  tf_metrics.py
figs			    requirements.txt	  tokenization.py
__init__.py		    run_classifier.py	  tokenization_test.py
LICENSE			    run_ner.py


In [None]:
! export QA_DIR=./BioASQ/
! export OUTPUT_DIR=./qa_outputs/


In [None]:
! export OUTPUT_DIR=./qa_outputs

In [None]:
! mkdir -p $OUTPUT_DIR

mkdir: missing operand
Try 'mkdir --help' for more information.


In [None]:
! python run_qa.py --do_train=True --do_predict=True --vocab_file=$BIOBERT_DIR/vocab.txt --bert_config_file=$BIOBERT_DIR/bert_config.json --init_checkpoint=$BIOBERT_DIR/model.ckpt-1000000 --max_seq_length=384 --train_batch_size=12 --learning_rate=5e-6 --doc_stride=128 --num_train_epochs=5.0 --do_lower_case=False --train_file=$QA_DIR/BioASQ-train-factoid-4b.json --predict_file=$QA_DIR/BioASQ-test-factoid-4b-1.json --output_dir=$OUTPUT_DIR





W1121 09:50:52.776824 140584916920192 module_wrapper.py:139] From run_qa.py:1134: The name tf.logging.set_verbosity is deprecated. Please use tf.compat.v1.logging.set_verbosity instead.


W1121 09:50:52.777034 140584916920192 module_wrapper.py:139] From run_qa.py:1134: The name tf.logging.INFO is deprecated. Please use tf.compat.v1.logging.INFO instead.


W1121 09:50:52.777210 140584916920192 module_wrapper.py:139] From /content/biobert/modeling.py:92: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.

Traceback (most recent call last):
  File "run_qa.py", line 1290, in <module>
    tf.app.run()
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/platform/app.py", line 40, in run
    _run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
  File "/usr/local/lib/python3.6/dist-packages/absl/app.py", line 300, in run
    _run_main(main, args)
  File "/usr/local/lib/python3.6/dist-packages/absl/app.py", line 251, in _run_main
  

# Metrics
The metrics for the MEDIQA 2019 challenge are..
* Accuracy 
* Precision: Number of correct ranked answers divided by the total number of retrieved answers for an specific question.
* Mean reciprocal rank (MRR): Evaluates any process that produces a list of possible responses to a sample of queries, ordered by probability of correctness 
* Spearman's Rank Correlation Coefficient (Spearman’s rho): Penalizes the differences (d) on predicted rank and true rank 

In [None]:
def mean_reciprocal_rank(rs):
    '''
    Evaluates any process that produces a list of possible responses to a sample of queries, ordered by probability of correctness. 
    :param rs: sentence rank
    :return: MRR score
    '''
    rs = (np.asarray(r).nonzero()[0] for r in rs)
    return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])

In [None]:
# rank, entail = QA[0][2:]
reference_ranks = [np.array(q[2]) for q in QA2]
system_ranks = [np.array(q[3]) for q in QA2]

NameError: ignored

In [None]:
predicted = []
for a, b in zip(reference_ranks, system_ranks):
    res = np.array(a)[np.argsort(b)]
    predicted.append([int(i==min(res)) for i in res])

In [None]:
mean_reciprocal_rank(predicted)

0.5924920634920635

In [None]:
def calc_hit_rank(prediction, reference):
    for i, p in enumerate(prediction):
        if reference[p-1] == 1:
            return i+1
    print(prediction)
    print(reference)
    raise ValueError('No reference!')

def mean_reciprocal_rank(predictions, references):
    '''
    Evaluates any process that produces a list of possible responses to a sample of queries, ordered by probability of correctness.
    :param predictions:  model prediction ranks
    :param references: sentence reference ranks
    :return: MRR score
    '''
    assert len(predictions) == len(references)
    ranks = []
    for p, c in zip(predictions, references):
        rank = calc_hit_rank(p, c)
        ranks.append(1.0 / rank)
    return sum(ranks) * 1.0 / len(ranks) 

In [None]:
reference_ranks2 = [(np.array(arr) == 1).astype(np.int64) for arr in reference_ranks]
system_ranks2 = [(np.array(arr) == 1).astype(np.int64) for arr in system_ranks]
mean_reciprocal_rank(system_ranks, reference_ranks)

0.5924920634920635

In [None]:
from scipy.stats import spearmanr

def mean_spearmanr(predictions, references):
    '''
    Penalizes the differences (d) on predicted rank and true rank
    :param predictions: model prediction ranks
    :param references: sentence reference ranks
    :return: Spearmanr score
    '''
    count = 0
    for i in range(len(predictions)):
        count += spearmanr(predictions[i], references[i])[0]

    return count/len(system_ranks)

In [None]:
i = 0
indx2id = []
QA, QA2 = [], []  # QA2 has also system ranks from ChiQA
for filename in os.listdir('./'):
    # i += 1
    print(filename)
    if not filename.endswith('.xml') or 'Labels' not in filename: continue
    fullname = os.path.join('data/Test', filename)
    tree = parse(filename)
    questions = tree.getElementsByTagName('Question')
    for question in questions:
        qelem = question.getElementsByTagName('QuestionText')
        q, qid = preprocess_text(qelem[0].firstChild.nodeValue), question.getAttribute('QID')
        # print(q) # --> questions
        answers = question.getElementsByTagName('Answer')
        answers_list, rank, system, labels = get_answers(answers)
        QA.append([q,answers_list, rank, labels])
        QA2.append([q,answers_list, rank, system, labels])
        indx2id.append(qid); i+=1;
        # break
len(QA2)

NameError: ignored

In [None]:
reference_ranks = [np.array(q[2]) for q in QA2]
system_ranks = [np.array(q[3]) for q in QA2]

In [None]:
mean_spearmanr(system_ranks, reference_ranks)

0.3631053391053391

In [None]:
predicted = []
for a, b in zip(reference_ranks, system_ranks):
    res = np.array(a)[np.argsort(b)]
    predicted.append([int(i==min(res)) for i in res])

In [None]:
def mean_reciprocal_rank(rs):
    rs = (np.asarray(r).nonzero()[0] for r in rs)
    return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])
mean_reciprocal_rank(predicted)

0.6025846560846562

In [26]:
from scipy.stats import spearmanr
from sklearn.metrics import accuracy_score
import unicodedata
import re

class Question(object):
    def __init__(self, q_id, q, a_ids, a, r, s, l):
        self.question_id = q_id
        self.question = q
        self.answer_ids = a_ids
        self.answers = a
        self.reference_rank = r
        self.system_rank = s
        self.labels = l
    
    def __str__(self):
        return f"{self.question}\n  {self.answers}\n  {self.reference_rank}\n  {self.system_rank}\n  {self.labels}"
    
    def __repr__(self):
        return str(self)

class QuestionsAndAnswers(list):
    def __init__(self, dataset='Train'):
        ''' dataset = {Train,Test,Validation} '''
        list.__init__(self)
        p = self.read_dataset(dataset)
        self.extend(self.read_dataset(dataset))
        self.references = [np.array(q.reference_rank) for q in self]
        self.labels = [np.array(q.labels) for q in self]


    def preprocess_text(self, text):
        s = unicodedata.normalize("NFKD", text.lower())
        return re.sub(r'\[\d\]', '', s)

    def get_answers(self, answers):
        # return np.array((map(lambda ans: preprocess_text(ans.getElementsByTagName('AnswerText')[0].firstChild.nodeValue), answers)))
        answs, answs_ids, rank, chiqa, y = [], [], [], [], []
        for answer in answers:
            ans = self.preprocess_text(answer.getElementsByTagName('AnswerText')[0].firstChild.nodeValue)
            a_id = answer.getAttribute('AID')
            reference = int(answer.getAttribute('ReferenceRank'))
            system = int(answer.getAttribute('SystemRank'))
            label = answer.getAttribute('ReferenceScore')
            answs.append(ans); answs_ids.append(a_id); rank.append(reference); chiqa.append(system); y.append(int(label in ['3','4']))
        return answs, answs_ids, rank, chiqa, y

    def read_dataset(self, dataset='Train'):
        i = 0
        indx2id = []
        QA, QA2 = [], []  # QA2 has also system ranks from ChiQA
        if dataset == 'Test': dataset = 'TestSet-wLabels'
        for filename in os.listdir('./'):
            if not filename.endswith('.xml') or dataset not in filename: continue
            tree = parse(filename)
            questions = tree.getElementsByTagName('Question')
            for question in questions:
                qelem = question.getElementsByTagName('QuestionText')
                q, q_id = self.preprocess_text(qelem[0].firstChild.nodeValue), question.getAttribute('QID')
                # print(q) # --> questions
                answers = question.getElementsByTagName('Answer')
                answers_list, a_ids, rank, system, labels = self.get_answers(answers)
                QA.append([q,answers_list, rank, labels])
                question = Question(q_id=q_id, q=q, a_ids=a_ids, a=answers_list, r=rank, s=system, l=labels)
                # QA2.append([q,answers_list, rank, system, labels])
                QA2.append(question)
                indx2id.append(q_id); i+=1;
                # break
        return QA2
    
    def output_predictions(self, predictions, labels):
        assert len(predictions) == len(self)
        print('question_id,answer_id,label')
        for i, p in enumerate(predictions):
            q_id = QA[i].question_id
            answers = QA[i].answer_ids
            # order = np.array(a)[np.argsort(p)]
            order = np.array(answers)[np.array(p)-1]
            lab = labels[i]
            for a_id, l in zip(order,lab):
                print(f"{q_id},{a_id},{int(l)}")
            
    def normalize_sequence(self, seq):
        seq = np.array(seq)
        a = np.argsort(seq)
        seq[a] = list(range(1,len(seq)+1))
        return seq

    def accuracy(self, predictions):
        '''
        Compute accuracy incorrect answers (label values of 0) of the model predictions  
        '''
    
        # Model predictions:
        preds = np.concatenate(predictions)
        # preds = np.concatenate(predictions==0)
        # idx_preds_incorr_answ = [if predictions [i] == 0for i in range(len(pred))]
        
        # Ground truth labels:
        true  = np.concatenate(self.labels) 
        #labels_true_incorrect_answ = [self.labels[index] for index in idx_preds_incorr_answ]
        #true  = np.concatenate(labels_true_incorrect_answ) 
        
        assert len(preds) == len(true), f"{len(preds)}, {len(true)}"
        return accuracy_score(true, preds)

    def precision(self, predictions):
        '''
        Number of correct ranked answers divided by the total number of retrieved answers for an specific question.
        '''
        precisions = []
        num_answers = []
        for i in range(len(predictions)):
            labels = self.labels[i]
            p = self.normalize_sequence([x for j,x in enumerate(predictions[i]) if labels[j]==1])
            r = self.normalize_sequence([x for j,x in enumerate(self.references[i]) if labels[j]==1])
            if len(p) == 0:
                print(predictions[i])
            correct = sum([a == b for a,b in zip(p, r)])
            # for a,b in zip(p, r)
            # num_answers.append(len(p))
            precisions.append(correct/len(p))
        return np.mean(precisions)
        # return np.average(np.array(precisions), weights=num_answers)

    def mean_spearmanr(self, predictions):
        '''
        Penalizes the differences (d) on predicted rank and true rank
        '''
        
        assert len(predictions) == len(self.references)
        count, total = 0, 0
        preds, refs = [], []
        for i in range(len(predictions)):
            labels = self.labels[i]
            assert len(predictions[i]) == len(labels), f"{predictions}, {labels}"
            p = [x for j,x in enumerate(predictions[i]) if labels[j]==1]
            r = [x for j,x in enumerate(self.references[i]) if labels[j]==1]
            preds += p; refs += r
            if len(r) == 1:
                total += 1
                count += 1
            elif len(r) == 0:
                continue
            else:
                total += 1
                count += spearmanr(p, r)[0]
        return spearmanr(preds, refs)[0]
        # return count/total

    def mean_reciprocal_rank(self, predicted):
        '''
        Evaluates any process that produces a list of possible responses to a sample of queries, ordered by probability of correctness
        '''
        rs = []
        for k, (a, b) in enumerate(zip(predicted, self.references)):
            res = np.array(a)[np.argsort(b)]
            labels = QA[k].labels
            res = [r if labels[i]==1 else 100 for i,r in enumerate(res)]
            rs.append([int(i==min(res)) for i in res])  # sets 1 in first ranked answer
        rs = (np.asarray(r).nonzero()[0] for r in rs)
        return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])
    
    

In [27]:
def normalize_sequence(seq):
    seq = np.array(seq)
    a = np.argsort(seq)
    seq[a] = list(range(1,len(seq)+1))
    return seq
normalize_sequence([55, 55,2,1])

array([3, 4, 2, 1])

In [32]:
QA = QuestionsAndAnswers(dataset = 'Test') 

In [33]:
len(QA)

0

In [29]:
QA[0].question_id

IndexError: list index out of range

In [None]:
system_ranks = [q.system_rank for q in QA]
reference_ranks = [q.reference_rank for q in QA]
labels = [q.labels for q in QA]

In [None]:
spearmanr(np.concatenate(system_ranks), np.concatenate(reference_ranks))
spearmanr([7, 9, 1], [2, 3, 1])

SpearmanrResult(correlation=1.0, pvalue=0.0)

In [30]:
QA.mean_reciprocal_rank(system_ranks)

NameError: name 'system_ranks' is not defined

In [None]:
QA.mean_spearmanr(system_ranks)

0.4492904692300491

In [None]:
system_labels = [np.ones(len(l)) for l in labels]
QA.accuracy(system_labels)

0.5167118337850045

In [None]:
QA.precision(system_ranks)

0.4866481481481482

In [None]:
QA.output_predictions(reference_ranks, system_labels)

question_id,answer_id,label
1,1_Answer6,1
1,1_Answer8,1
1,1_Answer7,1
1,1_Answer2,1
1,1_Answer4,1
1,1_Answer3,1
1,1_Answer1,1
2,2_Answer1,1
2,2_Answer2,1
2,2_Answer4,1
2,2_Answer3,1
3,3_Answer5,1
3,3_Answer2,1
3,3_Answer3,1
3,3_Answer6,1
3,3_Answer4,1
3,3_Answer7,1
3,3_Answer8,1
3,3_Answer10,1
3,3_Answer11,1
3,3_Answer9,1
5,5_Answer6,1
5,5_Answer4,1
5,5_Answer5,1
5,5_Answer7,1
5,5_Answer1,1
5,5_Answer2,1
5,5_Answer9,1
6,6_Answer8,1
6,6_Answer9,1
6,6_Answer2,1
6,6_Answer5,1
6,6_Answer6,1
6,6_Answer4,1
6,6_Answer3,1
6,6_Answer7,1
6,6_Answer1,1
7,7_Answer2,1
7,7_Answer1,1
7,7_Answer4,1
7,7_Answer3,1
7,7_Answer6,1
7,7_Answer5,1
7,7_Answer7,1
8,8_Answer1,1
8,8_Answer2,1
8,8_Answer3,1
10,10_Answer2,1
10,10_Answer8,1
10,10_Answer1,1
10,10_Answer7,1
10,10_Answer6,1
10,10_Answer4,1
10,10_Answer5,1
10,10_Answer3,1
12,12_Answer5,1
12,12_Answer1,1
12,12_Answer2,1
12,12_Answer3,1
12,12_Answer4,1
13,13_Answer1,1
13,13_Answer2,1
13,13_Answer4,1
13,13_Answer3,1
14,14_Answer2,1
14,14_Answer1,1
15,15_An

In [None]:
lengths = []
for q in QA:
    for a in q.answers:
        lengths.append(len(a))
        print(len(a))

In [31]:
import pandas as pd
import numpy as np
import scipy
import scipy.stats

class MediqaEvaluator:
    def __init__(self, answer_file_path, task=1, round=1):
        """
        `round` : Holds the round for which the evaluation is being done.
        can be 1, 2...upto the number of rounds the challenge has.
        Different rounds will mostly have different ground truth files.
        """
        self.answer_file_path = answer_file_path
        self.round = round
        self.task = task

    def _evaluate(self, client_payload, _context={}):
        if self.task == 1:
            return self._evaluate_task_1(client_payload, _context)
        elif self.task == 2:
            return self._evaluate_task_2(client_payload, _context)
        elif self.task == 3:
            return self._evaluate_task_3(client_payload, _context)


    def _evaluate_task_1(self, client_payload, _context={}):
        """
        `client_payload` will be a dict with (atleast) the following keys :
          - submission_file_path : local file path of the submitted file
          - aicrowd_submission_id : A unique id representing the submission
          - aicrowd_participant_id : A unique id for participant/team submitting (if enabled)
        """
        submission_file_path = client_payload["submission_file_path"]

        # Result file format: pair_id,label (csv file)

        col_names = ['pair_id', 'label']

        submission = pd.read_csv(submission_file_path, header=None, names=col_names)
        gold_truth = pd.read_csv(self.answer_file_path, header=None, names=col_names)

        # Drop duplicates except for the first occurrence.
        submission = submission.drop_duplicates(['pair_id'])

        submission.label = submission.label.astype(str)
        gold_truth.label = gold_truth.label.astype(str)

        submission['entry'] = submission.apply(lambda x: '_'.join(x), axis=1)
        gold_truth['entry'] = gold_truth.apply(lambda x: '_'.join(x), axis=1)

        s1 = submission[submission['entry'].isin(gold_truth['entry'])]

        accuracy = s1.size / gold_truth.size

        _result_object = {
            "score": accuracy,
            "score_secondary" : 0.0
        }
        return _result_object

    def _evaluate_task_2(self, client_payload, _context={}):
        """
        `client_payload` will be a dict with (atleast) the following keys :
          - submission_file_path : local file path of the submitted file
          - aicrowd_submission_id : A unique id representing the submission
          - aicrowd_participant_id : A unique id for participant/team submitting (if enabled)
        """
        submission_file_path = client_payload["submission_file_path"]

        # Result file format: pair_id,label (csv file)

        col_names = ['pair_id', 'label']

        submission = pd.read_csv(submission_file_path, header=None, names=col_names, dtype={'pair_id': str, "label": str})
        gold_truth = pd.read_csv(self.answer_file_path, header=None, names=col_names, dtype={'pair_id': str, "label": str})

        # Drop duplicates except for the first occurrence.
        submission = submission.drop_duplicates(['pair_id'])

        submission.label = submission.label.astype(str)
        gold_truth.label = gold_truth.label.astype(str)

        submission['entry'] = submission.apply(lambda x: '_'.join(x), axis=1)
        gold_truth['entry'] = gold_truth.apply(lambda x: '_'.join(x), axis=1)

        s1 = submission[submission['entry'].isin(gold_truth['entry'])]

        accuracy = s1.size / gold_truth.size

        _result_object = {
            "score": accuracy,
            "score_secondary" : 0.0
        }

        return _result_object

    def _evaluate_task_3(self, client_payload, _context={}):
        """
        `client_payload` will be a dict with (atleast) the following keys :
          - submission_file_path : local file path of the submitted file
          - aicrowd_submission_id : A unique id representing the submission
          - aicrowd_participant_id : A unique id for participant/team submitting (if enabled)
        """
        submission_file_path = client_payload["submission_file_path"]

        # Result file format: q_id,a_id,label{0/1}

        col_names = ['question_id','answer_id', 'label']

        submission = pd.read_csv(submission_file_path, header=None, names=col_names)
        gold_truth = pd.read_csv(self.answer_file_path, header=None, names=col_names)

        # Drop duplicates except for the first occurrence.
        submission = submission.drop_duplicates(['question_id', 'answer_id'])

        submission.label = submission.label.astype(str)
        gold_truth.label = gold_truth.label.astype(str)

        submission['entry'] = submission.apply(lambda x: '_'.join(map(str,x)), axis=1)
        gold_truth['entry'] = gold_truth.apply(lambda x: '_'.join(map(str,x)), axis=1)

        s1 = submission[submission['entry'].isin(gold_truth['entry'])]

        accuracy = s1.size / gold_truth.size

        question_ids = []
        correct_answers = {}
        for index, row in gold_truth.iterrows():
            qid = row['question_id']

            if qid not in question_ids:
                question_ids.append(qid)

            if row['label'] == '1':
                if qid not in correct_answers:
                    correct_answers[qid] = []

                correct_answers[qid].append(row['answer_id'])

        Pr = 0.
        spearman = 0.
        pv = 0.
        predictedPositive = 0.
        correctPredictedPositive = 0.
        mrr = 0.
        sp_nan_ignoredQs = 0

        for qid in question_ids:
            submitted_correct_answers = []
            index = 1
            first = True
            for _, row in submission[submission['question_id']==qid].iterrows():
                aid = row['answer_id']
                if row['label'] == '1':
                    predictedPositive += 1
                    if aid in correct_answers[qid]:
                        correctPredictedPositive += 1
                        submitted_correct_answers.append(aid)
                        if first:
                            mrr += 1. / index
                            first=False

                index += 1
            matched_gold_subset = []

            for x in correct_answers[qid]:
                if x in submitted_correct_answers:
                    matched_gold_subset.append(x)

            rho, p_value = scipy.stats.spearmanr(submitted_correct_answers, matched_gold_subset)
            if np.isnan(rho):
                rho = 0.0
                sp_nan_ignoredQs += 1
            spearman += rho
            pv += p_value

        question_nb = len(question_ids)
        q_nb_spearman = question_nb - sp_nan_ignoredQs
        spearman = spearman / q_nb_spearman
        Pr = correctPredictedPositive / predictedPositive
        mrr = mrr / question_nb

        if np.isnan(spearman):
            spearman = 0.0

        _result_object = {
            "score": accuracy,
            "score_secondary": spearman,
            "meta" : {
                "MRR": mrr,
                "Precision": Pr
            }
        }
        return _result_object


# Test Tasks 1,2,3
for task in []:
    print("Testing Task (Round-1) : {}".format(task))
    answer_file_path = "data/task{}/ground_truth_round_2.csv".format(task)
    _client_payload = {}
    _client_payload["submission_file_path"] = "data/task{}/sample_submission_round_2.csv".format(task)

    # Instaiate a dummy context
    _context = {}
    # Instantiate an evaluator
    aicrowd_evaluator = MediqaEvaluator(answer_file_path, task=task)
    # Evaluate
    result = aicrowd_evaluator._evaluate(_client_payload, _context)
    print(result)

# Test Tasks 1,2,3 - Round -2
for task in [3]:
    print("Testing Task (Round-2) : {}".format(task))
    answer_file_path = "data/task{}/ground_truth_round_2.csv".format(task)
    _client_payload = {}
    _client_payload["submission_file_path"] = "data/task{}/sample_submission_round_2.csv".format(task)

    # Instaiate a dummy context
    _context = {}
    # Instantiate an evaluator
    aicrowd_evaluator = MediqaEvaluator(answer_file_path, task=task, round=2)
    # Evaluate
    result = aicrowd_evaluator._evaluate(_client_payload, _context)
    print(result)

Testing Task (Round-2) : 3


FileNotFoundError: [Errno 2] File data/task3/sample_submission_round_2.csv does not exist: 'data/task3/sample_submission_round_2.csv'

### **BioELMo**

In [None]:
! pip install tensorflow-gpu==1.2 h5py

Collecting tensorflow-gpu==1.2
[?25l  Downloading https://files.pythonhosted.org/packages/cb/4d/c9c4da41c6d7b9a4949cb9e53c7032d7d9b7da0410f1226f7455209dd962/tensorflow_gpu-1.2.0-cp36-cp36m-manylinux1_x86_64.whl (89.5MB)
[K     |████████████████████████████████| 89.5MB 33kB/s 
Collecting markdown==2.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/ac/99/288a81a38526a42c98b5b9832c6e339ca8d5dd38b19a53abfac7c8037c7f/Markdown-2.2.0.tar.gz (236kB)
[K     |████████████████████████████████| 245kB 41.3MB/s 
Collecting html5lib==0.9999999
[?25l  Downloading https://files.pythonhosted.org/packages/ae/ae/bcb60402c60932b32dfaf19bb53870b29eda2cd17551ba5639219fb5ebf9/html5lib-0.9999999.tar.gz (889kB)
[K     |████████████████████████████████| 890kB 42.1MB/s 
Collecting backports.weakref==1.0rc1
  Downloading https://files.pythonhosted.org/packages/6a/f7/ae34b6818b603e264f26fe7db2bd07850ce331ce2fde74b266d61f4a2d87/backports.weakref-1.0rc1-py3-none-any.whl
Collecting bleach==1.5.0
  

In [None]:
! python setup.py install

python3: can't open file 'setup.py': [Errno 2] No such file or directory


In [None]:
! git clone https://github.com/allenai/bilm-tf.git

Cloning into 'bilm-tf'...
remote: Enumerating objects: 292, done.[K
remote: Total 292 (delta 0), reused 0 (delta 0), pack-reused 292[K
Receiving objects: 100% (292/292), 588.40 KiB | 782.00 KiB/s, done.
Resolving deltas: 100% (137/137), done.


In [None]:
import os
import pandas as pd
from collections import Counter

[0m[01;34mbilm-tf[0m/  [01;34msample_data[0m/


In [None]:
data_train = pd.read_csv('/content/bilm-tf/vocabulary.txt', sep=" ", header=None)
print("Shape of training data = ", data_train.shape)
data_train[0]

Shape of training data =  (1000003, 1)


0                       <S>
1                      </S>
2                     <UNK>
3                        of
4                       the
                 ...       
999998                SgcE6
999999     cubilin-mediated
1000000       syndrome/drug
1000001              11,214
1000002       artery.RESULT
Name: 0, Length: 1000003, dtype: object

In [None]:
if not os.path.exists("/content/bilm-tf/train"):
    os.makedirs("/content/bilm-tf/train")
 
for i in range(0,data_train.shape[0],6):
    text = "\n".join(data_train[0][i:i+6].tolist())
    fp = open("/content/bilm-tf/train/"+str(i)+".txt","w")
    fp.write(text)
    fp.close()

TypeError: ignored