In [1]:
from transformers import BertTokenizer, TFBertForQuestionAnswering
import tensorflow as tf

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = BertTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1-squad")

In [3]:
model = TFBertForQuestionAnswering.from_pretrained("dmis-lab/biobert-base-cased-v1.1-squad", from_pt=True)

All PyTorch model weights were used when initializing TFBertForQuestionAnswering.

All the weights of TFBertForQuestionAnswering were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForQuestionAnswering for predictions without further training.


In [4]:
question = "What reduces risk of Covid-19?"
text = """
Coronavirus (COVID-19) can make anyone seriously ill. But for some people, the risk is higher.
At some point during the COVID-19 pandemic you may have been told you were at high risk of getting seriously ill from COVID-19 (sometimes called clinically vulnerable or clinically extremely vulnerable). You may also have been advised to stay at home (shield).
For most people at high risk from COVID-19, vaccination has significantly reduced this risk. You can follow the same advice as everyone else on how to avoid catching and spreading COVID-19.
Some people continue to be at high risk from COVID-19, despite vaccination.
"""

In [5]:
inputs = tokenizer(question, text, return_tensors="tf")
outputs = model(**inputs)

In [6]:
answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])

In [7]:
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]

In [8]:
tokenizer.decode(predict_answer_tokens)

'vaccination'

Potential approach for TF-IDF cosine similarity:
1. Extraction of titles from all JSON documents.
2. Getting the list of embeddings for all the titles using BERT.
3. Finding the embedding for the input query using BERT
4. Using Cosine similarity, to find the list of similar embeddings to that of input query. This generate the list of titles which are similar to the input query. 

Could extend this to search in the text body after. Recommend starting with a small amount of self-generated text to test approach.

In [9]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [41]:
data = pd.read_csv("Data/clean_pmc.csv", nrows=1000)

In [42]:
not_null_data = data[data['abstract'].notnull()]

In [43]:
not_null_data.shape

(662, 9)

In [13]:
def get_answer(question, text):
    
    inputs = tokenizer(question, text, return_tensors="tf")
    outputs = model(**inputs)
    answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
    answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])
    predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
    predicted_answer = tokenizer.decode(predict_answer_tokens)

    return predicted_answer

In [51]:
question = "What are the risk factors of Covid-19?"

qa_results=[]
for index, row in tqdm(not_null_data.iterrows(), total=not_null_data.shape[0]):

    if len(tokenizer.tokenize(row['abstract'])) < 475:
        
        text = row['abstract']
        predicted_answer = get_answer(question, text)
        
        if predicted_answer != '[CLS]' and predicted_answer != '':
            qa_results.append((index, predicted_answer))

100%|██████████| 662/662 [43:11<00:00,  3.91s/it]  


In [65]:
import joblib
joblib.dump(qa_results, 'outputs/bert_qa_results_500.pkl')

['outputs/bert_qa_results_500.pkl']

In [66]:
joblib.load('outputs/bert_qa_results_500.pkl')

In [55]:
candidate_risk_factors = [
    'age', 'gender', 'sex', 'pneumonia', 'obesity', 'weight', 'diabetes', 'smoking', 'cardiovascular', 'location', 'contact', 'asthma', 'down\'s syndrome', 'cancer', 'sickle cell'
    ]

In [71]:
risk_factor_list=[]
for result in qa_results:
    for risk_factor in candidate_risk_factors:
        if risk_factor in result[1]:
            risk_factor_list.append(risk_factor)
print(risk_factor_list)

['age', 'asthma', 'age', 'age', 'age', 'contact', 'age', 'pneumonia', 'age', 'age', 'age', 'age', 'age', 'age', 'pneumonia', 'age', 'age', 'age', 'age', 'sex', 'age', 'age', 'cancer', 'age', 'age', 'asthma', 'age']


In [72]:
from collections import Counter
Counter(risk_factor_list)

Counter({'age': 20,
         'asthma': 2,
         'contact': 1,
         'pneumonia': 2,
         'sex': 1,
         'cancer': 1})

**Next steps**:

- Take subsample of data (first 5 rows) and try extracting answers. Use abstracts rather than body text to overcome 512 token limit.
- Write code to truncate to 512 tokens for if abstracts are bigger.
- Recreate plan to extract most commonly mentioned risk factors (may have to adjust list of risk factors to match returns from sample data).
- Plan how to return candidate papers once BERT methodology (above steps) is determined.