In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/My Drive/Data Science/Covid-19

/content/drive/My Drive/Data Science/Covid-19


In [3]:
%%time
filepath = 'processed_data/raw_data_comm_use_subset_df.pkl'

import pickle
with open(filepath, 'rb') as f:
    data = pickle.load(f)

CPU times: user 496 ms, sys: 549 ms, total: 1.05 s
Wall time: 10.6 s


In [4]:
from src.text_preprocessing import nltk_NLP, spacy_NLP, STOP_WORDS, text_preprocess
spacy_tokenizer = spacy_NLP('en_core_web_sm').tokenize_API()
nlp_tokenizer = nltk_NLP().tokenize_API()

# from nltk.stem.porter import PorterStemmer
# from nltk.stem.wordnet import WordNetLemmatizer
# nlp_custom_tokenizer = nltk_NLP(stemming=PorterStemmer, lemmatisation=WordNetLemmatizer).custom_API()

text_prep = lambda text: text_preprocess(text, tokenizer=spacy_tokenizer, stopwords=STOP_WORDS)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import pickle
folder = 'processed_data'
filename = 'raw_data_comm_use_subset_corpus'
with open(f'./{folder}/{filename}.pkl', 'rb') as f:
    corpus = pickle.load(f) 

In [6]:
%%time
from src.tf_idf import sklearn_TFIDF
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

sk_tfidf = sklearn_TFIDF()
sk_tfidf.tfidf_corpus(corpus)

Conduct TFIDF for individual documents: 100%|██████████| 9315/9315 [00:37<00:00, 248.20it/s]

CPU times: user 54 s, sys: 850 ms, total: 54.8 s
Wall time: 54.8 s





In [7]:
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'

QA_MODEL = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
QA_TOKENIZER = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
QA_MODEL.to(torch_device)
# QA_MODEL.eval()

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-12,

In [None]:
#tensorflow USE
from os.path import isdir
from os import mkdir
import os

# if not isdir('./tensorflow_USE'): mkdir('./tensorflow_USE') # If folder doesn't exist, create folder
# os.environ["TFHUB_CACHE_DIR"] = './tensorflow_USE' # Point to cache location

import tensorflow as tf
import tensorflow_hub as hub
def embed_useT():
    based_url = module='https://tfhub.dev/google/universal-sentence-encoder-large/'
    if tf.__version__.split('.',1)[0] == '2':
        return lambda x: hub.KerasLayer(based_url+'4')(x)['outputs']
    else:
        with tf.Graph().as_default():
            sentences = tf.compat.v1.placeholder(dtype=tf.string)
            embed = hub.Module(module_url+'3')
            embed_input = embed(sentences)
            session = tf.train.MonitoredSession()
        return lambda x: session.run(embed_input, {sentences: x})

embed_fn = embed_useT()

In [None]:
from src.covid_19_BERT import reconstructText
from src.helper import chunks

def Bert_SQuAD_predict(question, doc):
    seq_ids = QA_TOKENIZER.encode(question, doc)

    doc_tokens = doc.split()
    
    num_split = int(np.ceil(len(seq_ids)*1.1/256))
    if num_split-1 > 0:
        length_words = len(doc_tokens)
        group_num = length_words//num_split
        overlap = int(group_num*1.2//2)
        
        doc_partition = [
            ' '.join(doc_tokens[start:end])
            for start, end in chunks(length_words, group_num, overlap)
        ]
        doc_part_seq_ids = [QA_TOKENIZER.encode(question, dp) for dp in doc_partition]
    else:
        doc_part_seq_ids = [seq_ids]

    answers, confidences = [], []

    for part_seq_ids in doc_part_seq_ids:
        part_seq_tokens = QA_TOKENIZER.convert_ids_to_tokens(part_seq_ids)

        num_seg_a = part_seq_ids.index(QA_TOKENIZER.sep_token_id)+1
        num_seg_b = len(part_seq_ids)-num_seg_a

        segment_ids = [0]*num_seg_a+[1]*num_seg_b
        assert len(segment_ids) == len(part_seq_ids)
        n_ids = len(segment_ids)
        
        if n_ids > 512:
            start_scores, end_scores = QA_MODEL(
                torch.tensor([part_seq_ids[:512]]).to(torch_device), 
                token_type_ids=torch.tensor([segment_ids[:512]]).to(torch_device)
            )
        else:
            start_scores, end_scores = QA_MODEL(
                torch.tensor([part_seq_ids]).to(torch_device), 
                token_type_ids=torch.tensor([segment_ids]).to(torch_device)
            )

        start_scores, end_scores = start_scores[:,1:-1], end_scores[:,1:-1]

        answer_start, answer_end = torch.argmax(start_scores), torch.argmax(end_scores)
        answer = reconstructText(part_seq_tokens, answer_start, answer_end+2)
        if not answer: continue

        if answer.startswith('. ') or answer.startswith(', '): answer = answer[2:]

        answers.append(answer)
        confidences.append(start_scores[0,answer_start].item()+end_scores[0,answer_end].item())

    if not answers: return {'answer': ''}
    
    best_idx = confidences.index(max(confidences))
    confidence = confidences[best_idx]
    answer = answers[best_idx]
    
    seq_tokens = QA_TOKENIZER.convert_ids_to_tokens(seq_ids)
    return {
        'answer': answer,
        'confidence': -1000000 if answer.startswith('[CLS]') or answer.endswith('[SEP]') else confidence,
        'abstract_bert': reconstructText(seq_tokens[seq_tokens.index('[SEP]')+1:])
    }

In [None]:
question_list = [
    "Is the virus transmitted by aerisol, droplets, food, close contact, fecal matter, or water",
    "How long is the incubation period for the virus",
    "Can the virus be transmitted asymptomatically or during the incubation period",
    "What is the quantity of asymptomatic shedding",
    "How does temperature and humidity affect the tramsmission of 2019-nCoV",
    "How long can 2019-nCoV remain viable on inanimate, environmental, or common surfaces",
    "What types of inanimate or environmental surfaces affect transmission, survival, or  inactivation of 2019-nCov",
    "Can the virus be found in nasal discharge, sputum, urine, fecal matter, or blood",
    "What risk factors contribute to the severity of 2019-nCoV",
    "How does hypertension affect patients",
    "How does heart disease affect patients",
    "How does copd affect patients",
    "How does smoking affect 2019-nCoV patients",
    "How does pregnancy affect patients",
    "What are the case fatality rates for 2019-nCoV patients",
    "What is the case fatality rate in Italy",
    "What public health policies prevent or control the spread of 2019-nCoV",
    "Can animals transmit 2019-nCoV",
    "What animal did 2019-nCoV come from",
    "What real-time genomic tracking tools exist",
    "What regional genetic variations (mutations) exist",
    "What effors are being done in asia to prevent further outbreaks",
    "What drugs or therapies are being investigated",
    "What clinical trials for hydroxychloroquine have been completed",
    "What antiviral drug clinical trials have been completed",
    "Are anti-inflammatory drugs recommended",
    "Which non-pharmaceutical interventions limit tramsission",
    "What are most important barriers to compliance",
    "How does extracorporeal membrane oxygenation affect 2019-nCoV patients",
    "What telemedicine and cybercare methods are most effective",
    "How is artificial intelligence being used in real time health delivery",
    "What adjunctive or supportive methods can help patients",
    "What diagnostic tests (tools) exist or are being developed to detect 2019-nCoV",
    "What is being done to increase testing capacity or throughput",
    "What point of care tests are exist or are being developed",
    "What is the minimum viral load for detection",
    "What markers are used to detect or track COVID-19",
    "What collaborations are happening within the research community",
    "What are the major ethical issues related pandemic outbreaks",
    "How do pandemics affect the physical and/or psychological health of doctors and nurses",
    "What strategies can help doctors and nurses cope with stress in a pandemic",
    "What factors contribute to rumors and misinformation",
    "What is the immune system response to 2019-nCoV",
    "Can personal protective equipment prevent the transmission of 2019-nCoV",
    "Can 2019-nCoV infect patients a second time",
    "What is the weighted prevalence of sars-cov-2 or covid-19 in general population"
]

from src.covid_19_tf_idf import sk_tfidf_search
from src.helper import sort_dict

from IPython.display import display, HTML
import numpy as np
import logging; 
logging.basicConfig(); log = logging.getLogger('QA-Bert'); log.setLevel(logging.INFO)
import pandas as pd
for question in question_list:
    display(HTML('<div><b>Query</b>: '+question+'</div>'))

    log.info('Text Preprocess and TFIDF on question')
    question = text_prep(question)
    result_df = sk_tfidf_search(question, data, sk_tfidf)
    
    log.info('Conducting BERT on TFIDF result...')
    bert_results = {}
    for idx, doc in enumerate(result_df.abstract):
        if not doc: continue        
        result = Bert_SQuAD_predict(question, doc)
        if result['answer']:
            bert_results.update(
                {
                    result['confidence']: {
                        'answer': result['answer'],
                        'abstract_bert': result['abstract_bert'],
                        'doc_idx': idx,
                    }
                }
            )

    confidence_list = list(bert_results.keys())
    if confidence_list:
        exp_scores = [
            np.exp(score-max(confidence_list))
            for score in confidence_list
        ]
        total = sum(exp_scores)
        
        bert_results = {
            exp_scores[idx]/total : result
            for idx, (confidence_score, result) in enumerate(bert_results.items())
        }
        
    bert_results = sort_dict(bert_results, 'key', True)
    
    log.info('Conducting sentence forming on Bert result')
    for score, result in bert_results.items():
        if score <= 0 or score > 1 or len(result['answer'])==0 or not 'doc_idx' in result: continue

        full_abs, bert_ans = result['abstract_bert'], result['answer']

        split_abs = full_abs.split(bert_ans)
        start_sentence = split_abs[0][split_abs[0].rfind('.')+1:]

        if len(split_abs) == 1: sentence_end = ''
        else:
            sentance_end_pos = split_abs[1].find('. ')+1
            if sentance_end_pos == 0: sentance_end = split_abs[1]
            else: sentance_end = split_abs[1][:sentance_end_pos]
        
        result['sentence'] = {
            'start': start_sentence,
            'bert_ans': bert_ans,
            'sentance_end': sentance_end
        }
        result['full_sentence'] = ''.join(result['sentence'].values())
        
    bert_results = {
        score: result
        for score, result in bert_results.items()
        if result.get('full_sentence')
    }
    
    log.info('Conducting sementic matching using Tensorflow USE...')
    all_sent_embeddings = embed_fn(
        [question]+[
            result['full_sentence']
            for result in bert_results.values()
            if result.get('full_sentence')
        ]
    )
    
    similarity_matrix = np.inner(all_sent_embeddings, all_sent_embeddings)
    rankings = similarity_matrix[1:,0]
    
    bert_results = {
        rankings[idx]: bert_results[confidence_score]
        for idx, confidence_score in enumerate(bert_results.keys())
    }
    
    bert_results = sort_dict(bert_results, 'key', True)
    
    log.info('Forming final result...')
    final_answer = [
        {
            'paper_id': result_df.iloc[result['doc_idx']]['paper_id'],
            'sentence': '<div>'+''.join(
                [part.strip() if idx!=1 else ' <font color="red">'+part+'</font> ' for idx, part in enumerate(result['sentence'].values())]
            )+'</div>',
            'confidence': useT_score
        }
        for useT_score, result in bert_results.items()
    ]
    display(HTML(pd.DataFrame(final_answer).to_html(render_links=True, escape=False)))

INFO:QA-Bert:Text Preprocess and TFIDF on question
INFO:QA-Bert:Conducting BERT on TFIDF result...
INFO:QA-Bert:Conducting sentence forming on Bert result
INFO:QA-Bert:Conducting sementic matching using Tensorflow USE...
INFO:QA-Bert:Forming final result...


Unnamed: 0,paper_id,sentence,confidence
0,a0acb3cda6288d12ce50136a88c013a3098ee3fd,porcine epidemic diarrhea virus (pedv) is the first viral pathogen confirmed to be widely transmissible in animal food.,0.411964
1,b15e513ac2f5696b1e51324fb0a3118c44a6a9e9,repetitiveness and clustering of contacts are known to be relevant factors influencing the transmission of droplet or contact transmitted diseases .,0.402222
2,56bea8bc53d2703d7d33244508932aa26d1ad442,"a number of pepper-based foods tested positive for pmmv, suggesting dietary origins for this virus. intriguingly, the fecal pmmv was infectious to host plants , suggesting that humans might act as a vehicle for the dissemination of certain plant viruses.",0.390588
3,11ad2acc16067afbf2ce40d422647c3d899ecbd4,"abstract background : the influenza a h1n1 virus can be transmitted via direct, indirect, and airborne route to non-infected subjects when an infected patient coughs, which expels a number of different sized droplets to the surrounding environment as an aerosol .",0.379066


INFO:QA-Bert:Text Preprocess and TFIDF on question
INFO:QA-Bert:Conducting BERT on TFIDF result...
INFO:QA-Bert:Conducting sentence forming on Bert result
INFO:QA-Bert:Conducting sementic matching using Tensorflow USE...
INFO:QA-Bert:Forming final result...


Unnamed: 0,paper_id,sentence,confidence
0,37fe4be4d997e01fa069ab31bbe1a0090356500a,"abstract the incubation period of infectious diseases, the time from infection with a microorganism to onset of disease, is directly relevant to prevention and control. since explicit models of the incubation period enhance our understanding of the spread of disease, previous classic studies were revisited, focusing on the modeling methods employed and paying particular attention to relatively unknown historical efforts. the earliest study on the incubation period of pandemic influenza was published in 1919, providing estimates of the incubation period of spanish flu using the daily incidence on ships departing from several ports in australia.",0.552168
1,210a892deb1c61577f6fba58505fd65356ce6636,"based on the 95th percentile estimate of the incubation period, we recommend that the length of quarantine should be at least 14 days .",0.487088
2,4589d4013cf69c396e0fdb67131022fc11119654,we found a significant association between a longer incubation period and a greater risk of death among human h7n9 cases.,0.441553
3,9f63dea0f76ee477d2e8e5209d40179db431ab1d,"from the fitted distribution, the estimated incubation periods can be longer than 10 days for 8.",0.427425
4,4a077b9696d19b7d7fa3e71560b7fd5f414a4d19,"using the travel history and symptom onset of 88 confirmed cases that were detected outside wuhan in the early outbreak phase, we estimate the mean incubation period to be 6. 4 days (95 % credible interval : 5.",0.390342
5,599f44a88bfd9fcd7cc5b03f3b0bf01c9b3c5ba8,4 days) for rotavirus .,0.384971
6,cfffac30aa716974333312a44475097d94c8f475,we apply this method to data from a previously published literature review on the incubation period of nine respiratory viral infections .,0.317544
7,e26b6173b72a118e2b65869e3ba0cc176a3bb751,6) among cases in saudi arabia .,0.056545


INFO:QA-Bert:Text Preprocess and TFIDF on question
INFO:QA-Bert:Conducting BERT on TFIDF result...
INFO:QA-Bert:Conducting sentence forming on Bert result
INFO:QA-Bert:Conducting sementic matching using Tensorflow USE...
INFO:QA-Bert:Forming final result...


Unnamed: 0,paper_id,sentence,confidence
0,4589d4013cf69c396e0fdb67131022fc11119654,"the incubation period is the delay from infection until onset of symptoms, and varies from person to person .",0.548831
1,210a892deb1c61577f6fba58505fd65356ce6636,"using publicly available event-date data from the ongoing epidemic, the present study investigated the incubation period and other time intervals that govern the epidemiological dynamics of covid-19 infections .",0.480696
2,37fe4be4d997e01fa069ab31bbe1a0090356500a,"the earliest study on the incubation period of pande mic influenza was published in 1919, providing estimates of the incubation period of spanish flu using the daily incidence on ships departing from several ports in australia.",0.439365
3,9f63dea0f76ee477d2e8e5209d40179db431ab1d,the incubation period of hfmd was typically described as about 3-7 days but empirical evidence is lacking.,0.424437
4,4a077b9696d19b7d7fa3e71560b7fd5f414a4d19,"using the travel history and symptom onset of 88 confirmed cases that were detected outside wuhan in the early outbreak phase, we estimate the mean incubation period to be 6. 4 days (95 % credible interval : 5.",0.379779
5,cfffac30aa716974333312a44475097d94c8f475,we apply this method to data from a previously published literature review on the incubation period of nine respiratory viral infections .,0.375922
6,599f44a88bfd9fcd7cc5b03f3b0bf01c9b3c5ba8,0 days (95 % ci 1. 4-2. 4 days ) for rotavirus.,0.315835
7,e26b6173b72a118e2b65869e3ba0cc176a3bb751,6) among cases in saudi arabia .,-0.022052


INFO:QA-Bert:Text Preprocess and TFIDF on question
INFO:QA-Bert:Conducting BERT on TFIDF result...
INFO:QA-Bert:Conducting sentence forming on Bert result
INFO:QA-Bert:Conducting sementic matching using Tensorflow USE...
INFO:QA-Bert:Forming final result...


Unnamed: 0,paper_id,sentence,confidence
0,aa2ed61346b7c005b96d661476a671b555b2a93a,the asymptomatic infection was acquired via healthcare-associated transmission .,0.24253
1,f849d3e71f4d2eae8a1e39802195bc9c06fc30ae,asymptomatic shedding among pediatric surgery patients (psps ) could potentially lead to progression of symptomatic diseases and cause outbreaks of respiratory diseases .,0.235527
2,a204aafa38365dbcc0a26af3ca2c6d3313d7fab2,it is known that asymptomatic excretion of rsv occurs in 15 %-20 % of the | 327 moreira et al. infected healthcare workers (hcws).,0.229659
3,a20dad1dae885e38b8aeadb93c22d14a54c6388a,"abstract background : foodborne norovirus outbreak data in japan from [ 2005 ] [ 2006 ], involving virological surveillance of all symptomatic and asymptomatic individuals, were reanalyzed to estimate the asymptomatic ratio of norovirus infection along with the risk of infection and the probability of virus shedding .",0.223836
4,05257a2230897ea006b3f68dbf0d71e1e7216f55,long-term viral shedding for more than 30 days was significantly associated with prior allogeneic transplantation (p = 0.,0.175452
5,4ea6973e872fb9116a21c6539d2aba2ea5c1337c,"transfection of cells with sirna directed against pkc-δ reduced ace2 shedding by 20 % , while knockdown of pkc-ε was without effect.",0.134274


INFO:QA-Bert:Text Preprocess and TFIDF on question
INFO:QA-Bert:Conducting BERT on TFIDF result...
INFO:QA-Bert:Conducting sentence forming on Bert result
INFO:QA-Bert:Conducting sementic matching using Tensorflow USE...
INFO:QA-Bert:Forming final result...


Unnamed: 0,paper_id,sentence,confidence
0,123ea6f7fb7c9dbde72ca2a6a5f1c5d0986966ab,"m396, cr3014) that target the ace2 binding site of sars-cov failed to bind 2019-ncov spike protein, implying that the difference in the rbd of sars-cov and 2019-ncov has a critical impact for the cross-reactivity of neutralizing antibodies , and that it is still necessary to develop novel monoclonal antibodies that could bind specifically to 2019-ncov rbd.",-0.00912
1,4faf34d795e5ff74a886528e46268af783fe712b,"structural analysis suggests that ace2 from these animals can potentially bind rbd of 2019-ncov, making them all possible natural hosts for the virus .",-0.060918
2,489040d34aa5dc8e6eba3d4e9d3d48f0bcc6061f,"the proposal is a biologic that blocks 2019-ncov entry using a soluble version of the viral receptor, angiotensin-converting enzyme 2 (ace2), fused to an immunoglobulin fc domain (ace2-fc), providing a neutralizing antibody with maximal breath to avoid any viral escape, while also helping to recruit the immune system to build lasting immunity .",-0.113639


INFO:QA-Bert:Text Preprocess and TFIDF on question
INFO:QA-Bert:Conducting BERT on TFIDF result...
INFO:QA-Bert:Conducting sentence forming on Bert result
INFO:QA-Bert:Conducting sementic matching using Tensorflow USE...
INFO:QA-Bert:Forming final result...


Unnamed: 0,paper_id,sentence,confidence
0,4faf34d795e5ff74a886528e46268af783fe712b,2019-ncov is thought to be transmitted through respiratory droplets .,0.004231
1,123ea6f7fb7c9dbde72ca2a6a5f1c5d0986966ab,"m396, cr3014) that target the ace2 binding site of sars-cov failed to bind 2019-ncov spike protein , implying that the difference in the rbd of sars-cov and 2019-ncov has a critical impact for the cross-reactivity of neutralizing antibodies, and that it is still necessary to develop novel monoclonal antibodies that could bind specifically to 2019-ncov rbd.",-0.086897


INFO:QA-Bert:Text Preprocess and TFIDF on question
INFO:QA-Bert:Conducting BERT on TFIDF result...
INFO:QA-Bert:Conducting sentence forming on Bert result
INFO:QA-Bert:Conducting sementic matching using Tensorflow USE...
INFO:QA-Bert:Forming final result...


Unnamed: 0,paper_id,sentence,confidence
0,4faf34d795e5ff74a886528e46268af783fe712b,2019-ncov is thought to be transmitted through respiratory droplets .,0.175589
1,123ea6f7fb7c9dbde72ca2a6a5f1c5d0986966ab,"m396, cr3014) that target the ace2 binding site of sars-co v failed to bind 2019-ncov spike protein , implying that the difference in the rbd of sars-cov and 2019-ncov has a critical impact for the cross-reactivity of neutralizing antibodies, and that it is still necessary to develop novel monoclonal antibodies that could bind specifically to 2019-ncov rbd.",0.153915


INFO:QA-Bert:Text Preprocess and TFIDF on question
INFO:QA-Bert:Conducting BERT on TFIDF result...
INFO:QA-Bert:Conducting sentence forming on Bert result
INFO:QA-Bert:Conducting sementic matching using Tensorflow USE...


In [None]:
from src.covid_19_tf_idf import search_relevant_articles_tf_idf

tf_idf_search = lambda query: search_relevant_articles_tf_idf(
    query = query, 
    n_articles = 10, 
    data_df = data, 
    corpus_doc_tf_idf = corpus_doc_tf_idf, 
    term_doc_freq = term_doc_freq,
    query_preprocess_func = lambda text: text_preprocess(text, tokenizer = spacy_tokenizer, stopwords = STOP_WORDS)
)