## CORD-19 literature mining code for system implementation

In [1]:
## Import libraries

import pandas as pd
import numpy as np
import matplotlib as plt
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
import csv
import os
import json
from nltk import tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import random
import pickle
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.metrics.pairwise import cosine_similarity

## Load models
device = torch.device('cpu')
# sbert embedder
embedder = SentenceTransformer('msmarco-distilbert-base-v4').to(device)
# QA model
tokenizer = AutoTokenizer.from_pretrained("gerardozq/biobert_v1.1_pubmed-finetuned-squad")
model = AutoModelForQuestionAnswering.from_pretrained("gerardozq/biobert_v1.1_pubmed-finetuned-squad").to(device)


## TFIDF processing

filename = 'tfidf_doc'
infile = open(filename,'rb')
tfidf_doc = pickle.load(infile)
infile.close()

doc_list = tfidf_doc['doc_list']
doc_list_word = tfidf_doc['doc_list_word']



def dummy_fun(doc):
    return doc

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)  

tfidf_matrix = tfidf.fit_transform(doc_list)

## SBERT processing

filename = 'sbert_doc'
infile = open(filename,'rb')
sbert_doc = pickle.load(infile)
infile.close()

paracorp = sbert_doc['paracorp']
para_doc = sbert_doc['para_doc']

                
filename = 'emb_corpus300'
infile = open(filename,'rb')
corpus_embeddings1 = pickle.load(infile)
infile.close()

corpus = paracorp



In [2]:
def cord19mining(symptomInput = ['pain']):

    symptomlist = []
    rejectlist = ['covid-19', 'blood test','computerised tomogram','sars-cov-2 test positive',
                  'cerebrovascular accident','electrocardiogram','echocardiogram','troponin increased',
                 'arthralgia','myalgia','hyperhidrosis','paraesthesia','hypoaesthesia','feeling abnormal']
    for symptom in symptomInput:
        if symptom == 'dyspnoea':
            symptom1 = 'dyspnea'
        elif symptom == 'pyrexia':
            symptom1 = 'fever'
        elif symptom == 'injection site erythema':
            symptom1 = 'erythema'
        elif symptom == 'myalgia':
            symptom1 = 'myalgias'
        elif symptom == 'lymphadenopathy':
            symptom1 = 'lymph nodes'
        elif symptom in rejectlist:
            symptom1 = 'pain'
        else:
            symptom1 = symptom
        
        if symptom1 not in symptomlist:
            symptomlist.append(symptom1)
        
    
    output = {}
    i = 1
    
    resultlist_value = torch.empty((0))
    resultlist_indices = torch.empty((0))
    query_symp = []

    for symp in symptomlist:
        # Auto-query generation
        query = 'is ' + str(symp) + ' caused by vaccine a severe adverse effect'
        #queries = ['is fever caused by pfizer vaccine a severe adverse effect']
        # tfidf score
        query_token = tokenizer(query)
        query_vec = tfidf.transform([query_token['input_ids'][1:len(query_token['input_ids'])-1]])
        cosine_sim = cosine_similarity(tfidf_matrix, query_vec)
        tfidf_score = torch.FloatTensor(np.transpose(cosine_sim)[0])

        # sbert score
        top_k = min(5, len(corpus))

        query_embedding = embedder.encode(query, convert_to_tensor=True)
        
        k_coeff = 0.7
        
        # Linear combination of tfidf and sbert scores to find the highest 5 scores
        cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings1)[0]
        combined_scores = k_coeff*cos_scores + (1-k_coeff)*tfidf_score
        top_results = torch.topk(combined_scores, k=top_k)
        resultlist_value = torch.cat((resultlist_value,top_results[0]))
        resultlist_indices = torch.cat((resultlist_indices,top_results[1]))
        query_symp.extend([symp for i in range(5)])
    
    top_results_list = torch.topk(resultlist_value, k=top_k)
    symp_idx = top_results_list[1].tolist()
    query_symp = [query_symp[i] for i in symp_idx]
    qi = 0

    
    for idx in resultlist_indices[top_results_list[1]].int():
        # QA search
        question = 'is ' + query_symp[qi] + ' caused by vaccine a severe adverse effect'
        text = corpus[idx]
        inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt")
        input_ids = inputs["input_ids"].tolist()[0]
        outputs = model(**inputs)
        answer_start_scores = outputs.start_logits
        answer_end_scores = outputs.end_logits
        # Get the most likely beginning of answer with the argmax of the score
        answer_start = torch.argmax(answer_start_scores)
        # Get the most likely end of answer with the argmax of the score
        answer_end = torch.argmax(answer_end_scores) + 1
        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
        
        # Sentiment analyzer
        analyzer = SentimentIntensityAnalyzer()
        sentence = answer 
        vs = analyzer.polarity_scores(sentence)
        
        output[i] = {}
        output[i]['text'] =  para_doc[idx.item()]['text']
        output[i]['title'] =  para_doc[idx.item()]['title']
        output[i]['authors'] =  para_doc[idx.item()]['authors']
        output[i]['publish_time'] = para_doc[idx.item()]['publish_time'].strftime('%m/%d/%Y')     
        output[i]['journal'] =  para_doc[idx.item()]['journal']
        output[i]['sentiment'] =  vs
        i = i + 1
        qi = qi + 1
        
    return output



In [3]:
cord19mining(symptomInput = ['pyrexia', 'injection site warmth'])

{1: {'text': 'Vaccines activate the immune system, which will commonly result in minor side effects, including mild fever and local inflammatory reactions at the site of the injection. This may include redness, swelling, pain, and warmth at the injection sites [1] . These reactions are not a contraindication to receiving the same vaccine in the future, as they do not pose a risk for future allergic reactions to the vaccine. Non-allergic reactions to vaccines also include anxiety-related adverse events that can mimic allergic reactions, and may include breath-holding, hyperventilation, and vasovagal syncope (fainting) (see Table 1 in the Canadian Immunization Guide: Anaphylaxis and other Acute Reactions following Vaccination) [2] .',
  'title': 'COVID-19 vaccine testing & administration guidance for allergists/immunologists from the Canadian Society of Allergy and Clinical Immunology (CSACI)',
  'authors': 'Vander Leek, Timothy K.; Chan, Edmond S.; Connors, Lori; Derfalvi, Beata; Ellis,