# CORD-19 Literature Mining


## Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sentence_transformers import SentenceTransformer
from nltk import word_tokenize
from nltk.corpus import stopwords
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
import csv
import os
import json
import random
from sentence_transformers import util
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
print("Versions")
print("============================")
print("pandas: ",pd.__version__)
print("numpy: ",np.__version__)
print("matplotlib: ",plt.__version__)


Versions
pandas:  1.3.0
numpy:  1.19.2
matplotlib:  3.4.2


## Data preprocessing
### Filter & clean metacsv

In [3]:
# Import metacsv
date_cols = ['publish_time']
data = pd.read_csv(r'.\Cord19\archive/metadata.csv',
                  usecols=['cord_uid','source_x','title','license','publish_time',
                           'abstract','authors','journal','pdf_json_files'], parse_dates=date_cols)
data.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,cord_uid,source_x,title,license,abstract,publish_time,authors,journal,pdf_json_files
0,ug7v899j,PMC,Clinical features of culture-proven Mycoplasma...,no-cc,OBJECTIVE: This retrospective chart review des...,2001-07-04,"Madani, Tariq A; Al-Ghamdi, Aisha A",BMC Infect Dis,document_parses/pdf_json/d1aafb70c066a2068b027...
1,02tnwd4m,PMC,Nitric oxide: a pro-inflammatory mediator in l...,no-cc,Inflammatory diseases of the respiratory tract...,2000-08-15,"Vliet, Albert van der; Eiserich, Jason P; Cros...",Respir Res,document_parses/pdf_json/6b0567729c2143a66d737...
2,ejv2xln0,PMC,Surfactant protein-D and pulmonary host defense,no-cc,Surfactant protein-D (SP-D) participates in th...,2000-08-25,"Crouch, Erika C",Respir Res,document_parses/pdf_json/06ced00a5fc04215949aa...
3,2b73a28n,PMC,Role of endothelin-1 in lung disease,no-cc,Endothelin-1 (ET-1) is a 21 amino acid peptide...,2001-02-22,"Fagan, Karen A; McMurtry, Ivan F; Rodman, David M",Respir Res,document_parses/pdf_json/348055649b6b8cf2b9a37...
4,9785vg6d,PMC,Gene expression in epithelial cells in respons...,no-cc,Respiratory syncytial virus (RSV) and pneumoni...,2001-05-11,"Domachowske, Joseph B; Bonville, Cynthia A; Ro...",Respir Res,document_parses/pdf_json/5f48792a5fa08bed9f560...


In [4]:
# Filter by date
start_date = "2021-1-1"
end_date = "2022-1-31"

after_start_date = data["publish_time"] >= start_date
before_end_date = data["publish_time"] <= end_date
between_two_dates = after_start_date & before_end_date
data1 = data.loc[between_two_dates]
print(f'Number of articles: {(len(data1))}')

# Data cleaning: remove missing abstracts, missing pdf json, duplicate abstract and uid
len1 = len(data1)
data1.dropna(axis=0,how='any',subset=['abstract'],inplace=True)
print(f'Dropped {len1-len(data1)} articles with missing abstract')

len1 = len(data1)
data1.dropna(axis=0,how='any',subset=['pdf_json_files'],inplace=True)
print(f'Dropped {len1-len(data1)} articles with missing pdf json')

len1 = len(data1)
data1.dropna(axis=0,how='any',subset=['journal'],inplace=True)
print(f'Dropped {len1-len(data1)} articles with missing journal name')

len1 = len(data1)
data1.dropna(axis=0,how='any',subset=['authors'],inplace=True)
print(f'Dropped {len1-len(data1)} articles with missing authors')

len1 = len(data1)
data1.dropna(axis=0,how='any',subset=['publish_time'],inplace=True)
print(f'Dropped {len1-len(data1)} articles with missing publish time')

len1 = len(data1)
data1.dropna(axis=0,how='any',subset=['title'],inplace=True)
print(f'Dropped {len1-len(data1)} articles with missing title')

len1 = len(data1)
data1.drop_duplicates(subset=['abstract'],keep='first',inplace=True)
data1.drop_duplicates(subset=['cord_uid'],keep='first',inplace=True)
print(f'Dropped {len1-len(data1)} articles with duplicate abstract and uid')

print(f'Data rows: {len(data1)}')

Number of articles: 311274
Dropped 66738 articles with missing abstract
Dropped 151478 articles with missing pdf json
Dropped 9525 articles with missing journal name
Dropped 80 articles with missing authors
Dropped 0 articles with missing publish time
Dropped 0 articles with missing title

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)



Dropped 166 articles with duplicate abstract and uid
Data rows: 83287


In [5]:
# tokenize abstract for data search

data1['abstract'] = data1['abstract'].apply(lambda x: x.strip().lower())
stopwords = set(stopwords.words('english'))
data1['abstract_tokens'] = data1['abstract'].apply(lambda x: word_tokenize(x))

## Remove Stopwords
data1['abstract_tokens'] = data1['abstract_tokens'].apply(lambda x: [w for w in x if w not in stopwords])

## Remove words with single characters
data1['abstract_tokens'] = data1['abstract_tokens'].apply(lambda x: [w for w in x if len(w)>1])
data1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


Unnamed: 0,cord_uid,source_x,title,license,abstract,publish_time,authors,journal,pdf_json_files,abstract_tokens
14597,aoiambb8,PMC,Etiology of Severe Acute Respiratory Infection...,no-cc,"in april 2017, surveillance detected a surge i...",2021-01-27,"Rahaman, Md R.; Alroy, Karen A.; Van Beneden, ...",Emerg Infect Dis,document_parses/pdf_json/3798d3bcb09df78296832...,"[april, 2017, surveillance, detected, surge, s..."
14601,embv1el9,PMC,Accuracy of a battery-powered portable capnome...,no-cc,carbon dioxide measurement is useful for confi...,2021-01-01,"Hirakawa, Eiji; Ibara, Satoshi",J Clin Monit Comput,document_parses/pdf_json/2b4690b639d799f33bbf7...,"[carbon, dioxide, measurement, useful, confirm..."
14602,8dcgzyst,PMC,The influence of diabetes on postoperative com...,no-cc,background: diabetes mellitus has been commonl...,2021-01-01,"Tan, D. J. H.; Yaow, C. Y. L.; Mok, H. T.; Ng,...",Tech Coloproctol,document_parses/pdf_json/83a841d74d4f6bf8bffb8...,"[background, diabetes, mellitus, commonly, ass..."
14603,sgtvecj8,PMC,In vivo delivery of a multiepitope peptide and...,no-cc,objectives: a potent hiv vaccine should overco...,2021-01-01,"Davoodi, Saba; Bolhassani, Azam; Namazi, Fatemeh",Biotechnol Lett,document_parses/pdf_json/7dd8097ba18590caa4310...,"[objectives, potent, hiv, vaccine, overcome, l..."
14604,3vtq6d8f,PMC,Relationship Between Compassion Fatigue in Nur...,no-cc,this research was conducted in order to examin...,2021-01-02,"Aslan, Hakime; Erci, Behice; Pekince, Hatice",J Relig Health,document_parses/pdf_json/9bd542f156f108ef170a0...,"[research, conducted, order, examine, correlat..."


In [6]:
data1.reset_index(drop=True,inplace=True)
data1.head()

Unnamed: 0,cord_uid,source_x,title,license,abstract,publish_time,authors,journal,pdf_json_files,abstract_tokens
0,aoiambb8,PMC,Etiology of Severe Acute Respiratory Infection...,no-cc,"in april 2017, surveillance detected a surge i...",2021-01-27,"Rahaman, Md R.; Alroy, Karen A.; Van Beneden, ...",Emerg Infect Dis,document_parses/pdf_json/3798d3bcb09df78296832...,"[april, 2017, surveillance, detected, surge, s..."
1,embv1el9,PMC,Accuracy of a battery-powered portable capnome...,no-cc,carbon dioxide measurement is useful for confi...,2021-01-01,"Hirakawa, Eiji; Ibara, Satoshi",J Clin Monit Comput,document_parses/pdf_json/2b4690b639d799f33bbf7...,"[carbon, dioxide, measurement, useful, confirm..."
2,8dcgzyst,PMC,The influence of diabetes on postoperative com...,no-cc,background: diabetes mellitus has been commonl...,2021-01-01,"Tan, D. J. H.; Yaow, C. Y. L.; Mok, H. T.; Ng,...",Tech Coloproctol,document_parses/pdf_json/83a841d74d4f6bf8bffb8...,"[background, diabetes, mellitus, commonly, ass..."
3,sgtvecj8,PMC,In vivo delivery of a multiepitope peptide and...,no-cc,objectives: a potent hiv vaccine should overco...,2021-01-01,"Davoodi, Saba; Bolhassani, Azam; Namazi, Fatemeh",Biotechnol Lett,document_parses/pdf_json/7dd8097ba18590caa4310...,"[objectives, potent, hiv, vaccine, overcome, l..."
4,3vtq6d8f,PMC,Relationship Between Compassion Fatigue in Nur...,no-cc,this research was conducted in order to examin...,2021-01-02,"Aslan, Hakime; Erci, Behice; Pekince, Hatice",J Relig Health,document_parses/pdf_json/9bd542f156f108ef170a0...,"[research, conducted, order, examine, correlat..."


In [7]:
# Filter topic

inverted_idx = {}
for idx,sentence in enumerate(list(data1['abstract_tokens'])):
    for token in set(sentence):
        if token not in inverted_idx:
            inverted_idx[token] = [idx]
        else:
            inverted_idx[token].append(idx)

print(f'There are {len(inverted_idx)} unique words in inverted index')

keywords_vaccine = ['covid','covid-19']
vaccine_idx = []
for word in keywords_vaccine:
    try:
        vaccine_idx += inverted_idx[word]
    except:
        pass
vaccine_idx = list(set(vaccine_idx))
print(f'Found {len(vaccine_idx)} articles related to covid')
data2 = data1.loc[vaccine_idx]
print(len(data2))

There are 312151 unique words in inverted index
Found 42340 articles related to covid
42340


In [8]:
data2.reset_index(drop=True,inplace=True)

inverted_idx = {}
for idx,sentence in enumerate(list(data2['abstract_tokens'])):
    for token in set(sentence):
        if token not in inverted_idx:
            inverted_idx[token] = [idx]
        else:
            inverted_idx[token].append(idx)

print(f'There are {len(inverted_idx)} unique words in inverted index')

keywords_vaccine = ['vaccine']
vaccine_idx = []
for word in keywords_vaccine:
    try:
        vaccine_idx += inverted_idx[word]
    except:
        pass
vaccine_idx = list(set(vaccine_idx))
print(f'Found {len(vaccine_idx)} articles related to vaccine')
data3 = data2.loc[vaccine_idx]
print(len(data3))

There are 177597 unique words in inverted index
Found 3364 articles related to vaccine
3364


In [9]:
data3.reset_index(drop=True,inplace=True)

inverted_idx = {}
for idx,sentence in enumerate(list(data3['abstract_tokens'])):
    for token in set(sentence):
        if token not in inverted_idx:
            inverted_idx[token] = [idx]
        else:
            inverted_idx[token].append(idx)

print(f'There are {len(inverted_idx)} unique words in inverted index')

#keywords_vaccine = ['saftey','safe','surveillance','reactogenicity','clinial','trial','trials']
#keywords_vaccine = ['saftey','safe','surveillance','reactogenicity']
keywords_vaccine = ['adverse']
vaccine_idx = []
for word in keywords_vaccine:
    try:
        vaccine_idx += inverted_idx[word]
    except:
        pass
vaccine_idx = list(set(vaccine_idx))
print(f'Found {len(vaccine_idx)} articles related to adverse')
data4 = data3.loc[vaccine_idx]
print(len(data4))

There are 32456 unique words in inverted index
Found 296 articles related to adverse
296


In [50]:
# data4.to_pickle(r'C:\Users\Gerard\Desktop\ISS\Pattern Recognition Systems\4 Practice module\Cord19\archive/metadata_filter.pkl')

### Pre-processing corpus for SBERT & TFIDF

In [51]:
# data4 = pd.read_pickle(r'C:\Users\Gerard\Desktop\ISS\Pattern Recognition Systems\4 Practice module\Cord19\archive/metadata_filter.pkl')

In [10]:
# Import trained SBERT model

tokenizer = AutoTokenizer.from_pretrained("gerardozq/biobert_v1.1_pubmed-finetuned-squad")
model = AutoModelForQuestionAnswering.from_pretrained("gerardozq/biobert_v1.1_pubmed-finetuned-squad")

In [11]:
# Process corpus for SBERT


train_sent1 = []
train_sent2 = []
train_label = []
citecorp = []
paracorp = []
para_doc = {}
para_idx = 0

j=0
for i in range(0,data4.shape[0]):
    #print('archive/'+data4['pdf_json_files'].iloc[i])
    filename = 'archive/'+data4['pdf_json_files'].iloc[i]
    filename = filename.split("; ")
    #print(filename[0])
    with open(filename[0]) as f_json:
        full_text_dict = json.load(f_json)
        for paragraph_dict in full_text_dict['body_text']:
            if len(paragraph_dict['text'])>300:
                ref_text = []
                paragraph_text = paragraph_dict['text']
                for ref in paragraph_dict['cite_spans']:
                    ref_id = ref['ref_id']
                    try:
                        ref_title = full_text_dict['bib_entries'][ref_id]['title']
                    except:
                        ref_title = full_text_dict['metadata']['title']
                    ref_text.append(ref_title)
                for trainref in ref_text:
                    train_sent1.append(paragraph_text)
                    train_sent2.append(trainref)
                    train_label.append(1.0)
                    if trainref not in citecorp:
                        citecorp.append(trainref)
                paracorp.append(paragraph_text)
                para_doc[para_idx] = {}
                para_doc[para_idx]['title'] = data4['title'].iloc[i]
                para_doc[para_idx]['text'] = paragraph_text
                para_doc[para_idx]['authors'] = data4['authors'].iloc[i]
                para_doc[para_idx]['publish_time'] = data4['publish_time'].iloc[i]
                para_doc[para_idx]['journal'] = data4['journal'].iloc[i]
                para_idx = para_idx + 1
                

print(len(para_doc))
print(len(paracorp))


6998
6998


In [12]:
# Process corpus for TFIDF

doc_list = []
doc_list_word = []

for i in range(0,data4.shape[0]):
    doc_text = []
    doc_text_word = []
    #print('archive/'+data4['pdf_json_files'].iloc[i])
    filename = 'archive/'+data4['pdf_json_files'].iloc[i]
    filename = filename.split("; ")
    #print(filename[0])
    with open(filename[0]) as f_json:
        full_text_dict = json.load(f_json)
        for paragraph_dict in full_text_dict['body_text']:
            if len(paragraph_dict['text'])>300:
                paragraph_text = paragraph_dict['text']
                text_token = tokenizer(paragraph_text[:512])
                doc_list_word.append(paragraph_text)
                doc_list.append(text_token['input_ids'])
print(len(doc_list))
print(len(doc_list_word))


6998
6998


In [53]:
# Export processed corpus if needed
import pickle
tfidf_doc = {}
tfidf_doc['doc_list'] = doc_list
tfidf_doc['doc_list_word'] = doc_list_word

filename = 'tfidf_doc'
outfile = open(filename,'wb')
pickle.dump(tfidf_doc,outfile)
outfile.close()

In [56]:
# Export processed corpus if needed
import pickle
sbert_doc = {}
sbert_doc['paracorp'] = paracorp
sbert_doc['para_doc'] = para_doc

filename = 'sbert_doc'
outfile = open(filename,'wb')
pickle.dump(sbert_doc,outfile)
outfile.close()

### Create SBERT embeddings for export to system

In [13]:
embedder = SentenceTransformer('msmarco-distilbert-base-v4')


In [None]:
# Encode
corpus = paracorp

corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)


In [59]:
# Export SBERT embeddings
import pickle

filename = 'emb_corpus300'
outfile = open(filename,'wb')
pickle.dump(corpus_embeddings,outfile)
outfile.close()



In [18]:
import pickle
filename = 'emb_corpus300'
infile = open(filename,'rb')
corpus_embeddings1 = pickle.load(infile)
infile.close()

corpus = paracorp

### Optional: Export all models 

In [None]:
import pickle
models_litmining = {}
models_litmining['tokenizer'] = tokenizer
models_litmining['model'] = model
models_litmining['embedder'] = embedder

filename = 'models_litmining'
outfile = open(filename,'wb')
pickle.dump(models_litmining,outfile)
outfile.close()

In [None]:
import pickle
filename = 'models_litmining'
infile = open(filename,'rb')
models_litmining1 = pickle.load(infile)
infile.close()

tokenizer = models_litmining1['tokenizer']
model = models_litmining1['model']
embedder = models_litmining1['embedder']

## Evaluation

In [14]:
def dummy_fun(doc):
    return doc

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)  

tfidf_matrix = tfidf.fit_transform(doc_list)

In [19]:
def cord19mining(symptomInput = ['pain']):

    # symptom processing
    symptomlist = []
    rejectlist = ['covid-19', 'blood test','computerised tomogram','sars-cov-2 test positive',
                  'cerebrovascular accident','electrocardiogram','echocardiogram','troponin increased',
                 'arthralgia','hyperhidrosis','paraesthesia','hypoaesthesia','feeling abnormal']
    for symptom in symptomInput:
        if symptom == 'dyspnoea':
            symptom1 = 'dyspnea'
        elif symptom == 'pyrexia':
            symptom1 = 'fever'
        elif symptom == 'injection site erythema':
            symptom1 = 'erythema'
        elif symptom == 'myalgia':
            symptom1 = 'myalgias'
        elif symptom == 'lymphadenopathy':
            symptom1 = 'lymph nodes'
        elif symptom in rejectlist:
            symptom1 = 'pain'
        else:
            symptom1 = symptom
        
        if symptom1 not in symptomlist:
            symptomlist.append(symptom1)
        
    
    output = {}
    i = 1
    
    resultlist_value = torch.empty((0))
    resultlist_indices = torch.empty((0))
    query_symp = []

    for symp in symptomlist:
        # Auto-query generation
        query = 'is ' + str(symp) + ' caused by vaccine a severe adverse effect'
        #queries = ['is fever caused by vaccine a severe adverse effect']
        
        # tfidf score
        query_token = tokenizer(query)
        query_vec = tfidf.transform([query_token['input_ids'][1:len(query_token['input_ids'])-1]])
        cosine_sim = cosine_similarity(tfidf_matrix, query_vec)
        tfidf_score = torch.FloatTensor(np.transpose(cosine_sim)[0])

        # sbert score
        top_k = min(5, len(corpus))

        query_embedding = embedder.encode(query, convert_to_tensor=True)
        
        k_coeff = 0.7
        
        # Linear combination of tfidf and sbert scores to find the highest 5 scores
        cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings1)[0]
        combined_scores = k_coeff*cos_scores + (1-k_coeff)*tfidf_score
        top_results = torch.topk(combined_scores, k=top_k)
        resultlist_value = torch.cat((resultlist_value,top_results[0]))
        resultlist_indices = torch.cat((resultlist_indices,top_results[1]))
        query_symp.extend([symp for i in range(5)])
    
    top_results_list = torch.topk(resultlist_value, k=top_k)
    symp_idx = top_results_list[1].tolist()
    query_symp = [query_symp[i] for i in symp_idx]
    qi = 0

    
    for idx in resultlist_indices[top_results_list[1]].int():
        # QA search
        question = 'is ' + query_symp[qi] + ' caused by vaccine a severe adverse effect'
        text = corpus[idx]
        inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt")
        input_ids = inputs["input_ids"].tolist()[0]
        outputs = model(**inputs)
        answer_start_scores = outputs.start_logits
        answer_end_scores = outputs.end_logits
        # Get the most likely beginning of answer with the argmax of the score
        answer_start = torch.argmax(answer_start_scores)
        # Get the most likely end of answer with the argmax of the score
        answer_end = torch.argmax(answer_end_scores) + 1
        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
        
        # Sentiment analyzer
        analyzer = SentimentIntensityAnalyzer()
        sentence = answer 
        vs = analyzer.polarity_scores(sentence)
        
        output[i] = {}
        output[i]['text'] =  para_doc[idx.item()]['text']
        output[i]['title'] =  para_doc[idx.item()]['title']
        output[i]['authors'] =  para_doc[idx.item()]['authors']
        output[i]['publish_time'] = para_doc[idx.item()]['publish_time'].strftime('%m/%d/%Y')     
        output[i]['journal'] =  para_doc[idx.item()]['journal']
        output[i]['sentiment'] =  vs
        i = i + 1
        qi = qi + 1
        
    return output




In [28]:
cord19mining(symptomInput = ['pyrexia','injection site warmth'])

{1: {'text': 'Vaccines activate the immune system, which will commonly result in minor side effects, including mild fever and local inflammatory reactions at the site of the injection. This may include redness, swelling, pain, and warmth at the injection sites [1] . These reactions are not a contraindication to receiving the same vaccine in the future, as they do not pose a risk for future allergic reactions to the vaccine. Non-allergic reactions to vaccines also include anxiety-related adverse events that can mimic allergic reactions, and may include breath-holding, hyperventilation, and vasovagal syncope (fainting) (see Table 1 in the Canadian Immunization Guide: Anaphylaxis and other Acute Reactions following Vaccination) [2] .',
  'title': 'COVID-19 vaccine testing & administration guidance for allergists/immunologists from the Canadian Society of Allergy and Clinical Immunology (CSACI)',
  'authors': 'Vander Leek, Timothy K.; Chan, Edmond S.; Connors, Lori; Derfalvi, Beata; Ellis,

## Output results are provided to system UI