# COVID Data Bert Embedding Generation

Processes Allen Institute COVID dataset and pre-generates BERT embeddings for the entire dataset

In [0]:
!pip install sentence-transformers

In [0]:
from sentence_transformers import models, SentenceTransformer
import os
import torch
import pandas as pd
import glob
import json
import re
import pickle as pkl
import numpy as np
import scipy.spatial
import h5py
import re
import torch

## Load and Process Data

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
ROOT = "/content"
DATA_DIR = os.path.join(ROOT, r'gdrive/My Drive/School/Applied DL/project/')
DATA_PATH = os.path.join(DATA_DIR, "CORD-19-research-challenge.zip")  # path to Allen COVID dataset

Load cached preprocessed data if exists


In [0]:
# Load cached preprocessed data
# df = pd.read_pickle(os.path.join(DATA_DIR, 'processed_data_2.pkl'))

In [0]:
!unzip -qq $DATA_PATH -d $ROOT/data

In [0]:
DATA_PATH = os.path.join(ROOT, "data")

metadata = pd.read_csv(os.path.join(DATA_PATH, "metadata.csv"), dtype={"sha": str})
doc_list = glob.glob(os.path.join(DATA_PATH + "/**/*.json"), recursive=True)

In [0]:
def parse_document(file_path):
    """
    Given a string path to a json file, get the paper id and text of the paper
    """
    data = {}
    with open(file_path, mode="r") as f:
        content = json.load(f)

        abstract= []
        if "abstract" in content:
            for section in content["abstract"]:
                abstract.append(section["text"])

        body = []
        for section in content["body_text"]:
            if len(section["text"]) > 50:
                body.append(section["text"])
        
        data["paper_id"] = content["paper_id"]
        data["body_text"] = "\n".join(body)
        data["abstract"] = "\n".join(abstract)

    return data

### Create main dataframe

In [0]:
data = []
columns=['paper_id', 'title', 'abstract', 'body_text', 'authors', 'journal', 'url']

for doc in doc_list:
    text = parse_document(doc)

    meta = metadata.loc[metadata['sha'] == text["paper_id"]]
    if meta.shape[0] == 0:
        continue  # no metadata available

    data.append((text["paper_id"],  meta['title'].values[0],  
                 text["abstract"], text["body_text"], 
                 meta['authors'].values[0], meta['journal'].values[0],
                 meta['url'].values[0]))

df = pd.DataFrame(data, columns=columns)
del data  # to save memory
df.head()

Unnamed: 0,paper_id,title,abstract,body_text,authors,journal,url
0,35349bb1fc9290338907b7d7f104c9db3951163b,2018 ACVIM Forum Research Abstract Program: Se...,Angiotensin converting enzyme inhibitors (ACEi...,The objective was to assess the diagnostic uti...,,J Vet Intern Med,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...
1,67595d2304315ae4af47ec96fbd42c55bd6855e2,ORMA: a tool for identification of species-spe...,16S rRNA gene is one of the preferred targets ...,"During the last decades, different nucleic-aci...","Severgnini, Marco; Cremonesi, Paola; Consoland...",Nucleic Acids Res,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...
2,dac4e5ddd4e1b0a3ec755a573674371f724462a8,Susceptibility of Chikungunya Virus to Inactiv...,Despite increasing clinical relevance of Chiku...,"Over the past decades, Chikungunya virus (CHIK...","Franz, Sergej; Friesland, Martina; Passos, Vân...",J Infect Dis,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...
3,dca3423350c6278fe7b9479eb894a578ffeeeb68,Peptides as Therapeutic Agents for Dengue Virus,,Dengue is a mosquito-borne disease caused by t...,"Chew, Miaw-Fang; Poh, Keat-Seong; Poh, Chit-Laa",Int J Med Sci,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...
4,a50b31b20869c7fe66f9d850c228211c04819956,Repeat Auditing of Primary Health-care Facilit...,Background: The elevated risk of occupational ...,Accreditation or certification of health-care ...,"Cloete, Brynt; Yassi, Annalee; Ehrlich, Rodney",Saf Health Work,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...


### Clean Data

In [0]:
df['body_text'] = df['body_text'].str.replace(r'[^a-zA-z0-9\s]', '').str.lower()
# df['abstract'] = df['abstract'].str.replace(r'[^a-zA-z0-9\s]', '').str.lower()

# According to Allen Institute dataset, duplicate papers exist with different ids
df = df.drop_duplicates("abstract").drop_duplicates("body_text")
df = df.drop_duplicates(["abstract", "body_text"])
df = df.dropna(subset=["url"])
df = df.loc[(df["body_text"].str.len() > 0) & (df["abstract"].str.len() > 0)]

df = df.reset_index(drop=True)

In [0]:
# Cache processed dataframe for testing
df.to_pickle(os.path.join(DATA_DIR, 'processed_data_2.pkl'))

## Prepare Data for Embedding


Extract sentence/paragraph-level data from each paper.

In [0]:
text = df.drop(["authors", "journal"], axis=1)
text_dict = text.to_dict()

paper_id_list  = []
body_text_list = []
url_list = []

title_list = []
abstract_list = []
for i in range(0,len(df["paper_id"])):
    paper_id = text_dict["paper_id"][i]
    body_text = text_dict["body_text"][i].split("\n")
    title = text_dict["title"][i]
    abstract = text_dict["abstract"][i]
    url = text_dict["url"][i]
    for b in body_text:
        paper_id_list.append(paper_id)
        body_text_list.append(b)
        title_list.append(title)
        abstract_list.append(abstract)
        url_list.append(url)

df_sentences = pd.DataFrame({"paper_id":paper_id_list,"title":title_list,"abstract":abstract_list,"url":url_list}, index=body_text_list)
df_sentences = df_sentences.reset_index().rename(columns={"index":"body_text"})

In [0]:
df_sentences.head()

Unnamed: 0,body_text,paper_id,title,abstract,url
0,the objective was to assess the diagnostic uti...,35349bb1fc9290338907b7d7f104c9db3951163b,2018 ACVIM Forum Research Abstract Program: Se...,Angiotensin converting enzyme inhibitors (ACEi...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...
1,ten apparently healthy horses 458619kg from th...,35349bb1fc9290338907b7d7f104c9db3951163b,2018 ACVIM Forum Research Abstract Program: Se...,Angiotensin converting enzyme inhibitors (ACEi...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...
2,we conclude that ice is feasible safe and allo...,35349bb1fc9290338907b7d7f104c9db3951163b,2018 ACVIM Forum Research Abstract Program: Se...,Angiotensin converting enzyme inhibitors (ACEi...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...
3,julia r treseder nicole l leblanc katherine f ...,35349bb1fc9290338907b7d7f104c9db3951163b,2018 ACVIM Forum Research Abstract Program: Se...,Angiotensin converting enzyme inhibitors (ACEi...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...
4,these effects have been demonstrated to be ind...,35349bb1fc9290338907b7d7f104c9db3951163b,2018 ACVIM Forum Research Abstract Program: Se...,Angiotensin converting enzyme inhibitors (ACEi...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...


In [0]:
# # Save paper lookup table for app
# with open(os.path.join(DATA_DIR, 'text_lookup_processed.pkl'), 'wb') as f:
#         pkl.dump(df_sentences.drop(["paper_id"],axis=1).values, f)

## Generate Embeddings

In [0]:
# Sentence Transformer Models
# MODEL_NAME = 'distilbert-base-nli-stsb-mean-tokens'
# MODEL_NAME = 'bert-base-nli-mean-tokens'
# MODEL_NAME = 'distilbert-base-nli-mean-tokens'
# MODEL_NAME = 'roberta-base-nli-stsb-mean-tokens'
# embedder = SentenceTransformer(MODEL_NAME)

In [0]:
# CUSTOM HUGGINGFACE MODEL
MODEL_NAME = 'gsarti/covidbert-nli'
word_embedding_model = models.BERT(MODEL_NAME,
                       max_seq_length=510,
                       do_lower_case=True)

pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])

embedder = torch.quantization.quantize_dynamic(
    embedder, {torch.nn.Linear}, dtype=torch.qint8)

In [0]:
# Save model for app
# embedder.save("./model")
# !zip -r model.zip model

  adding: model/ (stored 0%)
  adding: model/1_Pooling/ (stored 0%)
  adding: model/1_Pooling/config.json (deflated 47%)
  adding: model/config.json (stored 0%)
  adding: model/0_BERT/ (stored 0%)
  adding: model/0_BERT/config.json (deflated 48%)
  adding: model/0_BERT/vocab.txt (deflated 53%)
  adding: model/0_BERT/tokenizer_config.json (deflated 34%)
  adding: model/0_BERT/special_tokens_map.json (deflated 40%)
  adding: model/0_BERT/sentence_bert_config.json (deflated 4%)
  adding: model/0_BERT/pytorch_model.bin (deflated 23%)
  adding: model/modules.json (deflated 51%)


In [0]:
# Break into batches to deal with low memory issues
NUM_BATCHES = 8
corpus = df_sentences["body_text"].values
batches = np.array_split(corpus, NUM_BATCHES)

In [0]:
# Free up memory
del df
del text_dict
del df_sentences
del corpus

Use this in web browser console to prevent colab from timing out
~~~
function ClickConnect(){

    console.log("Working"); 
    document.querySelector("colab-toolbar-button").click() 
}
setInterval(ClickConnect,60000)
~~~

In [0]:
# Generate embeddings
for i, batch in enumerate(batches):
    print("\nBatch:", i)
    # if i < 5:
    #     continue
    corpus_embeddings = embedder.encode(batch,show_progress_bar=True)
    with open(os.path.join(DATA_DIR, 'embeddings-batch' + str(i) + '-' + MODEL_NAME.replace("/", "-") + '.pkl'), 'wb') as f:
        pkl.dump(corpus_embeddings, f)
    del corpus_embeddings

In [0]:
# Combine batches
embeddings = []
for i in range(NUM_BATCHES):
    with open(os.path.join(DATA_DIR, 'embeddings-batch' + str(i) + '-' + MODEL_NAME.replace("/", "-") + '.pkl'), 'rb') as f:
        print("Loading batch:", i)
        embeddings += pkl.load(f)

Loading batch: 0
Loading batch: 1
Loading batch: 2
Loading batch: 3
Loading batch: 4
Loading batch: 5
Loading batch: 6
Loading batch: 7


Save/Load cached embeddings data

In [0]:
# Save full embeddings file
# Using hdf5 instead of pickle because list is too long, causing memory issues
hf = h5py.File(os.path.join(DATA_DIR, 'embeddings-' + MODEL_NAME.replace("/", "-") + '.hdf5'), 'w')
hf.create_dataset('embeddings', data=embeddings)
hf.close()

In [0]:
# Load embeddings from hdf5 file
f = h5py.File(os.path.join(DATA_DIR, 'embeddings-' + MODEL_NAME.replace("/", "-") + '.hdf5'), 'r')
embeddings = f["embeddings"][:]

## Ask Query

In [0]:
query = 'effects on pets on covid-19?'
query = [re.sub(r'[^a-zA-z0-9\s]', '', query).lower()]
query_embedding = np.array(embedder.encode(query ,show_progress_bar=True))

Batches: 100%|██████████| 1/1 [00:00<00:00, 13.70it/s]


In [0]:
# Split cosine similarity search into batches to save memory
NUM_CLOSEST = 5
NUM_BATCH_DIST = 4
embed_batches = np.array_split(embeddings, NUM_BATCH_DIST)

results = []
index = 0
for i, batch in enumerate(embed_batches):
    print("Calculating batch:", i)
    distances = scipy.spatial.distance.cdist(query_embedding, batch, "cosine")[0]

    results += zip(range(index, index + batch.shape[0]), distances)
    index += batch.shape[0]
    # only keep top results every batch
    results = sorted(results, key=lambda x: x[1])[:NUM_CLOSEST]

Calculating batch: 0
Calculating batch: 1
Calculating batch: 2
Calculating batch: 3


In [0]:
results

[(475954, 0.28233874234257994),
 (667820, 0.338897792298545),
 (616647, 0.3404976554699345),
 (616558, 0.350160514049465),
 (616639, 0.35081145272056524)]

In [0]:
print("QUESTION:", query)

for i, distance in results:
    print("Score:   ", "(Score: %.4f)" % (1-distance) , "\n" )
    print("Paragraph:   ", df_sentences.iloc[i]["body_text"].strip(), "\n" )
    print("paper_id:  " , df_sentences.iloc[i]["paper_id"] , "\n")
    print("Title:  " , df_sentences.iloc[i]["title"] , "\n")
    print("Abstract:  " , df_sentences.iloc[i]["abstract"] , "\n")
    print("-------------------------------------------")

QUESTION: ['effects on pets on covid19']
Score:    (Score: 0.7177) 

Paragraph:    aawsthe regulations of pets australian animal welfare strategy 

paper_id:   b9e7e99d058c1fffd687046fe2999ebdba9edd65 

Title:   Science delivering to regulators 

Abstract:   Regulations are a part of life but who writes them, what is the basis on which they are written and when the regulations get it wrong, whose fault is it? Is it those who wrote the regulations, those enforcing the regulations, those being regulated or the science underpinning the regulations? In seeking answers to these questions, this paper explores the regulatory process and the contribution of science. It takes as examples the role of the Australian Pesticides and Veterinary Medicines Authority (APVMA) in regulating veterinary products, of the Security Sensitive Biological Agents regulations in managing the risks from specific pathogens, the Quarantine Act regulations as applied to containment facilities and the development of we

## Notes

`bert-base-nli-mean-tokens` 
- takes about 1 hour per batch (4 hours total).

`distilbert-base-nli-stsb-mean-tokens` 
- takes about 30 min per batch (2 hours total). twice as fast.
- answers seem to not be great. lots of repeated answers. could be due to this model being fine-tuned on specifically "semantic textual similarity".


`distilbert-base-nli-mean-tokens`
- takes about 45 min per batch. 
- answers better than distilbert finetuned. maybe not as base bert or about the same.

`roberta-base-nli-stsb-mean-tokens`
- about 45-60 min per batch
- answers better than both distilbert.
- not sure how to compare with base bert

`covidbert`
- about 45 min per batch
- answers a lot more accurate than other bert models