<a href="https://colab.research.google.com/github/danebencedavid/NLP-A-Agent/blob/master/npl_agent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🦠 COVID-19 Research Agent
A QA Agent answering COVID-19, smoking, and diabetes related questions using graph-powered semantic search.



## **Imports and Downloading metadata of CORD19**

In [1]:
import kagglehub
import os
import pandas as pd
from nltk.corpus import wordnet
from tqdm import tqdm
import re
from nltk.stem import WordNetLemmatizer
import nltk
import requests
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch
from google.colab import drive

In [2]:
print("Downloading CORD-19 metadata...")

path = kagglehub.dataset_download("googleai/dataset-metadata-for-cord19")
print(f"Dataset downloaded to: {path}")

if not os.listdir(path):
        raise FileNotFoundError("Dataset directory is empty. Download might have failed.")

filename_with_path = os.path.join(path, os.listdir(path)[0])
print(f"Using metadata file: {filename_with_path}")

df_meta_cord19 = pd.read_csv(filename_with_path, low_memory=False)

Downloading CORD-19 metadata...
Dataset downloaded to: /kaggle/input/dataset-metadata-for-cord19
Using metadata file: /kaggle/input/dataset-metadata-for-cord19/CORD19 datasets - Sheet 1.csv


In [3]:
print(f"Initial dataframe shape: {df_meta_cord19.shape}")

df_meta_cord19 = df_meta_cord19[df_meta_cord19['description'].notnull()].copy()
print(f"Shape after removing rows with null description: {df_meta_cord19.shape}")

Initial dataframe shape: (16070, 14)
Shape after removing rows with null description: (14126, 14)


## **Defining core terms for COVID-19, smoking and diabetes**
Terms were gathered from [MeSH](https://www.ncbi.nlm.nih.gov/mesh/2052179/)

In [4]:
covid_terms = {
    "COVID 19",
    "SARS Coronavirus 2 Infection",
    "2019-nCoV\xa0Infection",
    "2019 nCoV\xa0Infection",
    "2019-nCoV\xa0Infections",
    "Infection,\xa02019-nCoV",
    "SARS-CoV-2\xa0Infection",
    "Infection,\xa0SARS-CoV-2",
    "SARS CoV 2\xa0Infection",
    "SARS-CoV-2\xa0Infections",
    "2019 Novel Coronavirus Disease",
    "2019 Novel Coronavirus Infection",
    "2019-nCoV\xa0Disease",
    "2019 nCoV\xa0Disease",
    "2019-nCoV\xa0Diseases",
    "Disease,\xa02019-nCoV",
    "COVID19",
    "Coronavirus Disease 2019",
    "Disease 2019, Coronavirus",
    "Severe Acute Respiratory Syndrome Coronavirus 2\xa0Infection",
    "COVID-19\xa0Virus Disease",
    "COVID 19\xa0Virus Disease",
    "COVID-19\xa0Virus Diseases",
    "Disease,\xa0COVID-19\xa0Virus",
    "Virus Disease,\xa0COVID-19",
    "Coronavirus Disease-19",
    "Coronavirus Disease 19",
    "COVID-19\xa0Virus Infection",
    "COVID 19\xa0Virus Infection",
    "COVID-19\xa0Virus Infections",
    "Infection,\xa0COVID-19\xa0Virus",
    "Virus Infection,\xa0COVID-19",
    "COVID-19\xa0Pandemic",
    "COVID 19\xa0Pandemic",
    "Pandemic,\xa0COVID-19",
    "COVID-19\xa0Pandemics"
}
smoking_terms = {
    "smoking",
    "tobacco",
    "cigarette",
    "nicotine",
    "tobacco use",
    "cigarette smoking",
    "tobacco smoking",
    "smokers",
    "environmental tobacco smoke",
    "passive smoking",
    "smokeless tobacco",
    "chewing tobacco",
    "snuff (tobacco)",
    "e-cigarettes",
    "electronic cigarettes",
    "vaping",
    "hookah smoking",
    "waterpipe smoking",
    "cigar smoking",
    "pipe smoking",
    "smoking cessation",
    "tobacco dependence",
    "nicotine dependence",
    "tobacco products"
}
diabetes_terms = {
    "diabetes",
    "diabetic",
    "hyperglycemia",
    "blood sugar",
    "insulin resistance",
    "t2dm",
    "type 2 diabetes mellitus",
    "non-insulin-dependent diabetes mellitus",
    "niddm",
    "t1dm",
    "type 1 diabetes mellitus",
    "insulin-dependent diabetes mellitus",
    "iddm",
    "diabetes mellitus",
    "glucose metabolism disorders",
    "blood glucose",
    "insulin deficiency",
    "diabetic complications",
    "diabetic retinopathy",
    "diabetic neuropathy",
    "diabetic nephropathy",
    "cardiovascular disease and diabetes",
    "prediabetes",
    "gestational diabetes"
}




## **Semantic filtering based on Descriptions (abstracts)**

In [5]:
if torch.cuda.is_available():
    torch.cuda.empty_cache()

In [6]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-v1.1")

def get_embeddings(texts, tokenizer, model, max_length=512, batch_size=32):
    all_embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding"):
        batch_texts = texts[i:i + batch_size]
        encoded_input = tokenizer(batch_texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
        with torch.no_grad():
            model_output = model(**encoded_input)
        # Perform mean pooling to get sentence embeddings
        token_embeddings = model_output.last_hidden_state
        input_mask_expanded = encoded_input['attention_mask'].unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        all_embeddings.append(mean_embeddings.cpu().numpy())
    return np.concatenate(all_embeddings, axis=0)

def get_embeddings_incremental_save(texts, tokenizer, model, output_dir="abstract_embeddings", batch_size=32, start_index=0):
    os.makedirs(output_dir, exist_ok=True)
    all_embeddings = []
    num_processed = 0
    for i in tqdm(range(start_index, len(texts), batch_size), desc="Encoding"):
        batch_texts = texts[i:i + batch_size]
        encoded_input = tokenizer(batch_texts, padding=True, truncation=True, max_length=512, return_tensors='pt')
        with torch.no_grad():
            model_output = model(**encoded_input)
        # Perform mean pooling to get sentence embeddings
        token_embeddings = model_output.last_hidden_state
        input_mask_expanded = encoded_input['attention_mask'].unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        embeddings_batch = mean_embeddings.cpu().numpy()
        all_embeddings.append(embeddings_batch)
        num_processed += len(batch_texts)

        # Save the current batch of embeddings
        output_path = os.path.join(output_dir, f"embeddings_batch_{i//batch_size}.npy")
        np.save(output_path, embeddings_batch)

    final_embeddings = np.concatenate(all_embeddings, axis=0) if all_embeddings else np.array([])
    return final_embeddings



# Encode the keyword sets
smoking_embeddings = get_embeddings(list(smoking_terms),tokenizer, model)
covid_embeddings = get_embeddings(list(covid_terms), tokenizer, model)
diabetes_embeddings = get_embeddings(list(diabetes_terms), tokenizer, model)

# Get the list of abstracts
abstracts = df_meta_cord19['description'].tolist()

output_embedding_dir = "/content/drive/MyDrive/Colab Notebooks/embeddings"

existing_files = [f for f in os.listdir(output_embedding_dir) if f.endswith(".npy")]
start_index = 0
if existing_files:
    # Sort files to find the last processed batch
    existing_files.sort(key=lambda f: int(f.split("_")[-1].split(".")[0]))
    last_file = existing_files[-1]
    last_batch_index = int(last_file.split("_")[-1].split(".")[0])
    start_index = (last_batch_index + 1) * 32


print(f"Resuming encoding from index: {start_index}")

# Encode all abstracts in batches
abstract_embeddings = get_embeddings_incremental_save(abstracts, tokenizer, model, output_dir=output_embedding_dir, batch_size=32, start_index=start_index)

if not abstract_embeddings.size: # If the process completed or no new files were created
    all_loaded_embeddings = []
    for file in sorted([f for f in os.listdir(output_embedding_dir) if f.endswith(".npy")], key=lambda f: int(f.split("_")[-1].split(".")[0])):
        path = os.path.join(output_embedding_dir, file)
        all_loaded_embeddings.append(np.load(path))
    if all_loaded_embeddings:
        abstract_embeddings = np.concatenate(all_loaded_embeddings, axis=0)
        print("All saved abstract embeddings loaded and concatenated.")
    else:
        print("No abstract embeddings were saved.")

print("Shape of smoking embeddings:", smoking_embeddings.shape)
print("Shape of covid embeddings:", covid_embeddings.shape)
print("Shape of diabetes embeddings:", diabetes_embeddings.shape)
print("Shape of abstract embeddings:", abstract_embeddings.shape)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]


Encoding:   0%|          | 0/1 [00:00<?, ?it/s][A
Encoding: 100%|██████████| 1/1 [00:01<00:00,  1.70s/it]

Encoding:   0%|          | 0/2 [00:00<?, ?it/s][A
Encoding:  50%|█████     | 1/2 [00:01<00:01,  1.94s/it][A
Encoding: 100%|██████████| 2/2 [00:02<00:00,  1.06s/it]
Encoding: 100%|██████████| 1/1 [00:00<00:00,  2.35it/s]


Resuming encoding from index: 11040


Encoding: 100%|██████████| 97/97 [52:16<00:00, 32.34s/it]

Shape of smoking embeddings: (24, 768)
Shape of covid embeddings: (36, 768)
Shape of diabetes embeddings: (24, 768)
Shape of abstract embeddings: (3086, 768)





In [8]:
print("Calculating semantic similarities...")
smoking_similarities = cosine_similarity(abstract_embeddings, smoking_embeddings)
covid_similarities = cosine_similarity(abstract_embeddings, covid_embeddings)
diabetes_similarities = cosine_similarity(abstract_embeddings, diabetes_embeddings)

print("Shape of smoking similarities:", smoking_similarities.shape)
print("Shape of covid similarities:", covid_similarities.shape)
print("Shape of diabetes similarities:", diabetes_similarities.shape)

Calculating semantic similarities...
Shape of smoking similarities: (3086, 24)
Shape of covid similarities: (3086, 36)
Shape of diabetes similarities: (3086, 24)
