<a href="https://colab.research.google.com/github/danebencedavid/NLP-A-Agent/blob/master/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [52]:
#Imports
import kagglehub
import os
import pandas as pd
import spacy
from nltk.corpus import wordnet
from tqdm import tqdm
import re
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import requests
from sentence_transformers import SentenceTransformer
import numpy as np

**Downloading the metadata.csv for CORD-19**.

In [4]:

path = kagglehub.dataset_download("googleai/dataset-metadata-for-cord19")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/dataset-metadata-for-cord19


In [5]:
os.listdir(path)

['CORD19 datasets - Sheet 1.csv']

In [6]:
filename_with_path = path + "/" + os.listdir(path)[0]
filename_with_path

'/kaggle/input/dataset-metadata-for-cord19/CORD19 datasets - Sheet 1.csv'

In [7]:
df_meta_cord19 = pd.read_csv(filename_with_path)

In [8]:
df_meta_cord19 = df_meta_cord19[df_meta_cord19['description'].notnull()]

**Extracting keywords from  *description* column.**

In [10]:
nlp = spacy.load("en_core_web_sm")

In [19]:
def spacy_keywords(text):
    keywords = []
    for text in tqdm(text, desc="Extracting keywords", unit="abstracts"):
        if pd.isna(text):
            keywords.append([])
        else:
            doc = nlp(text)
            keywords.append([chunk.text.lower() for chunk in doc.noun_chunks][:10])
    return keywords

In [20]:
df_meta_cord19['keywords'] = spacy_keywords(df_meta_cord19['description'])

Extracting keywords: 100%|██████████| 14126/14126 [05:39<00:00, 41.64abstracts/s]


In [23]:
df_meta_cord19['keywords'][0]

['["rt-pcr data',
 'comparative viral loads/ tissue levels',
 'ferrets',
 'either niv-my',
 'niv-bd - data',
 'the form',
 'spreadsheets',
 'calculated niv copies',
 'sample',
 'animal']

**Keyword Preprocessing**

In [28]:
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')

def preprocess_keyword(keyword):
    keyword = re.sub(r'[^\w\s]', '', keyword.lower())
    keyword = ' '.join([lemmatizer.lemmatize(word) for word in keyword.split()])
    return keyword.strip()

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [29]:
df_meta_cord19['processed_keywords'] = df_meta_cord19['keywords'].apply(lambda lst: list(set([preprocess_keyword(k) for k in lst])))

In [32]:
df_meta_cord19['processed_keywords'][0]

['nivbd data',
 'the form',
 'calculated niv copy',
 'animal',
 'ferret',
 'sample',
 'spreadsheet',
 'rtpcr data',
 'either nivmy',
 'comparative viral load tissue level']

**Keyword expansion**

In [44]:
smoking_terms = {"smoking", "tobacco", "cigarette"}
covid_terms = {"covid", "sars-cov-2", "coronavirus"}
socio_terms = {"poverty", "income", "education", "social class", "disadventage","inequality"}

def get_synonyms(seed_term):
    synonyms = set()
    try:
        for syn in wordnet.synsets(seed_term):
            for lemma in syn.lemmas():
                term = lemma.name().lower().replace('_', ' ')
                if len(term.split()) <= 2:
                    synonyms.add(term)
    except Exception as e:
        print(f"Error fetching synonyms for {seed_term}: {e}")
    return synonyms

In [45]:
def query_wikidata(keyword):
    terms = set()
    try:
        url = f"https://www.wikidata.org/w/api.php?action=wbsearchentities&search={keyword}&language=en&format=json"
        response = requests.get(url, timeout=10).json()
        terms.update(item['label'].lower() for item in response.get('search', []))
    except Exception as e:
        print(f"Wikidata API error: {e}")
    return terms

In [46]:
smoking_terms.update(get_synonyms("smoking"))
smoking_terms.update(query_wikidata("tobacco"))
covid_terms.update(get_synonyms("covid"))
covid_terms.update(query_wikidata("SARS-CoV-2"))
socio_terms.update(query_wikidata("poverty"))
socio_terms.update(query_wikidata("income"))
socio_terms.update(query_wikidata("education"))
socio_terms.update(query_wikidata("social class"))
socio_terms.update(query_wikidata("disadvantage"))
socio_terms.update(query_wikidata("inequality"))

**Filtering papers**

In [53]:
model = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [54]:
smoking_embeddings = model.encode(list(smoking_terms))
covid_embeddings = model.encode(list(covid_terms))
third_topic_embeddings = model.encode(list(socio_terms))

In [55]:
def semantic_match(text, term_embeddings, threshold=0.75):
    if pd.isna(text):
        return False
    text_embed = model.encode(text)
    similarities = np.dot(text_embed, term_embeddings.T)
    return np.max(similarities) >= threshold

In [None]:
semantic_mask = df_meta_cord19['processed_keywords'].apply(
    lambda x: (
        semantic_match(x, smoking_embeddings) and
        semantic_match(x, covid_embeddings) and
        semantic_match(x, socio_terms)
    )
)
