<a href="https://colab.research.google.com/github/danebencedavid/NLP-A-Agent/blob/master/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Imports
import kagglehub
import os
import pandas as pd
import spacy
from nltk.corpus import wordnet
from tqdm import tqdm
import re
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import requests
from sentence_transformers import SentenceTransformer
import numpy as np

**Downloading the metadata.csv for CORD-19**.

In [None]:

path = kagglehub.dataset_download("googleai/dataset-metadata-for-cord19")

print("Path to dataset files:", path)

In [None]:
os.listdir(path)

In [None]:
filename_with_path = path + "/" + os.listdir(path)[0]
filename_with_path

In [None]:
df_meta_cord19 = pd.read_csv(filename_with_path)

In [None]:
df_meta_cord19 = df_meta_cord19[df_meta_cord19['description'].notnull()]

**Extracting keywords from  *description* column.**

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
def spacy_keywords(text):
    keywords = []
    for text in tqdm(text, desc="Extracting keywords", unit="abstracts"):
        if pd.isna(text):
            keywords.append([])
        else:
            doc = nlp(text)
            keywords.append([chunk.text.lower() for chunk in doc.noun_chunks][:10])
    return keywords

In [None]:
df_meta_cord19['keywords'] = spacy_keywords(df_meta_cord19['description'])

In [None]:
df_meta_cord19['keywords'][0]

**Keyword Preprocessing**

In [None]:
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')

def preprocess_keyword(keyword):
    keyword = re.sub(r'[^\w\s]', '', keyword.lower())
    keyword = ' '.join([lemmatizer.lemmatize(word) for word in keyword.split()])
    return keyword.strip()

In [None]:
df_meta_cord19['processed_keywords'] = df_meta_cord19['keywords'].apply(lambda lst: list(set([preprocess_keyword(k) for k in lst])))

In [None]:
df_meta_cord19['processed_keywords'][0]

**Keyword expansion**

In [None]:
smoking_terms = {"smoking", "tobacco", "cigarette"}
covid_terms = {"covid", "sars-cov-2", "coronavirus"}
socio_terms = {"poverty", "income", "education", "social class", "disadventage","inequality"}

def get_synonyms(seed_term):
    synonyms = set()
    try:
        for syn in wordnet.synsets(seed_term):
            for lemma in syn.lemmas():
                term = lemma.name().lower().replace('_', ' ')
                if len(term.split()) <= 2:
                    synonyms.add(term)
    except Exception as e:
        print(f"Error fetching synonyms for {seed_term}: {e}")
    return synonyms

In [None]:
def query_wikidata(keyword):
    terms = set()
    try:
        url = f"https://www.wikidata.org/w/api.php?action=wbsearchentities&search={keyword}&language=en&format=json"
        response = requests.get(url, timeout=10).json()
        terms.update(item['label'].lower() for item in response.get('search', []))
    except Exception as e:
        print(f"Wikidata API error: {e}")
    return terms

In [None]:
smoking_terms.update(get_synonyms("smoking"))
smoking_terms.update(query_wikidata("tobacco"))
covid_terms.update(get_synonyms("covid"))
covid_terms.update(query_wikidata("SARS-CoV-2"))
socio_terms.update(query_wikidata("poverty"))
socio_terms.update(query_wikidata("income"))
socio_terms.update(query_wikidata("education"))
socio_terms.update(query_wikidata("social class"))
socio_terms.update(query_wikidata("disadvantage"))
socio_terms.update(query_wikidata("inequality"))

**Filtering papers**

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
smoking_embeddings = model.encode(list(smoking_terms))
covid_embeddings = model.encode(list(covid_terms))
third_topic_embeddings = model.encode(list(socio_terms))

In [None]:
def semantic_match(text, term_embeddings, threshold=0.75):
    if pd.isna(text):
        return False
    text_embed = model.encode(text)
    similarities = np.dot(text_embed, term_embeddings.T)
    return np.max(similarities) >= threshold

In [None]:
semantic_mask = df_meta_cord19['processed_keywords'].apply(
    lambda x: (
        semantic_match(x, smoking_embeddings) and
        semantic_match(x, covid_embeddings) and
        semantic_match(x, socio_terms)
    )
)
