## NLP PubMed Miner

### 2.1 Importing Libraries

In [1]:
import spacy
import pycountry
import pandas as pd

### 2.2 Loading Data

In [2]:
import pandas as pd

# Load data collected
df = pd.read_csv("../Pubmed-NLP/data/pubmed_fever_india.csv")
df.head()

Unnamed: 0,PMID,Title,Abstract,Date
0,40747162,Methicillin-Resistant Staphylococcus aureus Re...,Renal abscesses are rare in children. This rep...,2025 Jul
1,40746790,Diverse Manifestations of Central Nervous Syst...,OBJECTIVES: This study aims to evaluate and ch...,2025 Jun
2,40742524,Anti-chikungunya Activity of a Cinnamic Acid D...,Chikungunya virus (CHIKV) is an arthropod-born...,2025 Jul 31
3,40742003,Paradoxical Hemoglobin Drop Post-Transfusion: ...,"Hyperhaemolysis syndrome (HHS) is a rare, seve...",2025 Jul 31
4,40741554,Clinico-Epidemiological Profile of Dengue in C...,Background Dengue has emerged as the most comm...,2025 Jun


### 2.3 Load scispaCy and Extract Entities

In [3]:
# Load clinical model
nlp = spacy.load("en_core_sci_md")

def extract_entities(text):
    if pd.isna(text) or not isinstance(text, str):
        return []
    doc = nlp(text)
    return [ent.text.lower() for ent in doc.ents if len(ent.text) > 2]

### 2.4 Applying to Data Frame

In [4]:
df["Entities"] = df["Abstract"].apply(extract_entities)
df[["Title", "Entities"]].head()

Unnamed: 0,Title,Entities
0,Methicillin-Resistant Staphylococcus aureus Re...,"[renal abscesses, children, report, case, rena..."
1,Diverse Manifestations of Central Nervous Syst...,"[objectives, study, evaluate, characterize, di..."
2,Anti-chikungunya Activity of a Cinnamic Acid D...,"[chikungunya virus, chikv, arthropod-borne vir..."
3,Paradoxical Hemoglobin Drop Post-Transfusion: ...,"[hyperhaemolysis syndrome, hhs, rare, severe, ..."
4,Clinico-Epidemiological Profile of Dengue in C...,"[background, dengue, arboviral infection, indi..."


### 2.5 Add Country Matching

In [5]:
# Create a country list for rule-based detection
country_list = [country.name for country in pycountry.countries]

def extract_countries(text):
    if pd.isna(text):
        return []
    return [country for country in country_list if country.lower() in text.lower()]

df["Countries"] = df["Abstract"].apply(lambda x: extract_countries(x))

### 2.6 Save Output

In [6]:
df.to_csv("../Pubmed-NLP/data/abstracts_with_entities.csv", index=False)