## NLP PubMed Miner

### 3.1 Importing Libraries

In [1]:
import pandas as pd
import ast
from collections import Counter
import itertools

### 3.2 Loading Data

In [2]:
df = pd.read_csv("../Pubmed-NLP/data/abstracts_with_entities.csv")

In [3]:
# Convert 'Entities' and 'Countries' columns from string to list
df["Entities"] = df["Entities"].apply(ast.literal_eval)
df["Countries"] = df["Countries"].apply(ast.literal_eval)

In [4]:
# Filter rows where both entities and countries exist
df_clean = df[(df["Entities"].str.len() > 0) & (df["Countries"].str.len() > 0)].copy()

### 3.3 Extract Possible Symptom–Region–Disease Triplets

In [5]:
# WHITELIST of valid clinical terms (symptoms/diseases)
CLINICAL_TERMS = {
    "fever", "rash", "vomiting", "diarrhea", "cough", "headache", "fatigue",
    "dengue", "zika", "malaria", "cholera", "typhoid", "jaundice", "influenza",
    "sepsis", "tuberculosis", "hepatitis", "asthma", "covid-19",
    "sore throat", "shortness of breath", "nausea", "pain", "infection"
}

In [6]:
# Clean each entity to keep only whitelist matches
def clean_entity(e):
    e = e.lower().strip()
    return e in CLINICAL_TERMS

# Count triplets: (symptom, disease, country)
triplet_counter = Counter()

for _, row in df_clean.iterrows():
    entities = row["Entities"]
    countries = row["Countries"]

    if not entities or not countries:
        continue

    # Filter to only valid clinical terms
    cleaned_entities = [e.lower().strip() for e in entities if clean_entity(e)]
    
    if len(cleaned_entities) < 2:
        continue  # Need at least a pair

    for country in countries:
        country = country.lower()
        for pair in itertools.combinations(set(cleaned_entities), 2):
            symptom, disease = tuple(sorted(pair))
            triplet = (symptom, disease, country)
            triplet_counter[triplet] += 1


In [7]:
# Create co-occurrence DataFrame
cooc_df = pd.DataFrame([
    {"Symptom": t[0], "Disease": t[1], "Country": t[2], "Count": c}
    for t, c in triplet_counter.items()
])
# Sort and view top 15 patterns
cooc_df = cooc_df.sort_values(by="Count", ascending=False)
cooc_df.head(15)

Unnamed: 0,Symptom,Disease,Country,Count
4,fever,vomiting,india,5
5,dengue,fever,india,5
19,fever,headache,india,3
6,dengue,vomiting,india,3
29,dengue,malaria,india,2
53,headache,vomiting,india,2
26,fever,infection,india,2
28,fever,malaria,india,2
56,dengue,zika,india,2
43,dengue,vomiting,burkina faso,1


### 3.4 Save Output

In [8]:
cooc_df.to_csv("../Pubmed-NLP/data/cooccurrence_matrix_filtered.csv", index=False)