## **Disease knowledge graph**

In [None]:
import os
import pandas as pd

csv_disease_symptoms = "data/disease_knowledge_graph/Final_Augmented_dataset_Diseases_and_Symptoms.csv"
df_symptoms = pd.read_csv(csv_disease_symptoms)

In [None]:
keys = df_symptoms["diseases"].unique()
keys = [key.lower() for key in keys]
print(f"Number of unique diseases: {len(keys)}")
print(f"First 10 unique diseases: {keys}")

Number of unique diseases: 773
First 10 unique diseases: ['panic disorder', 'vocal cord polyp', 'turner syndrome', 'cryptorchidism', 'poisoning due to ethylene glycol', 'atrophic vaginitis', 'fracture of the hand', 'cellulitis or abscess of mouth', 'eye alignment disorder', 'headache after lumbar puncture', 'pyloric stenosis', 'salivary gland disorder', 'osteochondrosis', 'injury to the knee', 'metabolic disorder', 'vaginitis', 'sick sinus syndrome', 'tinnitus of unknown cause', 'glaucoma', 'eating disorder', 'transient ischemic attack', 'pyelonephritis', 'rotator cuff injury', 'chronic pain disorder', 'problem during pregnancy', 'liver cancer', 'atelectasis', 'injury to the hand', 'choledocholithiasis', 'injury to the hip', 'cirrhosis', 'thoracic aortic aneurysm', 'subdural hemorrhage', 'diabetic retinopathy', 'fibromyalgia', 'ischemia of the bowel', 'fetal alcohol syndrome', 'peritonitis', 'injury to the abdomen', 'acute pancreatitis', 'thrombophlebitis', 'asthma', 'foreign body in t

In [9]:
keys = df_symptoms.columns[1:]
print(f"Number of unique symptoms: {len(keys)}")
print(f"First 10 unique symptoms: {keys[:10]}")
# Save to CSV
df_keys = pd.DataFrame(keys, columns=["symptoms"])
df_keys.to_csv("unique_symptoms.csv", index=False)

Number of unique symptoms: 377
First 10 unique symptoms: Index(['anxiety and nervousness', 'depression', 'shortness of breath',
       'depressive or psychotic symptoms', 'sharp chest pain', 'dizziness',
       'insomnia', 'abnormal involuntary movements', 'chest tightness',
       'palpitations'],
      dtype='object')


In [4]:
import pandas as pd

# Assuming you already have a DataFrame named df_symptoms with a 'diseases' column
keys = df_symptoms["diseases"].unique()
keys = [key.lower() for key in keys]

print(f"Number of unique diseases: {len(keys)}")
print(f"First 10 unique diseases: {keys[:10]}")

# Save to CSV
df_keys = pd.DataFrame(keys, columns=["disease"])
df_keys.to_csv("unique_diseases.csv", index=False)

print("✅ Disease list saved to 'unique_diseases.csv'")


Number of unique diseases: 773
First 10 unique diseases: ['panic disorder', 'vocal cord polyp', 'turner syndrome', 'cryptorchidism', 'poisoning due to ethylene glycol', 'atrophic vaginitis', 'fracture of the hand', 'cellulitis or abscess of mouth', 'eye alignment disorder', 'headache after lumbar puncture']
✅ Disease list saved to 'unique_diseases.csv'


In [20]:
import pandas as pd
import re

# 1. Extract and lowercase unique disease names
keys = df_symptoms["diseases"].unique()
keys = [key.lower() for key in keys]

# 2. Extract aliases
rows = []
all_aliases = set()
for disease in keys:
    aliases = set()
    base = disease.strip()
    # Check for parentheses (e.g., hocm)
    match = re.search(r"\(([^)]+)\)", base)
    if match:
        abbrev = match.group(1).strip()
        aliases.add(abbrev)
        base = re.sub(r"\s*\([^)]*\)", "", base).strip()  # Remove the (hocm) part
        all_aliases.add(abbrev)
        
    all_aliases.add(base)
    aliases.add(base)

    rows.append({
        "original": disease,
        "aliases": ", ".join(sorted(aliases))
    })

# 3. Save as CSV
df_aliases = pd.DataFrame(rows)
df_aliases.to_csv("unique_diseases_with_aliases.csv", index=False)

df = pd.DataFrame(all_aliases)
df.to_csv("unique_aliases.csv", index=False)

print("✅ Disease list with aliases saved to 'unique_diseases_with_aliases.csv'")

✅ Disease list with aliases saved to 'unique_diseases_with_aliases.csv'


In [None]:
import pandas as pd

# Assuming df_symptoms is your original DataFrame
# Normalize disease names to lowercase
df_symptoms["diseases"] = df_symptoms["diseases"].str.lower()

# Group by disease and apply logical OR (using max for binary)
df_merged = df_symptoms.groupby("diseases").max().reset_index()

# Optional: Save to CSV
df_merged.to_csv("diseases_symptoms_merged.csv", index=False)

print(f"✅ Merged dataset shape: {df_merged.shape}")
print(df_merged.head())


In [None]:
diseases_nodes = df_symptoms["diseases"].unique()
simptoms_nodes = df_symptoms.columns[1:].unique()

In [11]:
import networkx as nx
# Create a directed graph
# create graph where each disease is connected to its symptoms

G = nx.DiGraph()
diseases_nodes = df_symptoms["diseases"].unique()
simptoms_nodes = df_symptoms.columns[1:].unique()
for disease in diseases_nodes:
    G.add_node(disease.lower(), type="disease")
for symptom in simptoms_nodes:
    G.add_node(symptom.lower(), type="symptom")
# Add edges from diseases to symptoms
for _, row in df_symptoms.iterrows():
    disease = row["diseases"].lower()
    for symptom in simptoms_nodes:
        if pd.notna(row[symptom]) and row[symptom] == 1:
            G.add_edge(disease, symptom.lower())

In [16]:
import networkx as nx
import pickle
# save the graph to a file

output_dir = "data/disease_knowledge_graph"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "disease_symptom_graph.gpickle")

with open(output_file, "wb") as f:
    pickle.dump(G, f)

print(f"Graph saved to {output_file}")

Graph saved to data/disease_knowledge_graph\disease_symptom_graph.gpickle


## **PubMed Retrieval**

In [None]:
import requests
from xml.etree import ElementTree

def search_pubmed(term, retmax=5):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
    
    # Step 1: Use esearch to get PMIDs
    esearch_url = base_url + "esearch.fcgi"
    params = {
        "db": "pubmed",
        "term": term,
        "retmax": retmax,
        "retmode": "xml"
    }
    response = requests.get(esearch_url, params=params)
    root = ElementTree.fromstring(response.content)
    id_list = [id_elem.text for id_elem in root.findall(".//Id")]

    if not id_list:
        print("No articles found.")
        return []

    # Step 2: Use efetch to get article details
    efetch_url = base_url + "efetch.fcgi"
    params = {
        "db": "pubmed",
        "id": ",".join(id_list),
        "retmode": "xml"
    }
    response = requests.get(efetch_url, params=params)
    root = ElementTree.fromstring(response.content)

    # Step 3: Parse article data
    articles = []
    for article in root.findall(".//PubmedArticle"):
        # Concatenate all ArticleTitle elements into a single string
        title_elems = article.findall(".//ArticleTitle")
        title = " ".join([elem.text.strip() for elem in title_elems if elem.text])
        abstract_elems = article.findall(".//Abstract/AbstractText")
        abstract = " ".join([elem.text.strip() for elem in abstract_elems if elem.text])
        key_words_elems = article.findall(".//Keyword")
        key_words = [elem.text.strip() for elem in key_words_elems if elem.text]
        pmid = article.findtext(".//PMID")
        articles.append({
            "PMID": pmid,
            "Title": title,
            "Abstract": abstract,
            "article": article,
            "KeyWords": key_words
        })

    return articles

# Example usage
results = search_pubmed("panic disorder", retmax=3)
for article in results:
    print(f"\nPMID: {article['PMID']}\nTitle: {article['Title']}\nAbstract: {article['Abstract']}\nKeywords: {', '.join(article['KeyWords'])}\n")



PMID: 40530069
Title: The implications of the diving response in altering carbon dioxide sensitivity as measured by changes in heart rate, respiration rate and psychological measures in panic disorder patients.
Abstract: Breath-hold divers are known for their exceptional breathing control and reduced carbon dioxide (CO This study investigated the effects of the CFI task on individuals with PD and a comparison group. Changes in heart rate, respiration rate, and psychological measures were assessed before and after a CO The results did not support the efficacy of the CFI task in reducing physiological markers of CO As hypothesized, the CFI task demonstrated anxiolytic effects in individuals with PD by reducing self-reported anxiety and panic symptoms. These findings highlight the potential of the CFI task for clinical application in the treatment of panic disorder, warranting further research with larger samples.
Keywords: anxiety, carbon dioxide sensitivity, cold facial immersion, divi

In [None]:
#show the  tree structure of the first article
from xml.etree import ElementTree as ET

first_article = results[0]['article']

print("\nTree structure of the first article:")
def print_tree(element, level=0):
    indent = "  " * level
    print(f"{indent}<{element.tag}>")
    for child in element:
        print_tree(child, level + 1)
    if element.text and element.text.strip():
        print(f"{indent}  {element.text.strip()}")
    print(f"{indent}</{element.tag}>")
print_tree(first_article)


### **PubMed Disease RAG Database creation**

In [28]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re
import pandas as pd

# Load sentence-transformer model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
disease_list = pd.read_csv("data/disease_csv_files/unique_aliases.csv")["0"].tolist()
simptoms_list = pd.read_csv("data/disease_csv_files/unique_symptoms.csv")["symptoms"].tolist()

def bert_encode(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()  # Use CLS token

def encode_text_and_match_diseases(text, top_k=5):
    encoding_dict = {}
    text_lower = text.lower()
    lists = [disease_list, simptoms_list]
    text_vec = bert_encode(text)
    encoding_dict["text_vector"] = text_vec[0].tolist()

    for idx, lst in enumerate(lists):
        lst = [item.lower() for item in lst]
        # 1. Find exact/partial disease matches in text
        exact_matches = [disease for disease in lst if re.search(rf'\b{re.escape(disease.lower())}\b', text_lower)]
        
        disease_vecs = np.vstack([bert_encode(disease) for disease in lst])
        
        # 3. Compute cosine similarities
        similarities = cosine_similarity(text_vec, disease_vecs)[0]
        top_indices = similarities.argsort()[-top_k:][::-1]
        semantic_matches = [(disease_list[i], similarities[i]) for i in top_indices]

        encoding_dict[f"exact_matches_{idx}"] = exact_matches
        encoding_dict[f"semantic_matches_{idx}"] = semantic_matches

    return encoding_dict

In [29]:
text = " Hi, my name is XXXX I m a 19year old girl and I keep getting these weird movement feelings in the centre of my stomach (inside) it feel like there is something in there. Sometimes it give me a sharp pain doesn t really hurt just a quick weird pain. At first I thought I could be pregnant but then I tooktook a pregnancy test and it came up negative, and I am also using contraception ( the implant ) so I don t think I could be pregnant, and also the last time I had sex was 5 months ago I feel like I d no if I was pregnant 5 months gone. I just want to know what it is because it is a weird and slightly uncomfortable feeling because I don t know what it is or could be. Thankyou."
results = encode_text_and_match_diseases(text, top_k=5)

  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


In [None]:
import pandas as pd
import torch
from torch_geometric.data import Data

# === Load disease–symptom data ===
df = pd.read_csv("data/disease_csv_files/diseases_symptoms_merged.csv")

# === Define node categories ===
person_node = "person"
disease_nodes = df["diseases"].unique().tolist()
symptom_nodes = df.columns[1:].tolist()

# === All nodes and mapping ===
all_nodes = [person_node] + disease_nodes + symptom_nodes
node2id = {name: i for i, name in enumerate(all_nodes)}
num_nodes = len(all_nodes)

# === Create edge list and edge types ===
edges = []
edge_types = []

# person → disease (type 0)
for disease in disease_nodes:
    edges.append([node2id[person_node], node2id[disease]])
    edge_types.append(0)

# disease → symptom (type 1)
for _, row in df.iterrows():
    disease = row["diseases"]
    for symptom in symptom_nodes:
        if row[symptom] == 1:
            edges.append([node2id[disease], node2id[symptom]])
            edge_types.append(1)

edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()  # [2, E]
edge_attr = torch.tensor(edge_types, dtype=torch.long)              # [E]

# === Node feature: just the node type (as a numeric feature vector) ===
# 0 = person, 1 = disease, 2 = symptom
node_type_feature = []
for node in all_nodes:
    if node == person_node:
        node_type_feature.append([0,1])
    elif node in disease_nodes:
        node_type_feature.append([1,0])
    else:
        node_type_feature.append([2,0])

x = torch.tensor(node_type_feature, dtype=torch.float)  # shape: [N, 1]

# === Assemble PyG graph ===
data = Data(
    x=x,                    # [N, 1], node_type as node feature
    edge_index=edge_index,  # [2, E]
    edge_attr=edge_attr     # [E]
)

# === Print summary ===
print(data)
print("Sample node features (node_type):")
print(data.x.squeeze())


Data(x=[1151, 1], edge_index=[2, 6162], edge_attr=[6162])
Sample node features (node_type):
tensor([0., 1., 1.,  ..., 2., 2., 2.])
