In [1]:
import os
import sys
import configparser

project_root = os.path.abspath(os.path.join(os.getcwd(), "../"))
sys.path.append(project_root)

from database.neo4j_db import Neo4jGraphDB

# Database Configuration

In [2]:
config = configparser.ConfigParser()
config.read('../config.ini')
neo4j_graph = Neo4jGraphDB(database=config["neo4j"]["database"])
driver = neo4j_graph._driver
with driver.session(database=neo4j_graph._database) as session:
    result = session.run("MATCH (n) RETURN count(n) AS node_count")
    node_count = result.single()["node_count"]
print(f"Total number of nodes in the Neo4j database: {node_count}")


Total number of nodes in the Neo4j database: 93325


# Data Exploration

In [3]:
# Lyme disease (mapped and non-mapped) phenotypes
with driver.session(database=neo4j_graph._database) as session:
    query = """
    MATCH (d:HpoDisease)
    WHERE d.label CONTAINS "Lyme"

    // 1) ALL phenotypes for the disease
    OPTIONAL MATCH (d)-[:HAS_PHENOTYPIC_FEATURE]->(r_all)
    WITH d, collect(DISTINCT r_all.label) AS allPhenotypes

    // 2) ONLY phenotypes that satisfy UMLS→ICD
    OPTIONAL MATCH (d)-[:HAS_PHENOTYPIC_FEATURE]->(r)
    <-[:UMLS_TO_HPO_PHENOTYPE]-(x:UMLS)-[:UMLS_TO_ICD]-()
    WITH d, allPhenotypes, collect(DISTINCT r.label) AS phenotypes

    // 3) Difference: in allPhenotypes but NOT in phenotypes
    WITH d, phenotypes, allPhenotypes,
        [p IN allPhenotypes WHERE NOT p IN phenotypes] AS otherPhenotypes

    RETURN d.label AS disease,
        phenotypes,        // filtered by UMLS→ICD
        allPhenotypes,     // full set
        otherPhenotypes    // full minus filtered
    ORDER BY size(disease) ASC;
    """
    result = session.run(query)
    for record in result:
        disease = record["disease"]
        phenotypes = record["phenotypes"]
        all_phenotypes = record["allPhenotypes"]
        other_phenotypes = record["otherPhenotypes"]
        print(f"Disease: {disease}")
        print(f"Phenotypes (UMLS→ICD mapped): {phenotypes}")
        print(f"All Phenotypes: {all_phenotypes}")
        print(f"Other Phenotypes: {other_phenotypes}")
        print()

Disease: Lyme disease
Phenotypes (UMLS→ICD mapped): ['Dermal atrophy', 'Peripheral neuropathy', 'Meningitis', 'Atypical behavior', 'Uveitis', 'Nausea and vomiting', 'Arrhythmia', 'Memory impairment', 'Headache', 'Arthritis', 'Fever', 'Atrioventricular block', 'Myalgia', 'Arthralgia', 'Fatigue', 'Insomnia', 'Amaurosis fugax']
All Phenotypes: ['Dermal atrophy', 'Peripheral neuropathy', 'Meningitis', 'Muscle weakness', 'Photophobia', 'Atypical behavior', 'Uveitis', 'Nausea and vomiting', 'Arrhythmia', 'Infectious encephalitis', 'Memory impairment', 'Headache', 'Skin nodule', 'Joint swelling', 'Arthritis', 'Fever', 'Atrioventricular block', 'Myalgia', 'Paresthesia', 'Arthralgia', 'Cranial nerve paralysis', 'Fatigue', 'Insomnia', 'Amaurosis fugax']
Other Phenotypes: ['Muscle weakness', 'Photophobia', 'Infectious encephalitis', 'Skin nodule', 'Joint swelling', 'Paresthesia', 'Cranial nerve paralysis']



In [4]:
# Check if subclasses of the phenotypes have mapping wiht ICD
mapped_phenotypes = ['Dermal atrophy', 'Peripheral neuropathy', 'Meningitis', 'Atypical behavior', 'Uveitis', 'Nausea and vomiting', 'Arrhythmia', 'Memory impairment', 'Headache', 'Arthritis', 'Fever', 'Atrioventricular block', 'Myalgia', 'Arthralgia', 'Fatigue', 'Insomnia', 'Amaurosis fugax']
with driver.session(database=neo4j_graph._database) as session:
    for phenotype in mapped_phenotypes:
        query = """
        MATCH (p:HpoPhenotype {label: $phenotype})
        OPTIONAL MATCH (p)<-[:SUBCLASSOF*]-(subclass:HpoPhenotype)
        WHERE EXISTS {
            MATCH (subclass)<-[:UMLS_TO_HPO_PHENOTYPE]-(u:UMLS)-[:UMLS_TO_ICD]-()
        }
        RETURN subclass.label AS subclass_label
        """
        result = session.run(query, phenotype=phenotype)
        subclasses_with_icd = [record["subclass_label"] for record in result if record["subclass_label"] is not None]
        print(f"Phenotype: {phenotype}")
        print(f"Subclasses with UMLS→ICD mapping: {subclasses_with_icd}")
        print()

Phenotype: Dermal atrophy
Subclasses with UMLS→ICD mapping: ['Striae distensae']

Phenotype: Peripheral neuropathy
Subclasses with UMLS→ICD mapping: ['Polyneuropathy', 'Mononeuropathy', 'Constrictive median neuropathy', 'Multiple mononeuropathy']

Phenotype: Meningitis
Subclasses with UMLS→ICD mapping: ['Fungal meningitis', 'Cryptococcal meningitis', 'Coccidioidal meningitis']

Phenotype: Atypical behavior
Subclasses with UMLS→ICD mapping: ['Hair-pulling', 'Nail-biting', 'Bruxism', 'Disinhibition', 'Hyperactivity', 'Attention deficit hyperactivity disorder', 'Aggressive behavior', 'Violent behavior', 'Oppositional defiant disorder', 'Nail-biting', 'Compulsive stealing', 'Excessive fire setting', 'Anorexia', 'Polydipsia', 'Polyphagia', 'Bulimia', 'Addictive nicotine use', 'Addictive cocaine use', 'Addictive opioid use', 'Addictive cannabis use', 'Addictive alcohol use', 'Addictive gambling behavior', 'Personality disorder', 'Histrionic personality disorder', 'Borderline personality diso