In [2]:
import csv
import os
from dotenv import load_dotenv

from neo4j import GraphDatabase

In [3]:
load_dotenv()

NEO4J_URI = os.getenv('NEO4J_URI_LOCAL')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME_LOCAL')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD_LOCAL')

In [4]:
# Connect and run cypher query
def execute_query(driver, cypher_query, parameters=None):
    try:
        with driver.session() as session:
            result = session.run(cypher_query, parameters)
    except Exception as e:
        print(f"Error executing query: {e}")
        return None

### Define Entities and Relationships

**Entities:**
- Healthcare Provider
- Patient
- Specialization
- Location

**Relationships:**
- TREATS
- SPECIALIZES_IN
- LOCATED_AT

In [5]:
def create_healthcare_provider_node(driver, provider, bio):
    print(f'Creating healthcare provider node...')
    print(f'- Provider: {provider}')
    print(f'- Bio: {bio}')
    
    cypher_query = '''
    MERGE (hp:HealthcareProvider {name: $provider, bio: $bio})
    '''
    
    parameters = {
        'provider': provider,
        'bio': bio
    }
    
    execute_query(driver, cypher_query, parameters)

def create_patient_node(driver, patient, patient_age, patient_gender, patient_condition):
    print(f'Creating patient node...')
    print(f'- Patient: {patient}')
    print(f'- Age: {patient_age}')
    print(f'- Gender: {patient_gender}')
    print(f'- Condition: {patient_condition}')
    
    cypher_query = '''
    MERGE (p:Patient {name: $patient, age: $patient_age, gender: $patient_gender, condition: $patient_condition})
    '''
    
    parameters = {
        'patient': patient,
        'patient_age': patient_age,
        'patient_gender': patient_gender,
        'patient_condition': patient_condition
    }
    
    execute_query(driver, cypher_query, parameters)

def create_specialization_node(driver, specialization):
    print(f'Creating specialization node...')
    print(f'- Specialization: {specialization}')
    
    cypher_query = '''
    MERGE (s:Specialization {name: $specialization})
    '''
    
    parameters = {'specialization': specialization}
    
    execute_query(driver, cypher_query, parameters)

def create_location_node(driver, location):
    print(f'Creating location node...')
    print(f'- Location: {location}')
    
    cypher_query = '''
    MERGE (l:Location {name: $location})
    '''
    
    parameters = {'location': location}
    
    execute_query(driver, cypher_query, parameters)
    

In [33]:
def create_relationships(driver, provider, patient, specialization, location):
    print(f'---- Creating relationships ----')
    print(f'\tProvider: {provider}')
    print(f'\tPatient: {patient}')
    print(f'\tSpecialization: {specialization}')
    print(f'\tLocation: {location}')
    print('---------------------------------')
    
    cypher_query = '''
    MATCH (hp:HealthcareProvider {name: $provider}), (p:Patient {name: $patient})
    MERGE (hp)-[:TREATS]->(p)
    WITH hp
    MATCH (hp), (s:Specialization {name: $specialization})
    MERGE (hp)-[:SPECIALIZES_IN]->(s)
    WITH hp
    MATCH (hp), (l:Location {name: $location})
    MERGE (hp)-[:LOCATED_AT]->(l)
    '''
    
    parameters = {
        'patient': patient,
        'provider': provider,
        'specialization': specialization,
        'location': location
    }
    
    execute_query(driver, cypher_query, parameters)

In [6]:
# Read CSV file and populate the graph
def read_csv_and_populate_graph(driver, csv_file_path):
    with open(csv_file_path, 'r') as file:
        reader = csv.DictReader(file)
        print(f'Reading CSV file {csv_file_path}. Start populating graph...')
        
        for row in reader:
            provider = row['Provider']
            patient = row['Patient']
            specialization = row['Specialization']
            location = row['Location']
            bio = row['Bio']
            patient_age = row['Patient_Age']
            patient_gender = row['Patient_Gender']
            patient_condition = row['Patient_Condition']
            
            # Create nodes
            create_healthcare_provider_node(driver, provider, bio)
            create_patient_node(driver, patient, patient_age, patient_gender, patient_condition)
            create_specialization_node(driver, specialization)
            create_location_node(driver, location)
            
            # Create relationships
            create_relationships(driver, provider, patient, specialization, location)
        
    driver.close()
    print(f'Finished populating graph!')

In [35]:
# Initialize Neo4j driver
driver = GraphDatabase.driver(
    uri=NEO4J_URI, 
    auth=(NEO4J_USERNAME, NEO4J_PASSWORD)
)
csv_file_path = 'healthcare.csv'

read_csv_and_populate_graph(driver, csv_file_path)

Reading CSV file healthcare.csv. Start populating graph...
Creating healthcare provider node...
- Provider: Dr. Jessica Lee
- Bio: Dr. Jessica Lee is a dermatologist focused on skin cancer treatment and prevention.
Creating patient node...
- Patient: Eva Blue
- Age: 66
- Gender: Male
- Condition: Asthma
Creating specialization node...
- Specialization: Pediatrics
Creating location node...
- Location: Los Angeles
---- Creating relationships ----
	Provider: Dr. Jessica Lee
	Patient: Eva Blue
	Specialization: Pediatrics
	Location: Los Angeles
---------------------------------
Creating healthcare provider node...
- Provider: Dr. Michael Brown
- Bio: Dr. Michael Brown is an orthopedic surgeon with expertise in joint replacement.
Creating patient node...
- Patient: Alice Brown
- Age: 59
- Gender: Female
- Condition: Osteoarthritis
Creating specialization node...
- Specialization: Pediatrics
Creating location node...
- Location: Los Angeles
---- Creating relationships ----
	Provider: Dr. Mich

In [7]:
from langchain_openai import ChatOpenAI

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
OPENAI_EMBEDDINGS_ENDPOINT = os.getenv('OPENAI_EMBEDDINGS_ENDPOINT')

llm = ChatOpenAI(api_key=OPENAI_API_KEY)

In [8]:
from langchain_community.graphs import Neo4jGraph

# Create knowledge graph
kg = Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
)

# Note: May need to install APOC plugin for Neo4j Desktop (enterprise version)

  kg = Neo4jGraph(


In [9]:
# Create vector index
kg.query(
    '''
    CREATE VECTOR INDEX health_providers_embeddings IF NOT EXISTS
    FOR (hp:HealthcareProvider) ON (hp.comprehensiveEmbedding)
    OPTIONS {
      indexConfig: {
        `vector.dimensions`: 1536,
        `vector.similarity_function`: 'cosine'
      }
    }
    '''
)

[]

In [10]:
print(kg.query('SHOW VECTOR INDEXES'))

[{'id': 2, 'name': 'health_providers_embeddings', 'state': 'ONLINE', 'populationPercent': 100.0, 'type': 'VECTOR', 'entityType': 'NODE', 'labelsOrTypes': ['HealthcareProvider'], 'properties': ['comprehensiveEmbedding'], 'indexProvider': 'vector-2.0', 'owningConstraint': None, 'lastRead': None, 'readCount': 0}]


In [11]:
# Embedding bio field in every healthcare provider node
kg.query(
    '''
    MATCH (hp:HealthcareProvider)-[:TREATS]->(p:Patient)
    WHERE hp.bio IS NOT NULL
    WITH hp, genai.vector.encode(
        hp.bio,
        "OpenAI",
        {
          token: $openAiApiKey,
          endpoint: $openAiEmbeddingsEndpoint
        }) AS vector
    WITH hp, vector
    WHERE vector IS NOT NULL
    CALL db.create.setNodeVectorProperty(hp, "comprehensiveEmbedding", vector)
    ''',
    params={
        'openAiApiKey': OPENAI_API_KEY,
        'openAiEmbeddingsEndpoint': OPENAI_EMBEDDINGS_ENDPOINT,
    },
)

# Note: genai.vector.encode for Neo4j Desktop (enterprise version) is activate by:
# Move or copy the genai plugins (.jar files) from <NEO4J_HOME>/products 
# to the <NEO4J_HOME>/plugins directory

ClientError: {code: Neo.ClientError.Procedure.ProcedureCallFailed} {message: Failed to invoke function `genai.vector.encode`: Caused by: org.neo4j.genai.vector.MalformedGenAIResponseException: Unexpected HTTP response code: 504 Gateway Time-out - error code: 504}

In [13]:
# Test if embedding is created successfully
result = kg.query('''
    MATCH (hp:HealthcareProvider)
    WHERE hp.bio IS NOT NULL
    RETURN hp.bio, hp.name, hp.comprehensiveEmbedding
    LIMIT 5
''')

for record in result:
    print(f"Name: {record['hp.name']}")
    print(f"Bio: {record['hp.bio']}")
    print(f"Embedding: {record['hp.comprehensiveEmbedding']}")
    print("---")

Name: Dr. Jessica Lee
Bio: Dr. Jessica Lee is a dermatologist focused on skin cancer treatment and prevention.
Embedding: [-0.001317372894845903, 0.006131031550467014, 0.016967637464404106, -0.018396228551864624, -0.016503972932696342, 0.008903623558580875, -0.0060464441776275635, 0.008239454589784145, 0.005068987607955933, -0.0017606744077056646, 0.0007620712276548147, 0.004060202743858099, 0.0011709111277014017, -0.0006704347324557602, -0.013521477580070496, 0.011409921571612358, 0.028872553259134293, -0.021905044093728065, 0.002099024597555399, -0.011792132630944252, -0.026165751740336418, -0.010269556194543839, -0.012337252497673035, -0.004683644510805607, 0.002808620221912861, 0.009191848337650299, 0.0005854555638507009, -0.020000258460640907, -0.005526387132704258, 0.013183128088712692, 0.010576577857136726, 0.008214391767978668, -0.0038941605016589165, -0.005025127436965704, 0.008627930656075478, 0.026140687987208366, 0.008176797069609165, -0.0125252241268754, 0.0111028999090194

In [15]:
# Query
question = 'Give me a list of healthcare providers whose speciality is Cardiology'

# Embedding the question and retrieve from knowledge graph
result = kg.query(
    '''
    WITH genai.vector.encode(
        $question,
        "OpenAI",
        {
          token: $openAiApiKey,
          endpoint: $openAiEndpoint
        }) AS question_embedding
    CALL db.index.vector.queryNodes(
        'health_providers_embeddings',
        $top_k,
        question_embedding
        ) YIELD node AS healthcare_provider, score
    RETURN healthcare_provider.name, healthcare_provider.bio, score
    ''',
    params={
        'openAiApiKey': OPENAI_API_KEY,
        'openAiEndpoint': OPENAI_EMBEDDINGS_ENDPOINT,
        'question': question,
        'top_k': 3,
    },
)

for record in result:
    print('Name:', record['healthcare_provider.name'])
    print('Bio:', record['healthcare_provider.bio'])
    print('Score:', record['score'])
    print('---')

Name: Dr. John Smith
Bio: Dr. John Smith is a renowned cardiologist with over 20 years of experience.
Score: 0.9129180908203125
---
Name: Dr. Sarah Johnson
Bio: Dr. Sarah Johnson is a pediatrician known for her compassionate care.
Score: 0.8825836181640625
---
Name: Dr. Emily Davis
Bio: Dr. Emily Davis specializes in neurology and has published numerous research papers.
Score: 0.877777099609375
---
