# Module objectives
- Creating a graph from structured data input
- Basic graph algorithms
- Text embeddings for semantic analysis
- Feature engineering
- Node embeddings


In [1]:
!pip install graphdatascience neo4j dotenv langchain langchain_openai



Import our usual suspects (and some more...)

In [2]:
import os
import pandas as pd
from dotenv import load_dotenv
from graphdatascience import GraphDataScience
from neo4j import Query, GraphDatabase, RoutingControl, Result
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Setup

Load env variables

In [3]:
env_file = 'ws.env'

In [4]:
if os.path.exists(env_file):
    load_dotenv(env_file, override=True)

    # Neo4j
    HOST = os.getenv('NEO4J_URI')
    USERNAME = os.getenv('NEO4J_USERNAME')
    PASSWORD = os.getenv('NEO4J_PASSWORD')
    DATABASE = os.getenv('NEO4J_DATABASE')

    # AI
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
    os.environ['OPENAI_API_KEY']=OPENAI_API_KEY
    LLM = os.getenv('LLM')
else:
    print(f"File {env_file} not found.")

Connect to neo4j db

In [5]:
driver = GraphDatabase.driver(
    HOST,
    auth=(USERNAME, PASSWORD)
)
driver.verify_connectivity(database=DATABASE)

  experimental_warn(


# Graph creation

Utility - not needed for this small dataset, but as best practice example

In [6]:
def split_dataframe(df, chunk_size = 50_000):
    chunks = list()
    num_chunks = len(df) // chunk_size + 1
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

Load synthetic Skills dataset

In [7]:
url = "https://raw.githubusercontent.com/Kristof-Neys/Neo4j_demos/main/expanded_skills.csv"
skills_csv = pd.read_csv(url)

Display the first few rows of the DataFrame

In [8]:
skills_csv.head(30)

Unnamed: 0,email,name,skills
0,thomas.nelson@test.org,Thomas Nelson,"Security, Pandas, Go"
1,lucy.clark@test.org,Lucy Clark,"WordPress, Scrum, Go, SQL, Linux"
2,richard.jackson@test.org,Richard Jackson,"System Design, PyTorch, Express.js, DevOps"
3,amelia.hall@test.org,Amelia Hall,"Agile, CSS3, R, Azure"
4,david.hill@test.org,David Hill,"Java, Scrum, Angular"
5,christopher.johnson@test.org,Christopher Johnson,"Tableau, Flask, API Design"
6,amelia.martin@test.org,Amelia Martin,"CI/CD, Kotlin, HTML5, TensorFlow"
7,daniel.hill@test.org,Daniel Hill,"System Design, Git, Cypher, Pandas, Spring Boot"
8,alice.white@test.org,Alice White,"Spark, Agile, JavaScript"
9,lucy.taylor@test.org,Lucy Taylor,"Flask, Tableau, CI/CD, Rust, System Design"


Convert skills column from comma separated string to List

In [9]:
skills_csv['skills'] = skills_csv['skills'].str.split(', ')
skills_csv.head()

Unnamed: 0,email,name,skills
0,thomas.nelson@test.org,Thomas Nelson,"[Security, Pandas, Go]"
1,lucy.clark@test.org,Lucy Clark,"[WordPress, Scrum, Go, SQL, Linux]"
2,richard.jackson@test.org,Richard Jackson,"[System Design, PyTorch, Express.js, DevOps]"
3,amelia.hall@test.org,Amelia Hall,"[Agile, CSS3, R, Azure]"
4,david.hill@test.org,David Hill,"[Java, Scrum, Angular]"


### Schema

In [10]:
schema_statements = [
    'CREATE CONSTRAINT IF NOT EXISTS FOR (n:Person) REQUIRE (n.email) IS NODE KEY',
    'CREATE CONSTRAINT IF NOT EXISTS FOR (n:Skill) REQUIRE (n.name) IS NODE KEY',
]

In [11]:
for statement in schema_statements:
    driver.execute_query(
        statement,
        database_=DATABASE,
        routing_=RoutingControl.WRITE
    )

Fetch all constraints

In [12]:
schema_result_df  = driver.execute_query(
    'SHOW CONSTRAINTS',
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)
schema_result_df.head(100)

Unnamed: 0,id,name,type,entityType,labelsOrTypes,properties,ownedIndex,propertyType
0,5,constraint_63bf11a1,NODE_KEY,NODE,[Skill],[name],constraint_63bf11a1,
1,3,constraint_d3bfd313,NODE_KEY,NODE,[Person],[email],constraint_d3bfd313,


Create a graph for (:Person)-[:KNOWS]->(:Skill)

In [13]:
for chunk in split_dataframe(skills_csv):
    records, summary, keys = driver.execute_query(
        """
        UNWIND $rows AS row
        MERGE (p:Person{email:row.email})
        SET p.name = row.name
        WITH p, row
        FOREACH(skill IN row.skills | MERGE (s:Skill{name:skill}) MERGE (p)-[:KNOWS]->(s) )
        RETURN COUNT(*) AS rows_processed
        """,
        database_=DATABASE,
        routing_=RoutingControl.WRITE,
        rows = chunk.to_dict('records')
    )

# Basic navigation of graph with cypher

What persons are in the database?

In [14]:
driver.execute_query(
    """
    MATCH (p:Person)
    RETURN p.name AS person_name
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
).head()

Unnamed: 0,person_name
0,Thomas Nelson
1,Lucy Clark
2,Richard Jackson
3,Amelia Hall
4,David Hill


What skills does each person know?

In [15]:
driver.execute_query(
    """
    MATCH (p:Person)-[:KNOWS]->(s:Skill)
    RETURN p.email AS email, p.name AS person_name, collect(s.name) AS skills
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
).head()

Unnamed: 0,email,person_name,skills
0,christopher.johnson@test.org,Christopher Johnson,"[API Design, Flask, Tableau]"
1,victoria.thomas@test.org,Victoria Thomas,"[API Design, Cloud Architecture, Swift]"
2,hannah.campbell@test.org,Hannah Campbell,"[API Design, DevOps, JavaScript, Power BI, Vue..."
3,brian.jackson@test.org,Brian Jackson,"[API Design, Cloud Architecture, Jenkins, Proj..."
4,john.walker@test.org,John Walker,"[API Design, Django, Python]"


What are the most frequent skills?

In [16]:
skill_count_df = driver.execute_query(
    """
    MATCH (p:Person)-[:KNOWS]->(s:Skill)
    RETURN s.name, COUNT(DISTINCT p) AS knownByCount ORDER BY knownByCount DESC LIMIT 10
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [17]:
skill_count_df

Unnamed: 0,s.name,knownByCount
0,System Design,14
1,Agile,13
2,Security,13
3,Angular,13
4,Blockchain,11
5,Cloud Architecture,11
6,Scrum,11
7,Docker,10
8,ReactJS,10
9,TensorFlow,10


# Node similarity

Let's investigate Persons that are similar in the graph (based on skills they share)

In [18]:
gds = GraphDataScience.from_neo4j_driver(driver=driver)
gds.set_database(DATABASE)
gds.version()

'2.15.0'

In [19]:
graph_name = "person_skills_projection"
node_projection = ["Person", "Skill"]
rel_projection = ["KNOWS"]  

In [20]:
G, res = gds.graph.project(graph_name, node_projection, rel_projection)

In [21]:
res

nodeProjection            {'Skill': {'label': 'Skill', 'properties': {}}...
relationshipProjection    {'KNOWS': {'aggregation': 'DEFAULT', 'orientat...
graphName                                          person_skills_projection
nodeCount                                                               151
relationshipCount                                                       399
projectMillis                                                            11
Name: 0, dtype: object

Documentation https://neo4j.com/docs/graph-data-science/current/algorithms/node-similarity/

In [22]:
gds.nodeSimilarity.stream(
    G,
    similarityMetric = 'OVERLAP',
    topK = 3
)

Unnamed: 0,node1,node2,similarity
0,0,85,0.666667
1,0,14,0.666667
2,0,10,0.333333
3,1,4,0.333333
4,1,22,0.333333
...,...,...,...
286,95,78,0.500000
287,95,60,0.500000
288,96,15,0.500000
289,96,34,0.500000


In [23]:
gds.nodeSimilarity.write(
    G,
    similarityMetric='OVERLAP',
    topK=3,
    writeRelationshipType='SIMILAR_SKILLSET',
    writeProperty='sim_score'
)

preProcessingMillis                                                       0
computeMillis                                                             6
writeMillis                                                              21
postProcessingMillis                                                      0
nodesCompared                                                            97
relationshipsWritten                                                    291
similarityDistribution    {'min': 0.3333320617675781, 'p5': 0.3333320617...
configuration             {'writeProperty': 'sim_score', 'writeRelations...
Name: 0, dtype: object

Remove symmetric relationships

In [24]:
gds.run_cypher(
    """
    MATCH (a:Person)-[r:SIMILAR_SKILLSET]->(b:Person)
    WHERE EXISTS { (b)-[:SIMILAR_SKILLSET]->(a) }
    AND   id(a)<id(b)
    DELETE r
    """
)

Drop the projection from the graph catalogue to free up resources

In [25]:
G.drop()

graphName                                         person_skills_projection
database                                                             neo4j
databaseLocation                                                     local
memoryUsage                                                               
sizeInBytes                                                             -1
nodeCount                                                              151
relationshipCount                                                      399
configuration            {'relationshipProjection': {'KNOWS': {'aggrega...
density                                                           0.017616
creationTime                           2025-03-08T08:38:11.800490335+00:00
modificationTime                       2025-03-08T08:38:11.800490335+00:00
schema                   {'graphProperties': {}, 'nodes': {'Skill': {},...
schemaWithOrientation    {'graphProperties': {}, 'nodes': {'Skill': {},...
Name: 0, dtype: object

Take a minute to explore the SIMILAR_SKILLSET network

# Semantic Similar skill

In [26]:
embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')

In [27]:
skills_df = gds.run_cypher(
    """
    MATCH (s:Skill)
    RETURN s.name AS skill
    """
)

In [28]:
skills_df.head(35)

Unnamed: 0,skill
0,API Design
1,AWS
2,Agile
3,Angular
4,Azure
5,Big Data
6,Blockchain
7,C++
8,CI/CD
9,CSS3


## STOP STOP STOP - DO NOT PROCEED (YET)

-- Only to be run by instructor (or if you have your own api key). Skip the following two cells -- 

In [29]:
# skills_df['embedding'] = skills_df['skill'].apply( lambda skill: embeddings.embed_documents([skill])[0])
# skills_df.head()

In [30]:
# gds.run_cypher('''
#     unwind $data as row
#     match (s:Skill{name: row.skill})
#     set s.embedding = row.embedding
#     ''',
#     params = { 'data': skills_df.to_dict(orient='records') }
# )

In [31]:
# Todo: Kristof, can you update this so it also has embeddings from text-embedding-ada-002
skills_df = pd.read_csv('https://raw.githubusercontent.com/Kristof-Neys/Neo4j_demos/refs/heads/main/skills_embeddings.csv')
skills_df['Embedding'] = skills_df['Embedding'].apply( lambda x: [ float(i) for i in x.strip("[]").split(", ")] )
skills_df.head()

Unnamed: 0,Name,Embedding
0,API Design,"[0.002335607074201107, -0.021444285288453102, ..."
1,AWS,"[-0.004132895264774561, -0.017077714204788208,..."
2,Agile,"[-0.03577807545661926, -0.01052175834774971, -..."
3,Angular,"[-0.006358983926475048, 0.015042469836771488, ..."
4,Azure,"[0.00941519346088171, -0.026340041309595108, 0..."


Add embeddings to Skill nodes in database

In [32]:
gds.run_cypher(
    """
    UNWIND $data AS row
    MATCH (s:Skill{name: row.Name})
    SET s.embedding = row.Embedding
    """,
    params = { 'data': skills_df.to_dict(orient='records') }
)

# Let's use the Semantic Meaning to find similarities...

In [33]:
graph_name = "skill_embedding_projection"
node_projection = {
        'Skill': {"properties": 'embedding'},
    },
rel_projection = ["KNOWS"]  # No rels will be projected, but we need to specify something here :)

In [34]:
G, res = gds.graph.project(graph_name, node_projection, rel_projection)

In [35]:
res

nodeProjection            {'Skill': {'label': 'Skill', 'properties': {'e...
relationshipProjection    {'KNOWS': {'aggregation': 'DEFAULT', 'orientat...
graphName                                        skill_embedding_projection
nodeCount                                                                54
relationshipCount                                                         0
projectMillis                                                            15
Name: 0, dtype: object

Running [K Nearest Neighbours](https://neo4j.com/docs/graph-data-science/current/algorithms/knn/) to find semantic similarities... 

In [36]:
gds.knn.write(
    G,
    nodeLabels=['Skill'],
    nodeProperties=['embedding'],
    topK=3,
    writeRelationshipType='SIMILAR_SEMANTIC',
    writeProperty='sim_score'
)

ranIterations                                                             6
didConverge                                                            True
nodePairsConsidered                                                    5257
preProcessingMillis                                                       0
computeMillis                                                            23
writeMillis                                                              16
postProcessingMillis                                                      0
nodesCompared                                                            54
relationshipsWritten                                                    162
similarityDistribution    {'min': 0.9043693542480469, 'p5': 0.9083518981...
configuration             {'writeProperty': 'sim_score', 'writeRelations...
Name: 0, dtype: object

Remove symmetric relationships

In [37]:
gds.run_cypher(
    """
    MATCH (a:Skill)-[r:SIMILAR_SEMANTIC]->(b:Skill)
    WHERE EXISTS {(b)-[:SIMILAR_SEMANTIC]->(a)} AND a<b
    DELETE r
    """  
)

Let's review

In [38]:
similar_skills_df = gds.run_cypher(
    """
    MATCH (s:Skill)-[r:SIMILAR_SEMANTIC]-(s2)
    RETURN s.name as skill, r.sim_score as score, s2.name as to_skill
    ORDER by skill asc, score desc
    """
)

In [39]:
similar_skills_df.head(15)

Unnamed: 0,skill,score,to_skill
0,API Design,0.922208,System Design
1,API Design,0.908355,Project Management
2,API Design,0.907156,Express.js
3,AWS,0.950841,Azure
4,AWS,0.911699,WordPress
5,AWS,0.911006,SQL
6,Agile,0.951504,Scrum
7,Agile,0.92155,Project Management
8,Agile,0.920712,Angular
9,Angular,0.925686,Azure


Drop the projection from the graph catalogue to free up resources

In [40]:
G.drop()

graphName                                       skill_embedding_projection
database                                                             neo4j
databaseLocation                                                     local
memoryUsage                                                               
sizeInBytes                                                             -1
nodeCount                                                               54
relationshipCount                                                        0
configuration            {'relationshipProjection': {'KNOWS': {'aggrega...
density                                                                0.0
creationTime                           2025-03-08T08:38:13.948292580+00:00
modificationTime                       2025-03-08T08:38:13.948292580+00:00
schema                   {'graphProperties': {}, 'nodes': {'Skill': {'e...
schemaWithOrientation    {'graphProperties': {}, 'nodes': {'Skill': {'e...
Name: 0, dtype: object

### Graph Feature Engineering

Let's do some "Graph Feature Engineering" - learn from our connected data...

In [41]:
graph_name = 'skill_BetW_projection'
node_projection = ['Skill']
rel_projection =  {'SIMILAR_SEMANTIC':{'orientation': 'NATURAL'}}

In [42]:
G, res = gds.graph.project(graph_name, node_projection, rel_projection)

In [43]:
BetWresult = gds.betweenness.write(
    G,
    writeProperty='betweenness'
)

In [44]:
BetWresult

nodePropertiesWritten                                                    54
writeMillis                                                               2
centralityDistribution    {'min': 0.0, 'max': 108.50048828124999, 'p90':...
postProcessingMillis                                                     10
preProcessingMillis                                                       0
computeMillis                                                             3
configuration             {'writeProperty': 'betweenness', 'jobId': '21e...
Name: 0, dtype: object

In [45]:
G.drop()

graphName                                            skill_BetW_projection
database                                                             neo4j
databaseLocation                                                     local
memoryUsage                                                               
sizeInBytes                                                             -1
nodeCount                                                               54
relationshipCount                                                      122
configuration            {'relationshipProjection': {'SIMILAR_SEMANTIC'...
density                                                           0.042628
creationTime                           2025-03-08T08:38:14.480173251+00:00
modificationTime                       2025-03-08T08:38:14.480173251+00:00
schema                   {'graphProperties': {}, 'nodes': {'Skill': {}}...
schemaWithOrientation    {'graphProperties': {}, 'nodes': {'Skill': {}}...
Name: 0, dtype: object

In [46]:
bridge_skill = gds.run_cypher(
    """
    MATCH (s:Skill)
    RETURN s.name AS skill, s.betweenness AS betweenness
    ORDER BY betweenness DESC
    """
)

In [47]:
bridge_skill.head(10)

Unnamed: 0,skill,betweenness
0,Java,108.5
1,SQL,90.833333
2,Tableau,79.0
3,Data Visualization,76.0
4,Linux,73.5
5,Power BI,73.0
6,Swift,73.0
7,Machine Learning,63.0
8,Data Analysis,48.0
9,Scala,34.0


Wait! - and I can vectorise my Graph as well....?*

In [48]:
graph_name = 'Person_projection'
node_projection = ['Person']
rel_projection =  {'SIMILAR_SKILLSET': {'orientation': 'UNDIRECTED'}}

In [49]:
G, res = gds.graph.project(graph_name, node_projection, rel_projection)

Running a node embedding in a few lines...

In [50]:
fastrp_res =  gds.fastRP.write(
    G,
    embeddingDimension = 128,
    iterationWeights = [0, 0, 1.0, 1.0],
    normalizationStrength = 0.05,
    writeProperty = "fastRP_Embedding",
    randomSeed = 42
)

Adding it now to memory for some more computations

In [51]:
fastrp_res =  gds.fastRP.mutate(
    G,
    embeddingDimension = 128,
    iterationWeights = [0, 0, 1.0, 1.0],
    normalizationStrength = 0.05,
    mutateProperty = "fastRP_Embedding",
    randomSeed = 42
)

Finding clusters based on their structural and specific attributes...

In [52]:
kmeans_result = gds.kmeans.write(
    G,
    nodeProperty = 'fastRP_Embedding',
    k = 5,
    writeProperty = 'kmeans5_cluster',
    randomSeed = 42,
    maxIterations = 100
)

In [53]:
G.drop()

graphName                                                Person_projection
database                                                             neo4j
databaseLocation                                                     local
memoryUsage                                                               
sizeInBytes                                                             -1
nodeCount                                                               97
relationshipCount                                                      404
configuration            {'relationshipProjection': {'SIMILAR_SKILLSET'...
density                                                           0.043385
creationTime                           2025-03-08T08:38:14.948947705+00:00
modificationTime                       2025-03-08T08:38:15.127744650+00:00
schema                   {'graphProperties': {}, 'nodes': {'Person': {'...
schemaWithOrientation    {'graphProperties': {}, 'nodes': {'Person': {'...
Name: 0, dtype: object

In [54]:
skill_teams = gds.run_cypher(
    """
    MATCH (n) WHERE (n.kmeans5_cluster) IS NOT NULL
    RETURN n.kmeans5_cluster AS Team, collect(n.name) AS Team_members
    """
)

In [55]:
skill_teams.head(6)

Unnamed: 0,Team,Team_members
0,3,"[Thomas Nelson, Lucy Clark, Elena Young, Sophi..."
1,1,"[Richard Jackson, Amelia Hall, David Hill, Dan..."
2,0,"[Christopher Johnson, Joseph Lopez, Victoria T..."
3,4,"[Amelia Martin, Lucy Turner, Joshua Lopez, Bri..."
4,2,"[Isabella Jones, Ryan Nelson, Matthew Miller, ..."
