# Module 1: Structured Data 

This module has the following objectives:
- Creating a graph from structured data input
- Basic graph algorithms
- Text embeddings for semantic analysis
- Feature engineering
- Node embeddings

In [1]:
!pip install graphdatascience neo4j dotenv langchain langchain_openai



Import our usual suspects (and some more...)

In [2]:
import os
import pandas as pd
from dotenv import load_dotenv
from graphdatascience import GraphDataScience
from neo4j import Query, GraphDatabase, RoutingControl, Result
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

## Setup

Load env variables

In [3]:
env_file = 'ws.env'

In [4]:
if os.path.exists(env_file):
    load_dotenv(env_file, override=True)

    # Neo4j
    HOST = os.getenv('NEO4J_URI')
    USERNAME = os.getenv('NEO4J_USERNAME')
    PASSWORD = os.getenv('NEO4J_PASSWORD')
    DATABASE = os.getenv('NEO4J_DATABASE')

    # AI
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
    os.environ['OPENAI_API_KEY']=OPENAI_API_KEY
    LLM = os.getenv('LLM')
else:
    print(f"File {env_file} not found.")

## Read Data

Load synthetic Skills dataset

In [5]:
url = "https://raw.githubusercontent.com/erikbijl/genai-workshop-amsterdam/refs/heads/main/talent/expanded_skills.csv"

In [6]:
skills_df = pd.read_csv(url)

Describe the dataset

In [7]:
skills_df.describe()

Unnamed: 0,email,name,skills
count,100,100,100
unique,100,100,100
top,thomas.nelson@test.org,Thomas Nelson,"Security, Pandas, Go"
freq,1,1,1


Display the first few rows of the DataFrame

In [8]:
skills_df.head(30)

Unnamed: 0,email,name,skills
0,thomas.nelson@test.org,Thomas Nelson,"Security, Pandas, Go"
1,lucy.clark@test.org,Lucy Clark,"WordPress, Scrum, Go, SQL, Linux"
2,richard.jackson@test.org,Richard Jackson,"System Design, PyTorch, Express.js, DevOps"
3,amelia.hall@test.org,Amelia Hall,"Agile, CSS3, R, Azure"
4,david.hill@test.org,David Hill,"Java, Scrum, Angular"
5,christopher.johnson@test.org,Christopher Johnson,"Tableau, Flask, API Design"
6,amelia.martin@test.org,Amelia Martin,"CI/CD, Kotlin, HTML5, TensorFlow"
7,daniel.hill@test.org,Daniel Hill,"System Design, Git, Cypher, Pandas, Spring Boot"
8,alice.white@test.org,Alice White,"Spark, Agile, JavaScript"
9,lucy.taylor@test.org,Lucy Taylor,"Flask, Tableau, CI/CD, Rust, System Design"


Convert skills column from comma separated string to List

In [9]:
skills_df['skills'] = skills_df['skills'].str.split(', ')
skills_df.head()

Unnamed: 0,email,name,skills
0,thomas.nelson@test.org,Thomas Nelson,"[Security, Pandas, Go]"
1,lucy.clark@test.org,Lucy Clark,"[WordPress, Scrum, Go, SQL, Linux]"
2,richard.jackson@test.org,Richard Jackson,"[System Design, PyTorch, Express.js, DevOps]"
3,amelia.hall@test.org,Amelia Hall,"[Agile, CSS3, R, Azure]"
4,david.hill@test.org,David Hill,"[Java, Scrum, Angular]"


## Create the Graph

### Connect to the Database

To connect to the database we use the [Neo4j Python Driver](https://neo4j.com/docs/python-manual/5/). The credentials are stored in our environment so can be specified to the driver.

In [10]:
driver = GraphDatabase.driver(
    HOST,
    auth=(USERNAME, PASSWORD)
)

In case we want to split large files. 

In [19]:
def split_dataframe(df, chunk_size = 50_000):
    chunks = list()
    num_chunks = len(df) // chunk_size + 1
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

Test the connection

In [20]:
driver.execute_query(
    """
    MATCH (n) RETURN COUNT(n) as Count
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

Unnamed: 0,Count
0,0


### Set constraints

We know what we will be loading. Set some constrainst first. Documentation: [Constraints](https://neo4j.com/docs/cypher-manual/current/constraints/managing-constraints/)

Set the constraint on Person Nodes

In [12]:
driver.execute_query(
    'CREATE CONSTRAINT IF NOT EXISTS FOR (n:Person) REQUIRE (n.email) IS NODE KEY',
    database_=DATABASE,
    routing_=RoutingControl.WRITE
)

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x30bff7750>, keys=[])

Set the constraint on Skill Nodes

In [13]:
driver.execute_query(
    'CREATE CONSTRAINT IF NOT EXISTS FOR (n:Skill) REQUIRE (n.name) IS NODE KEY',
    database_=DATABASE,
    routing_=RoutingControl.WRITE
)

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x310f02610>, keys=[])

Fetch all constraints

In [14]:
schema_result_df  = driver.execute_query(
    'SHOW CONSTRAINTS',
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)
schema_result_df.head()

Unnamed: 0,id,name,type,entityType,labelsOrTypes,properties,ownedIndex,propertyType
0,3,constraint_63bf11a1,NODE_KEY,NODE,[Skill],[name],constraint_63bf11a1,
1,5,constraint_d3bfd313,NODE_KEY,NODE,[Person],[email],constraint_d3bfd313,


### Load (:Person)-[:KNOWS]->(:Skill)

Create a Person and Skills nodes and create a relationship in between. Documentation: [MERGE](https://neo4j.com/docs/cypher-manual/current/clauses/merge/?utm_source=GSearch&utm_medium=PaidSearch&utm_campaign=Evergreen&utm_content=EMEA-Search-SEMCE-DSA-None-SEM-SEM-NonABM&utm_term=&utm_adgroup=DSA&gad_source=1&gclid=Cj0KCQjwm7q-BhDRARIsACD6-fXns_MSgSZ3_jQdYreKu3iOBQQU6bwddlNa4wD12oLr3rxKUlF4MMMaAnj1EALw_wcB)

In [21]:
for chunk in split_dataframe(skills_df):
    records, summary, keys = driver.execute_query(
        """
        UNWIND $rows AS row
        MERGE (p:Person{email:row.email})
        SET p.name = row.name
        WITH p, row
        FOREACH(skill IN row.skills | MERGE (s:Skill{name:skill}) MERGE (p)-[:KNOWS]->(s) )
        RETURN COUNT(*) AS rows_processed
        """,
        database_=DATABASE,
        routing_=RoutingControl.WRITE,
        rows = chunk.to_dict('records')
    )

## Explore the Graph

Now to the database and observe what is there. 
Example queries: 
- MATCH (n:Person) RETURN n LIMIT 25;
- MATCH (n:Skill) RETURN n LIMIT 25;
- MATCH p=()-[:KNOWS]->() RETURN p LIMIT 25;

We can also run this via the [Neo4j Python Driver](https://neo4j.com/docs/python-manual/5/). Let's do so below

#### What persons are in the database?

In [22]:
persons_df = driver.execute_query(
    """
    MATCH (p:Person)
    RETURN p.name AS person_name
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [23]:
persons_df

Unnamed: 0,person_name
0,Richard Jackson
1,Amelia Hall
2,David Hill
3,Christopher Johnson
4,Amelia Martin
...,...
95,Amelia Phillips
96,Amelia Brown
97,Peter Perez
98,Thomas Nelson


#### What skills does each person know?

In [24]:
person_skills_df = driver.execute_query(
    """
    MATCH (p:Person)-[:KNOWS]->(s:Skill)
    RETURN p.email AS email, p.name AS person_name, collect(s.name) AS skills
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [25]:
person_skills_df

Unnamed: 0,email,person_name,skills
0,christopher.johnson@test.org,Christopher Johnson,"[API Design, Flask, Tableau]"
1,victoria.thomas@test.org,Victoria Thomas,"[API Design, Cloud Architecture, Swift]"
2,hannah.campbell@test.org,Hannah Campbell,"[API Design, DevOps, JavaScript, Power BI, Vue..."
3,brian.jackson@test.org,Brian Jackson,"[API Design, Cloud Architecture, Rust, Vue.js]"
4,john.walker@test.org,John Walker,"[API Design, Django, Python]"
...,...,...,...
95,joseph.lopez@test.org,Joseph Lopez,"[Linux, ReactJS, System Design]"
96,richard.mitchell@test.org,Richard Mitchell,"[Node.js, Scala, Scrum]"
97,david.lopez@test.org,David Lopez,"[PHP, Security, WordPress]"
98,joseph.mitchell@test.org,Joseph Mitchell,"[Ruby, Spark, System Design, Vue.js]"


#### What are the most frequent skills?

In [26]:
skill_count_df = driver.execute_query(
    """
    MATCH (p:Person)-[:KNOWS]->(s:Skill)
    RETURN s.name, COUNT(DISTINCT p) AS knownByCount ORDER BY knownByCount DESC LIMIT 10
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [27]:
skill_count_df

Unnamed: 0,s.name,knownByCount
0,System Design,14
1,Agile,13
2,Security,13
3,Angular,13
4,Blockchain,11
5,Cloud Architecture,11
6,Scrum,11
7,Docker,10
8,ReactJS,10
9,TensorFlow,10


#### Multihop question

Run the following query in the database: 
- MATCH p=(p1:Person {name: "Lucy Clark"})-[:KNOWS]->(s1:Skill)<-[:KNOWS]-(p2:Person) RETURN DISTINCT p;
- MATCH p=(p1:Person {name: "Lucy Clark"})-[:KNOWS]->(s1:Skill)<-[:KNOWS]-(p2:Person)-[:KNOWS]-(s2:Skill) RETURN DISTINCT p;

In [28]:
persons_with_shared_skills_df = driver.execute_query(
    """
    MATCH p=(p1:Person {name: "Lucy Clark"})-[:KNOWS]->(s1:Skill)<-[:KNOWS]-(p2:Person)
    RETURN DISTINCT p2.name as person;
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [29]:
persons_with_shared_skills_df

Unnamed: 0,person
0,Natalie Miller
1,Peter Martinez
2,Ryan Nelson
3,Robert Davis
4,Natalie Brown
5,Thomas Nelson
6,Mia Nelson
7,David Lopez
8,John Johnson
9,Ryan Young


In [30]:
skills_two_steps_df = driver.execute_query(
    """
    MATCH p=(p1:Person {name: "Lucy Clark"})-[:KNOWS]->(s1:Skill)<-[:KNOWS]-(p2:Person)-[:KNOWS]-(s2:Skill)
    RETURN DISTINCT s2.name as skill;
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [31]:
skills_two_steps_df

Unnamed: 0,skill
0,Express.js
1,Azure
2,Testing
3,Machine Learning
4,Docker
5,Project Management
6,Agile
7,Spring Boot
8,Power BI
9,System Design


## Person Similarity

We can define the similarity of persons based on the number of skills that are overlapping. 

In [41]:
similar_skills_df = driver.execute_query(
    """
    MATCH path_1=(p1:Person{name:"Thomas Brown"})-[:KNOWS]->(s1:Skill)
    MATCH path_2=(s1)<-[:KNOWS]-(p2:Person)
    WITH p1.name as person_1, p2.name as person_2, COLLECT(DISTINCT s1.name) as skill_list, COUNT(DISTINCT(s1)) as skill_count
    WHERE skill_count > 1 AND person_1 <> person_2
    RETURN * ORDER BY skill_count DESC
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [42]:
similar_skills_df

Unnamed: 0,person_1,person_2,skill_count,skill_list
0,Thomas Brown,Amelia Davis,3,"[Security, Java, Docker]"
1,Thomas Brown,Lucy Turner,2,"[Security, Docker]"
2,Thomas Brown,James Anderson,2,"[Security, R]"
3,Thomas Brown,Andrew Martin,2,"[R, Java]"
4,Thomas Brown,Thomas Garcia,2,"[Java, Docker]"


In [49]:
similar_skills_all_df = driver.execute_query(
    """
    MATCH path_1=(p1:Person)-[:KNOWS]->(s1:Skill)<-[:KNOWS]-(p2:Person)
    WHERE p1.name < p2.name
    WITH p1.name as person_1, p2.name as person_2, COLLECT(DISTINCT s1.name) as skill_list, COUNT(DISTINCT(s1)) as skill_count
    WHERE skill_count >= 1
    RETURN * ORDER BY skill_count DESC
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [50]:
similar_skills_all_df

Unnamed: 0,person_1,person_2,skill_count,skill_list
0,Charles Jones,John Taylor,3,"[AWS, CSS3, Pandas]"
1,Joseph Martin,Kevin Young,3,"[Agile, Linux, ReactJS]"
2,Brian Thompson,John Baker,3,"[Agile, Data Analysis, PHP]"
3,Natalie Thompson,Ryan Jones,3,"[Angular, Jenkins, Spark]"
4,David Rodriguez,Matthew Scott,3,"[Azure, Cypher, Scrum]"
...,...,...,...,...
1314,Lucy Clark,Ryan Hall,1,[Scrum]
1315,David Hill,Lucy Clark,1,[Scrum]
1316,Emily Thompson,Lucy Clark,1,[Scrum]
1317,John Taylor,Lucy Clark,1,[Scrum]


Load the skill count to the database in a new relationship

In [52]:
for chunk in split_dataframe(similar_skills_all_df):
    records, summary, keys = driver.execute_query(
        """
        UNWIND $rows AS row
        MERGE (p1:Person{name:row.person_1})
        MERGE (p2:Person{name:row.person_2})
        MERGE (p1)-[s:SIMILAR_SKILLSET]->(p2)
        SET s.overlap = row.skill_count
        RETURN COUNT(*) AS rows_processed
        """,
        database_=DATABASE,
        routing_=RoutingControl.WRITE,
        rows = chunk.to_dict('records')
    )

Take a minute to explore the SIMILAR_SKILLSET network

## Communities

Let's run some Graph Data Science based on Persons and Skills. Let's first setup the [Graph Data Science Client](https://neo4j.com/docs/graph-data-science-client/current/). 

In [68]:
gds = GraphDataScience.from_neo4j_driver(driver=driver)
gds.set_database(DATABASE)
gds.version()

'2.15.0'

Let's investigate Persons that are similar in the graph (based on skills they share). For that we first need to create a [Graph object](https://neo4j.com/docs/graph-data-science-client/current/graph-object/). 

In [80]:
graph_name = "person_similarity_projection"
node_projection = ["Person"]
rel_projection = {"SIMILAR_SKILLSET": {"orientation": 'UNDIRECTED', "properties": "overlap"}, }

In [81]:
G, res = gds.graph.project(graph_name, node_projection, rel_projection)

Run the [Leiden Algorithm](https://neo4j.com/docs/graph-data-science/current/algorithms/leiden/) for Community Detection

In [97]:
gds.leiden.write(
    G,
    writeProperty='leiden_community',
    relationshipWeightProperty='overlap',
    maxLevels=100,
    gamma=5,
    theta=0.001
)

writeMillis                                                             13
nodePropertiesWritten                                                  100
ranLevels                                                                2
didConverge                                                           True
nodeCount                                                              100
communityCount                                                          48
communityDistribution    {'min': 1, 'p5': 1, 'max': 5, 'p999': 5, 'p99'...
modularity                                                       -0.026968
modularities                [-0.027811793200144475, -0.026968247063940864]
postProcessingMillis                                                     1
preProcessingMillis                                                      0
computeMillis                                                           22
configuration            {'writeProperty': 'leiden_community', 'theta':...
Name: 0, dtype: object

In [108]:
communities_df = driver.execute_query(
    """
    MATCH (p:Person)
    RETURN p.leiden_community AS Community, COUNT(*) as MemberCount
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [109]:
communities_df

Unnamed: 0,Community,MemberCount
0,0,1
1,1,1
2,31,2
3,5,5
4,41,2
5,30,2
6,22,5
7,11,2
8,43,2
9,2,1


Check communities based on people with high overlap

In [98]:
community_check_df = driver.execute_query(
    """
    MATCH (p1:Person)-[s:SIMILAR_SKILLSET]->(p2:Person)
    WHERE s.overlap > 2
    RETURN s.overlap, p1.name, p1.leiden_community, p2.name, p2.leiden_community
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [99]:
community_check_df

Unnamed: 0,s.overlap,p1.name,p1.leiden_community,p2.name,p2.leiden_community
0,3,Natalie Thompson,14,Ryan Jones,14
1,3,Hannah Campbell,22,Ryan Rodriguez,22
2,3,Charles Jones,13,John Taylor,13
3,3,Amelia Davis,18,Thomas Brown,18
4,3,Amelia Davis,18,John Garcia,21
5,3,Brian Thompson,33,John Baker,33
6,3,Joseph Martin,32,Kevin Young,32
7,3,Andrew Anderson,15,Andrew Martin,15
8,3,Daniel Hall,30,Daniel Hill,30
9,3,David Rodriguez,9,Matthew Scott,9


Drop the projection from the graph catalogue to free up resources

In [None]:
G.drop()

# Semantic Similar skill

In [105]:
skills_df = gds.run_cypher(
    """
    MATCH (s:Skill)
    RETURN s.name AS skill
    """
)

In [60]:
skills_df.head(5)

Unnamed: 0,Name,Embedding
0,API Design,"[0.002335607074201107, -0.021444285288453102, ..."
1,AWS,"[-0.004132895264774561, -0.017077714204788208,..."
2,Agile,"[-0.03577807545661926, -0.01052175834774971, -..."
3,Angular,"[-0.006358983926475048, 0.015042469836771488, ..."
4,Azure,"[0.00941519346088171, -0.026340041309595108, 0..."


## STOP STOP STOP - DO NOT PROCEED (YET)

-- Only to be run by instructor (or if you have your own api key). Skip the following two cells -- 

In [None]:
# embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')

In [29]:
# skills_df['embedding'] = skills_df['skill'].apply( lambda skill: embeddings.embed_documents([skill])[0])
# skills_df.head()

In [30]:
# gds.run_cypher('''
#     unwind $data as row
#     match (s:Skill{name: row.skill})
#     set s.embedding = row.embedding
#     ''',
#     params = { 'data': skills_df.to_dict(orient='records') }
# )

In [62]:
url = 'https://raw.githubusercontent.com/erikbijl/genai-workshop-amsterdam/refs/heads/main/talent/skills_embeddings.csv'

In [63]:
skills_embeddings_df = pd.read_csv(url)

In [64]:
skills_embeddings_df['Embedding'] = skills_embeddings_df['Embedding'].apply( lambda x: [ float(i) for i in x.strip("[]").split(", ")] )

In [65]:
skills_embeddings_df.head()

Unnamed: 0,Skill,Embedding
0,API Design,"[0.002335607074201107, -0.021444285288453102, ..."
1,AWS,"[-0.004132895264774561, -0.017077714204788208,..."
2,Agile,"[-0.03577807545661926, -0.01052175834774971, -..."
3,Angular,"[-0.006358983926475048, 0.015042469836771488, ..."
4,Azure,"[0.00941519346088171, -0.026340041309595108, 0..."


Length of an embedding

In [118]:
len(skills_embeddings_df['Embedding'].iloc[0])

1536

Add embeddings to Skill nodes in database

In [113]:
for chunk in split_dataframe(skills_embeddings_df):
    records, summary, keys = driver.execute_query(
        """
        UNWIND $rows AS row
        MATCH (s:Skill{name: row.Skill})
        SET s.embedding = row.Embedding
        WITH s
        CALL db.create.setNodeVectorProperty(s, "embedding", s.embedding)
        RETURN COUNT(*) AS rows_processed
        """,
        database_=DATABASE,
        routing_=RoutingControl.WRITE,
        rows = chunk.to_dict('records')
    )

In [121]:
indexes_result_df  = driver.execute_query(
    'SHOW INDEXES',
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)
indexes_result_df.head()

Unnamed: 0,id,name,state,populationPercent,type,entityType,labelsOrTypes,properties,indexProvider,owningConstraint,lastRead,readCount
0,2,constraint_63bf11a1,ONLINE,100.0,RANGE,NODE,[Skill],[name],range-1.0,constraint_63bf11a1,2025-03-10T21:12:33.741000000+00:00,638
1,4,constraint_d3bfd313,ONLINE,100.0,RANGE,NODE,[Person],[email],range-1.0,constraint_d3bfd313,2025-03-10T20:26:47.994000000+00:00,306
2,0,index_343aff4e,ONLINE,100.0,LOOKUP,NODE,,,token-lookup-1.0,,2025-03-10T20:51:08.954000000+00:00,2855
3,1,index_f7700477,ONLINE,100.0,LOOKUP,RELATIONSHIP,,,token-lookup-1.0,,2025-03-10T19:59:16.009000000+00:00,9
4,6,skill-embeddings,ONLINE,100.0,VECTOR,NODE,[Skill],[embedding],vector-2.0,,,0


# TO DO FROM HERE

- Update the semantic similarity with vector index
- Update betweenness. 

# Vectors for Semantic Meaning

In [119]:
driver.execute_query(
    """
    CREATE VECTOR INDEX `skill-embeddings` IF NOT EXISTS
    FOR (s:Skill) ON (s.embedding)
    OPTIONS {
        indexConfig: {
            `vector.dimensions`: 1536,
            `vector.similarity_function`: 'cosine'
        } 
    }
    """,
    database_=DATABASE,
    routing_=RoutingControl.WRITE
)      

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x318c99210>, keys=[])

# Let's use the Semantic Meaning to find similarities...

In [33]:
graph_name = "skill_embedding_projection"
node_projection = {
        'Skill': {"properties": 'embedding'},
    },
rel_projection = ["KNOWS"]  # No rels will be projected, but we need to specify something here :)

In [34]:
G, res = gds.graph.project(graph_name, node_projection, rel_projection)

In [35]:
res

nodeProjection            {'Skill': {'label': 'Skill', 'properties': {'e...
relationshipProjection    {'KNOWS': {'aggregation': 'DEFAULT', 'orientat...
graphName                                        skill_embedding_projection
nodeCount                                                                54
relationshipCount                                                         0
projectMillis                                                            15
Name: 0, dtype: object

Running [K Nearest Neighbours](https://neo4j.com/docs/graph-data-science/current/algorithms/knn/) to find semantic similarities... 

In [36]:
gds.knn.write(
    G,
    nodeLabels=['Skill'],
    nodeProperties=['embedding'],
    topK=3,
    writeRelationshipType='SIMILAR_SEMANTIC',
    writeProperty='sim_score'
)

ranIterations                                                             6
didConverge                                                            True
nodePairsConsidered                                                    5257
preProcessingMillis                                                       0
computeMillis                                                            23
writeMillis                                                              16
postProcessingMillis                                                      0
nodesCompared                                                            54
relationshipsWritten                                                    162
similarityDistribution    {'min': 0.9043693542480469, 'p5': 0.9083518981...
configuration             {'writeProperty': 'sim_score', 'writeRelations...
Name: 0, dtype: object

Remove symmetric relationships

In [37]:
gds.run_cypher(
    """
    MATCH (a:Skill)-[r:SIMILAR_SEMANTIC]->(b:Skill)
    WHERE EXISTS {(b)-[:SIMILAR_SEMANTIC]->(a)} AND a<b
    DELETE r
    """  
)

Let's review

In [38]:
similar_skills_df = gds.run_cypher(
    """
    MATCH (s:Skill)-[r:SIMILAR_SEMANTIC]-(s2)
    RETURN s.name as skill, r.sim_score as score, s2.name as to_skill
    ORDER by skill asc, score desc
    """
)

In [39]:
similar_skills_df.head(15)

Unnamed: 0,skill,score,to_skill
0,API Design,0.922208,System Design
1,API Design,0.908355,Project Management
2,API Design,0.907156,Express.js
3,AWS,0.950841,Azure
4,AWS,0.911699,WordPress
5,AWS,0.911006,SQL
6,Agile,0.951504,Scrum
7,Agile,0.92155,Project Management
8,Agile,0.920712,Angular
9,Angular,0.925686,Azure


Drop the projection from the graph catalogue to free up resources

In [40]:
G.drop()

graphName                                       skill_embedding_projection
database                                                             neo4j
databaseLocation                                                     local
memoryUsage                                                               
sizeInBytes                                                             -1
nodeCount                                                               54
relationshipCount                                                        0
configuration            {'relationshipProjection': {'KNOWS': {'aggrega...
density                                                                0.0
creationTime                           2025-03-08T08:38:13.948292580+00:00
modificationTime                       2025-03-08T08:38:13.948292580+00:00
schema                   {'graphProperties': {}, 'nodes': {'Skill': {'e...
schemaWithOrientation    {'graphProperties': {}, 'nodes': {'Skill': {'e...
Name: 0, dtype: object

### Graph Feature Engineering

Let's do some "Graph Feature Engineering" - learn from our connected data...

In [41]:
graph_name = 'skill_BetW_projection'
node_projection = ['Skill']
rel_projection =  {'SIMILAR_SEMANTIC':{'orientation': 'NATURAL'}}

In [42]:
G, res = gds.graph.project(graph_name, node_projection, rel_projection)

In [43]:
BetWresult = gds.betweenness.write(
    G,
    writeProperty='betweenness'
)

In [44]:
BetWresult

nodePropertiesWritten                                                    54
writeMillis                                                               2
centralityDistribution    {'min': 0.0, 'max': 108.50048828124999, 'p90':...
postProcessingMillis                                                     10
preProcessingMillis                                                       0
computeMillis                                                             3
configuration             {'writeProperty': 'betweenness', 'jobId': '21e...
Name: 0, dtype: object

In [45]:
G.drop()

graphName                                            skill_BetW_projection
database                                                             neo4j
databaseLocation                                                     local
memoryUsage                                                               
sizeInBytes                                                             -1
nodeCount                                                               54
relationshipCount                                                      122
configuration            {'relationshipProjection': {'SIMILAR_SEMANTIC'...
density                                                           0.042628
creationTime                           2025-03-08T08:38:14.480173251+00:00
modificationTime                       2025-03-08T08:38:14.480173251+00:00
schema                   {'graphProperties': {}, 'nodes': {'Skill': {}}...
schemaWithOrientation    {'graphProperties': {}, 'nodes': {'Skill': {}}...
Name: 0, dtype: object

In [46]:
bridge_skill = gds.run_cypher(
    """
    MATCH (s:Skill)
    RETURN s.name AS skill, s.betweenness AS betweenness
    ORDER BY betweenness DESC
    """
)

In [47]:
bridge_skill.head(10)

Unnamed: 0,skill,betweenness
0,Java,108.5
1,SQL,90.833333
2,Tableau,79.0
3,Data Visualization,76.0
4,Linux,73.5
5,Power BI,73.0
6,Swift,73.0
7,Machine Learning,63.0
8,Data Analysis,48.0
9,Scala,34.0


Wait! - and I can vectorise my Graph as well....?*

In [48]:
graph_name = 'Person_projection'
node_projection = ['Person']
rel_projection =  {'SIMILAR_SKILLSET': {'orientation': 'UNDIRECTED'}}

In [49]:
G, res = gds.graph.project(graph_name, node_projection, rel_projection)

Running a node embedding in a few lines...

In [50]:
fastrp_res =  gds.fastRP.write(
    G,
    embeddingDimension = 128,
    iterationWeights = [0, 0, 1.0, 1.0],
    normalizationStrength = 0.05,
    writeProperty = "fastRP_Embedding",
    randomSeed = 42
)

Adding it now to memory for some more computations

In [51]:
fastrp_res =  gds.fastRP.mutate(
    G,
    embeddingDimension = 128,
    iterationWeights = [0, 0, 1.0, 1.0],
    normalizationStrength = 0.05,
    mutateProperty = "fastRP_Embedding",
    randomSeed = 42
)

Finding clusters based on their structural and specific attributes...

In [52]:
kmeans_result = gds.kmeans.write(
    G,
    nodeProperty = 'fastRP_Embedding',
    k = 5,
    writeProperty = 'kmeans5_cluster',
    randomSeed = 42,
    maxIterations = 100
)

In [53]:
G.drop()

graphName                                                Person_projection
database                                                             neo4j
databaseLocation                                                     local
memoryUsage                                                               
sizeInBytes                                                             -1
nodeCount                                                               97
relationshipCount                                                      404
configuration            {'relationshipProjection': {'SIMILAR_SKILLSET'...
density                                                           0.043385
creationTime                           2025-03-08T08:38:14.948947705+00:00
modificationTime                       2025-03-08T08:38:15.127744650+00:00
schema                   {'graphProperties': {}, 'nodes': {'Person': {'...
schemaWithOrientation    {'graphProperties': {}, 'nodes': {'Person': {'...
Name: 0, dtype: object

In [54]:
skill_teams = gds.run_cypher(
    """
    MATCH (n) WHERE (n.kmeans5_cluster) IS NOT NULL
    RETURN n.kmeans5_cluster AS Team, collect(n.name) AS Team_members
    """
)

In [55]:
skill_teams.head(6)

Unnamed: 0,Team,Team_members
0,3,"[Thomas Nelson, Lucy Clark, Elena Young, Sophi..."
1,1,"[Richard Jackson, Amelia Hall, David Hill, Dan..."
2,0,"[Christopher Johnson, Joseph Lopez, Victoria T..."
3,4,"[Amelia Martin, Lucy Turner, Joshua Lopez, Bri..."
4,2,"[Isabella Jones, Ryan Nelson, Matthew Miller, ..."
