# Module 1: Structured Data 

This module has the following objectives:
- Creating a graph from structured data input
- Basic graph algorithms
- Text embeddings for semantic analysis
- Feature engineering
- Node embeddings

In [1]:
!pip install graphdatascience neo4j dotenv langchain langchain_openai



Import our usual suspects (and some more...)

In [2]:
import os
import pandas as pd
from dotenv import load_dotenv
from graphdatascience import GraphDataScience
from neo4j import Query, GraphDatabase, RoutingControl, Result
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

## Setup

Load env variables

In [3]:
env_file = 'ws.env'

In [4]:
if os.path.exists(env_file):
    load_dotenv(env_file, override=True)

    # Neo4j
    HOST = os.getenv('NEO4J_URI')
    USERNAME = os.getenv('NEO4J_USERNAME')
    PASSWORD = os.getenv('NEO4J_PASSWORD')
    DATABASE = os.getenv('NEO4J_DATABASE')

    # AI
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
    os.environ['OPENAI_API_KEY']=OPENAI_API_KEY
    LLM = os.getenv('LLM')
else:
    print(f"File {env_file} not found.")

## Read Data

Load synthetic Skills dataset

In [5]:
url = "https://raw.githubusercontent.com/erikbijl/genai-workshop-amsterdam/refs/heads/main/talent/expanded_skills.csv"

In [6]:
skills_df = pd.read_csv(url)

Describe the dataset

In [7]:
skills_df.describe()

Unnamed: 0,email,name,skills
count,100,100,100
unique,100,100,100
top,thomas.nelson@test.org,Thomas Nelson,"Security, Pandas, Go"
freq,1,1,1


Display the first few rows of the DataFrame

In [8]:
skills_df.head(30)

Unnamed: 0,email,name,skills
0,thomas.nelson@test.org,Thomas Nelson,"Security, Pandas, Go"
1,lucy.clark@test.org,Lucy Clark,"WordPress, Scrum, Go, SQL, Linux"
2,richard.jackson@test.org,Richard Jackson,"System Design, PyTorch, Express.js, DevOps"
3,amelia.hall@test.org,Amelia Hall,"Agile, CSS3, R, Azure"
4,david.hill@test.org,David Hill,"Java, Scrum, Angular"
5,christopher.johnson@test.org,Christopher Johnson,"Tableau, Flask, API Design"
6,amelia.martin@test.org,Amelia Martin,"CI/CD, Kotlin, HTML5, TensorFlow"
7,daniel.hill@test.org,Daniel Hill,"System Design, Git, Cypher, Pandas, Spring Boot"
8,alice.white@test.org,Alice White,"Spark, Agile, JavaScript"
9,lucy.taylor@test.org,Lucy Taylor,"Flask, Tableau, CI/CD, Rust, System Design"


Convert skills column from comma separated string to List

In [9]:
skills_df['skills'] = skills_df['skills'].str.split(', ')
skills_df.head()

Unnamed: 0,email,name,skills
0,thomas.nelson@test.org,Thomas Nelson,"[Security, Pandas, Go]"
1,lucy.clark@test.org,Lucy Clark,"[WordPress, Scrum, Go, SQL, Linux]"
2,richard.jackson@test.org,Richard Jackson,"[System Design, PyTorch, Express.js, DevOps]"
3,amelia.hall@test.org,Amelia Hall,"[Agile, CSS3, R, Azure]"
4,david.hill@test.org,David Hill,"[Java, Scrum, Angular]"


## Create the Graph

### Connect to the Database

To connect to the database we use the [Neo4j Python Driver](https://neo4j.com/docs/python-manual/5/). The credentials are stored in our environment so can be specified to the driver.

In [10]:
driver = GraphDatabase.driver(
    HOST,
    auth=(USERNAME, PASSWORD)
)

In case we want to split large files. 

In [11]:
def split_dataframe(df, chunk_size = 50_000):
    chunks = list()
    num_chunks = len(df) // chunk_size + 1
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

Test the connection

In [12]:
driver.execute_query(
    """
    MATCH (n) RETURN COUNT(n) as Count
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

Unnamed: 0,Count
0,0


### Set constraints

We know what we will be loading. Set some constrainst first. Documentation: [Constraints](https://neo4j.com/docs/cypher-manual/current/constraints/managing-constraints/)

Set the constraint on Person Nodes

In [13]:
driver.execute_query(
    'CREATE CONSTRAINT IF NOT EXISTS FOR (n:Person) REQUIRE (n.email) IS NODE KEY',
    database_=DATABASE,
    routing_=RoutingControl.WRITE
)

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x176e011d0>, keys=[])

Set the constraint on Skill Nodes

In [14]:
driver.execute_query(
    'CREATE CONSTRAINT IF NOT EXISTS FOR (n:Skill) REQUIRE (n.name) IS NODE KEY',
    database_=DATABASE,
    routing_=RoutingControl.WRITE
)

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x176e04110>, keys=[])

Fetch all constraints

In [15]:
schema_result_df  = driver.execute_query(
    'SHOW CONSTRAINTS',
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)
schema_result_df.head()

Unnamed: 0,id,name,type,entityType,labelsOrTypes,properties,ownedIndex,propertyType
0,6,constraint_63bf11a1,NODE_KEY,NODE,[Skill],[name],constraint_63bf11a1,
1,5,constraint_d3bfd313,NODE_KEY,NODE,[Person],[email],constraint_d3bfd313,


### Load (:Person)-[:KNOWS]->(:Skill)

Create a Person and Skills nodes and create a relationship in between. Documentation: [MERGE](https://neo4j.com/docs/cypher-manual/current/clauses/merge/?utm_source=GSearch&utm_medium=PaidSearch&utm_campaign=Evergreen&utm_content=EMEA-Search-SEMCE-DSA-None-SEM-SEM-NonABM&utm_term=&utm_adgroup=DSA&gad_source=1&gclid=Cj0KCQjwm7q-BhDRARIsACD6-fXns_MSgSZ3_jQdYreKu3iOBQQU6bwddlNa4wD12oLr3rxKUlF4MMMaAnj1EALw_wcB)

In [16]:
for chunk in split_dataframe(skills_df):
    records, summary, keys = driver.execute_query(
        """
        UNWIND $rows AS row
        MERGE (p:Person{email:row.email})
        SET p.name = row.name
        WITH p, row
        FOREACH(skill IN row.skills | MERGE (s:Skill{name:skill}) MERGE (p)-[:KNOWS]->(s) )
        RETURN COUNT(*) AS rows_processed
        """,
        database_=DATABASE,
        routing_=RoutingControl.WRITE,
        rows = chunk.to_dict('records')
    )

## Explore the Graph

Now to the database and observe what is there. 
Example queries: 
- MATCH (n:Person) RETURN n LIMIT 25;
- MATCH (n:Skill) RETURN n LIMIT 25;
- MATCH p=()-[:KNOWS]->() RETURN p LIMIT 25;

We can also run this via the [Neo4j Python Driver](https://neo4j.com/docs/python-manual/5/). Let's do so below

#### What persons are in the database?

In [17]:
persons_df = driver.execute_query(
    """
    MATCH (p:Person)
    RETURN p.name AS person_name
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [18]:
persons_df

Unnamed: 0,person_name
0,Thomas Nelson
1,Lucy Clark
2,Richard Jackson
3,Amelia Hall
4,David Hill
...,...
95,Grace Miller
96,Ryan Hall
97,Amelia Phillips
98,Amelia Brown


#### What skills does each person know?

In [19]:
person_skills_df = driver.execute_query(
    """
    MATCH (p:Person)-[:KNOWS]->(s:Skill)
    RETURN p.email AS email, p.name AS person_name, collect(s.name) AS skills
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [20]:
person_skills_df

Unnamed: 0,email,person_name,skills
0,christopher.johnson@test.org,Christopher Johnson,"[API Design, Flask, Tableau]"
1,victoria.thomas@test.org,Victoria Thomas,"[API Design, Cloud Architecture, Swift]"
2,hannah.campbell@test.org,Hannah Campbell,"[API Design, DevOps, JavaScript, Power BI, Vue..."
3,brian.jackson@test.org,Brian Jackson,"[API Design, Cloud Architecture, Rust, Vue.js]"
4,john.walker@test.org,John Walker,"[API Design, Django, Python]"
...,...,...,...
95,joseph.lopez@test.org,Joseph Lopez,"[Linux, ReactJS, System Design]"
96,richard.mitchell@test.org,Richard Mitchell,"[Node.js, Scala, Scrum]"
97,david.lopez@test.org,David Lopez,"[PHP, Security, WordPress]"
98,joseph.mitchell@test.org,Joseph Mitchell,"[Ruby, Spark, System Design, Vue.js]"


#### What are the most frequent skills?

In [21]:
skill_count_df = driver.execute_query(
    """
    MATCH (p:Person)-[:KNOWS]->(s:Skill)
    RETURN s.name, COUNT(DISTINCT p) AS knownByCount ORDER BY knownByCount DESC LIMIT 10
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [22]:
skill_count_df

Unnamed: 0,s.name,knownByCount
0,System Design,14
1,Agile,13
2,Security,13
3,Angular,13
4,Blockchain,11
5,Cloud Architecture,11
6,Scrum,11
7,Docker,10
8,ReactJS,10
9,TensorFlow,10


#### Multihop question

Run the following query in the database: 
- MATCH p=(p1:Person {name: "Lucy Clark"})-[:KNOWS]->(s1:Skill)<-[:KNOWS]-(p2:Person) RETURN DISTINCT p;
- MATCH p=(p1:Person {name: "Lucy Clark"})-[:KNOWS]->(s1:Skill)<-[:KNOWS]-(p2:Person)-[:KNOWS]-(s2:Skill) RETURN DISTINCT p;

In [23]:
persons_with_shared_skills_df = driver.execute_query(
    """
    MATCH p=(p1:Person {name: "Lucy Clark"})-[:KNOWS]->(s1:Skill)<-[:KNOWS]-(p2:Person)
    RETURN DISTINCT p2.name as person;
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [24]:
persons_with_shared_skills_df

Unnamed: 0,person
0,David Hill
1,Emily Thompson
2,Matthew Scott
3,Richard Mitchell
4,John Taylor
5,Christopher Thompson
6,Sophie Perez
7,Sophia Walker
8,David Rodriguez
9,Ryan Hall


In [25]:
skills_two_steps_df = driver.execute_query(
    """
    MATCH p=(p1:Person {name: "Lucy Clark"})-[:KNOWS]->(s1:Skill)<-[:KNOWS]-(p2:Person)-[:KNOWS]-(s2:Skill)
    RETURN DISTINCT s2.name as skill;
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [26]:
skills_two_steps_df

Unnamed: 0,skill
0,Java
1,Angular
2,TensorFlow
3,Cloud Architecture
4,ReactJS
5,Azure
6,Cypher
7,Scala
8,Node.js
9,CSS3


## Person Similarity

We can define the similarity of persons based on the number of skills that are overlapping. 

In [27]:
similar_skills_df = driver.execute_query(
    """
    MATCH path_1=(p1:Person{name:"Thomas Brown"})-[:KNOWS]->(s1:Skill)
    MATCH path_2=(s1)<-[:KNOWS]-(p2:Person)
    WITH p1.name as person_1, p2.name as person_2, COLLECT(DISTINCT s1.name) as skill_list, COUNT(DISTINCT(s1)) as skill_count
    WHERE skill_count > 1 AND person_1 <> person_2
    RETURN * ORDER BY skill_count DESC
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [28]:
similar_skills_df

Unnamed: 0,person_1,person_2,skill_count,skill_list
0,Thomas Brown,Amelia Davis,3,"[Java, Docker, Security]"
1,Thomas Brown,Andrew Martin,2,"[R, Java]"
2,Thomas Brown,James Anderson,2,"[R, Security]"
3,Thomas Brown,Thomas Garcia,2,"[Java, Docker]"
4,Thomas Brown,Lucy Turner,2,"[Docker, Security]"


In [29]:
similar_skills_all_df = driver.execute_query(
    """
    MATCH path_1=(p1:Person)-[:KNOWS]->(s1:Skill)<-[:KNOWS]-(p2:Person)
    WHERE p1.name < p2.name
    WITH p1.name as person_1, p2.name as person_2, COLLECT(DISTINCT s1.name) as skill_list, COUNT(DISTINCT(s1)) as skill_count
    WHERE skill_count >= 1
    RETURN * ORDER BY skill_count DESC
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [30]:
similar_skills_all_df

Unnamed: 0,person_1,person_2,skill_count,skill_list
0,Charles Jones,John Taylor,3,"[AWS, CSS3, Pandas]"
1,Joseph Martin,Kevin Young,3,"[Agile, Linux, ReactJS]"
2,Brian Thompson,John Baker,3,"[Agile, Data Analysis, PHP]"
3,Natalie Thompson,Ryan Jones,3,"[Angular, Jenkins, Spark]"
4,David Rodriguez,Matthew Scott,3,"[Azure, Cypher, Scrum]"
...,...,...,...,...
1314,David Hill,Ryan Hall,1,[Scrum]
1315,Emily Thompson,Ryan Hall,1,[Scrum]
1316,Matthew Scott,Ryan Hall,1,[Scrum]
1317,John Taylor,Ryan Hall,1,[Scrum]


Load the skill count to the database in a new relationship

In [31]:
for chunk in split_dataframe(similar_skills_all_df):
    records, summary, keys = driver.execute_query(
        """
        UNWIND $rows AS row
        MERGE (p1:Person{name:row.person_1})
        MERGE (p2:Person{name:row.person_2})
        MERGE (p1)-[s:SIMILAR_SKILLSET]->(p2)
        SET s.overlap = row.skill_count
        RETURN COUNT(*) AS rows_processed
        """,
        database_=DATABASE,
        routing_=RoutingControl.WRITE,
        rows = chunk.to_dict('records')
    )

Take a minute to explore the SIMILAR_SKILLSET network

## Communities

Let's run some Graph Data Science based on Persons and Skills. Let's first setup the [Graph Data Science Client](https://neo4j.com/docs/graph-data-science-client/current/). 

In [32]:
gds = GraphDataScience.from_neo4j_driver(driver=driver)
gds.set_database(DATABASE)
gds.version()

'2.15.0'

Let's investigate Persons that are similar in the graph (based on skills they share). For that we first need to create a [Graph object](https://neo4j.com/docs/graph-data-science-client/current/graph-object/). 

In [33]:
graph_name = "person_similarity_projection"
node_projection = ["Person"]
rel_projection = {"SIMILAR_SKILLSET": {"orientation": 'UNDIRECTED', "properties": "overlap"}, }

In [34]:
G, res = gds.graph.project(graph_name, node_projection, rel_projection)

Run the [Leiden Algorithm](https://neo4j.com/docs/graph-data-science/current/algorithms/leiden/) for Community Detection

In [35]:
gds.leiden.write(
    G,
    writeProperty='leiden_community',
    relationshipWeightProperty='overlap',
    maxLevels=100,
    gamma=5,
    theta=0.001
)

writeMillis                                                              4
nodePropertiesWritten                                                  100
ranLevels                                                                2
didConverge                                                           True
nodeCount                                                              100
communityCount                                                          46
communityDistribution    {'min': 1, 'p5': 1, 'max': 6, 'p999': 6, 'p99'...
modularity                                                       -0.025358
modularities                  [-0.0262057617167881, -0.025358010698155336]
postProcessingMillis                                                     1
preProcessingMillis                                                      0
computeMillis                                                           28
configuration            {'writeProperty': 'leiden_community', 'theta':...
Name: 0, dtype: object

In [36]:
communities_df = driver.execute_query(
    """
    MATCH (p:Person)
    RETURN p.leiden_community AS Community, COUNT(*) as MemberCount
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [37]:
communities_df

Unnamed: 0,Community,MemberCount
0,40,3
1,0,1
2,1,1
3,2,1
4,30,2
5,33,6
6,39,2
7,29,2
8,21,5
9,50,2


Check communities based on people with high overlap

In [38]:
community_check_df = driver.execute_query(
    """
    MATCH (p1:Person)-[s:SIMILAR_SKILLSET]->(p2:Person)
    WHERE s.overlap > 2
    RETURN s.overlap AS Overlap, p1.name AS Person1, p1.leiden_community AS Community1, p2.name AS Person2, p2.leiden_community AS Community2
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)

In [39]:
community_check_df

Unnamed: 0,Overlap,Person1,Community1,Person2,Community2
0,3,Natalie Thompson,13,Ryan Jones,13
1,3,Hannah Campbell,21,Ryan Rodriguez,21
2,3,Charles Jones,12,John Taylor,12
3,3,Amelia Davis,17,Thomas Brown,17
4,3,Amelia Davis,17,John Garcia,20
5,3,Brian Thompson,32,John Baker,32
6,3,Joseph Martin,31,Kevin Young,31
7,3,Andrew Anderson,14,Andrew Martin,14
8,3,Daniel Hall,29,Daniel Hill,29
9,3,David Rodriguez,47,Matthew Scott,47


Drop the projection from the graph catalogue to free up resources

In [40]:
G.drop()

graphName                                     person_similarity_projection
database                                                             neo4j
databaseLocation                                                     local
memoryUsage                                                               
sizeInBytes                                                             -1
nodeCount                                                              100
relationshipCount                                                     2638
configuration            {'relationshipProjection': {'SIMILAR_SKILLSET'...
density                                                           0.266465
creationTime                           2025-03-11T15:32:08.764636439+00:00
modificationTime                       2025-03-11T15:32:08.764636439+00:00
schema                   {'graphProperties': {}, 'nodes': {'Person': {}...
schemaWithOrientation    {'graphProperties': {}, 'nodes': {'Person': {}...
Name: 0, dtype: object

## Semantic Similar skill

In [41]:
skills_df = gds.run_cypher(
    """
    MATCH (s:Skill)
    RETURN s.name AS skill
    """
)

In [42]:
skills_df.head(5)

Unnamed: 0,skill
0,API Design
1,AWS
2,Agile
3,Angular
4,Azure


## STOP STOP STOP - DO NOT PROCEED (YET)

-- Only to be run by instructor (or if you have your own api key). Skip the following two cells -- 

In [43]:
# embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')

In [44]:
# skills_df['embedding'] = skills_df['skill'].apply( lambda skill: embeddings.embed_documents([skill])[0])
# skills_df.head()

In [45]:
# gds.run_cypher('''
#     unwind $data as row
#     match (s:Skill{name: row.skill})
#     set s.embedding = row.embedding
#     ''',
#     params = { 'data': skills_df.to_dict(orient='records') }
# )

In [46]:
#url = 'https://raw.githubusercontent.com/erikbijl/genai-workshop-amsterdam/refs/heads/main/talent/skills_embeddings.csv'

In [47]:
skills_embeddings_df = pd.read_csv('data/skills_embeddings_updated.csv')
#skills_embeddings_df = pd.read_csv(url)

In [48]:
skills_embeddings_df.head()

Unnamed: 0,Skill,Description,EmbeddingSkill,EmbeddingDescription
0,API Design,API Design is the process of creating applicat...,"[0.002335607074201107, -0.021444285288453102, ...","[0.0038909369613975286, -0.01114029809832573, ..."
1,AWS,"AWS, or Amazon Web Services, is a comprehensiv...","[-0.004132895264774561, -0.017077714204788208,...","[0.0031335512176156044, -0.02374901808798313, ..."
2,Agile,Agile is a dynamic and flexible project manage...,"[-0.03577807545661926, -0.01052175834774971, -...","[-0.020632127299904823, -0.015866903588175774,..."
3,Angular,"Angular is a powerful, open-source web applica...","[-0.006358983926475048, 0.015042469836771488, ...","[0.005805748514831066, 0.0334085151553154, -0...."
4,Azure,Azure is Microsoft's cloud computing platform ...,"[0.00941519346088171, -0.026340041309595108, 0...","[-0.00862701702862978, -0.010243783704936504, ..."


In [49]:
skills_embeddings_df['EmbeddingSkill'] = skills_embeddings_df['EmbeddingSkill'].apply( lambda x: [ float(i) for i in x.strip("[]").split(", ")] )
skills_embeddings_df['EmbeddingDescription'] = skills_embeddings_df['EmbeddingDescription'].apply( lambda x: [ float(i) for i in x.strip("[]").split(", ")] )

In [50]:
skills_embeddings_df.head()

Unnamed: 0,Skill,Description,EmbeddingSkill,EmbeddingDescription
0,API Design,API Design is the process of creating applicat...,"[0.002335607074201107, -0.021444285288453102, ...","[0.0038909369613975286, -0.01114029809832573, ..."
1,AWS,"AWS, or Amazon Web Services, is a comprehensiv...","[-0.004132895264774561, -0.017077714204788208,...","[0.0031335512176156044, -0.02374901808798313, ..."
2,Agile,Agile is a dynamic and flexible project manage...,"[-0.03577807545661926, -0.01052175834774971, -...","[-0.020632127299904823, -0.015866903588175774,..."
3,Angular,"Angular is a powerful, open-source web applica...","[-0.006358983926475048, 0.015042469836771488, ...","[0.005805748514831066, 0.0334085151553154, -0...."
4,Azure,Azure is Microsoft's cloud computing platform ...,"[0.00941519346088171, -0.026340041309595108, 0...","[-0.00862701702862978, -0.010243783704936504, ..."


Length of an embedding

In [51]:
len(skills_embeddings_df['EmbeddingSkill'].iloc[0])

1536

In [52]:
len(skills_embeddings_df['EmbeddingDescription'].iloc[0])

1536

Add embeddings to Skill nodes in database

In [53]:
for chunk in split_dataframe(skills_embeddings_df):
    records, summary, keys = driver.execute_query(
        """
        UNWIND $rows AS row
        MATCH (s:Skill{name: row.Skill})
        SET s.embedding_skill = row.EmbeddingSkill
        WITH s
        CALL db.create.setNodeVectorProperty(s, "embedding_skill", s.embedding_skill)
        RETURN COUNT(*) AS rows_processed
        """,
        database_=DATABASE,
        routing_=RoutingControl.WRITE,
        rows = chunk.to_dict('records')
    )

In [54]:
for chunk in split_dataframe(skills_embeddings_df):
    records, summary, keys = driver.execute_query(
        """
        UNWIND $rows AS row
        MATCH (s:Skill{name: row.Skill})
        SET s.embedding_description = row.EmbeddingDescription
        WITH s
        CALL db.create.setNodeVectorProperty(s, "embedding_description", s.embedding_description)
        RETURN COUNT(*) AS rows_processed
        """,
        database_=DATABASE,
        routing_=RoutingControl.WRITE,
        rows = chunk.to_dict('records')
    )

## Vectors for Semantic Meaning

In [56]:
driver.execute_query(
    """
    CREATE VECTOR INDEX `skill-embeddings` IF NOT EXISTS
    FOR (s:Skill) ON (s.embedding_skill)
    OPTIONS {
        indexConfig: {
            `vector.dimensions`: 1536,
            `vector.similarity_function`: 'cosine'
        } 
    }
    """,
    database_=DATABASE,
    routing_=RoutingControl.WRITE
)      

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x176e72d10>, keys=[])

In [57]:
driver.execute_query(
    """
    CREATE VECTOR INDEX `description-embeddings` IF NOT EXISTS
    FOR (s:Skill) ON (s.embedding_description)
    OPTIONS {
        indexConfig: {
            `vector.dimensions`: 1536,
            `vector.similarity_function`: 'cosine'
        } 
    }
    """,
    database_=DATABASE,
    routing_=RoutingControl.WRITE
)      

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x16def2f10>, keys=[])

In [58]:
indexes_result_df  = driver.execute_query(
    'SHOW INDEXES',
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)
indexes_result_df

Unnamed: 0,id,name,state,populationPercent,type,entityType,labelsOrTypes,properties,indexProvider,owningConstraint,lastRead,readCount
0,7,constraint_63bf11a1,ONLINE,100.0,RANGE,NODE,[Skill],[name],range-1.0,constraint_63bf11a1,,
1,4,constraint_d3bfd313,ONLINE,100.0,RANGE,NODE,[Person],[email],range-1.0,constraint_d3bfd313,,
2,3,description-embeddings,ONLINE,100.0,VECTOR,NODE,[Skill],[embedding_description],vector-2.0,,,
3,0,index_343aff4e,ONLINE,100.0,LOOKUP,NODE,,,token-lookup-1.0,,2025-03-11T15:22:19.978000000+00:00,5517.0
4,1,index_f7700477,ONLINE,100.0,LOOKUP,RELATIONSHIP,,,token-lookup-1.0,,2025-03-11T13:49:24.452000000+00:00,33.0
5,2,skill-embeddings,ONLINE,100.0,VECTOR,NODE,[Skill],[embedding_skill],vector-2.0,,,


## Semantic Search

Take some Skill and find relevant other Skills: "Python", "Java", "Git", "CI/CD", "AWS", "Data Visualization", "Power BI", "R"". 

In [59]:
similar_skills_df  = driver.execute_query(
    """
    MATCH (s:Skill{name: "R"})
    CALL db.index.vector.queryNodes("skill-embeddings", 10, s.embedding_skill) YIELD node, score
    WITH node as skill, score ORDER BY score DESC
    WHERE node.name <> s.name AND score > 0.9
    RETURN skill.name, score
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)
similar_skills_df

Unnamed: 0,skill.name,score
0,Ruby,0.915771
1,Spark,0.900055


In [60]:
similar_skills_df  = driver.execute_query(
    """
    MATCH (s:Skill{name: "R"})
    CALL db.index.vector.queryNodes("description-embeddings", 10, s.embedding_description) YIELD node, score
    WITH node as skill, score ORDER BY score DESC
    WHERE node.name <> s.name AND score > 0.9
    RETURN skill.name, score
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)
similar_skills_df

Unnamed: 0,skill.name,score
0,Data Analysis,0.920853
1,Ruby,0.919937
2,Pandas,0.909424
3,Spark,0.905365
4,Python,0.904694
5,C++,0.902298
6,Tableau,0.901474


In [64]:
driver.execute_query(
    """
    CALL apoc.periodic.iterate(
        "MATCH (skill1:Skill) RETURN skill1",
        "WITH skill1 
        CALL db.index.vector.queryNodes('description-embeddings', 10, skill1.embedding_description) YIELD node, score
        WITH skill1, node as skill2, score ORDER BY score DESC
        WHERE skill1.name < skill2.name AND score > 0.92
        MERGE (skill1)-[s:SIMILAR_SEMANTIC]->(skill2)
        SET s.score = score   
        ",
        {batchSize: 1000}
    )
    """,
    database_=DATABASE,
    routing_=RoutingControl.WRITE,
    result_transformer_= lambda r: r.to_df()
)

Unnamed: 0,batches,total,timeTaken,committedOperations,failedOperations,failedBatches,retries,errorMessages,batch,operations,wasTerminated,failedParams,updateStatistics
0,1,54,0,54,0,0,0,{},"{'total': 1, 'errors': {}, 'committed': 1, 'fa...","{'total': 54, 'errors': {}, 'committed': 54, '...",False,{},"{'relationshipsDeleted': 0, 'relationshipsCrea..."


Let's look in the browser how these relationships look like. 

In [62]:
similar_skills_df  = driver.execute_query(
    """
    MATCH (s1:Skill)-[r:SIMILAR_SEMANTIC]-(s2:Skill)
    WHERE s1.name < s2.name
    RETURN s1.name AS skill1, r.score AS score, s2.name AS skill2
    ORDER BY score DESC
    """,
    database_=DATABASE,
    routing_=RoutingControl.READ,
    result_transformer_= lambda r: r.to_df()
)
similar_skills_df

Unnamed: 0,skill1,score,skill2
0,Data Visualization,0.95079,Tableau
1,Agile,0.946243,Scrum
2,Data Analysis,0.943985,Data Visualization
3,ReactJS,0.94342,Vue.js
4,AWS,0.941818,Azure
5,CI/CD,0.941498,DevOps
6,Power BI,0.940277,Tableau
7,JavaScript,0.939743,Node.js
8,Express.js,0.937454,Node.js
9,Docker,0.936218,Kubernetes
