<a href="https://colab.research.google.com/github/guerinjeanmarc/FraudWorkshop/blob/main/Quick_Neo4j_GDS_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

Install neo4j graphdatascience client ([Documentation](https://neo4j.com/docs/graph-data-science/current/))

In [None]:
%%capture
!pip install graphdatascience 

Import our usual suspects

In [None]:
import pandas as pd
from graphdatascience import GraphDataScience 

Register for a sandbox and create an empty sandbox  https://sandbox.neo4j.com

In [None]:
# Capture connection string and auth info
connectionUrl = input("Neo4j Database Url: ")
username = input("User name: ")
password = input("Password: ")


Neo4j Database Url: bolt://3.219.33.30:7687
User name: neo4j
Password: forces-address-front


In [None]:
gds = GraphDataScience(connectionUrl, auth=(username, password))
sysinfo = gds.debug.sysInfo()
sysinfo[ sysinfo['key'] == 'gdsVersion']


Unnamed: 0,key,value
0,gdsVersion,2.3.1


# Graph creation

In [None]:
transaction_df = pd.DataFrame([
    {'name': 'Tom', 'merchant':'Amazon', 'amount': 100},
    {'name': 'Tom', 'merchant':'Dustin', 'amount': 50499},
    {'name': 'Tom', 'merchant':'eBay', 'amount': 220},
    {'name': 'Stefan', 'merchant':'Amazon', 'amount': 220},
    {'name': 'Stefan', 'merchant':'Dustin', 'amount': 399},
    {'name': 'Stefan', 'merchant':'eBay', 'amount': 1499},
    {'name': 'Stefan', 'merchant':'Bikes.de', 'amount': 22000},
    {'name': 'Kristof', 'merchant':'Amazon', 'amount': 423},
    {'name': 'Kristof', 'merchant':'Dustin', 'amount': 530},
    {'name': 'Kristof', 'merchant':'Hello Fresh', 'amount': 1050},
    {'name': 'Kristof', 'merchant':'Steam', 'amount': 230},
    {'name': 'Kristof', 'merchant':'Activision', 'amount': 783},
    {'name': 'Håkan', 'merchant':'Hello Fresh', 'amount': 2100},
    {'name': 'Håkan', 'merchant':'Steam', 'amount': 230},
    {'name': 'Håkan', 'merchant':'Activision', 'amount': 783},
    
], columns = ['name', 'merchant', 'amount'])
transaction_df.head(15)

Unnamed: 0,name,merchant,amount
0,Tom,Amazon,100
1,Tom,Dustin,50499
2,Tom,eBay,220
3,Stefan,Amazon,220
4,Stefan,Dustin,399
5,Stefan,eBay,1499
6,Stefan,Bikes.de,22000
7,Kristof,Amazon,423
8,Kristof,Dustin,530
9,Kristof,Hello Fresh,1050


In [None]:
# This is not really required for this small sample
gds.run_cypher("create constraint if not exists for (p:Person) require (p.name) is node key")
gds.run_cypher("create constraint if not exists for (p:Merchant) require (p.name) is node key")


In [None]:
# Create a graph for (:Person)-[:transacted_with]->(:Merchant)
# Tip: If we had more data, this would fail => Iterate over chunks of the dataframe.
gds.run_cypher(
    """
    unwind $transactions as transaction
    merge (p:Person{name: transaction['name']})
    merge (m:Merchant{name: transaction['merchant']})
    merge (p)-[tx:TRANSACTED_WITH]->(m)
       set tx.amount = transaction['amount']
    """,
    params = { 'transactions': transaction_df.to_dict(orient='records') }
)

# Basic navigation of graph with cypher

### Exercise 1

In [None]:
# What persons are in the database?
gds.run_cypher(""" 
  match (p:Person)
  return p.name as person_name
""").head()

Unnamed: 0,person_name
0,Håkan
1,Kristof
2,Stefan
3,Tom


In [None]:
# What merchants are persons transacting with?
gds.run_cypher(""" 
  match (p:Person)-[tx:TRANSACTED_WITH]->(m:Merchant)
  return m.name as name,collect(p.name) as persons
""").head()

Unnamed: 0,name,persons
0,Activision,"[Håkan, Kristof]"
1,Amazon,"[Kristof, Stefan, Tom]"
2,Bikes.de,[Stefan]
3,Dustin,"[Kristof, Stefan, Tom]"
4,Hello Fresh,"[Håkan, Kristof]"


In [None]:
# Exercise 1: What is the sum of the transacted amount per person and merchant?
# Replace with yor solution
gds.run_cypher(""" 
  match (p:Person)-[tx:TRANSACTED_WITH]->(m:Merchant)
  return p.name as person, m.name as merchant, sum(tx.amount) as totalAmount
""").head()

Unnamed: 0,person,merchant,totalAmount
0,Håkan,Activision,783
1,Kristof,Activision,783
2,Kristof,Amazon,423
3,Stefan,Amazon,220
4,Tom,Amazon,100


## Exercise 2

In [None]:
# Who transacted with Amazon but not Steam?
gds.run_cypher(""" 
  match (p:Person)-[:TRANSACTED_WITH]->(:Merchant{name:"Amazon"})
  where not exists { (p)-[:TRANSACTED_WITH]->(:Merchant{name:"Steam"}) }
  return p.name as transacted_with_amazon_but_not_steam
""").head()

Unnamed: 0,transacted_with_amazon_but_not_steam
0,Stefan
1,Tom


In [None]:
# Exercise 2: Are there Persons in the graph that did not transact with a merchant at all?
# Replace with yor solution
gds.run_cypher(""" 
  match (p:Person)
  where not exists { (p)-[:TRANSACTED_WITH]->(:Merchant) }
  return count(p)
""").head()

Unnamed: 0,count(p)
0,0


## Exercise 3

In [None]:
# What nodes do I have in my graph?
gds.run_cypher(""" 
  match (n)
  return labels(n) as node_labels, count(*) as number_of_nodes
""").head()

In [None]:
# Exercise 3: What relationships do I have in my graph?
# Replace with yor solution
gds.run_cypher(""" 
  return "not_solved" as answer
""").head()

# Node similarity

Let's get this party started

In [None]:
G, res = gds.graph.project(
    "shopping",                 #  Graph name
    ["Person", "Merchant"],   #  Node projection
    {"TRANSACTED_WITH": {"properties": "amount"}}              #  Relationship projection
)


Loading:   0%|          | 0/100 [00:00<?, ?%/s]

In [None]:
print(f"Graph '{G.name()}' node count: {G.node_count()}")
print(f"Graph '{G.name()}' node labels: {G.node_labels()}")


In [None]:
gds.nodeSimilarity.write(
    G,
    relationshipWeightProperty='amount', 
    writeRelationshipType='IS_SIMILAR_TO',
    writeProperty='sim_score'
)

preProcessingMillis                                                       3
computeMillis                                                           104
writeMillis                                                             178
postProcessingMillis                                                     -1
nodesCompared                                                             4
relationshipsWritten                                                      8
similarityDistribution    {'p1': 0.009687662124633789, 'max': 0.50738137...
configuration             {'topK': 10, 'writeConcurrency': 4, 'similarit...
Name: 0, dtype: object

In [None]:
# Drop the projection from the graph catalogue to free up resources
G.drop()

graphName                                                         shopping
database                                                             neo4j
memoryUsage                                                               
sizeInBytes                                                             -1
nodeCount                                                               11
relationshipCount                                                       15
configuration            {'relationshipProjection': {'TRANSACTED_WITH':...
density                                                           0.136364
creationTime                           2023-03-21T14:51:24.898195357+00:00
modificationTime                       2023-03-21T14:51:25.399990305+00:00
schema                   {'graphProperties': {}, 'relationships': {'TRA...
schemaWithOrientation    {'graphProperties': {}, 'relationships': {'TRA...
Name: 0, dtype: object

In [None]:
# Remove symetric relationships
gds.run_cypher("""
  MATCH (a:Person)-[r:IS_SIMILAR_TO]->(b:Person) 
    WHERE (b)-[:IS_SIMILAR_TO]->(a) 
    AND   id(a)<id(b)
  DELETE r
""")


# Graph embedding + knn 

In [None]:
# Let's make the same projection again
G, res = gds.graph.project(
    "shopping",                 #  Graph name
    ["Person", "Merchant"],   #  Node projection
    {"TRANSACTED_WITH": {"properties": "amount"}}              #  Relationship projection
)

In [None]:
# Mutate our projection (by computing an embedding)
gds.beta.node2vec.mutate(
    G,
    embeddingDimension=3,
    relationshipWeightProperty='amount',
    mutateProperty='embedding'
)

In [None]:
# What does our embeddings look like? Let's stream back and have a look
df_embeddings = gds.graph.nodeProperty.stream(
    G,
    node_properties='embedding',
    node_labels='Person'
)

In [None]:
pd.set_option('max_colwidth', None)
df_embeddings.head(10)

In [None]:
# Run knn
gds.knn.write(
    G,
    nodeLabels=['Person'],
    nodeProperties=['embedding'],
    topK=2,
    writeRelationshipType='SIMILAR_EMBEDDING',
    writeProperty='sim_score'
)

In [None]:
# Drop the projection from the graph catalogue to free up resources
G.drop()

In [None]:
# Again, let's remove symetric relationships
gds.run_cypher("""
  MATCH (a:Person)-[r:SIMILAR_EMBEDDING]->(b:Person) 
    WHERE (b)-[:SIMILAR_EMBEDDING]->(a) 
    AND   id(a)<id(b)
  DELETE r
""")

In [None]:
# Let's review
gds.run_cypher("""
  MATCH (p:Person)-[r:SIMILAR_EMBEDDING|IS_SIMILAR_TO]-(p2)
  RETURN p.name as person, 
         type(r) as type, 
         r.sim_score as score,
         p2.name as to_person
  ORDER by p.name, p2.name, type(r)
""").head(30)