In [None]:
#!pip install py2neo
#!pip install sentence-transformers
#!pip install neo4j

In [2]:
# Import the necessary libraries
import os
from neo4j import GraphDatabase
from sentence_transformers import SentenceTransformer, util

  from tqdm.autonotebook import tqdm, trange


In [3]:
# Get connection parameters from environment variables

uri = os.getenv('NEO4J_URI', 'neo4j://b92ae674.databases.neo4j.io')
user = os.getenv('NEO4J_USER', 'neo4j')
password = os.getenv('NEO4J_PASSWORD', 'password')

In [3]:
# Clear the Neo4j database beforehand if needed
def clear_db(uri, user, password):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    with driver.session() as session:
        session.run("MATCH (n) DETACH DELETE n")

In [None]:
# Function to create nodes and relationships in Neo4j
def create_knowledge_graph(uri, user, password, nodes, edges):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    
    def add_node(tx, label, name):
        tx.run("MERGE (a:" + label + " {name: $name})", name=name)
    
    def add_relationship(tx, label1, name1, label2, name2, relationship):
        tx.run("MATCH (a:" + label1 + " {name: $name1}), (b:" + label2 + " {name: $name2}) "
               "MERGE (a)-[r:" + relationship + "]->(b)",
               name1=name1, name2=name2)
    
    with driver.session() as session:
        for node in nodes:
            session.execute_write(add_node, "Node", node)
        for edge in edges:
            session.execute_write(add_relationship, "Node", edge[0], "Node", edge[1], "RELATED_TO")
    
    driver.close()

# Function to extract nodes and edges from text using sentence-transformers
def extract_nodes_edges(text):
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    #model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
    #model = SentenceTransformer('sentence-transformers/paraphrase-mpnet-base-v2')
    sentences = text.split('.')
    embeddings = model.encode(sentences, convert_to_tensor=True)
    
    # Use cosine similarity to find related sentences
    cosine_scores = util.pytorch_cos_sim(embeddings, embeddings)
    
    nodes = set()
    edges = set()
    
    similarity_threshold = 0.9  # Higher = fewer but precise relationships, lower = more but less precise relationships

    for i in range(len(sentences)):
        nodes.add(sentences[i].strip())
        for j in range(i+1, len(sentences)):
            if cosine_scores[i][j] > similarity_threshold:  # Threshold for similarity
                edges.add((sentences[i].strip(), sentences[j].strip()))
    
    return list(nodes), list(edges)


In [5]:
# Process the text document
filename = 'US-Constitution-With-Amendments.txt'

# Read the full text data
with open(filename, "r") as file:
    text = file.read()

# Extract nodes and edges from the text
nodes, edges = extract_nodes_edges(text)

# NOTE: This ran for 72 mins on Windows 10, around 48 mins on Linux Mint 22
# Create the knowledge graph
create_knowledge_graph(uri, user, password, nodes, edges)



In [5]:
# Helper function to run a query and return the results
def run_query(uri, user, password, query):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    results = []
    
    with driver.session() as session:
        result = session.run(query)
        for record in result:
            results.append(record)
    
    driver.close()
    return results

# Get connection parameters from environment variables
uri = os.getenv('NEO4J_URI', 'neo4j://b92ae674.databases.neo4j.io')
user = os.getenv('NEO4J_USER', 'neo4j')
password = os.getenv('NEO4J_PASSWORD', 'password')


Example Neo4j Queries against this data...

In [7]:
# Query 1: Get all nodes
query = "MATCH (n) RETURN n LIMIT 10"
nodes = run_query(uri, user, password, query)
nodes

[<Record n=<Node element_id='4:5c070880-17d8-45a9-a1fa-df478e8a6ff0:0' labels=frozenset({'Node'}) properties={'name': ''}>>,
 <Record n=<Node element_id='4:5c070880-17d8-45a9-a1fa-df478e8a6ff0:1' labels=frozenset({'Node'}) properties={'name': 'A well-regulated militia being                  2'}>>,
 <Record n=<Node element_id='4:5c070880-17d8-45a9-a1fa-df478e8a6ff0:2' labels=frozenset({'Node'}) properties={'name': 'from the persons having the highest\n     numbers, not exceeding three, on the\n     list of those voted for as President,\n     the House of Representatives shall\n     choose immediately, by ballot, the\n     President'}>>,
 <Record n=<Node element_id='4:5c070880-17d8-45a9-a1fa-df478e8a6ff0:3' labels=frozenset({'Node'}) properties={'name': 'No State shall lay\n     any'}>>,
 <Record n=<Node element_id='4:5c070880-17d8-45a9-a1fa-df478e8a6ff0:4' labels=frozenset({'Node'}) properties={'name': 'or Representative in Congress, or\n presidential elector, or hold any office,\n civi

In [8]:
# Query 2: Get all relationships
query = "MATCH ()-[r]->() RETURN r LIMIT 10"
relationships = run_query(uri, user, password, query)
relationships

[<Record r=<Relationship element_id='5:5c070880-17d8-45a9-a1fa-df478e8a6ff0:1152921504606846976' nodes=(<Node element_id='4:5c070880-17d8-45a9-a1fa-df478e8a6ff0:0' labels=frozenset() properties={}>, <Node element_id='4:5c070880-17d8-45a9-a1fa-df478e8a6ff0:0' labels=frozenset() properties={}>) type='RELATED_TO' properties={}>>,
 <Record r=<Relationship element_id='5:5c070880-17d8-45a9-a1fa-df478e8a6ff0:1152921504606846979' nodes=(<Node element_id='4:5c070880-17d8-45a9-a1fa-df478e8a6ff0:3' labels=frozenset() properties={}>, <Node element_id='4:5c070880-17d8-45a9-a1fa-df478e8a6ff0:52' labels=frozenset() properties={}>) type='RELATED_TO' properties={}>>,
 <Record r=<Relationship element_id='5:5c070880-17d8-45a9-a1fa-df478e8a6ff0:1152921504606846985' nodes=(<Node element_id='4:5c070880-17d8-45a9-a1fa-df478e8a6ff0:9' labels=frozenset() properties={}>, <Node element_id='4:5c070880-17d8-45a9-a1fa-df478e8a6ff0:422' labels=frozenset() properties={}>) type='RELATED_TO' properties={}>>,
 <Record r

In [10]:
# Query 3: Find nodes related to a specific node
specific_node = "Article 1"
query = f"MATCH (n)-[r:RELATED_TO]->(m) WHERE n.name = '{specific_node}' RETURN n, r, m"
related_nodes = run_query(uri, user, password, query)
print(f"Nodes related to '{specific_node}':", related_nodes)

Nodes related to 'Article 1': []


In [11]:
# Query 4: Count the number of nodes
query = "MATCH (n) RETURN count(n) as node_count"
node_count = run_query(uri, user, password, query)
print("Number of nodes:", node_count)

Number of nodes: [<Record node_count=2368>]


In [12]:
# Query 5: Count the number of relationships
query = "MATCH ()-[r]->() RETURN count(r) as relationship_count"
relationship_count = run_query(uri, user, password, query)
print("Number of relationships:", relationship_count)

Number of relationships: [<Record relationship_count=743>]


In [6]:
# Query to return the top 10 most frequent nodes with a common relationship
query = """
MATCH (n)-[r]->()
RETURN n.name AS node, count(r) AS relationships
ORDER BY relationships DESC
LIMIT 10
"""
top_nodes = run_query(uri, user, password, query)
print("Top 10 most frequent nodes with a common relationship:", top_nodes)

Top 10 most frequent nodes with a common relationship: [<Record node='Article [XX' relationships=7>, <Record node='The \ndates of ratification were: Massachusetts, May 22, 1912; Arizona, June \n3, 1912; Minnesota, June 10, 1912; New York, January 15, 1913; Kansas, \nJanuary 17, 1913; Oregon, January 23, 1913; North Carolina, January 25, \n1913; California, January 28, 1913; Michigan, January 28, 1913; Iowa, \nJanuary 30, 1913; Montana, January 30, 1913; Idaho, January 31, 1913; \nWest Virginia, February 4, 1913; Colorado, February 5, 1913; Nevada, \nFebruary 6, 1913; Texas, February 7, 1913; Washington, February 7, \n1913; Wyoming, February 8, 1913; Arkansas, February 11, 1913; Maine, \nFebruary 11, 1913; Illinois, February 13, 1913; North Dakota, February \n14, 1913; Wisconsin, February 18, 1913; Indiana, February 19, 1913; New \nHampshire, February 19, 1913; Vermont, February 19, 1913; South Dakota, \nFebruary 19, 1913; Oklahoma, February 24, 1913; Ohio, February 25, \n1913; Missouri

In [None]:
# Query to return the top 10 most frequent relationships

In [7]:
# Query to return the top 3 most frequent relationships
query = """
MATCH ()-[r]->()
RETURN type(r) AS relationship, count(r) AS frequency
ORDER BY frequency DESC
LIMIT 3
"""
top_relationships = run_query(uri, user, password, query)
print("Top 3 most frequent relationships:", top_relationships)

Top 3 most frequent relationships: [<Record relationship='RELATED_TO' frequency=743>]


In [None]:
# Query to return the top 3 most frequent relationships plus the nodes counts for each relationship

In [None]:
# Query to return the top 3 most frequent relationships plus the nodes counts for each relationship
query = """
MATCH ()-[r]->()
WITH type(r) AS relationship, count(r) AS frequency
ORDER BY frequency DESC
LIMIT 3
MATCH (n)-[r]->()
WHERE type(r) = relationship
RETURN relationship, frequency, count(DISTINCT n) AS node_count
ORDER BY frequency DESC
"""
top_relationships_with_node_count = run_query(uri, user, password, query)
print("Top 3 most frequent relationships plus the node count for each of those relationships:", top_relationships_with_node_count)

Top 3 most frequent relationships plus the node count for each of those relationships: [<Record relationship='RELATED_TO' frequency=743 node_count=526>]


In [10]:
# Query to show number of amendments in the US Constitution
query = "MATCH (a:Amendment) RETURN count(a) as amendment_count"
amendment_count = run_query(uri, user, password, query)
print("Number of amendments in the US Constitution:", amendment_count)



Number of amendments in the US Constitution: [<Record amendment_count=0>]


In [11]:
# Query to show the top 3 most common properties from all nodes in the graph
query = """
MATCH (n)
UNWIND keys(n) AS property
RETURN property, count(n[property]) AS frequency
ORDER BY frequency DESC
LIMIT 3
"""
top_properties = run_query(uri, user, password, query)
print("Top 3 most common properties from all nodes in the graph:", top_properties)

Top 3 most common properties from all nodes in the graph: [<Record property='name' frequency=2368>]


In [12]:
# Query to show how many nodes have a relationship and how many nodes have no relationship
query = """
MATCH (n)
OPTIONAL MATCH (n)-[r]->()
WITH n, COUNT(r) AS relationships
RETURN 
    COUNT(CASE WHEN relationships > 0 THEN 1 END) AS nodes_with_relationships,
    COUNT(CASE WHEN relationships = 0 THEN 1 END) AS nodes_without_relationships
"""
nodes_relationships_status = run_query(uri, user, password, query)
print("Nodes with and without relationships:", nodes_relationships_status)



Nodes with and without relationships: [<Record nodes_with_relationships=526 nodes_without_relationships=1842>]


While this exercise did indeed build a knowledge graph using a text file of the US Constitution and all of it's amendments, it's not a particularly useful KG. The only node property is "name" and the only relationship type is "related_to", which is better than nothing, but not all that helpful for gleaning knowledge from a body of text that we might not already know very much about. 

We might be able to improve this by using a better model for extracting the nodes and relationships (this used a basic sentence-transformer embedding model all-MiniLM-L6-v2, which might not have been the best choice). The other two models (commented out above) are likely worth trying.