In [None]:
#!pip install py2neo
#!pip install sentence-transformers
#!pip install neo4j
#!pip install sentence-transformers pandas torch

In [1]:
# Import the necessary libraries
import os
from neo4j import GraphDatabase
from sentence_transformers import SentenceTransformer, util

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Get connection parameters from environment variables

uri = os.getenv('NEO4J_URI', 'neo4j://b92ae674.databases.neo4j.io')
user = os.getenv('NEO4J_USER', 'neo4j')
password = os.getenv('NEO4J_PASSWORD', 'password')

In [4]:
# Clear the Neo4j database beforehand if needed
def clear_db(uri, user, password):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    with driver.session() as session:
        session.run("MATCH (n) DETACH DELETE n")

In [5]:
# Function to create nodes and relationships in Neo4j
def create_knowledge_graph(uri, user, password, nodes, edges):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    
    def add_node(tx, label, name):
        tx.run("MERGE (a:" + label + " {name: $name})", name=name)
    
    def add_relationship(tx, label1, name1, label2, name2, relationship):
        tx.run("MATCH (a:" + label1 + " {name: $name1}), (b:" + label2 + " {name: $name2}) "
               "MERGE (a)-[r:" + relationship + "]->(b)",
               name1=name1, name2=name2)
    
    with driver.session() as session:
        for node in nodes:
            session.execute_write(add_node, "Node", node)
        for edge in edges:
            session.execute_write(add_relationship, "Node", edge[0], "Node", edge[1], "RELATED_TO")
    
    driver.close()


In [6]:
# New function that uses some guidance on what nodes are desired
# We'll use sample sentences to identify semantically similar sentences in the body of the text
# Import the necessary libraries
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util

# Function to extract nodes and edges from text using sentence-transformers with semantic similarity
def extract_nodes_edges_with_semantic_similarity(text, example_sentences, threshold=0.7, batch_size=32):
    model = SentenceTransformer('sentence-transformers/paraphrase-mpnet-base-v2')
    if torch.cuda.is_available():
        model = model.to('cuda')
    
    sentences = text.split('.')
    df = pd.DataFrame(sentences, columns=['sentence'])
    
    embeddings = []
    for i in range(0, len(df), batch_size):
        batch = df['sentence'][i:i+batch_size].tolist()
        batch_embeddings = model.encode(batch, convert_to_tensor=True)
        embeddings.append(batch_embeddings)
    
    embeddings = torch.cat(embeddings)
    
    # Encode example sentences
    example_embeddings = model.encode(example_sentences, convert_to_tensor=True)
    
    # Use cosine similarity to find sentences similar to example sentences
    cosine_scores = util.pytorch_cos_sim(embeddings, example_embeddings)
    
    nodes = set()
    edges = set()
    
    for i in range(len(sentences)):
        for j in range(len(example_sentences)):
            if cosine_scores[i][j] > threshold:  # Adjustable threshold for similarity
                nodes.add(sentences[i].strip())
    
    # Add all sentences as nodes and find edges based on similarity
    for i in range(len(sentences)):
        nodes.add(sentences[i].strip())
        for j in range(i+1, len(sentences)):
            if util.pytorch_cos_sim(embeddings[i], embeddings[j]) > threshold:
                edges.add((sentences[i].strip(), sentences[j].strip()))
    
    return list(nodes), list(edges)

In [None]:
# Process the document...NOTE: This runs for ~5 hours
filename = 'US-Constitution-With-Amendments.txt'

# Read the full text data
with open(filename, "r") as file:
    text = file.read()

# Define example sentences that are similar to the nodes of interest
example_sentences = [
    "The right to free speech",
    "The right to bear arms",
    "The right to a fair trial",
    "The right to privacy",
    "The right to vote"
]

# Extract nodes and edges from the text with an adjustable threshold
nodes, edges = extract_nodes_edges_with_semantic_similarity(text, example_sentences, threshold=0.9, batch_size=32)

# Print the extracted nodes and edges
print("Nodes:", nodes)
print("Edges:", edges)

# Create the knowledge graph
create_knowledge_graph(uri, user, password, nodes, edges)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Nodes: ['', 'deny to any person within its\n jurisdiction the equal', 'Section 5', 'District of Columbia, electors for             23          1', 'Qualifications of electors of                  17          1', 'The seats of the Senators of the third          1          3       2\n     class at the expiration of the sixth\n     year', 'On imports and exports, without the             1         10       2\n     consent of Congress, except where\n     necessary for executing its\n     inspection laws', 'Article [III', 'Congress may determine the time of              2          1       4\n     choosing the electors and the day on\n     which they shall give their votes,\n     which day shall be the same\n     throughout the United States', 'Day on which they shall vote for President          2          1       4\n and Vice President, which shall be the\n same throughout the United States', 'Revenue', 'The Vice President shall have no vote          1          3       4\n unless the Senate b

Knowledge Graph is built in Neoj4...now let's run some queries to understand the data and what we have collected. 

In [11]:
# Helper function to run a query and return the results
def run_query(uri, user, password, query):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    results = []
    
    with driver.session() as session:
        result = session.run(query)
        for record in result:
            results.append(record)
    
    driver.close()
    return results

# Get connection parameters from environment variables
uri = os.getenv('NEO4J_URI', 'neo4j://b92ae674.databases.neo4j.io')
user = os.getenv('NEO4J_USER', 'neo4j')
password = os.getenv('NEO4J_PASSWORD', 'password')


Example Neo4j Queries against this data...

In [12]:
# Query 1: Get all nodes
query = "MATCH (n) RETURN n LIMIT 10"
nodes = run_query(uri, user, password, query)
nodes

[<Record n=<Node element_id='4:11a93b28-aff9-4f10-82aa-12582f6eaff7:0' labels=frozenset({'Node'}) properties={'name': ''}>>,
 <Record n=<Node element_id='4:11a93b28-aff9-4f10-82aa-12582f6eaff7:1' labels=frozenset({'Node'}) properties={'name': 'deny to any person within its\n jurisdiction the equal'}>>,
 <Record n=<Node element_id='4:11a93b28-aff9-4f10-82aa-12582f6eaff7:2' labels=frozenset({'Node'}) properties={'name': 'Section 5'}>>,
 <Record n=<Node element_id='4:11a93b28-aff9-4f10-82aa-12582f6eaff7:3' labels=frozenset({'Node'}) properties={'name': 'District of Columbia, electors for             23          1'}>>,
 <Record n=<Node element_id='4:11a93b28-aff9-4f10-82aa-12582f6eaff7:4' labels=frozenset({'Node'}) properties={'name': 'Qualifications of electors of                  17          1'}>>,
 <Record n=<Node element_id='4:11a93b28-aff9-4f10-82aa-12582f6eaff7:5' labels=frozenset({'Node'}) properties={'name': 'The seats of the Senators of the third          1          3       2\n   

In [13]:
# Query 2: Get all relationships
query = "MATCH ()-[r]->() RETURN r LIMIT 10"
relationships = run_query(uri, user, password, query)
relationships

[<Record r=<Relationship element_id='5:11a93b28-aff9-4f10-82aa-12582f6eaff7:1152921504606846976' nodes=(<Node element_id='4:11a93b28-aff9-4f10-82aa-12582f6eaff7:0' labels=frozenset() properties={}>, <Node element_id='4:11a93b28-aff9-4f10-82aa-12582f6eaff7:0' labels=frozenset() properties={}>) type='RELATED_TO' properties={}>>,
 <Record r=<Relationship element_id='5:11a93b28-aff9-4f10-82aa-12582f6eaff7:1152921504606846978' nodes=(<Node element_id='4:11a93b28-aff9-4f10-82aa-12582f6eaff7:2' labels=frozenset() properties={}>, <Node element_id='4:11a93b28-aff9-4f10-82aa-12582f6eaff7:2' labels=frozenset() properties={}>) type='RELATED_TO' properties={}>>,
 <Record r=<Relationship element_id='5:11a93b28-aff9-4f10-82aa-12582f6eaff7:1152921504606846987' nodes=(<Node element_id='4:11a93b28-aff9-4f10-82aa-12582f6eaff7:11' labels=frozenset() properties={}>, <Node element_id='4:11a93b28-aff9-4f10-82aa-12582f6eaff7:2210' labels=frozenset() properties={}>) type='RELATED_TO' properties={}>>,
 <Record 

In [14]:
# Query 3: Find nodes related to a specific node
specific_node = "Article 1"
query = f"MATCH (n)-[r:RELATED_TO]->(m) WHERE n.name = '{specific_node}' RETURN n, r, m"
related_nodes = run_query(uri, user, password, query)
print(f"Nodes related to '{specific_node}':", related_nodes)

Nodes related to 'Article 1': []


In [15]:
# Query 4: Count the number of nodes
query = "MATCH (n) RETURN count(n) as node_count"
node_count = run_query(uri, user, password, query)
print("Number of nodes:", node_count)

Number of nodes: [<Record node_count=2368>]


In [16]:
# Query 5: Count the number of relationships
query = "MATCH ()-[r]->() RETURN count(r) as relationship_count"
relationship_count = run_query(uri, user, password, query)
print("Number of relationships:", relationship_count)

Number of relationships: [<Record relationship_count=1042>]


In [17]:
# Query to return the top 10 most frequent nodes with a common relationship
query = """
MATCH (n)-[r]->()
RETURN n.name AS node, count(r) AS relationships
ORDER BY relationships DESC
LIMIT 10
"""
top_nodes = run_query(uri, user, password, query)
print("Top 10 most frequent nodes with a common relationship:", top_nodes)

Top 10 most frequent nodes with a common relationship: [<Record node='---------------------------------------------------------------------------\n                                   * * * * *                              \n\\3\\This clause has been affected by clause 1 of amendment XVII' relationships=8>, <Record node='Article [XX' relationships=7>, <Record node='\\4\\\n---------------------------------------------------------------------------\n                                   * * * * *                              \n\\4\\This clause has been affected by clause 2 of amendment XVIII' relationships=7>, <Record node='---------------------------------------------------------------------------\n                                   * * * * *                              \n\\5\\This clause has been affected by amendment XX' relationships=6>, <Record node='The judges of the Supreme and             3          1' relationships=6>, <Record node='Article [XXI' relationships=6>, <Record node=']\n\

In [19]:
# Query to return the top 3 most frequent relationships
query = """
MATCH ()-[r]->()
RETURN type(r) AS relationship, count(r) AS frequency
ORDER BY frequency DESC
LIMIT 3
"""
top_relationships = run_query(uri, user, password, query)
print("Top 3 most frequent relationships:", top_relationships)

Top 3 most frequent relationships: [<Record relationship='RELATED_TO' frequency=1042>]


In [20]:
# Query to return the top 3 most frequent relationships plus the nodes counts for each relationship
query = """
MATCH ()-[r]->()
WITH type(r) AS relationship, count(r) AS frequency
ORDER BY frequency DESC
LIMIT 3
MATCH (n)-[r]->()
WHERE type(r) = relationship
RETURN relationship, frequency, count(DISTINCT n) AS node_count
ORDER BY frequency DESC
"""
top_relationships_with_node_count = run_query(uri, user, password, query)
print("Top 3 most frequent relationships plus the node count for each of those relationships:", top_relationships_with_node_count)

Top 3 most frequent relationships plus the node count for each of those relationships: [<Record relationship='RELATED_TO' frequency=1042 node_count=652>]


In [21]:
# Query to show number of amendments in the US Constitution
query = "MATCH (a:Amendment) RETURN count(a) as amendment_count"
amendment_count = run_query(uri, user, password, query)
print("Number of amendments in the US Constitution:", amendment_count)



Number of amendments in the US Constitution: [<Record amendment_count=0>]


In [22]:
# Query to show the top 3 most common properties from all nodes in the graph
query = """
MATCH (n)
UNWIND keys(n) AS property
RETURN property, count(n[property]) AS frequency
ORDER BY frequency DESC
LIMIT 3
"""
top_properties = run_query(uri, user, password, query)
print("Top 3 most common properties from all nodes in the graph:", top_properties)

Top 3 most common properties from all nodes in the graph: [<Record property='name' frequency=2368>]


In [23]:
# Query to show how many nodes have a relationship and how many nodes have no relationship
query = """
MATCH (n)
OPTIONAL MATCH (n)-[r]->()
WITH n, COUNT(r) AS relationships
RETURN 
    COUNT(CASE WHEN relationships > 0 THEN 1 END) AS nodes_with_relationships,
    COUNT(CASE WHEN relationships = 0 THEN 1 END) AS nodes_without_relationships
"""
nodes_relationships_status = run_query(uri, user, password, query)
print("Nodes with and without relationships:", nodes_relationships_status)



Nodes with and without relationships: [<Record nodes_with_relationships=652 nodes_without_relationships=1716>]


While this exercise did indeed build a knowledge graph using a text file of the US Constitution and all of it's amendments, it's not a particularly useful KG. The only node property is "name" and the only relationship type is "related_to", which is better than nothing, but not all that helpful for gleaning knowledge from a body of text that we might not already know very much about. 

We might be able to improve this by using a better model for extracting the nodes and relationships (this used a basic sentence-transformer embedding model all-MiniLM-L6-v2, which might not have been the best choice). The other two models (commented out above) are likely worth trying.