In [None]:
#!pip install py2neo
#!pip install sentence-transformers
#!pip install neo4j

In [1]:
# Import the necessary libraries
import os
from neo4j import GraphDatabase
from sentence_transformers import SentenceTransformer, util

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Get connection parameters from environment variables

uri = os.getenv('NEO4J_URI', 'neo4j://b92ae674.databases.neo4j.io')
user = os.getenv('NEO4J_USER', 'neo4j')
password = os.getenv('NEO4J_PASSWORD', 'password')

In [3]:
# Clear the Neo4j database beforehand if needed
def clear_db(uri, user, password):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    with driver.session() as session:
        session.run("MATCH (n) DETACH DELETE n")

In [4]:
# Function to create nodes and relationships in Neo4j
def create_knowledge_graph(uri, user, password, nodes, edges):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    
    def add_node(tx, label, name):
        tx.run("MERGE (a:" + label + " {name: $name})", name=name)
    
    def add_relationship(tx, label1, name1, label2, name2, relationship):
        tx.run("MATCH (a:" + label1 + " {name: $name1}), (b:" + label2 + " {name: $name2}) "
               "MERGE (a)-[r:" + relationship + "]->(b)",
               name1=name1, name2=name2)
    
    with driver.session() as session:
        for node in nodes:
            session.execute_write(add_node, "Node", node)
        for edge in edges:
            session.execute_write(add_relationship, "Node", edge[0], "Node", edge[1], "RELATED_TO")
    
    driver.close()

# Function to extract nodes and edges from text using sentence-transformers
def extract_nodes_edges(text):
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    sentences = text.split('.')
    embeddings = model.encode(sentences, convert_to_tensor=True)
    
    # Use cosine similarity to find related sentences
    cosine_scores = util.pytorch_cos_sim(embeddings, embeddings)
    
    nodes = set()
    edges = set()
    
    similarity_threshold = 0.9

    for i in range(len(sentences)):
        nodes.add(sentences[i].strip())
        for j in range(i+1, len(sentences)):
            if cosine_scores[i][j] > similarity_threshold:  # Threshold for similarity
                edges.add((sentences[i].strip(), sentences[j].strip()))
    
    return list(nodes), list(edges)


In [5]:
# Process the text document
filename = 'US-Constitution-With-Amendments.txt'

# Read the full text data
with open(filename, "r") as file:
    text = file.read()

# Extract nodes and edges from the text
nodes, edges = extract_nodes_edges(text)

# NOTE: This ran for 72 mins on Windows 10, around 48 mins on Linux Mint 22
# Create the knowledge graph
create_knowledge_graph(uri, user, password, nodes, edges)



In [6]:
# Helper function to run a query and return the results
def run_query(uri, user, password, query):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    results = []
    
    with driver.session() as session:
        result = session.run(query)
        for record in result:
            results.append(record)
    
    driver.close()
    return results

# Get connection parameters from environment variables
uri = os.getenv('NEO4J_URI', 'neo4j://b92ae674.databases.neo4j.io')
user = os.getenv('NEO4J_USER', 'neo4j')
password = os.getenv('NEO4J_PASSWORD', 'password')


Example Neo4j Queries against this data...

In [7]:
# Query 1: Get all nodes
query = "MATCH (n) RETURN n LIMIT 10"
nodes = run_query(uri, user, password, query)
nodes

[<Record n=<Node element_id='4:5c070880-17d8-45a9-a1fa-df478e8a6ff0:0' labels=frozenset({'Node'}) properties={'name': ''}>>,
 <Record n=<Node element_id='4:5c070880-17d8-45a9-a1fa-df478e8a6ff0:1' labels=frozenset({'Node'}) properties={'name': 'A well-regulated militia being                  2'}>>,
 <Record n=<Node element_id='4:5c070880-17d8-45a9-a1fa-df478e8a6ff0:2' labels=frozenset({'Node'}) properties={'name': 'from the persons having the highest\n     numbers, not exceeding three, on the\n     list of those voted for as President,\n     the House of Representatives shall\n     choose immediately, by ballot, the\n     President'}>>,
 <Record n=<Node element_id='4:5c070880-17d8-45a9-a1fa-df478e8a6ff0:3' labels=frozenset({'Node'}) properties={'name': 'No State shall lay\n     any'}>>,
 <Record n=<Node element_id='4:5c070880-17d8-45a9-a1fa-df478e8a6ff0:4' labels=frozenset({'Node'}) properties={'name': 'or Representative in Congress, or\n presidential elector, or hold any office,\n civi

In [8]:
# Query 2: Get all relationships
query = "MATCH ()-[r]->() RETURN r LIMIT 10"
relationships = run_query(uri, user, password, query)
relationships

[<Record r=<Relationship element_id='5:5c070880-17d8-45a9-a1fa-df478e8a6ff0:1152921504606846976' nodes=(<Node element_id='4:5c070880-17d8-45a9-a1fa-df478e8a6ff0:0' labels=frozenset() properties={}>, <Node element_id='4:5c070880-17d8-45a9-a1fa-df478e8a6ff0:0' labels=frozenset() properties={}>) type='RELATED_TO' properties={}>>,
 <Record r=<Relationship element_id='5:5c070880-17d8-45a9-a1fa-df478e8a6ff0:1152921504606846979' nodes=(<Node element_id='4:5c070880-17d8-45a9-a1fa-df478e8a6ff0:3' labels=frozenset() properties={}>, <Node element_id='4:5c070880-17d8-45a9-a1fa-df478e8a6ff0:52' labels=frozenset() properties={}>) type='RELATED_TO' properties={}>>,
 <Record r=<Relationship element_id='5:5c070880-17d8-45a9-a1fa-df478e8a6ff0:1152921504606846985' nodes=(<Node element_id='4:5c070880-17d8-45a9-a1fa-df478e8a6ff0:9' labels=frozenset() properties={}>, <Node element_id='4:5c070880-17d8-45a9-a1fa-df478e8a6ff0:422' labels=frozenset() properties={}>) type='RELATED_TO' properties={}>>,
 <Record r

In [10]:
# Query 3: Find nodes related to a specific node
specific_node = "Article 1"
query = f"MATCH (n)-[r:RELATED_TO]->(m) WHERE n.name = '{specific_node}' RETURN n, r, m"
related_nodes = run_query(uri, user, password, query)
print(f"Nodes related to '{specific_node}':", related_nodes)

Nodes related to 'Article 1': []


In [11]:
# Query 4: Count the number of nodes
query = "MATCH (n) RETURN count(n) as node_count"
node_count = run_query(uri, user, password, query)
print("Number of nodes:", node_count)

Number of nodes: [<Record node_count=2368>]


In [12]:
# Query 5: Count the number of relationships
query = "MATCH ()-[r]->() RETURN count(r) as relationship_count"
relationship_count = run_query(uri, user, password, query)
print("Number of relationships:", relationship_count)

Number of relationships: [<Record relationship_count=743>]
