In [10]:
import subprocess
import docker
import os
import neo4j
from neo4j import GraphDatabase
import pandas as pd
import time
import numpy as np
from tqdm.notebook import tqdm
from torch_geometric.utils import sort_edge_index
import torch
from torch_geometric.datasets import PPI
from torch_geometric.loader import DataLoader

In [None]:
from ogb.nodeproppred import NodePropPredDataset
from torch_geometric.data import DataLoader

# Download and process data at './dataset/ogbg_molhiv/'
dataset = NodePropPredDataset(name = "ogbn-papers100M", root = 'ogbn_dataset/')

Loading necessary files...
This might take a while.
Processing graphs...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 15947.92it/s]


Saving...


In [9]:
import os
os.listdir("./ogbn_dataset/ogbn_papers100M/processed/")

['data_processed']

In [None]:
dataset.shape

In [None]:
split_idx = dataset.get_idx_split() 
train_loader = DataLoader(dataset[split_idx["train"]], batch_size=32, shuffle=True)
valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=32, shuffle=False)
test_loader = DataLoader(dataset[split_idx["test"]], batch_size=32, shuffle=False)

In [2]:
ppi = PPI(root = "data")
data_loader = iter(DataLoader(ppi, batch_size=len(ppi)))
ppi_data = next(data_loader)
pd.DataFrame(ppi_data.x).to_csv("data/ppi_x.csv", index = False, sep = ",")
pd.DataFrame(ppi_data.y).to_csv("data/ppi_y.csv", index = False, sep = ",")
pd.DataFrame(ppi_data.edge_index.transpose(-1, 0)).to_csv("data/ppi_edge_index.csv", index = False, sep = ",")

In [5]:
def create_neo4j_container():
    # Create a Docker client
    client = docker.from_env()
    
    # Define the image name
    image_name = "neo4j:latest"
    
    # Pull the image if it is not already available
    client.images.pull(image_name)
    
    # Define the container name and environment variables
    container_name = "neo4j-apoc"
    neo4j_env = {
        "NEO4J_apoc_export_file_enabled": "true",
        "NEO4J_apoc_import_file_enabled": "true",
        "NEO4J_apoc_import_file_use__neo4j__config": "true",
        "NEO4J_PLUGINS": '["apoc"]',
        "NEO4J_AUTH": "neo4j/password",
    }
    
    # Define the volume mapping
    volume_mapping = {
        f"{os.getcwd()}/neo4j_data": {"bind": "/data", "mode": "rw"},
        f"{os.getcwd()}/data": {"bind": "/import", "mode": "rw"}
    }
    ports = {"7474": 7475, "7687": 7688}
    # Run the container
    container = client.containers.run(
        image=image_name,
        name=container_name,
        ports=ports,
        environment=neo4j_env,
        volumes=volume_mapping,
        detach=True,
        tty=True,
        stdin_open=True,
        remove=True,  # Automatically remove the container when it stops
    )
    
    print(f"Container {container_name} started with ID: {container.id}")
    return container, ports

In [6]:
container, ports = create_neo4j_container()

Container neo4j-apoc started with ID: 59ce67e509c03e3c38a022f18fdcf970ce0723cf2b6935ba300ad959d20409a1


In [7]:
#container.stop()

In [8]:
def connect_to_neo4j(uri, user, password):
    """
    Establishes a connection to the Neo4j database.

    Args:
        uri (str): The URI of the Neo4j database.
        user (str): The username for authentication.
        password (str): The password for authentication.

    Returns:
        GraphDatabase.driver: The Neo4j driver instance for the connection.
    """

    return GraphDatabase.driver(uri, auth=(user, password))

In [9]:
uri = f"bolt://localhost:{ports['7687']}"
username = "neo4j"
password = "password"

# Parameters
num_hops = 3
num_iterations = 10
driver = connect_to_neo4j(uri, username, password)
driver.execute_query("MATCH (n) DETACH DELETE n")

Transaction failed and will be retried in 0.8631931064165062s (Couldn't connect to localhost:7688 (resolved to ()):
Connection to 127.0.0.1:7688 closed without handshake response)


EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x7eff9c026830>, keys=[])

In [10]:
data_root_dir = "/home/dwalke/git/shweta/Master-Thesis-Code/data/syn_data"
num_nodes = 10000
num_edges = 5
# edge_file_name = f"edge_index_{num_nodes}_nodes_{num_edges}_edges.csv"
# X = pd.read_csv(f"{data_root_dir}/X_{num_nodes}_nodes_{num_edges}_edges.csv")
# y = pd.read_csv(f"{data_root_dir}/y_{num_nodes}_nodes_{num_edges}_edges.csv")
# edges = pd.read_csv(f"{data_root_dir}/{edge_file_name}")
edge_file_name = "ppi_edge_index.csv"
X = pd.read_csv(f"data/ppi_x.csv")
y = pd.read_csv(f"data/ppi_y.csv")
edges = pd.read_csv("data/" + edge_file_name)

In [11]:
## TODO in-memory graph

In [27]:
def recurse_edge_index(source_nodes, edge_index, max_depth, depth = 0):
    assert max_depth >= 1, "Max depth should be above or equal to one"
    target_mask = edge_index[1][:, None] == source_nodes
    target_mask = target_mask.sum(-1).astype(np.bool_)
    subgraph_edge_index = edge_index[:, target_mask]
    depth = depth + 1
    
    if depth == max_depth:
        return subgraph_edge_index

    source_nodes = np.concatenate([subgraph_edge_index[0, :], source_nodes])    
    return recurse_edge_index(source_nodes,edge_index, max_depth, depth)

def get_subgraph_from_in_mem_graph(X,y, i, edge_index, hops):
    
    subgraph_edge_index = recurse_edge_index([i], edge_index, hops)
    unique_node_ids = np.unique(subgraph_edge_index)
    
    features = X.iloc[unique_node_ids, :].values
    labels = y.iloc[unique_node_ids, :].values.squeeze()

    node_ids = unique_node_ids
    _, cols_source = np.nonzero((subgraph_edge_index[0] == node_ids[:, None]).transpose())
    _, cols_target = np.nonzero((subgraph_edge_index[1] == node_ids[:, None]).transpose())
    remapped_edge_index = np.concatenate([np.expand_dims(cols_source, axis = 0), np.expand_dims(cols_target, axis = 0)], axis = 0)
    return remapped_edge_index, features, labels
    
raise Exception("Comment out")
hops = 2
overall_run_time = 0
edge_index = edges.values.astype(np.int64).transpose()
for i in tqdm(range(X.shape[0])):
    start = time.time()
    remapped_edge_index, features, labels = get_subgraph_from_in_mem_graph(X, y, i, edge_index, hops)
    overall_run_time += time.time() - start
    print(f"Fetched {remapped_edge_index.shape} edges, {labels.shape} labels, {features.shape} features in {overall_run_time} s")

Exception: Comment out

In [14]:
y_comb = y.copy()
y_comb['Combined'] = y_comb.apply(lambda row: [row[column] for column in y.columns], axis=1)

In [15]:
X_and_y = X.copy()
# X_and_y["label"] = y["0"]
X_and_y["label"] = y_comb["Combined"].apply(lambda x: ','.join(map(str, x)))
# node_file_name = f"X_and_y_{num_nodes}_{num_edges}.csv"
node_file_name = "X_y_ppi.csv"
X_and_y.to_csv(f"data/{node_file_name}", sep = ",", index = False)

In [21]:
X_and_y

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,label
0,-0.085514,-0.088374,-0.11278,-0.171909,-0.076607,-0.100271,-0.075123,-0.114888,-0.121191,-0.09936,...,-0.1539,-0.159347,-0.154581,-0.146616,-0.144943,-0.156757,-0.139898,-0.149366,-0.148115,"1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0...."
1,-0.085514,-0.088374,-0.11278,-0.171909,-0.076607,-0.100271,-0.075123,-0.114888,-0.121191,-0.09936,...,-0.1539,-0.159347,-0.154581,-0.146616,-0.144943,-0.156757,-0.139898,-0.149366,-0.148115,"1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1...."
2,-0.085514,-0.088374,-0.11278,-0.171909,-0.076607,-0.100271,-0.075123,-0.114888,-0.121191,-0.09936,...,-0.1539,-0.159347,-0.154581,-0.146616,-0.144943,-0.156757,-0.139898,-0.149366,-0.148115,"1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...."
3,-0.085514,-0.088374,-0.11278,-0.171909,-0.076607,-0.100271,-0.075123,-0.114888,-0.121191,-0.09936,...,-0.1539,-0.159347,-0.154581,-0.146616,-0.144943,-0.156757,-0.139898,-0.149366,-0.148115,"1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0...."
4,-0.085514,-0.088374,-0.11278,-0.171909,-0.076607,-0.100271,-0.075123,-0.114888,-0.121191,-0.09936,...,-0.1539,-0.159347,-0.154581,-0.146616,-0.144943,-0.156757,7.148086,-0.149366,-0.148115,"1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44901,-0.085514,-0.088374,-0.11278,-0.171909,-0.076607,-0.100271,-0.075123,-0.114888,-0.121191,-0.09936,...,-0.1539,-0.159347,-0.154581,-0.146616,-0.144943,-0.156757,-0.139898,-0.149366,-0.148115,"1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0...."
44902,-0.085514,-0.088374,-0.11278,-0.171909,-0.076607,-0.100271,-0.075123,-0.114888,-0.121191,-0.09936,...,-0.1539,-0.159347,-0.154581,-0.146616,-0.144943,-0.156757,-0.139898,-0.149366,-0.148115,"1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1...."
44903,-0.085514,-0.088374,-0.11278,-0.171909,-0.076607,-0.100271,-0.075123,-0.114888,-0.121191,-0.09936,...,-0.1539,-0.159347,-0.154581,-0.146616,-0.144943,-0.156757,-0.139898,-0.149366,-0.148115,"1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0...."
44904,-0.085514,-0.088374,-0.11278,-0.171909,-0.076607,-0.100271,-0.075123,-0.114888,-0.121191,-0.09936,...,-0.1539,-0.159347,-0.154581,-0.146616,-0.144943,-0.156757,-0.139898,-0.149366,-0.148115,"1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1...."


In [140]:
pd.read_csv(f"data/{node_file_name}")["label"]

0        1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0....
1        1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1....
2        1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....
3        1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0....
4        1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1....
                               ...                        
44901    1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....
44902    1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1....
44903    1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0....
44904    1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1....
44905    1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0....
Name: label, Length: 44906, dtype: object

In [23]:
with driver.session() as session:
    node_query = """
            LOAD CSV WITH HEADERS FROM $file AS line
            WITH line, linenumber() AS index
            CALL (line, index) {
              MERGE (p:Node {id: index - 2})
              SET p.label = toIntegerList(split(line["label"], ','))
              WITH p, line, [key IN keys(line) WHERE NOT key IN ["label", "id"]] as keys
              UNWIND keys AS key
              SET p[key] = toFloat(line[key])
            } IN TRANSACTIONS OF 10000 ROWS
            """
    node_summary = session.run(node_query, file=f"file:///{node_file_name}").consume()
    node_count = node_summary.counters.nodes_created
    print(node_summary.result_available_after)
    session.run("CREATE INDEX IF NOT EXISTS FOR (n:Node) ON (n.id)")

38779


In [24]:
driver.execute_query("MATCH (n) RETURN n LIMIT 1")

EagerResult(records=[<Record n=<Node element_id='4:a7bcbde8-8be7-41a1-b47f-878022e7a268:0' labels=frozenset({'Node'}) properties={'44': -0.14661625, '45': -0.14494342, '46': -0.15675704, '47': -0.1398976, '48': -0.14936616, '49': -0.14811485, 'id': 0, '10': 0.0, '11': -0.16990769, '12': -0.04277122, '13': -0.1122726, '14': -0.07601667, '15': -0.11518571, '16': -0.1030655, '17': -0.11196714, '18': -0.14349532, '19': -0.09751539, '0': -0.08551431, '1': -0.08837446, '2': -0.11277995, '3': -0.17190897, '4': -0.076607116, '5': -0.10027067, '6': -0.07512251, '7': -0.1148876, '8': -0.12119106, '9': -0.09936041, '20': -0.08747425, '21': -0.1456625, '22': -0.12344164, '23': -0.1241833, '24': -0.097631685, '25': -0.11966856, '26': -0.116075665, '27': -0.07345559, '28': -0.06671735, '29': -0.087344915, '30': -0.17965071, '31': -0.144703, '32': -0.16059202, '33': -0.158168, '34': -0.14772183, '35': -0.4349539, '36': -0.16168302, '37': -0.1555599, '38': -0.15260626, '39': -0.13964918, 'label': [1, 

In [18]:
with driver.session(database = "neo4j") as session:
    edge_query = """
            LOAD CSV FROM $file AS line
            WITH line, linenumber() AS index
            WHERE index <> 1
            CALL (line) {
                MATCH (s:Node {id: toInteger(line[0])}), (t:Node {id: toInteger(line[1])})
                CREATE (s)-[r:connects]->(t)
            } IN TRANSACTIONS OF 10000 ROWS
            """
    edge_summary = session.run(edge_query, file=f"file:///{edge_file_name}").consume()
    edge_count = edge_summary.counters.nodes_created

In [105]:
edge_res = driver.execute_query("MATCH (s)-[r]->(t) UNWIND r as edge RETURN collect(distinct [startNode(edge).id, endNode(edge).id]) as edges")
(sort_edge_index(torch.from_numpy(edge_index)) == sort_edge_index(torch.tensor(edge_res.records[0]["edges"]).transpose(1,0))).sum()

tensor(2452736)

In [None]:
import numpy as np
import time
from tqdm.notebook import tqdm

fetch_run_time = 0
overall_run_time = 0
hops = 3

edge_index = edges.values.astype(np.int64).transpose()

##TODO other strat might be to use arrays for features?
with driver.session(database = "neo4j") as session:
    i = 0
    for seed_node_id in tqdm(list(map(lambda node: node[0], driver.execute_query("MATCH (n) RETURN n.id").records))):
        # i+=1
        # if i < 281: continue
        start = time.time()
        subgraph_query = f"""
            MATCH (s)-[r*0..{hops}]->(t {{id: $seed_node_id}})
            UNWIND r as edge
            WITH t, edge
            WITH collect(DISTINCT [startNode(edge).id, endNode(edge).id]) as edges, collect(distinct startNode(edge)) as startNodes, t as endNode
            UNWIND startNodes + [endNode] as node 
            WITH edges, COLLECT(DISTINCT node) AS uniqueNodes
            UNWIND uniqueNodes as node
            WITH edges, node, [key IN keys(node) WHERE NOT key IN ["label", "id"] | [toInteger(key), node[key]]] as nodeFeatures
            ORDER BY node.id // Only required for my tests
            RETURN edges, collect(node.id) as idCollection, collect(node.label) as labels, collect(nodeFeatures) as features
            """
        results = session.run(subgraph_query, seed_node_id = seed_node_id)
        res_df = results.to_df()
        if res_df.empty: continue

        summary = results.consume()      
        fetch_run_time += summary.result_consumed_after
        
        subgraph_edge_index = np.array(res_df["edges"][0]).transpose()
        property_keys = np.array(res_df["features"][0])[:, :, 0]
        property_values = np.array(res_df["features"][0])[:, :, 1]
        sort_idxs = np.argsort(property_keys)
        
        node_ids = np.array(res_df["idCollection"][0])
        _, cols_source = np.nonzero((subgraph_edge_index[0] == node_ids[:, None]).transpose())
        _, cols_target = np.nonzero((subgraph_edge_index[1] == node_ids[:, None]).transpose())
        
        remapped_edge_index = np.concatenate([np.expand_dims(cols_source, axis = 0), np.expand_dims(cols_target, axis = 0)], axis = 0)
        features = property_values[np.arange(property_keys.shape[0])[:, None], sort_idxs]
        labels = np.array(res_df["labels"][0])       
        overall_run_time += time.time() - start        

        ## Testing
        remapped_edge_index_test, features_test, labels_test = get_subgraph_from_in_mem_graph(X, y, seed_node_id, edge_index, hops)        
        assert (sort_edge_index(torch.from_numpy(remapped_edge_index_test)) == sort_edge_index(torch.from_numpy(remapped_edge_index))).sum() / (remapped_edge_index_test.shape[-1] * remapped_edge_index_test.shape[0]), "Edges doesnt match"
        assert np.allclose(labels_test, labels), "Labels does not match"
        assert np.allclose(features, features_test), "features doe not match"
        print(f"Fetched {remapped_edge_index.shape} edges, {labels.shape} labels, {features.shape} features in {fetch_run_time} ms ({overall_run_time} s)")

  0%|          | 0/44906 [00:00<?, ?it/s]

Fetched (2, 4153) edges, (1105, 121) labels, (1105, 50) features in 288 ms (1.0520639419555664 s)
Fetched (2, 23457) edges, (1470, 121) labels, (1470, 50) features in 2139 ms (3.5857880115509033 s)
Fetched (2, 17438) edges, (1454, 121) labels, (1454, 50) features in 2859 ms (5.079564571380615 s)
Fetched (2, 10570) edges, (1342, 121) labels, (1342, 50) features in 3358 ms (6.036400318145752 s)
Fetched (2, 25953) edges, (1501, 121) labels, (1501, 50) features in 4744 ms (8.509945392608643 s)
Fetched (2, 19237) edges, (1437, 121) labels, (1437, 50) features in 5439 ms (9.822910785675049 s)
Fetched (2, 25737) edges, (1488, 121) labels, (1488, 50) features in 6728 ms (12.065760850906372 s)
Fetched (2, 23484) edges, (1483, 121) labels, (1483, 50) features in 8515 ms (14.546214580535889 s)
Fetched (2, 18388) edges, (1440, 121) labels, (1440, 50) features in 9047 ms (15.88187289237976 s)
Fetched (2, 19320) edges, (1479, 121) labels, (1479, 50) features in 9594 ms (17.053831577301025 s)
Fetched

In [160]:
get_subgraph_from_in_mem_graph(X, y, seed_node_id, edge_index, hops) 

(array([], shape=(2, 0), dtype=int64),
 array([], shape=(0, 50), dtype=float64),
 array([], shape=(0, 121), dtype=float64))

In [95]:
driver.execute_query("MATCH (n) DETACH DELETE(n)")

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x7efd5020f220>, keys=[])

In [60]:
container.stop()