In [1]:
import pandas as pd
import os
from stellargraph import datasets
from IPython.display import display, HTML
from neo4j import GraphDatabase
import torch

import time
import py2neo

functions

In [2]:
def empty_database(graph, dataset_label):
    # Delete nodes and relationships associated with the specified dataset label
    delete_query = f"""
    MATCH (n:{dataset_label}_Node)
    DETACH DELETE n
    """
    tx = graph.auto()
    tx.evaluate(delete_query)
    
    # Drop constraints associated with the specified dataset label
    constraints = graph.run("CALL db.constraints").data()
    for constraint in constraints:
        # Check if the constraint description contains the dataset label
        if f":{dataset_label}_Node" in constraint['description']:
            graph.run(f"DROP CONSTRAINT {constraint['name']}")

    # Drop indexes associated with the specified dataset label
    indexes = graph.run("CALL db.indexes").data()
    for index in indexes:
        # Check if the index description contains the dataset label
        if f":{dataset_label}_Node" in index['name']:
            graph.run(f"DROP INDEX {index['name']}")

In [3]:
def load_nodes(graph, node_list, dataset_label):
    query = f"""
    UNWIND $node_list as node
    CREATE (n:{dataset_label}_Node {{
        subject: node.subject,
        features: node.features
    }})
    SET n.ID = toInteger(node.id)
    """
    records = node_list.to_dict("records")
    tx = graph.auto()
    tx.evaluate(query, parameters={"node_list": records})


In [4]:
def load_edges(graph, edge_list, dataset_label):
    query = f"""
    UNWIND $edge_list as edge
    MATCH (source:{dataset_label}_Node {{ID: toInteger(edge.source)}})
    MATCH (target:{dataset_label}_Node {{ID: toInteger(edge.target)}})
    MERGE (source)-[:{dataset_label}_CITES]->(target)
    """
    records = edge_list.to_dict("records")
    tx = graph.auto()
    tx.evaluate(query, parameters={"edge_list": records})


In [5]:
def manage_constraints(graph, dataset_label, action="create"):
    if action == "create":
        constraint_query = f"CREATE CONSTRAINT ON (n:{dataset_label}_Node) ASSERT n.ID IS UNIQUE"
    elif action == "drop":
        constraints = graph.run("CALL db.constraints").data()
        for constraint in constraints:
            if dataset_label in constraint['description']:
                graph.run(f"DROP CONSTRAINT {constraint['description']}")
    else:
        raise ValueError("Action must be either 'create' or 'drop'")
    
    tx = graph.auto()
    tx.evaluate(constraint_query)


In [13]:
default_host = os.environ.get("EIGEN_THESIS")

# Create the Neo4j Graph database object; the arguments can be edited to specify location and authentication
graph = py2neo.Graph('neo4j://localhost:7687', user="neo4j", password="eigen1234")

CORA

In [None]:
dataset_cora = datasets.Cora()
display(HTML(dataset_cora.description))
dataset_cora.download()

In [None]:
edge_list_cora = pd.read_csv(
    os.path.join(dataset_cora.data_directory, "cora.cites"),
    sep="\t",
    header=None,
    names=["target", "source"],
)
edge_list_cora["label"] = "cites"


In [None]:
display(edge_list_cora)

In [None]:
feature_names_cora = ["w_{}".format(ii) for ii in range(1433)]
column_names_cora = feature_names_cora + ["subject"]
node_list_cora = pd.read_csv(
    os.path.join(dataset_cora.data_directory, "cora.content"),
    sep="\t",
    header=None,
    names=column_names_cora,
)

In [None]:
# gather all features into lists under 'features' column.
node_list_cora["features"] = node_list_cora[feature_names_cora].values.tolist()

node_list_cora = node_list_cora.drop(columns=feature_names_cora)
node_list_cora["id"] = node_list_cora.index
node_list_cora

In [None]:
empty_database(graph,'cora')

In [None]:
load_nodes(graph,node_list_cora,'cora')

In [None]:
load_edges(graph,edge_list_cora,'cora')

In [None]:
manage_constraints(graph,'cora')

CITESEER

In [2]:
dataset_citeseer = datasets.CiteSeer()
display(HTML(dataset_citeseer.description))
dataset_citeseer.download()

In [3]:
edge_list_citeseer = pd.read_csv(
    os.path.join(dataset_citeseer.data_directory, "citeseer.cites"),
    sep="\t",
    header=None,
    names=["target", "source"],
)
edge_list_citeseer["label"] = "cites"


In [4]:
display(edge_list_citeseer)

Unnamed: 0,target,source,label
0,100157,100157,cites
1,100157,364207,cites
2,100157,38848,cites
3,100157,bradshaw97introduction,cites
4,100157,bylund99coordinating,cites
...,...,...,...
4727,zhao98empirical,zhao99discriminant,cites
4728,zheng98stochastic,90601,cites
4729,zheng98stochastic,zheng98integrating,cites
4730,zhu00incorporating,clarke01exploiting,cites


In [5]:
feature_names_citeseer = ["w_{}".format(ii) for ii in range(3703)]
column_names_citeseer = feature_names_citeseer + ["subject"]
node_list_citeseer = pd.read_csv(
    os.path.join(dataset_citeseer.data_directory, "citeseer.content"),
    sep="\t",
    header=None,
    names=column_names_citeseer,
    low_memory=False,
)

In [6]:
# gather all features into lists under 'features' column.
node_list_citeseer["features"] = node_list_citeseer[feature_names_citeseer].values.tolist()

node_list_citeseer = node_list_citeseer.drop(columns=feature_names_citeseer)
node_list_citeseer["id"] = node_list_citeseer.index
node_list_citeseer

Unnamed: 0,subject,features,id
100157,Agents,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",100157
100598,IR,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",100598
105684,Agents,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",105684
11099,DB,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11099
114091,AI,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",114091
...,...,...,...
zhang99query,DB,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",zhang99query
zhang99situated,ML,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",zhang99situated
zhang99towards,IR,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",zhang99towards
zhou00implementation,DB,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",zhou00implementation


In [None]:
empty_database(graph,'citeseer')

In [None]:
load_nodes(graph,node_list_citeseer,'citeseer')

In [None]:
load_edges(graph,edge_list_citeseer,'citeseer')

In [None]:
manage_constraints(graph,'citeseer')

PUBMED

In [None]:
dataset_pubmed = datasets.PubMedDiabetes()
display(HTML(dataset_pubmed.description))
dataset_pubmed.download()

In [10]:
edge_list_pubmed = pd.read_csv(
    os.path.join(dataset_pubmed.data_directory, "Pubmed-Diabetes.DIRECTED.cites.tab"),
    sep="\t",
    header=None,
    names=["id", "source", "pipe", "target"],
    usecols=["source", "target","id"],
)
# Strip the "paper:" prefix from 'source' and 'target'
edge_list_pubmed['source'] = edge_list_pubmed['source'].str.lstrip("paper:")
edge_list_pubmed['target'] = edge_list_pubmed['target'].str.lstrip("paper:")

# Filter rows with non-integer values
edge_list_pubmed = edge_list_pubmed[edge_list_pubmed['source'].str.isdigit() & edge_list_pubmed['target'].str.isdigit()]

# Convert the filtered columns to integers
edge_list_pubmed['source'] = edge_list_pubmed['source'].astype(int)
edge_list_pubmed['target'] = edge_list_pubmed['target'].astype(int)

edge_list_pubmed["label"] = "cites"


In [11]:
display(edge_list_pubmed.head(10))

Unnamed: 0,id,source,target,label
2,33824,19127292,17363749,cites
3,37511,19668377,17293876,cites
4,6735,1313726,3002783,cites
5,33493,19110882,14578298,cites
6,27380,18606979,10333910,cites
7,37858,19735543,8420806,cites
8,34760,19228402,17017752,cites
9,34707,19220880,12734781,cites
10,13481,16595006,11790819,cites
11,33995,19143817,11206408,cites


In [None]:
feature_names_pubmed = ["w_{}".format(ii) for ii in range(500)]
column_names_pubmed = feature_names_pubmed + ["subject"]
node_list_pubmed = pd.read_csv(
    os.path.join(dataset_pubmed.data_directory, "Pubmed-Diabetes.NODE.paper.tab"),
    sep="\t",
    skiprows=2,
    header=None,
    names=column_names_pubmed,
    low_memory=False,
)

In [16]:
# Load the dataset
dataset_pubmed = datasets.PubMedDiabetes()
dataset_pubmed, labels = dataset_pubmed.load()

# Convert node features to a DataFrame
node_list_pubmed = pd.DataFrame(dataset_pubmed.node_features(), index=dataset_pubmed.nodes())

# Combine all feature columns into a single list for each row
node_list_pubmed["features"] = node_list_pubmed.apply(lambda row: row.tolist(), axis=1)

# Add labels as a separate column
node_list_pubmed["label"] = labels.values

# Add the node IDs as a separate column
node_list_pubmed["id"] = dataset_pubmed.nodes()

# Drop the individual feature columns to retain only 'features' and 'label'
node_list_pubmed = node_list_pubmed[["id","features", "label"]]

# Display the updated DataFrame
node_list_pubmed.head(10)


Unnamed: 0_level_0,id,features,label
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12187484,12187484,"[0.09393489360809326, 0.028698459267616272, 0....",1
2344352,2344352,"[0.02361791580915451, 0.0, 0.01478415913879871...",1
14654069,14654069,"[0.10226314514875412, 0.0, 0.01066898088902235...",1
16443886,16443886,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2
2684155,2684155,"[0.03061581775546074, 0.0, 0.0, 0.0, 0.0, 0.0,...",1
15032912,15032912,"[0.11689675599336624, 0.0, 0.0, 0.0, 0.0, 0.0,...",1
17988185,17988185,"[0.0, 0.0, 0.00744525995105505, 0.0, 0.0, 0.0,...",3
9834350,9834350,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3
16230722,16230722,"[0.0, 0.010479104705154896, 0.0042941542342305...",3
3542527,3542527,"[0.0, 0.0, 0.027970030903816223, 0.0, 0.0, 0.0...",2


In [17]:
load_nodes(graph,node_list_pubmed,'pubmed')

In [18]:
load_edges(graph,edge_list_pubmed,'pubmed')