In [1]:
# Hack to make the module importable
import sys
sys.path.append(r'./../')

In [5]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
%load_ext autoreload
%autoreload 2
from py2neo import Graph, NodeMatcher
import pandas as pd

from rel2graph.relational_modules.pandas import PandasDataframeIterator
from rel2graph import IteratorIterator
from rel2graph import Converter
from rel2graph.utils import load_file
from rel2graph import register_attribute_postprocessor, Attribute
import rel2graph.common_modules

filename = "schema.yaml"

In [3]:
# Configure Logging
import logging

#logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger("rel2graph")
logger.setLevel(logging.DEBUG)
log_formatter = logging.Formatter("%(asctime)s [%(threadName)s]::[%(levelname)s]::%(filename)s: %(message)s")
console_handler = logging.StreamHandler()
console_handler.setFormatter(log_formatter)
logger.addHandler(console_handler)


In [6]:
iris = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [7]:
data = {
  "ID": [1,2,2,3,4,4],
  "FirstName": ["Julian", "Fritz",  "Fritz", "Hans", "Rudolfo", "Rudolfo"],
  "LastName": ["Minder", "Generic", "SomeGuy", "Müller", "Muster", "Muster"],
  "FavoriteFlower": ["virginica", "setosa", "setosa", "versicolor", "setosa", "setosa"]
}
people = pd.DataFrame(data)
people

Unnamed: 0,ID,FirstName,LastName,FavoriteFlower
0,1,Julian,Minder,virginica
1,2,Fritz,Generic,setosa
2,2,Fritz,SomeGuy,setosa
3,3,Hans,Müller,versicolor
4,4,Rudolfo,Muster,setosa
5,4,Rudolfo,Muster,setosa


In [8]:
graph = Graph(scheme="bolt", host="localhost", port=7687,  auth=('neo4j', 'bos')) ## UPDATE password

graph.delete_all()  # reset graph (only wehn first creating the databse, here for debugging purposes)

In [9]:
# Now neo4j does not support the numpy dtype int64, so we need to convert it to python native int
# We create a wrapper for this.
@register_attribute_postprocessor
def INT(attribute):
    return Attribute(attribute.key, int(attribute.value))

@register_attribute_postprocessor
def FLOAT(attribute):
    return Attribute(attribute.key, float(attribute.value))

# In the schema file wrap the Person.ID attribute in the INT wrapper
#        + ID = INT(Person.ID)

2022-12-19 22:25:58,522 [MainThread]::[DEBUG]::registrar.py: Registered attribute postprocessor 'INT''.
2022-12-19 22:25:58,523 [MainThread]::[DEBUG]::registrar.py: Registered attribute postprocessor 'FLOAT''.


In [10]:
iterator = IteratorIterator([PandasDataframeIterator(people, "Person"), PandasDataframeIterator(iris, "Flower")])

In [11]:
graph.delete_all()

In [12]:
converter = Converter(load_file(filename), iterator, graph, num_workers=1)

2022-12-19 22:26:10,291 [MainThread]::[DEBUG]::schema_compiler.py: Compiling module 'AttributeFactory' with static args ['sepal_length', 'sepal_length']
2022-12-19 22:26:10,291 [MainThread]::[DEBUG]::registrar.py: Requested factory 'AttributeFactory' from registry.
2022-12-19 22:26:10,292 [MainThread]::[DEBUG]::schema_compiler.py: Compiling module 'FLOAT' with static args []
2022-12-19 22:26:10,292 [MainThread]::[DEBUG]::registrar.py: Requested factory 'FLOAT' from registry.
2022-12-19 22:26:10,293 [MainThread]::[DEBUG]::registrar.py: Requested factory 'AttributeFactoryWrapper' from registry.
2022-12-19 22:26:10,293 [MainThread]::[DEBUG]::schema_compiler.py: Compiling module 'AttributeFactory' with static args ['sepal_width', 'sepal_width']
2022-12-19 22:26:10,294 [MainThread]::[DEBUG]::registrar.py: Requested factory 'AttributeFactory' from registry.
2022-12-19 22:26:10,294 [MainThread]::[DEBUG]::schema_compiler.py: Compiling module 'FLOAT' with static args []
2022-12-19 22:26:10,295 

In [13]:

converter()

2022-12-19 22:26:12,885 [MainThread]::[INFO]::converter.py: Running convertion with 1 parallel workers.
2022-12-19 22:26:12,886 [MainThread]::[INFO]::converter.py: Starting creation of nodes.
2022-12-19 22:26:12,886 [Worker-0]::[DEBUG]::converter.py: Starting Worker 0
2022-12-19 22:26:12,888 [Worker-0]::[DEBUG]::converter.py: Processing PandasSeriesResource 'Person' (row 0)
2022-12-19 22:26:12,888 [Worker-0]::[DEBUG]::converter.py: Processing PandasSeriesResource 'Person' (row 1)
2022-12-19 22:26:12,889 [Worker-0]::[DEBUG]::converter.py: Processing PandasSeriesResource 'Person' (row 2)
2022-12-19 22:26:12,890 [Worker-0]::[DEBUG]::converter.py: Processing PandasSeriesResource 'Person' (row 3)
2022-12-19 22:26:12,891 [Worker-0]::[DEBUG]::converter.py: Processing PandasSeriesResource 'Person' (row 4)
2022-12-19 22:26:12,892 [Worker-0]::[DEBUG]::converter.py: Processing PandasSeriesResource 'Person' (row 5)
2022-12-19 22:26:12,893 [Worker-0]::[DEBUG]::converter.py: Processing PandasSeriesR