In [1]:
# Hack to make the module importable
import sys
sys.path.append(r'./../')

In [2]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [3]:
%load_ext autoreload
%autoreload 2
from py2neo import Graph, NodeMatcher
import pandas as pd

from rel2graph.relational_modules.pandas import PandasDataframeIterator
from rel2graph import IteratorIterator
from rel2graph import Converter
from rel2graph.utils import load_file
from rel2graph import register_attribute_postprocessor, Attribute, register_attribute_preprocessor, Resource, register_subgraph_preprocessor
import rel2graph.common_modules
from rel2graph.common_modules import DATE
from datetime import datetime
import numpy as np
import math

filename = "frus_schema.yaml"

In [4]:
# Configure Logging
import logging

#logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger("rel2graph")
logger.setLevel(logging.DEBUG)
log_formatter = logging.Formatter("%(asctime)s [%(threadName)s]::[%(levelname)s]::%(filename)s: %(message)s")
console_handler = logging.StreamHandler()
console_handler.setFormatter(log_formatter)
logger.addHandler(console_handler)

In [5]:
doc_df = pd.read_csv('tables/doc_single_volume.csv')
# change year from type 'float' to 'str(int)' suitable for rel2graph
doc_df['year'] = doc_df['year'].apply(lambda x: x if math.isnan(x) else str(int(x)))

era_df = pd.read_csv('tables/era.csv')
year_df = pd.read_csv('tables/year.csv')
person_df = pd.read_csv('tables/person_single_volume.csv')

person_sentby_df = pd.read_csv('tables/person_sentby_single_volume.csv')
person_sentto_df = pd.read_csv('tables/person_sentto_single_volume.csv')
person_mentioned_df = pd.read_csv('tables/person_mentioned_single_volume.csv')

In [6]:
graph = Graph(scheme="bolt", host="localhost", port=7687,  auth=('neo4j', 'bos'))

graph.delete_all()  # reset graph (only wehn first creating the databse, here for debugging purposes)

In [7]:
# Now neo4j does not support the numpy dtype int64, so we need to convert it to python native int
# We create a wrapper for this.
@register_attribute_postprocessor
def INT(attribute):
    # check if field is Nan
    if isinstance(attribute.value, float) and math.isnan(attribute.value):
        return Attribute(attribute.key, attribute.value)
    else:
        return Attribute(attribute.key, int(attribute.value))

@register_attribute_postprocessor
def FLOAT(attribute):
    return Attribute(attribute.key, float(attribute.value))

@register_attribute_postprocessor
def AUX(attribute):
    # check if field is Nan
    if isinstance(attribute.value, float) and math.isnan(attribute.value):
        return Attribute(attribute.key, attribute.value)
    else:
        return Attribute(attribute.key, datetime.strptime(attribute.value,'%Y-%m-%d'))


@register_subgraph_preprocessor
def ONLY_CREATE_IF_EXISTS(resource: Resource, key) -> Resource:
    val = resource[key]
    if isinstance(val, float) and math.isnan(val):
        return None
    else:
        return resource


# In the schema file wrap the Person.ID attribute in the INT wrapper
#        + ID = INT(Person.ID)

2023-01-03 20:35:49,925 [MainThread]::[DEBUG]::registrar.py: Registered attribute postprocessor 'INT''.
2023-01-03 20:35:49,926 [MainThread]::[DEBUG]::registrar.py: Registered attribute postprocessor 'FLOAT''.
2023-01-03 20:35:49,927 [MainThread]::[DEBUG]::registrar.py: Registered attribute postprocessor 'AUX''.
2023-01-03 20:35:49,928 [MainThread]::[DEBUG]::registrar.py: Registered subgraph preprocessor 'ONLY_CREATE_IF_EXISTS'.


In [8]:
iterator = IteratorIterator([PandasDataframeIterator(doc_df, "Document"), 
                             PandasDataframeIterator(era_df, "Era"), 
                             PandasDataframeIterator(person_df, "Person"),
                             PandasDataframeIterator(year_df, "Year"),
                             PandasDataframeIterator(person_sentby_df, "PersonSentBy"),
                             PandasDataframeIterator(person_sentto_df, "PersonSentTo"),
                             PandasDataframeIterator(person_mentioned_df, "PersonMentionedIn")])

In [9]:
converter = Converter(load_file(filename), iterator, graph, num_workers=1)

2023-01-03 20:35:50,810 [MainThread]::[DEBUG]::schema_compiler.py: Compiling module 'AttributeFactory' with static args ['docID', 'id']
2023-01-03 20:35:50,811 [MainThread]::[DEBUG]::registrar.py: Requested factory 'AttributeFactory' from registry.
2023-01-03 20:35:50,812 [MainThread]::[DEBUG]::schema_compiler.py: Compiling module 'AttributeFactory' with static args ['subtype', 'subtype']
2023-01-03 20:35:50,812 [MainThread]::[DEBUG]::registrar.py: Requested factory 'AttributeFactory' from registry.
2023-01-03 20:35:50,813 [MainThread]::[DEBUG]::schema_compiler.py: Compiling module 'AttributeFactory' with static args ['volume', 'volume']
2023-01-03 20:35:50,813 [MainThread]::[DEBUG]::registrar.py: Requested factory 'AttributeFactory' from registry.
2023-01-03 20:35:50,814 [MainThread]::[DEBUG]::schema_compiler.py: Compiling module 'AttributeFactory' with static args ['date', 'date']
2023-01-03 20:35:50,814 [MainThread]::[DEBUG]::registrar.py: Requested factory 'AttributeFactory' from r

In [10]:
converter()

2023-01-03 20:35:51,122 [MainThread]::[INFO]::converter.py: Running convertion with 1 parallel workers.
2023-01-03 20:35:51,123 [MainThread]::[INFO]::converter.py: Starting creation of nodes.
2023-01-03 20:35:51,124 [Worker-0]::[DEBUG]::converter.py: Starting Worker 0
2023-01-03 20:35:51,125 [Worker-0]::[DEBUG]::converter.py: Processing PandasSeriesResource 'Document' (row 0)
2023-01-03 20:35:51,126 [Worker-0]::[DEBUG]::converter.py: Processing PandasSeriesResource 'Document' (row 1)
2023-01-03 20:35:51,128 [Worker-0]::[DEBUG]::converter.py: Processing PandasSeriesResource 'Document' (row 2)
2023-01-03 20:35:51,129 [Worker-0]::[DEBUG]::converter.py: Processing PandasSeriesResource 'Document' (row 3)
2023-01-03 20:35:51,130 [Worker-0]::[DEBUG]::converter.py: Processing PandasSeriesResource 'Document' (row 4)
2023-01-03 20:35:51,131 [Worker-0]::[DEBUG]::converter.py: Processing PandasSeriesResource 'Document' (row 5)
2023-01-03 20:35:51,134 [Worker-0]::[DEBUG]::converter.py: Processing P