In [1]:
from rdflib import Graph
from gremlin_python.driver import client


rdf_graph = Graph()
rdf_graph.parse("data/Testdataset.rdf", format="turtle")

gremlin_client = client.Client('ws://localhost:8182/gremlin', 'g')

for subj, pred, obj in rdf_graph:
    print(f"Subj: {subj}, Pred: {pred}, Obj: {obj}")
    # Create or get vertex for subj
    # If obj is literal: add as property
    # If obj is URI: create/get vertex for obj, add edge pred from subj to obj
    # Use gremlin_client.submit() to send Gremlin queries


Subj: ex_comp:54, Pred: http://localhost/hasName, Obj: C. Gathmann Computer-System-Beratung GmbH
Subj: ex_comp:26, Pred: http://localhost/hasAddress, Obj: ex_addr:24
Subj: ex_addr:17, Pred: http://localhost/hasStreet, Obj: Gaußstraße 158
Subj: ex_addr:32, Pred: rdf:type, Obj: ex:Address
Subj: ex_addr:69, Pred: http://localhost/hasStreet, Obj: Harderweg 1
Subj: ex_comp:12, Pred: http://localhost/hasName, Obj: Neunte Grundstücksverwaltung AHG-Beteiligungs- und Handelsgesellschaft mbH & Co. KG
Subj: ex_addr:46, Pred: http://localhost/hasStreet, Obj: Rosenstraße 2
Subj: ex_comp:22, Pred: http://localhost/hasAddress, Obj: ex_addr:20
Subj: ex_comp:63, Pred: rdf:type, Obj: ex:Company
Subj: ex_addr:30, Pred: http://localhost/hasStreet, Obj: Goldbekplatz 1
Subj: ex_comp:29, Pred: rdf:type, Obj: ex:Company
Subj: ex_comp:34, Pred: rdf:type, Obj: ex:Company
Subj: ex_comp:56, Pred: http://localhost/hasFNR, Obj: R1101_HRB77984
Subj: ex_comp:94, Pred: http://localhost/hasName, Obj: Hartungstraße 12 V

In [28]:
from rdflib import Graph, URIRef, Literal
from rdflib.namespace import RDF
from gremlin_python.driver import client
from gremlin_python.driver import serializer

import logging
import nest_asyncio
nest_asyncio.apply() #Needed to fix 'Running Event loop Issue' in Jupyter notebooks - This is not needed when executed as python script

logging.basicConfig(
    level=logging.DEBUG, 
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename='RDF_DataIngestion.log',
    filemode='w'
)
logger = logging.getLogger(__name__)

def sanitize_key(key: str) -> str:
    return key.replace(' ', '_').split('/')[-1]

def process_rdf(rdf_path: str, gremlin_client: client.Client):
    rdf_graph = Graph()
    rdf_graph.parse(rdf_path, format="turtle")
    
    stats = {
        'vertices_processed': 0,
        'edges_created': 0,
        'properties_set': 0,
        'errors': 0
    }

    for subj, pred, obj in rdf_graph:
        try:
            subj_uri = str(subj)
            pred_name = sanitize_key(str(pred))
            obj_value = obj

            # Vertex creation/check
            vertex_result = gremlin_client.submit(
                f"g.V().has('uri', '{subj_uri}').fold()"
                f".coalesce(__.unfold(), __.addV('Entity').property('uri', '{subj_uri}'))"
            ).all().result()

            
            if 'created' in str(vertex_result):
                stats['vertices_processed'] += 1
                logger.debug(f"Created new vertex: {subj_uri}")
            else:
                logger.debug(f"Existing vertex found: {subj_uri}")

            if pred == RDF.type:

                # Handle type
                gremlin_client.submit(
                    f"g.V().has('uri', '{subj_uri}').property('rdf_type', '{str(obj)}')"
                ).all().result()
                logger.debug(f"Set rdf_type for {subj_uri}: {str(obj)}")
                stats['properties_set'] += 1

            elif isinstance(obj, URIRef):

                # Edge creation
                obj_uri = str(obj)

                # Checking if the Target Vertex already exists and adds it, in case it does not exist
                gremlin_client.submit(
                    f"g.V().has('uri', '{obj_uri}').fold()"
                    f".coalesce(__.unfold(), __.addV('Entity').property('uri', '{obj_uri}'))"
                ).all().result()

            
                edge_result = gremlin_client.submit(
                    f"""
                    g.V().has('uri', '{subj_uri}')
                    .addE('{pred_name}')
                    .to(__.V().has('uri', '{obj_uri}'))
                    """
                ).all().result()

                stats['edges_created'] += 1
                logger.info(f"Created edge: {subj_uri} -[{pred_name}]-> {obj_uri}")

            else:
                # Property handling
                if isinstance(obj, Literal):
                    value = obj.toPython()
                    if isinstance(value, str):
                        value = f"'{value.replace("'", "''")}'"
                    
                    prop_result = gremlin_client.submit(
                        f"g.V().has('uri', '{subj_uri}')"
                        f".property('{pred_name}', {value})"
                    ).all().result()
                    
                    stats['properties_set'] += 1
                    logger.debug(f"Set property {pred_name}={value} on {subj_uri}")

        except Exception as e:
            stats['errors'] += 1
            logger.error(f"Failed to process {subj} {pred} {obj}: {str(e)}", exc_info=True)

    # Summary logging
    logger.info("\nImport Statistics:")
    logger.info(f"Total vertices processed: {stats['vertices_processed']}")
    logger.info(f"Total edges created: {stats['edges_created']}")
    logger.info(f"Total properties set: {stats['properties_set']}")
    logger.info(f"Total errors encountered: {stats['errors']}")

# Initialize and run
gremlin_client = client.Client(
    url='ws://localhost:8182/gremlin',
    traversal_source='g',
    message_serializer=serializer.GraphSONSerializersV3d0())
process_rdf("data/Augemented_Testdataset.ttl", gremlin_client)
gremlin_client.close()


In [24]:
import rdflib
import csv

# Load the graph
g = rdflib.Graph()
g.parse("data/Testdataset.rdf", format="turtle")

def get_local_name(uri):
    # Get the last part after '/' or '#'
    uri = str(uri)
    if '#' in uri:
        return uri.split('#')[-1]
    else:
        return uri.split('/')[-1]

with open("triples.csv", "w", newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["subject", "predicate", "object"])
    for subj, pred, obj in g:
        # Use the full URI for subject and object, but only the local name for predicate
        writer.writerow([subj, get_local_name(pred), obj])


In [25]:
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
from gremlin_python.process.anonymous_traversal import traversal
from janusgraph_python.driver.serializer import JanusGraphSONSerializersV3d0
import pandas as pd

# Connect to JanusGraph server
connection = DriverRemoteConnection(
    'ws://localhost:8182/gremlin', 
    'g', 
    message_serializer=JanusGraphSONSerializersV3d0()
)
g = traversal().with_remote(connection)

df = pd.read_csv("triples.csv")


for idx, row in df.iterrows():
    subj = str(row['subject'])
    pred = str(row['predicate'])
    obj = str(row['object'])

    # Add or get subject vertex
    subj_v = g.V(subj).fold().coalesce(
        g.V(subj),
        g.addV(str(subj)).property('id', subj)
    ).next()

    # Add or get object vertex
    obj_v = g.V(obj).fold().coalesce(
        g.V(obj),
        g.addV('Entity').property('id', obj)
    ).next()

    # Add edge (predicate as label)
    g.V(subj_v).addE(pred).to(obj_v).iterate()




# Add first person vertex
v1 = g.add_v('company').property('name', 'marko').next()

# Add second person vertex
v2 = g.add_v('person').property('name', 'stephen').next()

# Create an edge between them
g.V(v1).add_e('knows').to(v2).property('weight', 0.75).iterate()

connection.close()



TypeError: The child traversal of [['V', 'ex_comp:54']] was not spawned anonymously - use the __ class rather than a TraversalSource to construct the child traversal