In [1]:
from rdflib import Graph
from gremlin_python.driver import client


rdf_graph = Graph()
rdf_graph.parse("data/Testdataset.rdf", format="turtle")

gremlin_client = client.Client('ws://localhost:8182/gremlin', 'g')

# For visual inspection! 
for subj, pred, obj in rdf_graph:
    print(f"Subj: {subj}, Pred: {pred}, Obj: {obj}")

Subj: ex_addr:21, Pred: http://localhost/hasStreet, Obj: Statthalterplatz c/o Kiosk am Bahnhof
Subj: ex_addr:32, Pred: rdf:type, Obj: ex:Address
Subj: ex_addr:52, Pred: http://localhost/hasCity, Obj: Hamburg.
Subj: ex_addr:38, Pred: http://localhost/hasStreet, Obj: Fuhlsbüttler Straße 387
Subj: ex_comp:63, Pred: http://localhost/hasAddress, Obj: ex_addr:58
Subj: ex_addr:35, Pred: http://localhost/hasStreet, Obj: Caffamacherreihe 7
Subj: ex_comp:31, Pred: http://localhost/hasName, Obj: FB trust GmbH
Subj: ex_addr:6, Pred: http://localhost/hasStreet, Obj: Lierenfelder Straße 42
Subj: ex_addr:55, Pred: rdf:type, Obj: ex:Address
Subj: ex_addr:34, Pred: http://localhost/hasStreet, Obj: Luftfrachthof-Geb.
Subj: ex_comp:70, Pred: http://localhost/hasAddress, Obj: ex_addr:63
Subj: ex_addr:32, Pred: http://localhost/hasCity, Obj: Hamburg.
Subj: ex_comp:98, Pred: rdf:type, Obj: ex:Company
Subj: ex_addr:46, Pred: http://localhost/hasAddress, Obj: Rosenstraße 2, 20095 Hamburg.
Subj: ex_addr:33, Pr

## Insert RDF Data


In [2]:
from rdflib import Graph, URIRef, Literal
from rdflib.namespace import RDF
from gremlin_python.driver import client
from gremlin_python.driver import serializer

import logging
import nest_asyncio
nest_asyncio.apply() #Needed to fix 'Running Event loop Issue' in Jupyter notebooks - This is not needed when executed as python script

logging.basicConfig(
    level=logging.DEBUG, 
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename='RDF_DataIngestion.log',
    filemode='w'
)
logger = logging.getLogger(__name__)

def sanitize_key(key: str) -> str:
    return key.replace(' ', '_').split('/')[-1]

def process_rdf(rdf_path: str, gremlin_client: client.Client):
    rdf_graph = Graph()
    rdf_graph.parse(rdf_path, format="turtle")
    
    stats = {
        'vertices_processed': 0,
        'edges_created': 0,
        'properties_set': 0,
        'errors': 0
    }


    # Iteratively walking through all triplets
    for subj, pred, obj in rdf_graph:
        subj_uri = str(subj) 
        pred_name = sanitize_key(str(pred)) #Just some cleaninig of they key! :) 
        obj_value = str(obj) if isinstance(obj, Literal) else obj

    for subj, pred, obj in rdf_graph:
        try:
            subj_uri = str(subj)
            pred_name = sanitize_key(str(pred))
            obj_value = obj

            # Vertex creation/check
            '''
            Gremlin Patterns: 
            - Check existence with .fold()
            - Create if missing using coalesce()
            '''
            vertex_result = gremlin_client.submit(
                f"g.V().has('uri', '{subj_uri}').fold()"
                f".coalesce(__.unfold(), __.addV('{str(subj)}').property('uri', '{subj_uri}'))"
            ).all().result()


            
            if 'created' in str(vertex_result):
                stats['vertices_processed'] += 1
                logger.debug(f"Created new vertex: {subj_uri}")
            else:
                logger.debug(f"Existing vertex found: {subj_uri}")

            
            # Setting URI as UUID 
            if pred == RDF.type:
                # Handle type
                gremlin_client.submit(
                    f"g.V().has('uri', '{subj_uri}').property('rdf_type', '{str(obj)}')"
                ).all().result()
                logger.debug(f"Set rdf_type for {subj_uri}: {str(obj)}")
                stats['properties_set'] += 1

            elif isinstance(obj, URIRef):

                # Edge creation
                obj_uri = str(obj)

                '''
                If vertex with uri=obj_uri exists → Returns it

                If missing → Creates new vertex with:
                    Label: Entity (hardcoded default)
                    Property: uri = obj_uri

                '''



                # Checking if the Target Vertex already exists and adds it, in case it does not exist
                gremlin_client.submit(
                    f"g.V().has('uri', '{obj_uri}').fold()"
                    f".coalesce(__.unfold(), __.addV('Entity').property('uri', '{obj_uri}'))"
                ).all().result()

            
                edge_result = gremlin_client.submit(
                    f"""
                    g.V().has('uri', '{subj_uri}')
                    .addE('{pred_name}')
                    .to(__.V().has('uri', '{obj_uri}'))
                    """
                ).all().result()

                stats['edges_created'] += 1
                logger.info(f"Created edge: {subj_uri} -[{pred_name}]-> {obj_uri}")

            else:
                # Property handling
                if isinstance(obj, Literal):
                    value = obj.toPython()
                    if isinstance(value, str):
                        value = f"'{value.replace("'", "''")}'"
                    
                    prop_result = gremlin_client.submit(
                        f"g.V().has('uri', '{subj_uri}')"
                        f".property('{pred_name}', {value})"
                    ).all().result()
                    
                    stats['properties_set'] += 1
                    logger.debug(f"Set property {pred_name}={value} on {subj_uri}")

        except Exception as e:
            stats['errors'] += 1
            logger.error(f"Failed to process {subj} {pred} {obj}: {str(e)}", exc_info=True)

    # Summary logging
    logger.info("\nImport Statistics:")
    logger.info(f"Total vertices processed: {stats['vertices_processed']}")
    logger.info(f"Total edges created: {stats['edges_created']}")
    logger.info(f"Total properties set: {stats['properties_set']}")
    logger.info(f"Total errors encountered: {stats['errors']}")

# Initialize and run
gremlin_client = client.Client(
    url='ws://localhost:8182/gremlin',
    traversal_source='g',
    message_serializer=serializer.GraphSONSerializersV3d0())
process_rdf("data/testdataset.rdf", gremlin_client)
gremlin_client.close()


## Remove Entity Nodes
Those nodes are just a byproduct of a suboptimal population and are, due to timeconstraints only removed in this following way

In [6]:
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
from janusgraph_python.driver.serializer import JanusGraphSONSerializersV3d0  # Correct import for JanusGraph serializer
from gremlin_python.process.anonymous_traversal import traversal


def delete_entities_by_uri(target_uri):
    # Establish connection
    connection = DriverRemoteConnection(
        'ws://localhost:8182/gremlin',
        'g',
        message_serializer=JanusGraphSONSerializersV3d0()
    )
    g = traversal().withRemote(connection)

    try:
        # Execute deletion query
        delete_result = g.V().has('uri', target_uri).drop().iterate()
        
        # Get deletion count (JanusGraph doesn't return counts natively)
        remaining = g.V().has('uri', target_uri).count().next()
        
        print(f"Deleted all entities with URI '{target_uri}'")
        print(f"Remaining entities with this URI: {remaining}")
        
    except Exception as e:
        print(f"Deletion failed: {str(e)}")
    finally:
        connection.close()

# Usage
delete_entities_by_uri('ex:Company')
delete_entities_by_uri('ex:Address')

Deleted all entities with URI 'ex:Company'
Remaining entities with this URI: 0
Deleted all entities with URI 'ex:Address'
Remaining entities with this URI: 0


## Create artifical ownership structures

In [8]:
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
from gremlin_python.process.anonymous_traversal import traversal
from gremlin_python.process.graph_traversal import __  # Correct import for anonymous traversals
from janusgraph_python.driver.serializer import JanusGraphSONSerializersV3d0
import random
import traceback

# Connect to JanusGraph
connection = DriverRemoteConnection(
    'ws://localhost:8182/gremlin',
    'g',
    message_serializer=JanusGraphSONSerializersV3d0()
)
g = traversal().withRemote(connection)

# Helper functions to safely access label and id
def get_label(v):
    try:
        return v.label
    except AttributeError:
        return v.get("label")

def get_id(v):
    try:
        return v.id
    except AttributeError:
        return v.get("id")

# 1. Get all vertices and filter for labels containing "ex_comp"
all_vertices = g.V().toList()
filtered_vertices = [v for v in all_vertices if "ex_comp" in get_label(v)]

print(f"Found {len(filtered_vertices)} vertices with labels containing 'ex_comp'")

# 2. Select 50% randomly
if len(filtered_vertices) < 2:
    print("Not enough vertices to create edges.")
    connection.close()
    exit()

selected_vertices = random.sample(filtered_vertices, len(filtered_vertices) // 2)
print(f"Selected {len(selected_vertices)} vertices for edge creation")

# 3. Initialize incoming capacity to 1.0 per vertex
capacity = {get_id(v): 1.0 for v in selected_vertices}

# 4. Create edges between selected vertices with random weights
for source in selected_vertices:
    source_id = get_id(source)

    possible_targets = [v for v in selected_vertices if get_id(v) != source_id]
    random.shuffle(possible_targets)

    for target in possible_targets:
        target_id = get_id(target)

        if capacity[target_id] <= 0:
            continue  # Skip if no capacity left

        max_weight = min(1.0, capacity[target_id])
        weight = round(random.uniform(0.1, max_weight), 2)

        try:
            g.V(source_id).addE('weighted_connection').to(
                __.V(target_id)  # ✅ Proper anonymous traversal
            ).property('weight', weight).next()

            capacity[target_id] = max(0, capacity[target_id] - weight)

            print(f"Created edge {source_id} -> {target_id} with weight {weight} "
                  f"(Remaining capacity: {capacity[target_id]:.2f})")

        except Exception as e:
            print(f"Error creating edge {source_id} -> {target_id}: {e}")
            traceback.print_exc()

# Close connection
connection.close()


Found 100 vertices with labels containing 'ex_comp'
Selected 50 vertices for edge creation
Created edge 553064 -> 1327304 with weight 0.15 (Remaining capacity: 0.85)
Created edge 553064 -> 520304 with weight 0.59 (Remaining capacity: 0.41)
Created edge 553064 -> 1138888 with weight 0.4 (Remaining capacity: 0.60)
Created edge 553064 -> 614432 with weight 0.13 (Remaining capacity: 0.87)
Created edge 553064 -> 1298568 with weight 0.43 (Remaining capacity: 0.57)
Created edge 553064 -> 647312 with weight 0.13 (Remaining capacity: 0.87)
Created edge 553064 -> 659600 with weight 0.39 (Remaining capacity: 0.61)
Created edge 553064 -> 1183944 with weight 0.69 (Remaining capacity: 0.31)
Created edge 553064 -> 622624 with weight 0.14 (Remaining capacity: 0.86)
Created edge 553064 -> 1347720 with weight 0.61 (Remaining capacity: 0.39)
Created edge 553064 -> 663672 with weight 0.74 (Remaining capacity: 0.26)
Created edge 553064 -> 557088 with weight 0.93 (Remaining capacity: 0.07)
Created edge 5530