In [None]:
%pip install openpyxl

In [1]:

import pandas as pd
from rdflib import Graph, Namespace, RDF, RDFS, OWL, Literal

# Load CSV files
emp_file = r"data/Fin_Emp.xlsx"
org_file = r"data/Fin_Org.xlsx"
emp_data = pd.read_excel(emp_file, sheet_name=0)
org_data = pd.read_excel(org_file, sheet_name=0)

# Load the knowledge graph
kg_file = r"kgCreation/NewFinKG.ttl"
graph = Graph()
graph.parse(kg_file, format="turtle")


<Graph identifier=Ndef0c38bf3bc4d898a09e266ea6c9595 (<class 'rdflib.graph.Graph'>)>

In [2]:
# Define namespaces
FIN = Namespace("http://financial.com/")
SCHEMA = Namespace("https://schema.org/")
ORG = Namespace("http://organization.com/")
EMP = Namespace("http://employee.com/")

# Bind namespaces
graph.bind("fin", FIN)
graph.bind("schema", SCHEMA)
graph.bind("org", ORG)
graph.bind("emp", EMP)

# Clear the graph to rebuild with corrected structure
graph.remove((None, None, None))

# Define Classes
graph.add((SCHEMA.Person, RDF.type, RDFS.Class))
graph.add((SCHEMA.Person, RDFS.label, Literal("Person")))
graph.add((FIN.Manager, RDF.type, RDFS.Class))
graph.add((FIN.Manager, RDFS.subClassOf, SCHEMA.Person))
graph.add((FIN.Manager, RDFS.label, Literal("Manager")))
graph.add((SCHEMA.Organization, RDF.type, RDFS.Class))
graph.add((SCHEMA.Organization, RDFS.label, Literal("Organization")))
graph.add((FIN.ParentOrganization, RDF.type, RDFS.Class))
graph.add((FIN.ParentOrganization, RDFS.subClassOf, SCHEMA.Organization))
graph.add((FIN.ParentOrganization, RDFS.label, Literal("Parent Organization")))

# Define Properties
define_properties = [
    (SCHEMA.givenName, SCHEMA.Person, RDFS.Literal, "Given Name"),
    (SCHEMA.familyName, SCHEMA.Person, RDFS.Literal, "Family Name"),
    (SCHEMA.email, SCHEMA.Person, RDFS.Literal, "Email"),
    (SCHEMA.telephone, SCHEMA.Person, RDFS.Literal, "Telephone"),
    (SCHEMA.jobTitle, SCHEMA.Person, RDFS.Literal, "Job Title"),
    (SCHEMA.worksFor, SCHEMA.Person, SCHEMA.Organization, "Works For"),
    (SCHEMA.manages, SCHEMA.Manager, SCHEMA.Person, "Manages"),
    (SCHEMA.parentOrganization, SCHEMA.Organization, FIN.ParentOrganization, "Parent Organization"),
    (SCHEMA.hasChildOrganization, FIN.ParentOrganization, SCHEMA.Organization, "Has Child Organization"),
    (SCHEMA.hasHead, SCHEMA.Organization, SCHEMA.Person, "Has Head"),
    (SCHEMA.sources, None, RDFS.Literal, "Sources"),
    (SCHEMA.gid, SCHEMA.Person, RDFS.Literal, "GID"),
    (SCHEMA.id, SCHEMA.Person, RDFS.Literal, "ID"),
    (SCHEMA.hasManager, SCHEMA.Person, SCHEMA.Person, "Line Manager"),
    (SCHEMA.location, SCHEMA.Person, RDFS.Literal, "Location"),
    (SCHEMA.status, SCHEMA.Person, RDFS.Literal, "Status"),
    (SCHEMA.userType, SCHEMA.Person, RDFS.Literal, "User Type"),
    (SCHEMA.orgID, SCHEMA.Organization, RDFS.Literal, "Organisation ID"),
    (SCHEMA.description, SCHEMA.Organization, RDFS.Literal, "Description"),
    (SCHEMA.name, SCHEMA.Organization, RDFS.Literal, "Organisation Name"),
]

for prop, domain, range_, label in define_properties:
    graph.add((prop, RDF.type, RDF.Property))
    if domain:
        graph.add((prop, RDFS.domain, domain))
    if range_:
        graph.add((prop, RDFS.range, range_))
    graph.add((prop, RDFS.label, Literal(label)))

# Create GID-to-ID lookup table
gid_to_id = dict(zip(emp_data['gid'], emp_data['id']))

# Add Individuals from CSV
for _, row in emp_data.iterrows():
    # Skip rows where the givenName(f_name) is "!deleted"
    if row['f_name'] == "!deleted!":
        continue
    if pd.notna(row['id']):
        person_uri = EMP[str(int(row['id']))]
        graph.add((person_uri, RDF.type, SCHEMA.Person))
        graph.add((person_uri, SCHEMA.givenName, Literal(row['f_name'])))
        graph.add((person_uri, SCHEMA.familyName, Literal(row['l_name'])))
        graph.add((person_uri, SCHEMA.email, Literal(row['email_id'])))
        graph.add((person_uri, SCHEMA.telephone, Literal(row['contact_info'])))
        graph.add((person_uri, SCHEMA.jobTitle, Literal(row['job_title'])))
        graph.add((person_uri, SCHEMA.gid, Literal(row['gid'])))
        graph.add((person_uri, SCHEMA.id, Literal(int(row['id']))))
        graph.add((person_uri, SCHEMA.location, Literal(row['location'])))
        graph.add((person_uri, SCHEMA.status, Literal(row['status'])))
        graph.add((person_uri, SCHEMA.userType, Literal(row['user_type'])))
        graph.add((person_uri, SCHEMA.sources, Literal(f"https://cosmos.siemens-energy.cloud/org-management?employeeGid={row['gid']}")))
        graph.add((person_uri, SCHEMA.sources, Literal(f"https://finance-center.mosaic.siemens-energy.cloud/employee_relation?id={int(row['id'])}")))
        
        if pd.notna(row['org_id']):
            graph.add((person_uri, SCHEMA.worksFor, ORG[str(int(row['org_id']))]))
            
        # Handle line manager relationships using GID-to-ID mapping
        if pd.notna(row['line_manager_id']):
            manager_gid = row['line_manager_id']  # Get GID of the manager
            manager_id = gid_to_id.get(manager_gid)  # Lookup corresponding ID
            if manager_id:  # If the manager is in the dataset
                manager_uri = EMP[str(int(manager_id))]
                graph.add((person_uri, SCHEMA.hasManager, manager_uri))
                graph.add((manager_uri, SCHEMA.manages, person_uri))  # Bidirectional relationship
                
for _, row in org_data.iterrows():
    org_uri = ORG[str(int(row['id']))]
    graph.add((org_uri, RDF.type, SCHEMA.Organization))
    graph.add((org_uri, SCHEMA.orgId, Literal(row['id'])))
    graph.add((org_uri, SCHEMA.name, Literal(row['org_title'])))
    graph.add((org_uri, SCHEMA.description, Literal(row['description'])))
    graph.add((org_uri, SCHEMA.sources, Literal(f"https://finance-center.mosaic.siemens-energy.cloud/landscape/organization?id={int(row['id'])}")))
    if pd.notna(row['parent_org_id']):
        parent_uri = ORG[str(int(row['parent_org_id']))]
        graph.add((org_uri, SCHEMA.parentOrganization, parent_uri))
        graph.add((parent_uri, SCHEMA.hasChildOrganization, org_uri))
    if pd.notna(row['org_head']):
        graph.add((org_uri, SCHEMA.hasHead, EMP[str(int(row['org_head']))]))

# Serialize the corrected knowledge graph
output_file = "NewFinKG.ttl"
graph.serialize(destination=output_file, format="turtle")
print(f"Corrected knowledge graph saved to {output_file}")


Corrected knowledge graph saved to NewFinKG.ttl


In [3]:
import pandas as pd
import json
from rdflib import Graph, Namespace, RDF, RDFS, Literal

# Load the existing knowledge graph
kg_file = r"kgCreation/NewFinKG.ttl"  
graph = Graph()

graph.parse(kg_file, format="turtle")

# Define namespaces
FIN = Namespace("http://financial.com/")
SCHEMA = Namespace("https://schema.org/")
ORG = Namespace("http://organization.com/")
EMP = Namespace("http://employee.com/")
APP = Namespace("http://application.com/")

# Bind namespaces
graph.bind("fin", FIN)
graph.bind("schema", SCHEMA)
graph.bind("org", ORG)
graph.bind("emp", EMP)
graph.bind("app", APP)

# Load the new datasets
app_master_file = r"data/application_master.csv"
app_owners_file = r"data/apps_owners.csv"
app_master_data = pd.read_csv(app_master_file)
app_owners_data = pd.read_csv(app_owners_file)

# Convert the 'id' column to integers
app_master_data['id'] = app_master_data['id'].fillna(0).astype(int)
app_owners_data['app_id'] = app_owners_data['app_id'].astype(int)
app_owners_data['employee_id'] = app_owners_data['employee_id'].astype(int)

# Define new Application class and properties
graph.add((APP.Application, RDF.type, RDFS.Class))
graph.add((APP.Application, RDFS.label, Literal("Application")))

# Define properties for Application, Employee, and relationships
new_properties = [
    (APP.appId, APP.Application, RDFS.Literal, "Application ID"),
    (APP.appName, APP.Application, RDFS.Literal, "Application Name"),
    (APP.appDescription, APP.Application, RDFS.Literal, "Application Description"),
    (APP.accessLink, APP.Application, RDFS.Literal, "Access Link"),
    (APP.appLink, APP.Application, RDFS.Literal, "Application Link"),
    (APP.appImage, APP.Application, RDFS.Literal, "Application Image"),
    (APP.partOfOrg, APP.Application, SCHEMA.Organization, "Part of Organization"),
    (APP.managedBy, APP.Application, SCHEMA.Person, "Managed By"),
    (APP.hasOwner, APP.Application, SCHEMA.Person, "Has Owner"),
    (APP.manages, SCHEMA.Person, APP.Application, "Manages"),
    (SCHEMA.id, SCHEMA.Person, RDFS.Literal, "Employee ID"),
]

# Add properties to the graph
for prop, domain, rng, label in new_properties:
    graph.add((prop, RDF.type, RDF.Property))
    graph.add((prop, RDFS.domain, domain))
    graph.add((prop, RDFS.range, rng))
    graph.add((prop, RDFS.label, Literal(label)))

# Extract organization names from the existing KG
org_name_map = {}
for org_uri, _, org_name in graph.triples((None, SCHEMA.name, None)):
    org_name_map[org_name.toPython()] = org_uri

# Add application data to the graph
for _, row in app_master_data.iterrows():
    if row["is_deleted"] == 0:  # Skip deleted applications
        app_id = int(row['id'])
        app_uri = APP[str(app_id)]
        graph.add((app_uri, RDF.type, APP.Application))
        graph.add((app_uri, APP.appId, Literal(app_id)))
        graph.add((app_uri, APP.appName, Literal(row['app_title'])))
        graph.add((app_uri, APP.appDescription, Literal(row['app_description'])))
        graph.add((app_uri, APP.accessLink, Literal(row['access_link'])))
        graph.add((app_uri, APP.appLink, Literal(row['app_link'])))
        graph.add((app_uri, APP.appImage, Literal(row['app_image'])))
        graph.add((app_uri, SCHEMA.sources, Literal(f"https://finance-center.mosaic.siemens-energy.cloud/landscape/applications?orgId=1&appId={int(row['id'])}")))

        # Parse the JSON-like data in app_org and map organization names
        if pd.notna(row['app_org']):
            try:
                app_org_data = json.loads(row['app_org'])
                for org in app_org_data.get("appOrg", []):
                    org_name = org.get("name")
                    if org_name in org_name_map:
                        graph.add((app_uri, APP.partOfOrg, org_name_map[org_name]))
            except json.JSONDecodeError:
                print(f"Error decoding JSON in app_org: {row['app_org']}")

# Add employees and relationships to the graph
for _, row in app_owners_data.iterrows():
    if row["is_deleted"] == 0:  # Skip deleted applications
        emp_id = int(row['employee_id'])
        app_id = int(row['app_id'])
        app_uri = APP[str(app_id)]
        emp_uri = EMP[str(emp_id)]
        # Check if the employee already exists in the graph
        employee_exists = any(graph.triples((emp_uri, RDF.type, SCHEMA.Person)))
        if not employee_exists:
            # Add a new employee node
            graph.add((emp_uri, RDF.type, SCHEMA.Person))
            graph.add((emp_uri, SCHEMA.id, Literal(emp_id)))

        # Add managedBy relationship
        graph.add((app_uri, APP.managedBy, emp_uri))
        # Add inverse manages relationship
        graph.add((emp_uri, APP.manages, app_uri))
        # Add hasOwner if is_owners is true
        if row['is_owners']:
            # Add hasOwner relationship
            graph.add((app_uri, APP.hasOwner, emp_uri))
    

# Serialize the updated knowledge graph
output_file = "ExtendedFinKG.ttl"
graph.serialize(destination=output_file, format="turtle")
print(f"Extended knowledge graph saved to {output_file}")



Extended knowledge graph saved to ExtendedFinKG.ttl


In [4]:
import pandas as pd
import json
from rdflib import Graph, Namespace, RDF, RDFS, Literal

# 1) Load the existing KG (already extended with Applications) from ExtendedFinKG.ttl
kg_file = r"kgCreation/ExtendedFinKG.ttl"
graph = Graph()

graph.parse(kg_file, format="turtle")

# 2) Define namespaces
FIN   = Namespace("http://financial.com/")
SCHEMA= Namespace("https://schema.org/")
ORG   = Namespace("http://organization.com/")
EMP   = Namespace("http://employee.com/")
APP   = Namespace("http://application.com/")
PRO   = Namespace("http://process.com/")  

# 3) Bind them
graph.bind("fin", FIN)
graph.bind("schema", SCHEMA)
graph.bind("org", ORG)
graph.bind("emp", EMP)
graph.bind("app", APP)
graph.bind("pro", PRO)

# 4) Load the new datasets
process_master_file       = r"data/process_master.csv"
orgs_processes_file       = r"data/orgs_processes.csv"
process_applications_file = r"data/process_applications.csv"
process_owners_file       = r"data/process_owners.csv"

process_master_data       = pd.read_csv(process_master_file)
orgs_processes_data       = pd.read_csv(orgs_processes_file)
process_applications_data = pd.read_csv(process_applications_file)
process_owners_data       = pd.read_csv(process_owners_file)

# 5) Define a new Process class
graph.add((PRO.Process, RDF.type, RDFS.Class))
graph.add((PRO.Process, RDFS.label, Literal("Process")))

# 6) Define the Process properties
process_properties = [
    (PRO.processId,         PRO.Process,         RDFS.Literal,         "Process ID"),
    (PRO.title,             PRO.Process,         RDFS.Literal,         "Process Title"),
    (PRO.description,       PRO.Process,         RDFS.Literal,         "Process Description"),
    (PRO.referenceUrls,     PRO.Process,         RDFS.Literal,         "Reference URLs"),
    (PRO.templateUrls,      PRO.Process,         RDFS.Literal,         "Template URLs"),
    (PRO.parentProcess,     PRO.Process,         PRO.Process,          "Parent Process"),

    (PRO.hasChildProcess,   PRO.Process,         PRO.Process,          "Has Child Process"),

    # Link to Orgs
    (PRO.partOfOrg,         PRO.Process,         SCHEMA.Organization,   "Part of Organization"),
    (SCHEMA.hasProcess,     SCHEMA.Organization, PRO.Process,          "Organization Has Process"),

    # Link to Apps
    (PRO.hasApplication,    PRO.Process,         APP.Application,      "Process Has Application"),
    (APP.partOfProcess,     APP.Application,     PRO.Process,          "Application Part of Process"),

    # Link to Employees
    (PRO.managedBy,         PRO.Process,         SCHEMA.Person,        "Managed By"),
    (PRO.hasOwner,          PRO.Process,         SCHEMA.Person,        "Has Owner"),
    (PRO.manages,        SCHEMA.Person,       PRO.Process,          "Person Manages Process"),
]

for prop, domain, rng, label in process_properties:
    graph.add((prop, RDF.type, RDF.Property))
    if domain:
        graph.add((prop, RDFS.domain, domain))
    graph.add((prop, RDFS.range, rng))
    graph.add((prop, RDFS.label, Literal(label)))

# 7) Create/Enrich Process individuals from `process_master`

for _, row in process_master_data.iterrows():
    # Skip deleted processes
    if row["is_deleted"] == 1:
        continue
        
    process_id = int(row["id"])
    proc_uri   = PRO[str(process_id)]

    # Create the Process node
    graph.add((proc_uri, RDF.type, PRO.Process))
    graph.add((proc_uri, PRO.processId, Literal(process_id)))
    graph.add((proc_uri, PRO.title, Literal(row["title"])))
    graph.add((proc_uri, PRO.description, Literal(row["desc"])))

    # Reference URLs
    if pd.notna(row.get("template_urls", None)):
        try:
            template_data = json.loads(row["template_urls"])
            for item in template_data.get("data", []):
                file_path = item.get("file_path", "")
                # Clean the file_path
                if file_path.startswith("/"):
                    file_path = file_path[1:]
            # Add the cleaned file_path to the graph
                graph.add((proc_uri, PRO.templateUrls, Literal(file_path)))
        except (json.JSONDecodeError, TypeError):
            pass
    
    if pd.notna(row.get("reference_urls", None)):
        try:
            ref_data = json.loads(row["reference_urls"])
            for item in ref_data.get("data", []):
                link_url = item.get("link", "")
                graph.add((proc_uri, PRO.referenceUrls, Literal(link_url)))
        except (json.JSONDecodeError, TypeError):
            pass


    # Parent Process
    if pd.notna(row["parent_process_id"]):
        parent_id  = int(row["parent_process_id"])
        parent_uri = PRO[str(parent_id)]
        graph.add((proc_uri, PRO.parentProcess, parent_uri))
        graph.add((parent_uri, PRO.hasChildProcess, proc_uri))

    # Add a source link for the process
    graph.add(
        (proc_uri, SCHEMA.sources,
        Literal(f"https://finance-center.mosaic.siemens-energy.cloud/landscape/process?process_id={process_id}"))
    )


# 8) Link Orgs <-> Processes from `orgs_processes`
for _, row in orgs_processes_data.iterrows():
    if row["is_deleted"] == 1:
        continue

    org_id     = int(row["org_id"])
    process_id = int(row["process_id"])
    org_uri    = ORG[str(org_id)]
    proc_uri   = PRO[str(process_id)]

    # org -> process
    graph.add((org_uri, SCHEMA.hasProcess, proc_uri))
    # process -> org
    graph.add((proc_uri, PRO.partOfOrg, org_uri))

# 9) Link Processes <-> Applications from `process_applications`
for _, row in process_applications_data.iterrows():
    if row["is_deleted"] == 1:
        continue

    proc_id = int(row["process_id"])
    app_id  = int(row["application_id"])
    proc_uri = PRO[str(proc_id)]
    app_uri  = APP[str(app_id)]

    # process -> application
    graph.add((proc_uri, PRO.hasApplication, app_uri))
    # application -> process
    graph.add((app_uri, APP.partOfProcess, proc_uri))

# 10) Link Processes <-> Employees (Owners) from `process_owners`
for _, row in process_owners_data.iterrows():
    if row["is_deleted"] == 1:
        continue

    proc_id = int(row["process_id"])
    emp_id  = int(row["employee_id"])

    proc_uri = PRO[str(proc_id)]
    emp_uri  = EMP[str(emp_id)]

    # Check if the employee already exists in the graph.
    emp_exists = any(graph.triples((emp_uri, RDF.type, SCHEMA.Person)))
    if not emp_exists:
        graph.add((emp_uri, RDF.type, SCHEMA.Person))
        graph.add((emp_uri, SCHEMA.id, Literal(emp_id)))

    # Link: process -> managedBy -> employee
    graph.add((proc_uri, PRO.managedBy, emp_uri))
    # Inverse: employee -> manages -> process
    graph.add((emp_uri, PRO.manages, proc_uri))

    # Link: process -> hasOwner -> employee 
    if row["is_owners"] == 1:
        graph.add((proc_uri, PRO.hasOwner, emp_uri))

# 11) Serialize as the newly extended KG

output_file = r"kgCreation/ExtendedFinKG_Pro.ttl"
graph.serialize(destination=output_file, format="turtle")
print(f"Extended knowledge graph saved to {output_file}")


Extended knowledge graph saved to C:\Users\z0050t3j\OneDrive - Siemens Energy\Dokumente\Thesis\anubhuti_master_thesis\kgCreation\ExtendedFinKG_Pro.ttl
