The goal here is to artificially add ownership patterns as augmentations to the `Testdataset.rdf` 

In [12]:
import logging
from rdflib import Graph, Namespace, Literal,RDF,BNode


import random

# --- Logging setup ---
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.FileHandler("logs/Data_Augmentation.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# --- RDF and Namespace setup ---
g = Graph()
input_file = "data/Testdataset.rdf"
output_file = "data/Augemented_Testdataset.ttl"

logger.info(f"Loading RDF data from {input_file}")
g.parse(input_file, format="turtle")

# --- Namespace setup ---
from rdflib.namespace import RDF, RDFS,URIRef  # Use built-in namespaces

EX_COMP = Namespace("http://example.org/company/")
EX_EMPL = Namespace("http://example.org/employment/") 
EX_ADDR = Namespace("http://example.org/address/")
EX = Namespace("http://example.org/")

# --- Binding all prefixes ---
namespaces = {
    "rdf": RDF,
    "rdfs": RDFS,
    "ex": EX,
    "ex_comp": EX_COMP,
    "ex_empl": EX_EMPL,
    "ex_addr": EX_ADDR
}

for prefix, ns in namespaces.items():
    g.bind(prefix, ns)

# --- Extract companies ---
logger.info("Extracting company URIs from the graph")

g.bind("ex_comp", EX_COMP)

query = """
SELECT DISTINCT ?entity
WHERE {
  ?entity ?p ?o .
  FILTER(STRSTARTS(STR(?entity), "ex_comp"))
}
"""

#   

companies = [str(row.entity) for row in g.query(query)]


#companies = [s for s, p, o in g.triples((EX.Company,None,None))]
logger.info(f"Found {len(companies)} companies")

# --- Generate synthetic ownership edges ---
logger.info("Generating synthetic ownership edges")
for idx, company in enumerate(companies):
    num_owned = random.randint(1, 3)
    targets = random.sample(companies, num_owned)
    for target in targets:
        try:
            share = Literal(random.randint(1, 100))
            # Convert string URIs to URIRef objects first!
            subj = URIRef(company)
            obj = URIRef(target)
            g.add((subj, EX_OWN.owns, obj))
            g.add((subj, EX_OWN.percentage, share))
            logger.debug(f"Added ownership: {company} owns {share}% of {target}")
                
        except Exception as e:
            logger.error(f"Failed to add triples for {company} -> {target}: {e}")
            continue  # Skip to next iteration
    if (idx + 1) % 10 == 0:
        logger.info(f"Processed {idx + 1} companies")

# --- Save to Turtle ---
logger.info(f"Saving updated graph to {output_file}")
g.serialize(destination=output_file, format="turtle")
logger.info("Done. Ownership edges added and file saved.")


2025-04-25 14:33:29,384 - INFO - Loading RDF data from data/Testdataset.rdf
2025-04-25 14:33:29,411 - INFO - Extracting company URIs from the graph
2025-04-25 14:33:29,437 - INFO - Found 100 companies
2025-04-25 14:33:29,438 - INFO - Generating synthetic ownership edges
2025-04-25 14:33:29,439 - INFO - Processed 10 companies
2025-04-25 14:33:29,440 - INFO - Processed 20 companies
2025-04-25 14:33:29,440 - INFO - Processed 30 companies
2025-04-25 14:33:29,441 - INFO - Processed 40 companies
2025-04-25 14:33:29,442 - INFO - Processed 50 companies
2025-04-25 14:33:29,443 - INFO - Processed 60 companies
2025-04-25 14:33:29,443 - INFO - Processed 70 companies
2025-04-25 14:33:29,444 - INFO - Processed 80 companies
2025-04-25 14:33:29,444 - INFO - Processed 90 companies
2025-04-25 14:33:29,445 - INFO - Processed 100 companies
2025-04-25 14:33:29,446 - INFO - Saving updated graph to data/Augemented_Testdataset.ttl
2025-04-25 14:33:29,468 - INFO - Done. Ownership edges added and file saved.


In [25]:
from rdflib import Graph, Namespace

# Load the graph
g = Graph()
g.parse("Testdataset.rdf", format="turtle")  # Use actual file path

# Define namespace correctly
EX_COMP = Namespace("http://example.org/company/")
g.bind("ex_comp", EX_COMP)  # Bind to prefix used in the file

# Execute corrected query
query = """
SELECT DISTINCT ?entity
WHERE {
    ?entity a ex:Company ;  # Uses ex:Company type from your data
            ?p ?o .

}
"""
results = g.query(query)
ex_comp_entities = [str(row.entity) for row in results]
print(ex_comp_entities)

[]
