# Knowledge Graph Creation

## Requirements

The **`data`** folder must be ready, containing:
- `rdf/subgraphs/data.tll`

The schema folder must contain the ontology:
- `rdf/schema/shapes.ttl`
- `rdf/schema/skos.ttl`
- `rdf/schema/owl.ttl`

In [62]:
API_KEY="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"

### IO Utils

In [63]:
from pathlib import Path
from rdflib import Graph


def load_graph(files_to_load: list[Path]) -> Graph:
    graph = Graph()
    for path in files_to_load:
        graph.parse(path, format="turtle")
    return graph


def export_graph(graph: Graph, output_path: Path) -> None:
    # Ensure the output directory exists
    output_path.parent.mkdir(parents=True, exist_ok=True)

    # Export the graph to Turtle format
    graph.serialize(destination=output_path, format="turtle")

## A-Box Linking

In [None]:
from rdflib import Graph, Namespace, URIRef
from rdflib.namespace import RDF, OWL
import requests
import time

SP = Namespace("http://example.org/smartphone#")
SPOTLIGHT_URL = "https://api.dbpedia-spotlight.org/en/annotate"
WIKIDATA_API = "https://www.wikidata.org/w/api.php"

# Required for Wikidata API
HEADERS = {"User-Agent": "SmartphoneKG/1.0 (university project)"}

# Keywords to identify phone-related Wikidata entries
PHONE_KEYWORDS = [                          
    # Generic                               
    "smartphone", "phone model", "mobile    
phone", "android",                          
    "generation of the", "series of         
smartphones",                               
    # Brands                                
    "iphone", "apple", "samsung", "galaxy", 
"google", "pixel",                          
    "xiaomi", "redmi", "poco", "oneplus",   
"oppo", "realme", "vivo", "iqoo",           
    "huawei", "honor", "motorola", "moto",  
"sony", "asus", "nothing", "tecno",         
    # Series                                
    "rog", "xperia", "zenfone", "razr",     
"nord", "narzo", "camon", "spark",          
    "pova", "nova", "mate", "magic", "fold",
"flip", "edge", "note", "find", "reno"     
]  


def query_spotlight(label: str) -> str | None:
    """Get DBpedia URI for a brand via Spotlight API."""
    try:
        r = requests.post(
            SPOTLIGHT_URL,
            data={"text": f"{label} company", "confidence": 0.8},
            headers={"Accept": "application/json"},
            timeout=10,
        )
        for res in r.json().get("Resources", []):
            if label.lower() in res.get("@surfaceForm", "").lower():
                return res["@URI"]
    except Exception:
        pass
    return None


def clean_phone_name(uri: str) -> str:
    """Extract clean phone name from URI, removing brand duplication."""
    # URI like: .../apple_apple_iphone_16 -> "iPhone 16"
    raw = uri.split("/")[-1].replace("_", " ")
    # Remove duplicate brand prefix (e.g., "apple apple" -> "apple")
    words = raw.split()
    if len(words) >= 2 and words[0].lower() == words[1].lower():
        words = words[1:]  # Remove first duplicate
    # Capitalize properly for search
    return " ".join(words).title()


def query_wikidata(phone_name: str) -> str | None:
    """Search Wikidata for a phone model."""
    try:
        r = requests.get(
            WIKIDATA_API,
            params={
                "action": "wbsearchentities",
                "search": phone_name,
                "language": "en",
                "type": "item",
                "limit": 5,
                "format": "json",
            },
            headers=HEADERS,
            timeout=10,
        )
        for result in r.json().get("search", []):
            desc = result.get("description", "").lower()
            label = result.get("label", "").lower()
            # Filter for smartphone-related entries
            if any(kw in desc or kw in label for kw in PHONE_KEYWORDS):
                return f"http://www.wikidata.org/entity/{result['id']}"
    except Exception as e:
        print(f"  Wikidata error: {e}")
    return None


def perform_linkage(graph: Graph) -> Graph:
    """Return a graph containing owl:sameAs links for brands (DBpedia) and phones (Wikidata)."""

    result = Graph()
    result.bind("owl", OWL)
    result.bind("sp", SP)

    # Link brands to DBpedia
    brands = [s for s in graph.subjects(RDF.type, SP.Brand) if isinstance(s, URIRef)]

    for uri in brands:
        name = str(uri).split("/")[-1].replace("_", " ")
        if dbpedia := query_spotlight(name):
            result.add((uri, OWL.sameAs, URIRef(dbpedia)))
            print(f"+ Brand: {name} -> {dbpedia}")

    # Link phones to Wikidata
    phones = [s for s in graph.subjects(RDF.type, SP.BasePhone) if isinstance(s, URIRef)]
    linked_count = 0

    for uri in phones:
        phone_name = clean_phone_name(str(uri))

        # Sleep to avoid rate limiting
        
        time.sleep(1)

        if wikidata := query_wikidata(phone_name):
            result.add((uri, OWL.sameAs, URIRef(wikidata)))
            print(f"+ Phone: {phone_name} -> {wikidata}")
            linked_count += 1

    print(f"\nGenerated {len(result)} owl:sameAs links ({linked_count} phones)")
    return result

## T-Box Alignment

In [65]:
import json
import re
from pathlib import Path
from typing import TypedDict

import requests
from rdflib import Graph, Namespace, URIRef
from rdflib.namespace import OWL, RDFS, SKOS

SP = Namespace("http://example.org/smartphone#")

LOV_API = "https://lov.linkeddata.es/dataset/lov/api/v2/term/search"
OPENAI_URL = "https://api.openai.com/v1/chat/completions"
OPENAI_MODEL = "gpt-4o-mini"

TRUSTED_VOCABS = ["schema.org", "dbpedia.org", "wikidata.org", "purl.org/goodrelations",
                  "xmlns.com/foaf", "purl.org/dc", "w3.org/2004/02/skos"]

RELATION_PREDICATES: dict[str, URIRef] = {
    "equivalent": OWL.equivalentClass,
    "subclass": RDFS.subClassOf,
    "subproperty": RDFS.subPropertyOf,
    "exact": SKOS.exactMatch,
    "close": SKOS.closeMatch,
}

MANUAL_ALIGNMENTS: list[tuple[URIRef, URIRef, URIRef]] = [
    (SP.BasePhone, RDFS.subClassOf, URIRef("http://schema.org/Product")),
    (SP.BasePhone, SKOS.exactMatch, URIRef("http://www.wikidata.org/entity/Q22645")),
    (SP.User, RDFS.subClassOf, URIRef("http://schema.org/Person")),
    (SP.User, RDFS.subClassOf, URIRef("http://xmlns.com/foaf/0.1/Person")),
    (SP.TagSentiment, SKOS.closeMatch, URIRef("http://schema.org/Rating")),
    (SP.manufactures, SKOS.closeMatch, URIRef("http://schema.org/manufacturer")),
    (SP.manufactures, SKOS.closeMatch, URIRef("http://dbpedia.org/ontology/manufacturer")),
    (SP.likes, SKOS.closeMatch, URIRef("http://xmlns.com/foaf/0.1/interest")),
    (SP.supportsNFC, SKOS.closeMatch, URIRef("http://dbpedia.org/ontology/feature")),
]


class LOVCandidate(TypedDict):
    uri: str
    label: str
    score: float


class LLMChoice(TypedDict):
    uri: str
    relation: str


def query_lov(term: str, term_type: str) -> list[LOVCandidate]:
    """Query LOV API for alignment candidates."""
    candidates: dict[str, LOVCandidate] = {}
    queries = [term, re.sub(r'([A-Z])', r' \1', term).strip().lower()]

    for q in queries:
        try:
            r = requests.get(LOV_API, params={"q": q, "type": term_type, "page_size": 20}, timeout=10)
            for res in r.json().get("results", []):
                uri: str | None = res.get("uri", [None])[0]
                if uri and any(v in uri for v in TRUSTED_VOCABS) and uri not in candidates:
                    candidates[uri] = {
                        "uri": uri,
                        "label": res.get("prefixedName", [""])[0],
                        "score": res.get("score", 0),
                    }
        except Exception as e:
            print(f"  LOV error: {e}")

    return list(candidates.values())[:20]


def ask_llm(name: str, term_type: str, comment: str, candidates: list[LOVCandidate]) -> list[LLMChoice]:
    """Ask LLM to choose best alignments from candidates."""
    if not candidates:
        return []

    cand_text = "\n".join(f"  {i+1}. {c['uri']} ({c['label']}, score={c['score']:.2f})"
                          for i, c in enumerate(candidates))

    prompt = f"""You are an expert in ontology alignment for Linked Open Data. Your task is to align terms from a smartphone domain ontology (namespace: http://example.org/smartphone#) to well-known external vocabularies.

## ONTOLOGY TERM TO ALIGN
- **Local name:** {name}
- **Type:** {term_type}
- **Description:** {comment or 'No description available'}
- **Domain context:** This term belongs to a smartphone/mobile device ontology covering phones, brands, specifications (RAM, storage, battery, display), features (5G, NFC, wireless charging), pricing, user reviews, and sentiment analysis.

## CANDIDATE MATCHES FROM LINKED OPEN VOCABULARIES
{cand_text}

## YOUR TASK
Select ONLY semantically appropriate matches (up to 10). For each match, specify the relationship type:

- **"equivalent"** → owl:equivalentClass/Property - Identical meaning, can be used interchangeably
- **"subclass"** / **"subproperty"** → rdfs:subClassOf/subPropertyOf - Our term is more specific than the external one
- **"exact"** → skos:exactMatch - Same concept, suitable for cross-vocabulary linking
- **"close"** → skos:closeMatch - Similar but not identical meaning, useful for discovery

## CRITICAL SEMANTIC CONSTRAINTS - READ CAREFULLY

**Understand the actual meaning of each candidate before matching:**
- Read the URI path carefully: "BusinessEntity" means organization/company, NOT a product
- "fileSize" means size of a file in bytes, NOT storage capacity of a device
- "foaf:phone" is for telephone NUMBERS (strings like "+1-555-1234"), NOT phone devices
- "Camera" as a class means a camera device, NOT a phone with a good camera

**REJECT matches that are:**
- Lexically similar but semantically different (e.g., "phone" in phoneName vs foaf:phone)
- Wrong domain (e.g., a Product class matched to an Organization/BusinessEntity class)
- Wrong measurement type (e.g., storage capacity matched to file size)
- Overly generic when specific alternatives exist

**Quality over quantity:** Return FEWER but CORRECT matches. An empty result is better than wrong alignments.

## GUIDELINES
- Prefer Schema.org and DBpedia for broad interoperability
- Phone subclasses (like HighResolutionCameraPhone) should align to schema:Product or similar, NOT to Camera
- Property names about device specs should match device/product properties, not unrelated concepts
- Each URI should appear only ONCE with its best-fitting relation

## RESPONSE FORMAT
Return ONLY a JSON object with a "matches" array:
{{"matches": [{{"uri": "http://schema.org/Product", "relation": "subclass"}}, ...]}}

If no good matches exist, return: {{"matches": []}}"""

    try:
        r = requests.post(
            OPENAI_URL,
            headers={
                "Authorization": f"Bearer {API_KEY}",
                "Content-Type": "application/json",
            },
            json={
                "model": OPENAI_MODEL,
                "messages": [{"role": "user", "content": prompt}],
                "response_format": {"type": "json_object"},
            },
            timeout=60,
        )
        content = r.json()["choices"][0]["message"]["content"]
        parsed = json.loads(content)
        # Handle both {"matches": [...]} and [...] formats
        if isinstance(parsed, list):
            return parsed
        if isinstance(parsed, dict) and "matches" in parsed:
            return parsed["matches"]
        return []
    except Exception as e:
        print(f"  LLM error: {e}, using top LOV match")
        if candidates:
            return [{"uri": candidates[0]["uri"], "relation": "close"}]
    return []

def perform_alignment(graph: Graph) -> Graph:
    """Return a graph containing only the generated ontology alignments."""

    result = Graph()
    result.bind("owl", OWL)
    result.bind("rdfs", RDFS)
    result.bind("skos", SKOS)
    result.bind("sp", SP)

    seen: dict[tuple[str, str], URIRef] = {}

    # Apply manual alignments into result graph
    for s, p, o in MANUAL_ALIGNMENTS:
        key = (str(s), str(o))
        if key not in seen:
            result.add((s, p, o))
            seen[key] = p
            print(f"+ manual: {s.split('#')[-1]} -> {o}")

    # Collect ontology terms from the original graph
    terms: dict[URIRef, str] = {}

    for uri in graph.subjects(None, OWL.Class):
        if isinstance(uri, URIRef) and str(uri).startswith(str(SP)):
            terms[uri] = "class"

    for uri in graph.subjects(None, None):
        if isinstance(uri, URIRef) and str(uri).startswith(str(SP)):
            if (uri, None, OWL.ObjectProperty) in graph or (uri, None, OWL.DatatypeProperty) in graph:
                terms[uri] = "property"

    print(f"Aligning {len(terms)} terms…")

    for uri, term_type in terms.items():
        name = str(uri).split("#")[-1]
        comment = str(next(graph.objects(uri, RDFS.comment), ""))

        candidates = query_lov(name, term_type)
        choices = ask_llm(name, term_type, comment, candidates)

        seen_uris: set[str] = set()
        for choice in choices:
            if not isinstance(choice, dict):
                continue
            ext = choice.get("uri")
            if not ext or ext in seen_uris:
                continue

            seen_uris.add(ext)

            key = (str(uri), ext)
            if key in seen:
                continue

            relation = choice.get("relation", "close")
            pred = RELATION_PREDICATES.get(relation, SKOS.closeMatch)

            result.add((uri, pred, URIRef(ext)))
            seen[key] = pred
            print(f"+ {name} -> {relation}: {ext}")

    return result

## Unstructured Data to RDF Graph

In [66]:
from rdflib import Graph

def get_unstructured_data_as_rdf() -> Graph:
    raise NotImplementedError("Function get_unstructured_data_as_rdf is not yet implemented.")

## Materialize Constructs

In [77]:
from rdflib import Graph

SPARQL_CONSTRUCTS=[
    ("HighResolutionCameraPhone", """
        PREFIX sp: <http://example.org/smartphone#>
        CONSTRUCT { ?phone a sp:HighResolutionCameraPhone }
        WHERE { ?phone a sp:BasePhone ; sp:mainCameraMP ?mp . FILTER(?mp >= 100) }
    """),
    ("LargeBatteryPhone", """
        PREFIX sp: <http://example.org/smartphone#>
        CONSTRUCT { ?phone a sp:LargeBatteryPhone }
        WHERE { ?phone a sp:BasePhone ; sp:batteryCapacityMah ?mah . FILTER(?mah >= 5000) }
    """),
    ("BudgetPhone", """
        PREFIX sp: <http://example.org/smartphone#>
        PREFIX spv: <http://example.org/smartphone/vocab#>
        CONSTRUCT { ?config sp:hasPriceSegment spv:Budget }
        WHERE {
            ?config a sp:PhoneConfiguration .
            ?offering sp:forConfiguration ?config ; sp:priceValue ?price .
            FILTER(?price < 400)
            FILTER NOT EXISTS {
                ?other sp:forConfiguration ?config ; sp:priceValue ?lower .
                FILTER(?lower < ?price)
            }
        }
    """),
    ("MidRangePhone", """
        PREFIX sp: <http://example.org/smartphone#>
        PREFIX spv: <http://example.org/smartphone/vocab#>
        CONSTRUCT { ?config sp:hasPriceSegment spv:MidRange }
        WHERE {
            ?config a sp:PhoneConfiguration .
            ?offering sp:forConfiguration ?config ; sp:priceValue ?price .
            FILTER(?price >= 400 && ?price <= 900)
            FILTER NOT EXISTS {
                ?other sp:forConfiguration ?config ; sp:priceValue ?lower .
                FILTER(?lower < ?price)
            }
        }
    """),
    ("FlagshipPhone", """
        PREFIX sp: <http://example.org/smartphone#>
        PREFIX spv: <http://example.org/smartphone/vocab#>
        CONSTRUCT { ?config sp:hasPriceSegment spv:Flagship }
        WHERE {
            ?config a sp:PhoneConfiguration .
            ?offering sp:forConfiguration ?config ; sp:priceValue ?price .
            FILTER(?price > 900)
            FILTER NOT EXISTS {
                ?other sp:forConfiguration ?config ; sp:priceValue ?lower .
                FILTER(?lower < ?price)
            }
        }
    """),
]

def materialize_construct_rules(
    graph: Graph,
) -> Graph:
    """
    Execute SPARQL CONSTRUCT rules and return a graph
    containing only the inferred triples.
    """

    result = Graph()

    for _, query in SPARQL_CONSTRUCTS:
        constructed = graph.query(query).graph
        if constructed:
            result += constructed

    return result

## Apply OWL Reasoning

In [78]:
from rdflib import Graph
import owlrl

def apply_owl_rl_reasoning(graph: Graph) -> Graph:
    """
    Run OWL RL reasoning and return only the inferred triples.
    """

    base_size = len(graph)

    temp = Graph()
    temp += graph

    owlrl.DeductiveClosure(owlrl.OWLRL_Semantics).expand(temp)

    inferred = Graph()
    for triple in temp:
        if triple not in graph:
            inferred.add(triple)

    return inferred

# Execute Pipeline

In [79]:
# Load base facts and ontology layers into the knowledge graph
kg = load_graph([
    # Facts
    Path("rdf/subgraphs/data.ttl"),
    # Schema
    Path("rdf/schema/owl.ttl"),
    Path("rdf/schema/shapes.ttl"),
    Path("rdf/schema/skos.ttl")
])

# Persist the initial graph snapshot
export_graph(kg, Path("rdf/subgraphs/knowledge_graph_initial.ttl"))

In [80]:
links_path = Path("rdf/subgraphs/links.ttl")

if links_path.exists():
    kg += load_graph([links_path])
else:
    links = perform_linkage(kg)
    export_graph(links, links_path)
    kg += links

+ Brand: Apple -> http://dbpedia.org/resource/Apple_Inc.
+ Brand: Asus -> http://dbpedia.org/resource/Asus
+ Brand: Google -> http://dbpedia.org/resource/Google
+ Brand: Huawei -> http://dbpedia.org/resource/Huawei
+ Brand: Motorola -> http://dbpedia.org/resource/Motorola
+ Brand: OnePlus -> http://dbpedia.org/resource/OnePlus
+ Brand: Oppo -> http://dbpedia.org/resource/Oppo
+ Brand: Realme -> http://dbpedia.org/resource/Realme
+ Brand: Samsung -> http://dbpedia.org/resource/Samsung
+ Brand: Sony -> http://dbpedia.org/resource/Sony
+ Brand: Xiaomi -> http://dbpedia.org/resource/Xiaomi
+ Phone: Apple Iphone 16 -> http://www.wikidata.org/entity/Q130267745
+ Phone: Apple Iphone 16 Plus -> http://www.wikidata.org/entity/Q130312645
+ Phone: Apple Iphone 16 Pro -> http://www.wikidata.org/entity/Q130267746
+ Phone: Apple Iphone 16 Pro Max -> http://www.wikidata.org/entity/Q130315105
+ Phone: Apple Iphone 16E -> http://www.wikidata.org/entity/Q132559447
+ Phone: Apple Iphone 17 -> http://www.

In [81]:
alignments_path = Path("rdf/subgraphs/alignments.ttl")

if alignments_path.exists():
    kg += load_graph([alignments_path])
else:
    alignments = perform_alignment(kg)
    export_graph(alignments, alignments_path)
    kg += alignments

In [82]:
# unstructured_path = Path("rdf/subgraphs/unstructured.ttl")

# if unstructured_path.exists():
#     kg += load_graph([unstructured_path])
# else:
#     unstructured = get_unstructured_data_as_rdf()
#     export_graph(unstructured, unstructured_path)
#     kg += unstructured

In [83]:
constructed_path = Path("rdf/subgraphs/constructed.ttl")

if constructed_path.exists():
    kg += load_graph([constructed_path])
else:
    constructed = materialize_construct_rules(kg)
    export_graph(constructed, constructed_path)
    kg += constructed

In [84]:
inferred_path = Path("rdf/subgraphs/inferred.ttl")

if inferred_path.exists():
    kg += load_graph([inferred_path])
else:
    inferred = apply_owl_rl_reasoning(kg)
    export_graph(inferred, inferred_path)
    kg += inferred

In [85]:
# Final export
export_graph(kg, Path("rdf/knowledge_graph_full.ttl"))