In [1]:
import datetime as dt
from libs.utils.params import Params

params = Params(
    size=Params(
        # Sampling sizes
        size=1000,
        initial=4000,
        decrease=0.9,
        plateau=600
    ),
    # Random state
    seed=None,
    # Thresholds
    threshold=Params(
        adaptative=True,
        opt=0.9,
        initial=0.9,
        min=0.6,
        step=0.05,
        expressive=0.5,
        current=0.9,
    ),
    max_depth=4,
    max_depth_step=0,
    patterns=Params(
        individuals=True,
        existential=True,
    ),
    embeddings="toy",
    clustering=Params(
        affinity="euclidean",
        linkage="ward",
    ),
    metric="harmonic",
    max_axioms=2,
    min_gain=0.08,
    allow_child=False,
    sort_axioms=False,
    others=Params(
        keep=True,
        n=8, # Max number of candidates to keep
        threshold=0.9, # % of the optimal score
    ),
    halting=Params(
        min_size=30,
        max_rec_steps=40,
        max_clustering_steps=100,
        max_extracted_depth=15,
        memory_limit=110*1024**2 # gigabytes
    ),
    extra=Params(
        active=True,
        n=100,
        reset_classes=True,
        depth=20,
        threshold=0.15
    ),
    record=Params(
        save_taxonomy=True,
        checkpoints=True,
        checkpoint_every=100,
        dirname="results/taxonomy/auto",
        name_pattern="taxonomy_{halting.max_clustering_steps}s_{timestamp:%m%d_%Hh%M}"
    ),
    display=True
)
#params.record.taxname = params.record.name_pattern.format(**params)

params.save("test.json")

In [2]:
from libs.graph import KnowledgeGraph

kg = KnowledgeGraph.from_dir("toy")

Triples: 100%|█████████████████████████████████████████████████████████████| 316114/316114 [00:02<00:00, 125380.10it/s]


In [17]:
import libs.expressive.extractor as exp
from importlib import reload
import libs.sampling as lsamp
from libs.axiom import Existential, Concept

reload(lsamp)
reload(exp)


relevant_ids = {h for h, rs in kg._h.items() if len(rs) > 3}
sampler = lsamp.NaiveGraphSampler(kg, relevant_ids)

# Run 10 clustering steps
extr = exp.ExpressiveExtractor(kg, params, sampler=sampler, verbose="INFO")
extr.init()
extr.run(10)

# Print the resulting taxonomy
T = extr.get_taxonomy()
T.print()

17:30:25 - INFO : Initialisation done.
17:30:25 - INFO : STEP 0: starting with axiom ⊤
17:30:26 - INFO : Subclasses found: ∃dbo:birthDate.{xsd:date}, ∃dbo:location.dbo:Location
17:30:26 - INFO : STEP 1: starting with axiom ∃dbo:birthDate.{xsd:date}
17:30:28 - INFO : Subclasses found: ∃dbo:deathPlace.dbo:Place, ∃dbo:birthPlace.dbo:Country
17:30:28 - INFO : STEP 2: starting with axiom ∃dbo:location.dbo:Location
17:30:29 - INFO : Subclasses found: ∃dbo:location.dbo:Country, ∃dbo:followingEvent.dbo:SocietalEvent
17:30:29 - INFO : STEP 3: starting with axiom ∃dbo:deathPlace.dbo:Place
17:30:30 - INFO : Subclasses found: ∃dbo:birthPlace.dbo:AdministrativeRegion, ∃foaf:surname.{<LABEL:en>}
17:30:30 - INFO : STEP 4: starting with axiom ∃dbo:birthPlace.dbo:Country
17:30:31 - INFO : Subclasses found: ∃dbo:deathPlace.dbo:Location, ∃dbo:number.{<STRING>}
17:30:31 - INFO : STEP 5: starting with axiom ∃dbo:location.dbo:Country
17:30:32 - INFO : Subclasses found: ∃dbo:fastestDriver.dbo:Person, ∃dbo:nu

**Sampling Examples**

For now, `GraphSampler` is not working properly so we use `NaiveGraphSampler`. In production, `NaiveGraphSampler` should not be used since it performs a full scan of the knowledge graph and thus is inefficient for real-world, large-scale graphs.

In [8]:
import libs.sampling as lsamp
from importlib import reload
reload(lsamp)
from libs.axiom import Existential, Concept

# An example of complex axiom (arity 3)
axiom = Existential("dbo:nationality", Concept(singleton="dbr:Japan")) & Concept("dbo:Agent")
print(axiom)

# Only consider entities involved in 4+ relations
relevant_ids = {h for h, rs in kg._h.items() if len(rs) > 3}
sampler = lsamp.NaiveGraphSampler(kg, relevant_ids)

# Sample 10 entities from this axiom
instances, size = sampler.sample(axiom, 10)
kg.ent.to_uris(*instances)

∃dbo:nationality.{dbr:Japan}∧dbo:Agent


['dbr:Yasunori_Nomura',
 'dbr:Kagami_Yoshimizu',
 'dbr:Tateo_Ozaki',
 'dbr:Hikari_Okubo',
 'dbr:Nabi_Tajima',
 'dbr:Saeko_Kimura',
 'dbr:Masahiko_Amakasu']

**Axiom Inducer Module**

This is a demo of the class `libs.axiom_extraction.Inducer`. For inducing axioms, we need a set of positive examples $E^+$,  a set of negative examples $E^-$ and a threshold $\delta \in [0, 1]$. In this demo, $E^+$ is a random subset of class `dbo:Athlete`, $E^-$ is a random subset of class `dbo:City`, and $\delta=0.85$.

The algorithm outputs a list of axioms that have a partition score higher than $\delta$.

In [7]:
import libs.axiom_extraction as lae
from importlib import reload
import random
lae = reload(lae)


size = 5
delta = 0.85

samples = []
for cls in ("dbo:Athlete", "dbo:City"):
    clsid = graph.ent.to_id(cls)
    instances = [x for x in relevant_ids if (x, kg.isaid, clsid) in kg]
    if verbose and size > len(instances):
        print(f"WARNING: Can't sample {size} items out of {len(instances)} instances of class {cls}")
    size = min(size, len(instances))

    samples.append(random.sample(instances, size))

E_pos, E_neg = samples
ind = lae.Inducer(E_pos, E_neg, kg, threshold=0.0, verbose=True)

print(ind)
res = ind.find(allow_neg=False, threshold=delta)
res

Inducer(entities=10, axioms=125)
Finding axioms

Step 0/3: 1 axioms to improve
Improving __empty__...
Coverage too low (0.00<0.85). Adding OR clauses...
Specificity too low (0.00<0.85). Adding AND clauses...
...250 results found

Step 1/3: 5 axioms to improve
Improving ∃dbo:isPartOf.dbo:AdministrativeRegion...
Coverage too low (0.40<0.85). Adding OR clauses...
...124 results found
Improving ∃dbo:isPartOf.{dbr:Una_district}...
Coverage too low (0.20<0.85). Adding OR clauses...
...124 results found
Improving ∃dbo:part.{dbr:Human_settlement}...
Coverage too low (0.20<0.85). Adding OR clauses...
...124 results found
Improving ∃dbo:position.{dbr:Attacking_Midfielder}...
Coverage too low (0.00<0.85). Adding OR clauses...
Specificity too low (0.80<0.85). Adding AND clauses...
...248 results found
Improving ∃dbo:nationality.dbo:Place...
Coverage too low (0.00<0.85). Adding OR clauses...
Specificity too low (0.60<0.85). Adding AND clauses...
...248 results found

Step 2/3: 5 axioms to improve
I

0,1,2,3,4
axiom,cov,spe,sco,
0,,,,
∃dbo:isPartOf.dbo:AdministrativeRegion,0.40,1.00,0.70,0.0
∃dbo:isPartOf.{dbr:Una_district},0.20,1.00,0.60,0.0
∃dbo:part.{dbr:Human_settlement},0.20,1.00,0.60,0.0
∃dbo:position.{dbr:Attacking_Midfielder},0.00,0.80,0.40,0.0
∃dbo:nationality.dbo:Place,0.00,0.60,0.30,0.0
1,,,,
∃dbo:isPartOf.dbo:AdministrativeRegion∨∃dbo:isPartOf.{dbr:Una_district},0.60,1.00,0.80,1.0
∃dbo:isPartOf.dbo:AdministrativeRegion∨∃dbo:part.{dbr:Human_settlement},0.60,1.00,0.80,1.0
