In [1]:
import datetime as dt
from libs.utils.params import Params

params = Params(
    size=Params(
        # Sampling sizes
        size=1000,
        initial=4000,
        decrease=0.9,
        plateau=600
    ),
    # Random state
    seed=None,
    # Thresholds
    threshold=Params(
        adaptative=True,
        opt=0.9,
        initial=0.9,
        min=0.6,
        step=0.05,
        expressive=0.5,
        current=0.9,
    ),
    max_depth=4,
    max_depth_step=0,
    patterns=Params(
        individuals=True,
        existential=True,
    ),
    embeddings="toy",
    clustering=Params(
        affinity="euclidean",
        linkage="ward",
    ),
    metric="harmonic",
    max_axioms=2,
    min_gain=0.08,
    allow_child=False,
    sort_axioms=False,
    others=Params(
        keep=True,
        n=8, # Max number of candidates to keep
        threshold=0.9, # % of the optimal score
    ),
    halting=Params(
        min_size=30,
        max_rec_steps=40,
        max_clustering_steps=100,
        max_extracted_depth=15,
        memory_limit=110*1024**2 # gigabytes
    ),
    extra=Params(
        active=True,
        n=100,
        reset_classes=True,
        depth=20,
        threshold=0.15
    ),
    record=Params(
        save_taxonomy=True,
        checkpoints=True,
        checkpoint_every=100,
        dirname="results/taxonomy/auto",
        name_pattern="taxonomy_{halting.max_clustering_steps}s_{timestamp:%m%d_%Hh%M}"
    ),
    display=True
)
#params.record.taxname = params.record.name_pattern.format(**params)

params.save("test.json")

In [2]:
from libs.graph import KnowledgeGraph

kg = KnowledgeGraph.from_dir("toy")

Triples: 100%|█████████████████████████████████████████████████████████████| 316114/316114 [00:02<00:00, 125380.10it/s]


In [3]:
from libs.embeddings import load

E = load("toy")

E.shape

(54795, 50)

In [7]:
import libs.expressive.extractor as exp
from importlib import reload
import libs.sampling as lsamp
from libs.axiom import Existential, Concept

reload(lsamp)
reload(exp)


relevant_ids = {h for h, rs in kg._h.items() if len(rs) > 3}
sampler = lsamp.NaiveGraphSampler(kg, relevant_ids)


extr = exp.ExpressiveExtractor(kg, params, sampler=sampler, verbose="INFO")
extr.init()
extr.run(10)
extr.T.print()

10:14:27 - INFO : Initialisation done.
10:14:27 - INFO : STEP 0: starting with axiom ⊤
10:14:28 - INFO : Subclasses found: ∃dbo:birthDate.{xsd:date}, ∃dbo:location.dbo:Location
10:14:28 - INFO : STEP 1: starting with axiom ∃dbo:birthDate.{xsd:date}
10:14:29 - INFO : Subclasses found: ∃dbo:deathPlace.dbo:Place, ∃dbo:birthPlace.dbo:Country
10:14:29 - INFO : STEP 2: starting with axiom ∃dbo:location.dbo:Location
10:14:30 - INFO : Subclasses found: ∃dbo:location.dbo:Country, ∃dbo:followingEvent.dbo:SocietalEvent
10:14:30 - INFO : STEP 3: starting with axiom ∃dbo:deathPlace.dbo:Place
10:14:32 - INFO : Subclasses found: ∃dbo:birthPlace.dbo:AdministrativeRegion, ∃foaf:surname.{<LABEL:en>}
10:14:32 - INFO : STEP 4: starting with axiom ∃dbo:birthPlace.dbo:Country
10:14:33 - INFO : Subclasses found: ∃dbo:deathPlace.dbo:Location, ∃dbo:number.{<STRING>}
10:14:33 - INFO : STEP 5: starting with axiom ∃dbo:location.dbo:Country
10:14:34 - INFO : Subclasses found: ∃dbo:fastestDriver.dbo:Person, ∃dbo:nu

In [77]:
len(list(clu.items()))

1000

In [7]:
import logging

root = logging.getLogger("")
root.warning("test")

test


In [9]:
import libs.sampling as lsamp
reload(lsamp)
from libs.axiom import Existential, Concept

relevant_ids = {h for h, rs in kg._h.items() if len(rs) > 3}
sampler = lsamp.NaiveGraphSampler(kg, relevant_ids)
axiom = Existential("dbo:nationality", Concept(singleton="dbr:Japan")) & Concept("dbo:Agent")

instances, size = sampler.sample(axiom, 10)

kg.ent.to_uris(*instances)

['dbr:Yasunori_Nomura',
 'dbr:Kagami_Yoshimizu',
 'dbr:Tateo_Ozaki',
 'dbr:Hikari_Okubo',
 'dbr:Nabi_Tajima',
 'dbr:Saeko_Kimura',
 'dbr:Masahiko_Amakasu']

In [16]:
list(kg.ent.idx.keys())[:10]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [12]:
ax = Existential("dbo:birthName", Concept(singleton="<LABEL:en>")) & Concept("dbo:Agent")

print(ax.holds_for(kg.ent.to_id("dbr:Kagami_Yoshimizu"), kg))

sampler.sample(ax, 10)

True


(set(), 0)

In [None]:
isa = kg.rel.to_id("rdf:type")
rid = kg.rel.to_id()

In [5]:
kg.ent.to_uris(*list(relevant_ids)[:10])

['dbr:Valeriy_Vorobyov',
 'dbr:Gornji_Mamići',
 'dbr:Raphael_Samuel',
 'dbr:Wyoming_Historic_District',
 'dbr:Lakeridge,_Saskatoon__St._Luke_School__1',
 'dbr:Portland_International_Airport_MAX_Station',
 'dbr:Lough_Neagh',
 'dbr:Kagami_Yoshimizu',
 'dbr:Air_Nigeria',
 'dbr:Akron,_Pennsylvania']

In [6]:
kg.print_relations("dbr:Kagami_Yoshimizu")

dbr:Kagami_Yoshimizu
	 rdf:type
		 dbo:Agent
		 dbo:Artist
		 dbo:Person
	 foaf:gender <LABEL:en>
	 dbo:birthName <LABEL:en>
	 dbo:nationality dbr:Japan
	 dbo:birthPlace
		 dbr:Saitama_Prefecture
		 dbr:Japan
		 dbr:Satte,_Saitama
	 dbo:birthDate xsd:date


In [6]:
# extr.init()
extr.next()

IndexError: pop from empty list

In [4]:
import random

ca, cb = "dbo:Athlete", "dbo:City"
size = 5

relevant_ids = [h for h, rs in kg._h.items() if len(rs) > 5]

def sample_from_class(cls, size, graph=kg, verbose=True):
    clsid = graph.ent.to_id(cls)
    instances = [x for x in relevant_ids if (x, graph.isaid, clsid) in graph]
    if verbose and size > len(instances):
        print(f"WARNING: Can't sample {size} items out of {len(instances)} instances of class {cls}")
    size = min(size, len(instances))

    return random.sample(instances, size)

A = sample_from_class(ca, size)
B = sample_from_class(cb, size)

In [7]:
import libs.axiom_extraction as lae
from importlib import reload

lae = reload(lae)

ind = lae.Inducer(B, A, kg, threshold=0.0)

print(ind)

res = ind.find(allow_neg=False)
res

Inducer(entities=10, axioms=125)
Finding axioms

Step 0/3: 1 axioms to improve
Improving __empty__...
Coverage too low (0.00<0.85). Adding OR clauses...
Specificity too low (0.00<0.85). Adding AND clauses...
...250 results found

Step 1/3: 5 axioms to improve
Improving ∃dbo:isPartOf.dbo:AdministrativeRegion...
Coverage too low (0.40<0.85). Adding OR clauses...
...124 results found
Improving ∃dbo:isPartOf.{dbr:Una_district}...
Coverage too low (0.20<0.85). Adding OR clauses...
...124 results found
Improving ∃dbo:part.{dbr:Human_settlement}...
Coverage too low (0.20<0.85). Adding OR clauses...
...124 results found
Improving ∃dbo:position.{dbr:Attacking_Midfielder}...
Coverage too low (0.00<0.85). Adding OR clauses...
Specificity too low (0.80<0.85). Adding AND clauses...
...248 results found
Improving ∃dbo:nationality.dbo:Place...
Coverage too low (0.00<0.85). Adding OR clauses...
Specificity too low (0.60<0.85). Adding AND clauses...
...248 results found

Step 2/3: 5 axioms to improve
I

0,1,2,3,4
axiom,cov,spe,sco,
0,,,,
∃dbo:isPartOf.dbo:AdministrativeRegion,0.40,1.00,0.70,0.0
∃dbo:isPartOf.{dbr:Una_district},0.20,1.00,0.60,0.0
∃dbo:part.{dbr:Human_settlement},0.20,1.00,0.60,0.0
∃dbo:position.{dbr:Attacking_Midfielder},0.00,0.80,0.40,0.0
∃dbo:nationality.dbo:Place,0.00,0.60,0.30,0.0
1,,,,
∃dbo:isPartOf.dbo:AdministrativeRegion∨∃dbo:isPartOf.{dbr:Una_district},0.60,1.00,0.80,1.0
∃dbo:isPartOf.dbo:AdministrativeRegion∨∃dbo:part.{dbr:Human_settlement},0.60,1.00,0.80,1.0


In [7]:
kg.print_relations("dbr:Henri_Cochet")

dbr:Henri_Cochet
	 rdf:type
		 dbo:TennisPlayer
		 dbo:Athlete
		 dbo:Agent
		 dbo:Person


In [8]:
for i in B[:10]:
    print(i, kg.ent.to_name(i))

33830 dbr:Lake_of_Two_Mountains
1401 dbr:La_Sauzière-Saint-Jean
21573 dbr:Mount_Helen,_Victoria
33700 dbr:Incheon
27706 dbr:Farino
39428 dbr:Kissidougou_Prefecture
13376 dbr:Las_Piedras,_Artigas
1993 dbr:Quilmes
34 dbr:Munich
46924 dbr:Yazd


In [11]:
relevant_ids = [h for h, rs in kg._h.items() if len(rs) > 5]

def sample_from_class(cls, size, graph=kg, verbose=True):
    clsid = graph.ent.to_id(cls)
    instances = [x for x in relevant_ids if (x, graph.isaid, clsid) in graph]
    if verbose and size > len(instances):
        print(f"WARNING: Can't sample {size} items out of {len(instances)} instances of class {cls}")
    size = min(size, len(instances))

    return random.sample(instances, size)

sample_from_class("dbo:TennisPlayer", 100)



[23735,
 10003,
 19116,
 33,
 3242,
 5270,
 9621,
 11046,
 8582,
 24551,
 22815,
 25026,
 31238,
 28302,
 45888,
 2906,
 8416,
 7846,
 4370,
 24435,
 30291,
 16864]

In [None]:

def sample_from_class(cls, size, graph=kg):
    return random.sample([h for h, _, _ in graph.find_triples(r=graph.isaid, t=graph.ent.to_id(cls))], size)
