In [1]:
# Create/Load the config xml template
# extract all classes and their properties from dbpedia ontology
# For each class find coverage of each property
# write config file for each class
    # write properties with coverage higher than set threshold 

In [87]:
# Imports
import xml.etree.ElementTree as ET
from SPARQLWrapper import SPARQLWrapper, JSON
import time, os, re, copy
from xml.dom import minidom

In [88]:
# Read template xml file to a string
sample_xml = './limes_config_template.xml'
sample_file = ET.parse(sample_xml)

In [89]:
# Sparql endpoints
sparql_en = SPARQLWrapper(endpoint="http://porque.cs.upb.de:8890/sparql", defaultGraph="http://www.upb.de/en-dbp2016-10")
sparql_de = SPARQLWrapper(endpoint="http://porque.cs.upb.de:8890/sparql", defaultGraph="http://www.upb.de/de-dbp2016-10")
sparql_fr = SPARQLWrapper(endpoint="http://porque.cs.upb.de:8890/sparql", defaultGraph="http://www.upb.de/fr-dbp2016-10")
sparql_es = SPARQLWrapper(endpoint="http://porque.cs.upb.de:8890/sparql", defaultGraph="http://www.upb.de/es-dbp2016-10")

sparql_map = {
    'en': sparql_en,
    'de': sparql_de,
    'es': sparql_es,
    'fr': sparql_fr
}


In [90]:
# SPARQL - for each class find all its properties ond the number of times it appeared in its instances
propcount_sparql = '''
SELECT ?prop (count(?prop) as ?propcount) WHERE
{
?res a <%s> .
?res ?prop ?obj .
?prop <http://www.w3.org/2000/01/rdf-schema#domain> <%s> .
FILTER STRSTARTS( str(?prop), "http://dbpedia.org/ontology/" )
}
GROUP BY ?prop
'''
# SPARQL - for each class count the number of instances it has
rescount_sparql = '''
select ?class (count(?res) as ?instancecount) where {
?res a ?class .
FILTER STRSTARTS( str(?class), "http://dbpedia.org/ontology/" )
} 
GROUP BY ?class
'''

restriction_tmplt_x = '?x a dbo:%s'
restriction_tmplt_y = '?y a dbo:%s'

In [91]:
# Extract instance counts for all classes
crm_lang_map = dict()
for lang in sparql_map.keys():
    sparql_ln = sparql_map[lang]
    class_rescount_map = dict()
    
    sparql_ln.setQuery(rescount_sparql)
    sparql_ln.setReturnFormat(JSON)
    results = sparql_ln.query().convert()

    for result in results["results"]["bindings"]:
        classUri = result["class"]["value"]
        instanceCount = result["instancecount"]["value"]
        class_rescount_map[classUri] = instanceCount
    crm_lang_map[lang] = class_rescount_map
    print(lang,'class->instance-count map size:',len(class_rescount_map))


en class->instance-count map size: 421
de class->instance-count map size: 138
es class->instance-count map size: 158
fr class->instance-count map size: 199


In [92]:
# Extract property counts for all classes and divide them by instance counts to get coverage
class_rescount_map = crm_lang_map['en']
class_propcov_map = dict()
xml_map = dict()
for classUri in class_rescount_map.keys():
    # ignore classes with zero instances
    if class_rescount_map[classUri] == 0:
        continue
    print('Querying for class', classUri)
    class_propcov_map[classUri] = dict()
    # Run query 
    sparql_en.setQuery(propcount_sparql%(classUri, classUri))
    sparql_en.setReturnFormat(JSON)
    results = sparql_en.query().convert()
    
    print('Results found:', len(results["results"]["bindings"]))
    propTags = []
    # Adding rdfs label tag by default
    prop_tag = ET.Element('PROPERTY')
    prop_tag.text = 'rdfs:label AS nolang->lowercase'
    propTags.append(prop_tag)
    # this array is just for logging
    propLbls = []
    for result in results["results"]["bindings"]:
        propUri = result["prop"]["value"]
        propCount = result["propcount"]["value"]
        propCov = int(propCount)/int(class_rescount_map[classUri])
        class_propcov_map[classUri][propUri] = propCov
        if(propCov > 0.4):
            # Create Property tags
            prop_tag = ET.Element('PROPERTY')
            # prop_tag.text = '<'+propUri+'>'
            prop_tag.text = 'dbo:' + propUri.split('/')[-1]
            propLbls.append(prop_tag.text)
            propTags.append(prop_tag)
    print('accepted properties:', propLbls)
    if(len(propTags) > 1):
        # Create XML Config
        xml_obj = copy.deepcopy(sample_file)
        # Create restrictions with class URI
        restr_tagx = ET.Element('RESTRICTION')
        restr_tagy = ET.Element('RESTRICTION')
        class_label = classUri.split('/')[-1]
        restr_tagx.text = restriction_tmplt_x%(class_label)
        restr_tagy.text = restriction_tmplt_y%(class_label)
        
        # Find source and target
        source_ele = xml_obj.getroot().find('SOURCE')
        target_ele = xml_obj.getroot().find('TARGET')
        # add source elements
        source_ele.append(restr_tagx)
        source_ele.extend(propTags)
        # add target elements
        target_ele.append(restr_tagy)
        target_ele.extend(propTags)
        # append to xml list
        # print('Position in array:', len(xml_arr))
        xml_map[classUri] = xml_obj

Querying for class http://dbpedia.org/ontology/FootballLeagueSeason
Results found: 0
accepted properties: []
Querying for class http://dbpedia.org/ontology/MusicalWork
Results found: 0
accepted properties: []
Querying for class http://dbpedia.org/ontology/PokerPlayer
Results found: 0
accepted properties: []
Querying for class http://dbpedia.org/ontology/RoadJunction
Results found: 1
accepted properties: ['dbo:meetingRoad']
Querying for class http://dbpedia.org/ontology/SkiArea
Results found: 0
accepted properties: []
Querying for class http://dbpedia.org/ontology/MouseGeneLocation
Results found: 0
accepted properties: []
Querying for class http://dbpedia.org/ontology/EducationalInstitution
Results found: 0
accepted properties: []
Querying for class http://dbpedia.org/ontology/VolleyballPlayer
Results found: 2
accepted properties: []
Querying for class http://dbpedia.org/ontology/SolarEclipse
Results found: 0
accepted properties: []
Querying for class http://dbpedia.org/ontology/Railway

In [93]:
len(class_propcov_map)

421

In [94]:
len(xml_map)

95

In [95]:
outfile_tmplt = './limes-cfg/porque_enrich_limes_config_%s_%s.xml'
predfile_tmplt = 'predictions_%s_%s.nt'
revfile_tmplt = 'predictions_rev_%s_%s.nt'
count = 1
for class_name in xml_map.keys():
    xml_obj = xml_map[class_name]
    count_str = str(count)
    # For each language (de/es/fr)
    for lang in ['de','fr','es']:
        # check their map for the class. If class exists, then create the config file for that language
        if (not class_name in crm_lang_map[lang]) or crm_lang_map[lang][class_name] == 0:
            continue
        file_name = outfile_tmplt%(lang, count)
        predfile = predfile_tmplt%(lang, count)
        revfile = revfile_tmplt%(lang, count)
        # change prediction and review file name
        xml_obj.getroot().find('ACCEPTANCE').find('FILE').text = predfile
        xml_obj.getroot().find('REVIEW').find('FILE').text = revfile
        # change the map for query
        target_graph = xml_obj.getroot().find('TARGET').find('GRAPH')
        target_graph.text = sparql_map[lang]._defaultGraph
        # Convert obj to string and write 
        # xmlstr = minidom.parseString(ET.tostring(xml_obj.getroot())).toprettyxml(indent="    ",encoding="utf-8")
        with open(file_name, "wb") as f:
            # f.write(xmlstr)
            f.write('<?xml version="1.0" encoding="utf-8"?>\n<!DOCTYPE LIMES SYSTEM "limes.dtd">\n'.encode('utf8'))
            ET.ElementTree(xml_obj.getroot()).write(f, 'utf-8')
    count+=1
print('Done')

Done


In [33]:
#
# Obsolete code
#

outfile_tmplt = '/data-disk/kg-fusion/limes-cfg/porque_enrich_limes_config_%s.xml'
predfile_tmplt = 'predictions_%s.nt'
revfile_tmplt = 'predictions_rev_%s.nt'
count = 1
for class_name in xml_map.keys():
    xml_obj = xml_map[class_name]
    count_str = str(count)
    file_name = outfile_tmplt%count
    predfile = predfile_tmplt%count
    revfile = revfile_tmplt%count
    # change prediction and review file name
    xml_obj.getroot().find('ACCEPTANCE').find('FILE').text = predfile
    xml_obj.getroot().find('REVIEW').find('FILE').text = revfile
    
    # Convert obj to string and write 
    xmlstr = minidom.parseString(ET.tostring(xml_obj.getroot())).toprettyxml(indent="    ",encoding="utf-8")
    with open(file_name, "wb") as f:
        f.write(xmlstr)
    count+=1
print('Done')

Done


In [65]:
# Scrap code snippets below

In [25]:
print(sample_file.getroot().find('SOURCE').find('ID').text)

en-dbpedia2016-10


In [28]:
source_ele = sample_file.getroot().find('SOURCE')
restriction = ET.Element('RESTRICTION')
restriction.text = '?x a <%s>'
source_ele.append(restriction)

In [48]:
for item in sample_file.getroot().find('SOURCE')
print(item.tag, item.text)

SyntaxError: invalid syntax (<ipython-input-48-aa963a4be64b>, line 1)