In [1]:
from tools.datasets import *
from tools.sparql_wrapper import DBPediaQuery

In [2]:
semweb_df = fetch_dataset("Semantic_Web", "dbpedia")

Dataset Semantic_Web.json already available. Downloading from https://dbpedia.org/data/Semantic_Web.json
Done


In [18]:
root = semweb_df["http://dbpedia.org/resource/Semantic_Web"]

In [8]:
# Until I manage to setup a SPARQL endpoint I'll just work with the dumps I have
import logging

def strip_prefix(prefix: str, payload: str):
    if payload.startswith(prefix):
        return payload[len(prefix):]
    return payload
    
def extract_wikidata_reference(name, df):
    prefix = "http://www.wikidata.org/entity/"
    owl_sameAs = "http://www.w3.org/2002/07/owl#sameAs"
    
    def check(row: pd.Series):
        return row["p.value"] == owl_sameAs and \
                row["o.value"].startswith(prefix + "Q")

    try:
        return strip_prefix(prefix, df[df.apply(check, axis=1)].iloc[0]["o.value"])
    except:
        logging.warn(f"Could not find wikidata entity for entity {name}")

"""
def extract_wikipedia_entity(name):
    # keep it very simple - for now
    name = strip_prefix("http://dbpedia.org/page/", name)
    name = strip_prefix("http://dbpedia.org/resource/", name)
    return "http://wikipedia.org/wiki/" + name

    # TODO: Actually query the dataset to find wikipedia-en:
    
"""

def merge_with_wikidata(name: str):
    # Simple merging algo:
    # For all dbpedia triplets, replace the original resource name with the wikipedia link,
    # Thus query wikidata so that 
    # Then look for a relationship in wikidata that links the very same entities
    # and associate the link to the wikidata property.
    entity_df = fetch_dataset(name, provider='dbpedia')
    wd_id = extract_wikidata_reference(name, entity_df)
    wd_df = fetch_dataset(wd_id, provider='wikidata')

    # Only consider the rows that match with the resource name
    for row in entity_df.iterrows():
        pass



In [13]:
res = fetch_dataset("Semantic_web", provider="dbpedia")

for row in res.iterrows():
    print(row[1]["s.value"], row[1]["p.value"], row[1]["o.value"])

Dataset not available, downloading from http://dbpedia.org/page/Semantic_web




http://dbpedia.org/resource/Semantic_spectrum http://dbpedia.org/ontology/wikiPageWikiLink http://dbpedia.org/resource/Semantic_web
http://dbpedia.org/resource/List_of_University_of_Maryland,_Baltimore_County_people http://dbpedia.org/ontology/wikiPageWikiLink http://dbpedia.org/resource/Semantic_web
http://dbpedia.org/resource/A.nnotate http://dbpedia.org/ontology/wikiPageWikiLink http://dbpedia.org/resource/Semantic_web
http://dbpedia.org/resource/Data_Reference_Model http://dbpedia.org/ontology/wikiPageWikiLink http://dbpedia.org/resource/Semantic_web
http://dbpedia.org/resource/Formal_concept_analysis http://dbpedia.org/ontology/wikiPageWikiLink http://dbpedia.org/resource/Semantic_web
http://dbpedia.org/resource/Terminology_extraction http://dbpedia.org/ontology/wikiPageWikiLink http://dbpedia.org/resource/Semantic_web
http://dbpedia.org/resource/Peter_Fox_(professor) http://dbpedia.org/ontology/field http://dbpedia.org/resource/Semantic_web
http://dbpedia.org/resource/Qitera http

In [15]:
dbpedia_sparql.sparql.setQuery(dbpedia_sparql.gen_query("DESCRIBE :Semantic_web"))
res = dbpedia_sparql.sparql.query().convert()



In [17]:
res["results"]

{'distinct': False,
 'ordered': True,
 'bindings': [{'s': {'type': 'uri',
    'value': 'http://dbpedia.org/resource/Semantic_spectrum'},
   'p': {'type': 'uri',
    'value': 'http://dbpedia.org/ontology/wikiPageWikiLink'},
   'o': {'type': 'uri', 'value': 'http://dbpedia.org/resource/Semantic_web'}},
  {'s': {'type': 'uri',
    'value': 'http://dbpedia.org/resource/List_of_University_of_Maryland,_Baltimore_County_people'},
   'p': {'type': 'uri',
    'value': 'http://dbpedia.org/ontology/wikiPageWikiLink'},
   'o': {'type': 'uri', 'value': 'http://dbpedia.org/resource/Semantic_web'}},
  {'s': {'type': 'uri', 'value': 'http://dbpedia.org/resource/A.nnotate'},
   'p': {'type': 'uri',
    'value': 'http://dbpedia.org/ontology/wikiPageWikiLink'},
   'o': {'type': 'uri', 'value': 'http://dbpedia.org/resource/Semantic_web'}},
  {'s': {'type': 'uri',
    'value': 'http://dbpedia.org/resource/Data_Reference_Model'},
   'p': {'type': 'uri',
    'value': 'http://dbpedia.org/ontology/wikiPageWiki

In [16]:
def find_redirects(entity_df: dict):
    pages = []
    for key, predicates in entity_df.items():
        for predicate, value in predicates.items():
            if predicate == "http://dbpedia.org/ontology/wikiPageRedirects" :
                pages.append(key)
    return pages

def strip_prefix(prefix: str, payload) -> str:
    if payload.startswith(prefix):
        return payload[len(prefix):]
    return payload

def add_wikipedia_prefix(prefix, name):
    return "http://en.wikipedia.org/wiki/" + name

def add_dbpedia_prefix(prefix, name):
    return "http://dbpedia.org/resource/" + name

"""
def fetch_redirects(entity_name: str, entity_df : dict):
    full_entity_name = get_dbpedia_link(entity_name)
    to_crawl = [strip_prefix("http://dbpedia.org/resource/", resource) for resource in find_redirects(entity_df)]
    
    entity_df_cloned = entity_df.copy()
    
    owl_sameas = "http://www.w3.org/2002/07/owl#sameAs"
    
    for redirected in to_crawl:
        redirected_df = fetch_dataset(redirected, provider="dbpedia")
        if redirected in redirected_df:
            to_copy = [key for key in list(redirected.keys()) \ 
                         if key not in [add_dbpedia_prefix(redirected), \
                                        add_wikipedia_prefix(redirected)] ]
            
            for k in to_copy:
                entity_df_cloned[full_entity_name][to_copy]
    
    
    return entity_df_cloned

"""

# res = fetch_redirects("Semantic_Web", semweb_df)

['Criticism_of_the_Semantic_Web', 'Semantic_Internet', 'Web_3.0', 'Semantic_web', 'Syntactic_Web', 'SemWeb', 'Semantic_internet', 'Semantic_integrity', 'The_semantic_web', 'Web_3', 'Semantic-web', 'Semweb']
Dataset Criticism_of_the_Semantic_Web.json already available. Downloading from https://dbpedia.org/data/Criticism_of_the_Semantic_Web.json
Done
Dataset Semantic_Internet.json already available. Downloading from https://dbpedia.org/data/Semantic_Internet.json
Done
Dataset Web_3.0.json already available. Downloading from https://dbpedia.org/data/Web_3.0.json
Done
Dataset Semantic_web.json already available. Downloading from https://dbpedia.org/data/Semantic_web.json
Done
Dataset Syntactic_Web.json already available. Downloading from https://dbpedia.org/data/Syntactic_Web.json
Done
Dataset SemWeb.json already available. Downloading from https://dbpedia.org/data/SemWeb.json
Done
Dataset Semantic_internet.json already available. Downloading from https://dbpedia.org/data/Semantic_internet

{'http://dbpedia.org/resource/Criticism_of_the_Semantic_Web': {'http://dbpedia.org/ontology/wikiPageRedirects': [{'type': 'uri',
    'value': 'http://dbpedia.org/resource/Semantic_Web'}]},
 'http://dbpedia.org/resource/Semantic_Internet': {'http://dbpedia.org/ontology/wikiPageRedirects': [{'type': 'uri',
    'value': 'http://dbpedia.org/resource/Semantic_Web'}]},
 'http://dbpedia.org/resource/Web_3.0': {'http://dbpedia.org/ontology/wikiPageRedirects': [{'type': 'uri',
    'value': 'http://dbpedia.org/resource/Semantic_Web'}]},
 'http://dbpedia.org/resource/Carole_Goble': {'http://dbpedia.org/ontology/field': [{'type': 'uri',
    'value': 'http://dbpedia.org/resource/Semantic_Web'}]},
 'http://dbpedia.org/resource/Wendy_Hall': {'http://dbpedia.org/ontology/field': [{'type': 'uri',
    'value': 'http://dbpedia.org/resource/Semantic_Web'}]},
 'http://dbpedia.org/resource/Semantic_web': {'http://dbpedia.org/ontology/wikiPageRedirects': [{'type': 'uri',
    'value': 'http://dbpedia.org/reso