## Variables

### Import libraries

In [58]:
import os
import urllib.parse as up
from rdflib import Graph, Namespace, Literal, URIRef
from rdflib.namespace import RDF, RDFS, SKOS, OWL, NamespaceManager
from SPARQLWrapper import SPARQLWrapper, TURTLE
import uuid
import sys
import json

### Define global variables

In [59]:
ont_file = "address_ont.ttl"
mapping_file = "mapping.json"
out_wikidata_file = "wikidata-temp.ttl"
out_addr_file = "addr-temp.ttl"
addr_file = "addresses.json"

temp_folder = "tmp_files"


if not os.path.exists(temp_folder):
    os.makedirs(temp_folder)

export_format = "TURTLE"

project_name = "paris_directories" # This repository must exist in graphdb
addr_graph_name = "addresses"

graphdb_url = "http://localhost:7200"
endpoint_url = "https://query.wikidata.org/sparql"
namespace_url = "http://rdf.geohistoricaldata.org/address#"

# ontorefine_cmd = "ontorefine-cli"
ontorefine_cmd = "/opt/ontotext-refine/lib/app/bin/ontorefine-cli"
# ontorefine_cmd = "/Applications/Ontotext\ Refine.app/Contents/app/bin/ontorefine-cli"


## Functions to create / import graphs

### Import created ttl file in GraphDB

In [60]:
def import_ttl_file_in_graphdb(graphdb_url, repository_id, ttl_file, graph_name):
    # cmd = f"curl -X POST -H \"Content-Type:application/x-turtle\" -T \"{ttl_file}\" {graphdb_url}/repositories/{repository_id}/statements"
    cmd = f"curl -X POST -H \"Content-Type:application/x-turtle\" -T \"{ttl_file}\" {graphdb_url}/repositories/{repository_id}/rdf-graphs/{graph_name}"

    os.system(cmd)

### Export query result

In [61]:
def get_query_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(TURTLE)
    return sparql.query().convert()

### Functions to create addresses from JSON file

In [107]:
def create_address(g:Graph, namespace_url:str, label:str, lang:str, landmarks:list, spatial_relations:list, target:int=None, datetime:str=None):
    namespace = Namespace(namespace_url)

    landmark_uris = []
    for landmark in landmarks:
        landmark_uri = create_landmark(g, namespace, landmark, lang)
        landmark_uris.append(landmark_uri)

    if target is None:
        target_uri = create_landmark(g, namespace, {}, lang)
    else:
        target_uri = landmark_uri[target]

    next_addr_seg_uri = None
    for spat_rel in reversed(spatial_relations):
        spatial_rel_uri = create_spatial_relation(g, namespace, spat_rel[0], lang)
        addr_seg_uri = generate_uri(namespace, "AS_")
        try:
            locatum = landmark_uris[spat_rel[1]]
        except:
            locatum = target_uri
        relatums = []
        for x in spat_rel[2]:
            try:
                relatums.append(landmark_uris[x])
            except IndexError:
                relatums.append(target_uri)

        create_addr_segment(g, namespace, spatial_rel_uri, addr_seg_uri, locatum, relatums, next_addr_seg_uri)
        next_addr_seg_uri = addr_seg_uri

    create_address_main_elem(g, namespace, label, target_uri, next_addr_seg_uri, lang, datetime)

def create_address_main_elem(g:Graph, namespace:Namespace, label, target_uri:URIRef, first_step_uri:URIRef, lang:str=None, datetime:str=None):
    addr_uri = generate_uri(namespace, "ADDR")
    g.add((addr_uri, RDF.type, namespace["Address"]))
    g.add((addr_uri, RDFS.label, Literal(label, lang=lang)))
    g.add((addr_uri, namespace["targets"], target_uri))
    g.add((addr_uri, namespace["firstStep"], first_step_uri))
    if datetime is not None:
        g.add((addr_uri, namespace["dateTime"], Literal(datetime, datatype="http://www.w3.org/2001/XMLSchema#dateTime")))

def create_landmark(g:Graph, namespace:Namespace, landmark_dict:dict, lang:str=None):
    landmark_types = {"thoroughfare":"Thoroughfare", "undefined":"Undefined", None: "Undefined", "city":"City", "housenumber":"HouseNumber", "district":"District"}
    l_uri = generate_uri(namespace, "LM")
    l_label = landmark_dict.get("label")
    l_type = landmark_types.get(landmark_dict.get("type"))

    g.add((l_uri, RDF.type, namespace["Landmark"]))
    g.add((l_uri, namespace["isLandmarkType"], namespace[l_type]))

    if l_label is not None:
        g.add((l_uri, RDFS.label, Literal(l_label, lang=lang)))

    return l_uri

def create_addr_segment(g:Graph, namespace:Namespace, spatial_rel_uri:URIRef, addr_seg_uri:URIRef, locatum:URIRef, relatums:list[URIRef], next_addr_segment:URIRef=None):
    if next_addr_segment is not None:
        g.add((addr_seg_uri, RDF.type, namespace["addressSegment"]))
        g.add((addr_seg_uri, namespace["nextStep"], next_addr_segment))
    else:
        g.add((addr_seg_uri, RDF.type, namespace["finalAddressSegment"]))

    g.add((addr_seg_uri, namespace["locatum"], locatum))

    for relatum in relatums:
        g.add((addr_seg_uri, namespace["relatum"], relatum))

    g.add((addr_seg_uri, namespace["isSpatialRelationType"], spatial_rel_uri))
    
def create_spatial_relation(g:Graph, namespace:Namespace, label:str, lang:str=None):
    spatial_rel = generate_uri(namespace, "SR")
    label_lang = Literal(label, lang=lang)
    g.add((spatial_rel, RDF.type, OWL.NamedIndividual))
    g.add((spatial_rel, RDF.type, namespace["AddressSegmentType"]))
    g.add((spatial_rel, RDF.type, SKOS.Concept))
    g.add((spatial_rel, RDFS.label, label_lang))
    return spatial_rel

def generate_uri(namespace:Namespace, prefix:str=None):
    if prefix is not None:
        return namespace[f"{prefix}_{uuid.uuid4().hex}"]
    else:
        return namespace[uuid.uuid4().hex]

## Process to create / import data

### Queries to build graph from Wikidata

:warning: Impossible to chain queries for Wikidata, it has to be split

In [63]:
# Get streets of Paris, with label, altLabel and their locations
query1 = """
PREFIX addr: <http://rdf.geohistoricaldata.org/address#>

CONSTRUCT {
 ?street a addr:Landmark;
           addr:isLandmarkType addr:Thoroughfare;
           rdfs:label ?streetLabel;
           skos:altLabel ?streetAltLabel;
           addr:within ?loc.
}
WHERE {
  { ?street p:P361 [ps:P361 wd:Q16024163]. }
  UNION
  { ?street p:P361 [ps:P361 wd:Q107311481]. }
  ?street wdt:P131 ?loc.
  ?street rdfs:label ?streetLabel.
  FILTER (LANG(?streetLabel) = "fr")
  OPTIONAL {?street skos:altLabel ?streetAltLabel. FILTER (LANG(?streetAltLabel) = "fr")}
}
"""

# Get streets of Paris with the history of their official name
query2 = """
PREFIX addr: <http://rdf.geohistoricaldata.org/address#>

CONSTRUCT {
  ?street addr:hasOfficialName _:bn.
  _:bn rdfs:label ?officialName; addr:startDate ?startDate; addr:endDate ?endDate.
}
WHERE {
  { ?street p:P361 [ps:P361 wd:Q16024163]. }
  UNION
  { ?street p:P361 [ps:P361 wd:Q107311481]. }
    ?street p:P1448 ?officialNameSt. 
    ?officialNameSt ps:P1448 ?officialName.
    OPTIONAL{?officialNameSt pq:P580 ?startDate}
    OPTIONAL{?officialNameSt pq:P582 ?endDate}
    #FILTER (LANG(?officialName) = "fr")
}
"""

# Get related data of municipal arrondissements of Paris
query3 = """
PREFIX addr: <http://rdf.geohistoricaldata.org/address#>

CONSTRUCT {
  ?arrdt a addr:Landmark;
           addr:isLandmarkType addr:District;
           rdfs:label ?arrdtLabel;
           skos:altLabel ?arrdtAltLabel;
           addr:within wd:Q90;
           addr:startDate ?startDate.
}
WHERE {
  ?arrdt wdt:P31 wd:Q702842; p:P131 [ps:P131 wd:Q90]; rdfs:label ?arrdtLabel; skos:altLabel ?arrdtAltLabel.
  FILTER(LANG(?arrdtLabel) = "fr" && LANG(?arrdtAltLabel) = "fr")
  OPTIONAL {?arrdt wdt:P571 ?startDate}
}
"""

# Get related data of quartiers of Paris
query4 = """
PREFIX addr: <http://rdf.geohistoricaldata.org/address#>

CONSTRUCT {
  ?quartier a addr:Landmark;
           addr:isLandmarkType addr:District;
           rdfs:label ?quartierLabel;
           skos:altLabel ?quartierAltLabel;
           addr:within ?loc.
}
WHERE {
  ?quartier wdt:P31 wd:Q252916; rdfs:label ?quartierLabel; skos:altLabel ?quartierAltLabel; p:P131 [ps:P131 ?loc].
  FILTER(LANG(?quartierLabel) = "fr" && LANG(?quartierAltLabel) = "fr")
}
"""

# Get related data of Paris
query5 = """
PREFIX addr: <http://rdf.geohistoricaldata.org/address#>

CONSTRUCT {
  ?paris a addr:Landmark;
           addr:isLandmarkType addr:City;
           rdfs:label ?parisLabel;
           skos:altLabel ?parisAltLabel.
}
WHERE {
  BIND (wd:Q90 AS ?paris)
  ?paris rdfs:label ?parisLabel; skos:altLabel ?parisAltLabel.
  FILTER(LANG(?parisLabel) = "fr" && LANG(?parisAltLabel) = "fr")
}
"""

### Extract street data of Paris from Wikidata

In [64]:
abs_out_wikidata_file = os.path.abspath(os.path.join(temp_folder, out_wikidata_file))
abs_out_addr_file = os.path.abspath(os.path.join(temp_folder, out_addr_file))


In [65]:
g = get_query_results(endpoint_url, query1)
g += get_query_results(endpoint_url, query2)
g += get_query_results(endpoint_url, query3)
g += get_query_results(endpoint_url, query4)
g += get_query_results(endpoint_url, query5)

g.serialize(destination=abs_out_wikidata_file)



<Graph identifier=N44d13419450c4189994d2a7f64f5ca68 (<class 'rdflib.graph.ConjunctiveGraph'>)>

### Remove all data of repository whose id is `project_name`

In [108]:
cmd = f"curl -X DELETE -H \"Content-Type:application/x-turtle\" {graphdb_url}/repositories/{project_name}/statements"
os.system(cmd)

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0


0

### Import ontology

In [109]:
abs_ont_file = os.path.abspath(ont_file)
cmd = f"curl -X POST -H \"Content-Type:application/x-turtle\" -T \"{abs_ont_file}\" {graphdb_url}/repositories/{project_name}/statements"

os.system(cmd)

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0

100 19366    0     0  100 19366      0   196k --:--:-- --:--:-- --:--:--  197k


0

### Import created graph from Wikidata

In [110]:
cmd = f"curl -X POST -H \"Content-Type:application/x-turtle\" -T \"{abs_out_wikidata_file}\" {graphdb_url}/repositories/{project_name}/rdf-graphs/wikidata"
os.system(cmd)

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0

100 1987k    0     0  100 1987k      0  1613k  0:00:01  0:00:01 --:--:-- 1613k


0

### Structure data from `addr_file` to export it in `out_file`

In [111]:
f = open(os.path.abspath(addr_file))
data = json.load(f)
f.close()

g = Graph()

for addr in data["addresses"]:
    try:
        create_address(g, namespace_url, addr.get("label"), addr.get("lang"), addr.get("landmarks"), addr.get("spatial_relations"), addr.get("target"), addr.get("date"))
    except TypeError:
        pass

g.serialize(destination=os.path.join(abs_out_addr_file))

<Graph identifier=N6a48c41758224731a5e311d75d3a2da0 (<class 'rdflib.graph.Graph'>)>

### Import created graph from directories

In [112]:
cmd = f"curl -X POST -H \"Content-Type:application/x-turtle\" -T \"{abs_out_addr_file}\" {graphdb_url}/repositories/{project_name}/rdf-graphs/{addr_graph_name}"
os.system(cmd)

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  4525    0     0  100  4525      0  43656 --:--:-- --:--:-- --:--:-- 43932


0

### Get links between data extracted from wikidata and data from sources

In [113]:
query_template = """
PREFIX addr: <http://rdf.geohistoricaldata.org/address#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>

INSERT {{
    GRAPH <{graphdb_url}/repositories/{project_name}/rdf-graphs/{graph_name}> {{?streetG1 owl:sameAs ?streetG2.}}
}}
WHERE {{
    GRAPH <{graphdb_url}/repositories/{project_name}/rdf-graphs/{graph_name}> {{
        ?streetG1 addr:isLandmarkType addr:Thoroughfare ; rdfs:label ?streetNameG1; addr:dateTime ?date.
        BIND (LCASE(?streetNameG1) AS ?lcStreetNameG1)
    }}
    GRAPH <{graphdb_url}/repositories/{project_name}/rdf-graphs/wikidata> {{
        ?streetG2 addr:isLandmarkType addr:Thoroughfare ; rdfs:label ?streetG2Name; addr:hasOfficialName ?offNameSt.
        ?offNameSt rdfs:label ?offName.
        OPTIONAL{{?offNameSt addr:startDate ?sd}}
        OPTIONAL{{?offNameSt addr:endDate ?ed}}
        BIND(IF(BOUND(?sd), ?sd, "{neg_inf_date}"^^xsd:dateTime) AS ?startDate)
        BIND(IF(BOUND(?ed), ?ed, "{pos_inf_date}"^^xsd:dateTime) AS ?endDate)
        FILTER (?date >= ?startDate && ?date <= ?endDate)
        BIND (LCASE(?offName) AS ?lcStreetNameG2)
    }}
    FILTER (?lcStreetNameG1 = ?lcStreetNameG2)
}}
"""

### Get `owl:sameAs` links for each street of each source graph with street of Wikidata graph

In [114]:
pos_inf_date = "6000-01-01"
neg_inf_date = "-6000-01-01"

query = query_template.format(graph_name=addr_graph_name,
                                neg_inf_date=neg_inf_date, pos_inf_date=pos_inf_date,
                                project_name=project_name, graphdb_url=graphdb_url)
query_encoded = up.quote(query)
cmd = f"curl -X POST -H \"Content-Type:application/x-www-form-urlencoded\" -d \"update={query_encoded}\" {graphdb_url}/repositories/{project_name}/statements"

# os.system(cmd)



PREFIX addr: <http://rdf.geohistoricaldata.org/address#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>

INSERT {
    GRAPH <http://localhost:7200/repositories/paris_directories/rdf-graphs/addresses> {?streetG1 owl:sameAs ?streetG2.}
}
WHERE {
    GRAPH <http://localhost:7200/repositories/paris_directories/rdf-graphs/addresses> {
        ?streetG1 addr:isLandmarkType addr:Thoroughfare ; rdfs:label ?streetNameG1; addr:dateTime ?date.
        BIND (LCASE(?streetNameG1) AS ?lcStreetNameG1)
    }
    GRAPH <http://localhost:7200/repositories/paris_directories/rdf-graphs/wikidata> {
        ?streetG2 addr:isLandmarkType addr:Thoroughfare ; rdfs:label ?streetG2Name; addr:hasOfficialName ?offNameSt.
        ?offNameSt rdfs:label ?offName.
        OPTIONAL{?offNameSt addr:startDate ?sd}
        OPTIONAL{?offNameSt addr:endDate ?ed}
        BIND(IF(BOUND(?sd), ?sd, "-6000-01-01"^^xsd:dateTime) AS