# SPARQL Playground

<a href="https://githubtocolab.com/gleanerio/archetype/blob/master/networks/commons/sparql.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.png" alt="Open in Colab"/></a>


## requirements.txt

In [None]:
!pip install -q minio
!pip install -q kglab
!pip install -q sparqlwrapper
!pip install -q pygraphml


## imports

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)  ## remove pandas future warning
import kglab
from minio import Minio
from rdflib import Graph, plugin
import plotly.express as px
import pandas as pd
from urllib.request import urlopen
import os,json

from pygraphml import GraphMLParser
from pygraphml import Graph as GraphML 

In [2]:
def ensure_directory_exists(path):
    if not os.path.exists(path):
        os.makedirs(path)

def popper(input):
    lines = input.decode().split('\n') # Split input into separate lines
    modified_lines = []

    for line in lines:
        newline = line.replace("http://schema.org", "https://schema.org")
        segments = newline.split(' ')

        if len(segments) > 3:
            segments.pop()   # Remove the last two segment
            segments.pop()
            new_line = ' '.join(segments) + ' .'
            modified_lines.append(new_line)

    result_string = '\n'.join(modified_lines)

    return(result_string)

def publicurls(client, bucket, prefix):
    urls = []
    objects = client.list_objects(bucket, prefix=prefix, recursive=True)
    for obj in objects:
        result = client.stat_object(bucket, obj.object_name)

        if result.size > 0:  #  how to tell if an objet   obj.is_public  ?????
            url = client.presigned_get_object(bucket, obj.object_name)
            # print(f"Public URL for object: {url}")
            urls.append(url)

    return urls


## Local file


In [3]:
# Check for using GPU, in case you want to ensure your GPU is used
# gc = kglab.get_gpu_count()
# print(gc)

In [4]:
# if you need to list the current URLs as a public S3, use something like this to get it

client = Minio("ossapi.oceaninfohub.org:80",  secure=False) # Create client with anonymous access.
urls = publicurls(client, "public", "graph")
for u in urls:
    print(u)

http://ossapi.oceaninfohub.org/public/graphs/summonedafricaioc_v1_release.nq
http://ossapi.oceaninfohub.org/public/graphs/summonedaquadocs_v1_release.nq
http://ossapi.oceaninfohub.org/public/graphs/summonedcioos_v1_release.nq
http://ossapi.oceaninfohub.org/public/graphs/summonededmerp_v1_release.nq
http://ossapi.oceaninfohub.org/public/graphs/summonededmo_v1_release.nq
http://ossapi.oceaninfohub.org/public/graphs/summonedemodnet_v1_release.nq
http://ossapi.oceaninfohub.org/public/graphs/summonedinanodc_v1_release.nq
http://ossapi.oceaninfohub.org/public/graphs/summonedinvemardocuments_v1_release.nq
http://ossapi.oceaninfohub.org/public/graphs/summonedinvemarexperts_v1_release.nq
http://ossapi.oceaninfohub.org/public/graphs/summonedinvemarinstitutions_v1_release.nq
http://ossapi.oceaninfohub.org/public/graphs/summonedinvemartraining_v1_release.nq
http://ossapi.oceaninfohub.org/public/graphs/summonedinvemarvessels_v1_release.nq
http://ossapi.oceaninfohub.org/public/graphs/summonedmarinet

## URLs

At this point we have the URLs, and we could either loop load all of them or pull one out manually and use.  This code could
be used as a basis for any of these approaches.


In [5]:
dgurl = "http://ossapi.oceaninfohub.org/public/graphs/summonedcioos_v1_release.nq"
# df = urlopen(dgurl)
dg = urlopen(dgurl).read()
rp = popper(dg)

In [6]:
namespaces = {
    "sh":   "http://www.w3.org/ns/shacl#" ,
    "schema": "https://schema.org/"
}

kg = kglab.KnowledgeGraph(
    name = "Schema.org based datagraph",
    base_uri = "https://example.org/id/",
    namespaces = namespaces,
)

try:
    g = Graph().parse(data=rp, format='nt')
    r = g.serialize(format='nt')
    kg.load_rdf_text(r)
except Exception as e:
    print("Exception: {}\n --".format(str(e)))
    raise e

print("Graph loaded with {} triples".format(len(g)))

Graph loaded with 145779 triples


In [7]:
sparql = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>


SELECT ?p (COUNT(?p) as ?count)
WHERE
{
  ?s ?p ?o .
}
GROUP BY ?p ORDER BY DESC(?count)
"""

pdf = kg.query_as_df(sparql)
# df = pdf   # .to_pandas()  #  breaks with papermill for reasons unknown at this time if to_pandas() is used, needed in my kglab conda env


In [8]:
pdf.head()

Unnamed: 0,p,count
0,rdf:type,25425
1,schema:keywords,19507
2,schema:name,14406
3,schema:url,8505
4,schema:description,6168


In [11]:
sparql = """
PREFIX schema: <https://schema.org/>


SELECT DISTINCT ?s ?desc ?name
WHERE
{
 ?s rdf:type ?type
   FILTER ( ?type IN (schema:ResearchProject, schema:Project, schema:Organization, 
   schema:Dataset, schema:CreativeWork, schema:Person, schema:Map, schema:Course,
   schema:CourseInstance, schema:Event, schema:Vehicle) )
   ?s schema:description ?desc .
   ?s schema:name ?name

}
"""

pdf = kg.query_as_df(sparql)
# df = pdf   # .to_pandas()  #  breaks with papermill for reasons unknown at this time if to_pandas() is used, needed in my kglab conda env


In [12]:
pdf.head(20)

Unnamed: 0,s,desc,name
0,<https://catalogue.cioos.ca/dataset/ab3684f1-7...,The Sea-Bird SeaCAT SBE16plus V2 5270 was depl...,Strait of Georgia East Conductivity Temperatur...
1,<https://catalogue.cioos.ca/dataset/ab3684f1-7...,The Sea-Bird SeaCAT SBE16plus V2 5270 was depl...,Strait of Georgia East Conductivité/Températur...
2,<https://catalogue.cioos.ca/dataset/ab3684f1-7...,Ce Sea-Bird SeaCAT SBE16plus V2 5270 a été dép...,Strait of Georgia East Conductivity Temperatur...
3,<https://catalogue.cioos.ca/dataset/ab3684f1-7...,Ce Sea-Bird SeaCAT SBE16plus V2 5270 a été dép...,Strait of Georgia East Conductivité/Températur...
4,<https://catalogue.cioos.ca/dataset/2580e0b0-d...,Ce Sea-Bird SeaCAT SBE19plus V2 6813 a été dép...,Barkley Canyon Upper Slope Conductivité/Tempér...
5,<https://catalogue.cioos.ca/dataset/2580e0b0-d...,Ce Sea-Bird SeaCAT SBE19plus V2 6813 a été dép...,Barkley Upper Slope Conductivity Temperature D...
6,<https://catalogue.cioos.ca/dataset/2580e0b0-d...,The Sea-Bird SeaCAT SBE19plus V2 6813 was depl...,Barkley Canyon Upper Slope Conductivité/Tempér...
7,<https://catalogue.cioos.ca/dataset/2580e0b0-d...,The Sea-Bird SeaCAT SBE19plus V2 6813 was depl...,Barkley Upper Slope Conductivity Temperature D...
8,<https://catalogue.cioos.ca/dataset/ca-cioos_e...,La température et la position de la surface de...,Déploiement de dispositifs dérivants suivis pa...
9,<https://catalogue.cioos.ca/dataset/ca-cioos_e...,La température et la position de la surface de...,Argos Satellite Tracked Drifters deployed from...


In [13]:
rq_pcount = """SELECT ?p (COUNT(?p) as ?pCount)
WHERE
{
  ?s ?p ?o .
}
GROUP BY ?p 
ORDER BY DESC(?count)
"""

pdf = kg.query_as_df(rq_pcount)
pdf.head()

Unnamed: 0,p,pCount
0,rdf:type,25425
1,schema:variableMeasured,2933
2,schema:url,8505
3,schema:name,14406
4,schema:keywords,19507


In [14]:
rq_desc = """PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT ?s ?name ?desc
WHERE
{
  ?s <https://schema.org/name> ?name .
  ?s rdf:type <https://schema.org/Dataset> .
  ?s <https://schema.org/description> ?desc .
}
LIMIT 200
"""

pdf = kg.query_as_df(rq_desc)
pdf.head(10)

Unnamed: 0,s,name,desc
0,<https://catalogue.cioos.ca/dataset/ab3684f1-7...,Strait of Georgia East Conductivity Temperatur...,The Sea-Bird SeaCAT SBE16plus V2 5270 was depl...
1,<https://catalogue.cioos.ca/dataset/ab3684f1-7...,Strait of Georgia East Conductivité/Températur...,The Sea-Bird SeaCAT SBE16plus V2 5270 was depl...
2,<https://catalogue.cioos.ca/dataset/ab3684f1-7...,Strait of Georgia East Conductivity Temperatur...,Ce Sea-Bird SeaCAT SBE16plus V2 5270 a été dép...
3,<https://catalogue.cioos.ca/dataset/ab3684f1-7...,Strait of Georgia East Conductivité/Températur...,Ce Sea-Bird SeaCAT SBE16plus V2 5270 a été dép...
4,<https://catalogue.cioos.ca/dataset/2580e0b0-d...,Barkley Canyon Upper Slope Conductivité/Tempér...,Ce Sea-Bird SeaCAT SBE19plus V2 6813 a été dép...
5,<https://catalogue.cioos.ca/dataset/2580e0b0-d...,Barkley Upper Slope Conductivity Temperature D...,Ce Sea-Bird SeaCAT SBE19plus V2 6813 a été dép...
6,<https://catalogue.cioos.ca/dataset/2580e0b0-d...,Barkley Canyon Upper Slope Conductivité/Tempér...,The Sea-Bird SeaCAT SBE19plus V2 6813 was depl...
7,<https://catalogue.cioos.ca/dataset/2580e0b0-d...,Barkley Upper Slope Conductivity Temperature D...,The Sea-Bird SeaCAT SBE19plus V2 6813 was depl...
8,<https://catalogue.cioos.ca/dataset/ca-cioos_e...,Déploiement de dispositifs dérivants suivis pa...,La température et la position de la surface de...
9,<https://catalogue.cioos.ca/dataset/ca-cioos_e...,Argos Satellite Tracked Drifters deployed from...,La température et la position de la surface de...


## Remote SPARQL server

In [15]:
#@title
def get_sparql_dataframe(service, query):
    """
    Helper function to convert SPARQL results into a Pandas data frame.
    """
    sparql = SPARQLWrapper(service)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    result = sparql.query()

    processed_results = json.load(result.response)
    cols = processed_results['head']['vars']

    out = []
    for row in processed_results['results']['bindings']:
        item = []
        for c in cols:
            item.append(row.get(c, {}).get('value'))
        out.append(item)

    return pd.DataFrame(out, columns=cols)

In [16]:
rp1 = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX schema: <https://schema.org/>

SELECT DISTINCT ?source ?type ?target ?sType ?tType
WHERE {
  graph ?g {
    ?source a ?sType .
    ?target a ?tType .
    ?source ?type ?target .
    FILTER((?sType) IN (schema:Person, schema:Organization, schema:Dataset, schema:Course, schema:Document))
    FILTER((?tType) IN (schema:Person, schema:Organization, schema:Dataset, schema:Course, schema:Document))
  }
}

"""

ep = "http://0.0.0.0:7878/query"

df = get_sparql_dataframe(ep, rp1)
df.head(10)

NameError: name 'SPARQLWrapper' is not defined

In [37]:
g = GraphML()

#  yeah, I get it...   don't iterate rows...  PR's welcome for this, being bad is too easy!  
# for index, row in nodes.iterrows():
#    g.add_node(row['Id'])

for index, row in df.iterrows():
    n1 = g.add_node(row['source'])
    n1['type'] = row['sType']
    n2 = g.add_node(row['target'])
    n2['type'] = row['tType']
    e = g.add_edge(n1, n2)
    e['predicate'] = row['type']


In [38]:
fname = "./output/testGraphML.xml"
parser = GraphMLParser()
parser.write(g, fname)