In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, HiveContext
from pyspark.sql.functions import *

In [2]:
from IPython.display import display
import pandas as pd

In [3]:
from PySPARQL.Wrapper import PySPARQLWrapper

In [4]:
import timeit
import warnings
warnings.filterwarnings('ignore')

### Connection to Apache Spark

In [5]:
spark = (SparkSession
          .builder
          .appName("interfacing spark sql to hive metastore without configuration file")
          .config("hive.metastore.uris", "thrift://hive-metastore:9083") 
          .enableHiveSupport() 
          .getOrCreate())

### SPARQL query endpoint

In [6]:
sparql_endpoint = "http://ontop:8080/sparql"

### Timeit configuration

In [7]:
repeat = 10
loop = 1

def Average(lst):
    total = 0
    for ele in range(0, len(lst)):
        total = total + lst[ele]
    return total / len(lst)

### Q1

In [111]:
# Get execution times

code ='''
query = """
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX foaf: <http://xmlns.com/foaf/0.1/>
        PREFIX schema: <http://schema.org/>
        PREFIX rev: <http://purl.org/stuff/rev#>
        PREFIX edm: <http://www.europeana.eu/schemas/edm/>
        PREFIX dc: <http://purl.org/dc/elements/1.1/>
        PREFIX gr: <http://purl.org/goodrelations/v1#>
        PREFIX dcterms: <http://purl.org/dc/terms/>
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        PREFIX bsbm: <http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/>

        SELECT DISTINCT ?label ?value
        WHERE {
            ?product rdfs:label ?label .
            ?product bsbm:productPropertyNumeric1 ?value .
            ?product rdf:type bsbm:Product .
            ?product bsbm:producer ?producer .
            ?producer rdf:type bsbm:Producer .
            ?producer foaf:homepage ?hp .
            ?review bsbm:reviewFor ?product .
            ?review rdf:type schema:Review .
            ?review rev:reviewer ?pers .
            ?pers foaf:name ?fn .
            ?pers edm:country ?cn .
            ?offer bsbm:product ?product .
            ?offer rdf:type schema:Offer .
            FILTER (?value > 102)
        }
        ORDER BY ?label
        LIMIT 10
"""

wrapper = PySPARQLWrapper(spark, sparql_endpoint)
result = wrapper.query(query)
resultDF = result.dataFrame
'''

result = timeit.repeat(code, repeat=repeat, globals=globals() , number=loop)

print("TIME")
print(result)

print("\nAVG_TIME")
print(Average(result))

TIME
[1.9698936240001785, 1.776905861999694, 1.6356732259991986, 1.2230020500001046, 1.3031294909997087, 1.8124921300004644, 1.3802593139998862, 1.3958868429999711, 1.5133209960004024, 1.4760353740002756]

AVG_TIME
1.5486598909999885


In [112]:
# Run the query

query = """
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX foaf: <http://xmlns.com/foaf/0.1/>
        PREFIX schema: <http://schema.org/>
        PREFIX rev: <http://purl.org/stuff/rev#>
        PREFIX edm: <http://www.europeana.eu/schemas/edm/>
        PREFIX dc: <http://purl.org/dc/elements/1.1/>
        PREFIX gr: <http://purl.org/goodrelations/v1#>
        PREFIX dcterms: <http://purl.org/dc/terms/>
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        PREFIX bsbm: <http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/>

        SELECT DISTINCT ?label ?value
        WHERE {
            ?product rdfs:label ?label .
            ?product bsbm:productPropertyNumeric1 ?value .
            ?product rdf:type bsbm:Product .
            ?product bsbm:producer ?producer .
            ?producer rdf:type bsbm:Producer .
            ?producer foaf:homepage ?hp .
            ?review bsbm:reviewFor ?product .
            ?review rdf:type schema:Review .
            ?review rev:reviewer ?pers .
            ?pers foaf:name ?fn .
            ?pers edm:country ?cn .
            ?offer bsbm:product ?product .
            ?offer rdf:type schema:Offer .
            FILTER (?value > 102)
        }
        ORDER BY ?label
        LIMIT 10
"""

wrapper = PySPARQLWrapper(spark, sparql_endpoint)
result = wrapper.query(query)
resultDF = result.dataFrame

# Visual plot
pandasDF = resultDF.toPandas()
display(pandasDF)

Unnamed: 0,label,value
0,'ahchoo',594
1,'coterie',1891
2,'desolates waging reveilles',374
3,'lignites rallying specters',133
4,'manner gatemen',831
5,'procreators taiwanese antigene',1504
6,'reexhibit wrang tarts',940
7,'resettling uncoagulated lowish',1743
8,'tither pettiness',1900
9,'vacillator mortifying',834


### Q2

In [113]:
# Get execution times

code ='''
query = """
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX foaf: <http://xmlns.com/foaf/0.1/>
        PREFIX schema: <http://schema.org/>
        PREFIX rev: <http://purl.org/stuff/rev#>
        PREFIX edm: <http://www.europeana.eu/schemas/edm/>
        PREFIX dc: <http://purl.org/dc/elements/1.1/>
        PREFIX gr: <http://purl.org/goodrelations/v1#>
        PREFIX dcterms: <http://purl.org/dc/terms/>
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        PREFIX bsbm: <http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/>

        SELECT ?label ?comment ?producer ?price
            ?propertyTextual1 ?propertyTextual2 ?propertyTextual3
            ?propertyNumeric1 ?propertyNumeric2 ?propertyTextual4
            ?propertyTextual5 ?propertyNumeric4
        WHERE {
            ?p rdfs:label ?label .
            ?p rdfs:comment ?comment .
            ?p bsbm:producer ?producer .
            ?p dc:publisher ?ps .
            ?offer bsbm:product ?p .
            ?offer bsbm:price ?price .
            ?p bsbm:productPropertyTextual1 ?propertyTextual1 .
            ?p bsbm:productPropertyTextual2 ?propertyTextual2 .
            ?p bsbm:productPropertyTextual3 ?propertyTextual3 .
            ?p bsbm:productPropertyNumeric1 ?propertyNumeric1 .
            ?p bsbm:productPropertyNumeric2 ?propertyNumeric2 .
            ?p bsbm:productPropertyTextual4 ?propertyTextual4 .
            ?p bsbm:productPropertyTextual5 ?propertyTextual5 .
            ?p bsbm:productPropertyNumeric4 ?propertyNumeric4 .
        }
"""

wrapper = PySPARQLWrapper(spark, sparql_endpoint)
result = wrapper.query(query)
resultDF = result.dataFrame
'''

result = timeit.repeat(code, repeat=repeat, globals=globals() , number=loop)

print("TIME")
print(result)

print("\nAVG_TIME")
print(Average(result))

TIME
[2.6481286369998998, 2.013073076999717, 1.3909098190006262, 1.7910241549998318, 1.4756068000006053, 1.5118295810007112, 2.581986535999931, 1.5044501989996206, 1.2923217909992673, 1.2762878249995993]

AVG_TIME
1.7485618419999809


In [114]:
# Run the query

query = """
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX foaf: <http://xmlns.com/foaf/0.1/>
        PREFIX schema: <http://schema.org/>
        PREFIX rev: <http://purl.org/stuff/rev#>
        PREFIX edm: <http://www.europeana.eu/schemas/edm/>
        PREFIX dc: <http://purl.org/dc/elements/1.1/>
        PREFIX gr: <http://purl.org/goodrelations/v1#>
        PREFIX dcterms: <http://purl.org/dc/terms/>
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        PREFIX bsbm: <http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/>

        SELECT ?label ?comment ?producer ?price
            ?propertyTextual1 ?propertyTextual2 ?propertyTextual3
            ?propertyNumeric1 ?propertyNumeric2 ?propertyTextual4
            ?propertyTextual5 ?propertyNumeric4
        WHERE {
            ?p rdfs:label ?label .
            ?p rdfs:comment ?comment .
            ?p bsbm:producer ?producer .
            ?p dc:publisher ?ps .
            ?offer bsbm:product ?p .
            ?offer bsbm:price ?price .
            ?p bsbm:productPropertyTextual1 ?propertyTextual1 .
            ?p bsbm:productPropertyTextual2 ?propertyTextual2 .
            ?p bsbm:productPropertyTextual3 ?propertyTextual3 .
            ?p bsbm:productPropertyNumeric1 ?propertyNumeric1 .
            ?p bsbm:productPropertyNumeric2 ?propertyNumeric2 .
            ?p bsbm:productPropertyTextual4 ?propertyTextual4 .
            ?p bsbm:productPropertyTextual5 ?propertyTextual5 .
            ?p bsbm:productPropertyNumeric4 ?propertyNumeric4 .
        }
"""

wrapper = PySPARQLWrapper(spark, sparql_endpoint)
result = wrapper.query(query)
resultDF = result.dataFrame

# Visual plot
pandasDF = resultDF.toPandas()
display(pandasDF)

Unnamed: 0,label,comment,producer,price,propertyTextual1,propertyTextual2,propertyTextual3,propertyNumeric1,propertyNumeric2,propertyTextual4,propertyTextual5,propertyNumeric4
0,'ahchoo','chanceman ventrals phlegmy vower matureness f...,http://example.com/Producer/1,8.56176E3,'whirs radiation overman violative adulators b...,'vichies resituates breads visard unfought adj...,'unreels voicer acidifiers shredding fistula u...,594,434,,'hyperbolas knouted eulogists',-1
1,'ahchoo','chanceman ventrals phlegmy vower matureness f...,http://example.com/Producer/1,3.03146E3,'whirs radiation overman violative adulators b...,'vichies resituates breads visard unfought adj...,'unreels voicer acidifiers shredding fistula u...,594,434,,'hyperbolas knouted eulogists',-1
2,'vacillator mortifying','workaholics situating repartees mobilizers an...,http://example.com/Producer/1,8.35118E3,'egoisms welterweight friendship topsoiling se...,'unacceptance antibody reinflamed rehardens','steeper aerifies unum overbalanced discipline...,834,450,,,202
3,'desolates waging reveilles','poleward sagest impellers enjoyed tailpipes r...,http://example.com/Producer/1,9.54292E3,'catarrhs triadisms foreseen wennier scurviest...,'chilblains intertribal balsamic exotism reint...,'digged glockenspiels hearted sapid weeny coll...,374,536,'limeades iterances expressionist sculked supp...,,813
4,'tither pettiness','buskin recompensable capacitances bootee lock...,http://example.com/Producer/1,7.09958E3,'solidest incarnation arrayers gruelingly hono...,'builders preadjusting cpl fosterage trulls fa...,'dermatologies flopover noctambulation frizzle...,1900,774,,,-1
...,...,...,...,...,...,...,...,...,...,...,...,...
195,'reexhibit wrang tarts','bihourly prosiest matrixes jaggedest violinis...,http://example.com/Producer/1,7.63992E3,'skinfuls uncertainty craving incas maenades f...,'quartered mishmosh booms globoid syndicating ...,'visas deprecatingly conceptualist reactivatio...,940,290,,,-1
196,'tither pettiness','buskin recompensable capacitances bootee lock...,http://example.com/Producer/1,9.20969E3,'solidest incarnation arrayers gruelingly hono...,'builders preadjusting cpl fosterage trulls fa...,'dermatologies flopover noctambulation frizzle...,1900,774,,,-1
197,'resettling uncoagulated lowish','reprice renovating chevrolets refolds fantast...,http://example.com/Producer/1,2.06776E3,'unrestored kneeler chaplet newts deckle vegas...,'replicates corks cinematheques charmers licit...,'cowiest crimpers unmuzzles repacify poilu',1743,136,,,-1
198,'manner gatemen','lordlings dialyzed hoardings palmitate resist...,http://example.com/Producer/1,1.49017E3,'guzzling jillion psychotherapists substantiat...,'recommendation embezzler reconviction misprop...,'decentralizations impacting promulgations bib...,831,312,'distracts universally trashily enervator',,-1


### Q3

In [115]:
# Get execution times

code ='''
query = """
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX foaf: <http://xmlns.com/foaf/0.1/>
        PREFIX schema: <http://schema.org/>
        PREFIX rev: <http://purl.org/stuff/rev#>
        PREFIX edm: <http://www.europeana.eu/schemas/edm/>
        PREFIX dc: <http://purl.org/dc/elements/1.1/>
        PREFIX gr: <http://purl.org/goodrelations/v1#>
        PREFIX dcterms: <http://purl.org/dc/terms/>
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        PREFIX bsbm: <http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/>

        SELECT ?product ?label ?p1 ?p3
        WHERE {
            ?product rdfs:label ?label .
            ?product bsbm:producer ?producer .
            ?review bsbm:reviewFor ?product .
            ?review rdf:type schema:Review .
            ?product bsbm:productPropertyNumeric1 ?p1 .
            ?product bsbm:productPropertyNumeric3 ?p3 .
            FILTER (?p1 > 1800)
            FILTER (?p3 < 5 )
        }
        ORDER BY ?label
        LIMIT 10
"""

wrapper = PySPARQLWrapper(spark, sparql_endpoint)
result = wrapper.query(query)
resultDF = result.dataFrame
'''

result = timeit.repeat(code, repeat=repeat, globals=globals() , number=loop)

print("TIME")
print(result)

print("\nAVG_TIME")
print(Average(result))

TIME
[1.27481309399991, 0.6865023899999869, 0.8077077090001694, 1.0947284320000108, 1.0224303380000492, 0.9940615169998637, 0.8249554069998339, 0.8511201930004972, 0.8374584960001812, 0.7681802589995641]

AVG_TIME
0.9161957835000066


In [116]:
# Run the query

query = """
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX foaf: <http://xmlns.com/foaf/0.1/>
        PREFIX schema: <http://schema.org/>
        PREFIX rev: <http://purl.org/stuff/rev#>
        PREFIX edm: <http://www.europeana.eu/schemas/edm/>
        PREFIX dc: <http://purl.org/dc/elements/1.1/>
        PREFIX gr: <http://purl.org/goodrelations/v1#>
        PREFIX dcterms: <http://purl.org/dc/terms/>
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        PREFIX bsbm: <http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/>

        SELECT ?product ?label ?p1 ?p3
        WHERE {
            ?product rdfs:label ?label .
            ?product bsbm:producer ?producer .
            ?review bsbm:reviewFor ?product .
            ?review rdf:type schema:Review .
            ?product bsbm:productPropertyNumeric1 ?p1 .
            ?product bsbm:productPropertyNumeric3 ?p3 .
            FILTER (?p1 > 1800)
            FILTER (?p3 < 5 )
        }
        ORDER BY ?label
        LIMIT 10
"""

wrapper = PySPARQLWrapper(spark, sparql_endpoint)
result = wrapper.query(query)
resultDF = result.dataFrame

# Visual plot
pandasDF = resultDF.toPandas()
display(pandasDF)

Unnamed: 0,product,label,p1,p3


### Q4

In [117]:
# Get execution times

code ='''
query = """
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX foaf: <http://xmlns.com/foaf/0.1/>
        PREFIX schema: <http://schema.org/>
        PREFIX rev: <http://purl.org/stuff/rev#>
        PREFIX edm: <http://www.europeana.eu/schemas/edm/>
        PREFIX dc: <http://purl.org/dc/elements/1.1/>
        PREFIX gr: <http://purl.org/goodrelations/v1#>
        PREFIX dcterms: <http://purl.org/dc/terms/>
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        PREFIX bsbm: <http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/>

        SELECT DISTINCT ?label ?c ?propertyTextual ?p1
        WHERE {
            ?product rdfs:label ?label .
            ?product bsbm:producer ?pcr .
            ?pcr edm:country ?c .
            ?pcr foaf:homepage ?h .
            ?offer gr:validFrom ?vf .
            ?offer bsbm:product ?product .
            ?review bsbm:reviewFor ?product .
            ?review rev:reviewer ?pers .
            ?pers foaf:name ?fn .
            ?pers edm:country ?cn .
            ?product bsbm:productPropertyTextual1 ?propertyTextual .
            ?product bsbm:productPropertyNumeric1 ?p1 .
            FILTER (?p1 > 630)
        }
        ORDER BY ?label
        LIMIT 10
"""

wrapper = PySPARQLWrapper(spark, sparql_endpoint)
result = wrapper.query(query)
resultDF = result.dataFrame
'''

result = timeit.repeat(code, repeat=repeat, globals=globals() , number=loop)

print("TIME")
print(result)

print("\nAVG_TIME")
print(Average(result))

TIME
[1.3783949899998333, 1.5600478980004482, 1.270196228999339, 1.571367683999597, 1.4738308010000765, 0.9898938100004671, 1.1092333090000466, 1.1712226170002396, 1.0478586429999268, 1.1309514750000744]

AVG_TIME
1.270299745600005


In [118]:
# Run the query

query = """
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX foaf: <http://xmlns.com/foaf/0.1/>
        PREFIX schema: <http://schema.org/>
        PREFIX rev: <http://purl.org/stuff/rev#>
        PREFIX edm: <http://www.europeana.eu/schemas/edm/>
        PREFIX dc: <http://purl.org/dc/elements/1.1/>
        PREFIX gr: <http://purl.org/goodrelations/v1#>
        PREFIX dcterms: <http://purl.org/dc/terms/>
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        PREFIX bsbm: <http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/>

        SELECT DISTINCT ?label ?c ?propertyTextual ?p1
        WHERE {
            ?product rdfs:label ?label .
            ?product bsbm:producer ?pcr .
            ?pcr edm:country ?c .
            ?pcr foaf:homepage ?h .
            ?offer gr:validFrom ?vf .
            ?offer bsbm:product ?product .
            ?review bsbm:reviewFor ?product .
            ?review rev:reviewer ?pers .
            ?pers foaf:name ?fn .
            ?pers edm:country ?cn .
            ?product bsbm:productPropertyTextual1 ?propertyTextual .
            ?product bsbm:productPropertyNumeric1 ?p1 .
            FILTER (?p1 > 630)
        }
        ORDER BY ?label
        LIMIT 10
"""

wrapper = PySPARQLWrapper(spark, sparql_endpoint)
result = wrapper.query(query)
resultDF = result.dataFrame

# Visual plot
pandasDF = resultDF.toPandas()
display(pandasDF)

Unnamed: 0,label,c,propertyTextual,p1
0,'coterie',DE,'scalded decoct practitioners infolds levered ...,1891
1,'manner gatemen',DE,'guzzling jillion psychotherapists substantiat...,831
2,'procreators taiwanese antigene',DE,'exceptionally replan aiming bedstraws tragica...,1504
3,'reexhibit wrang tarts',DE,'skinfuls uncertainty craving incas maenades f...,940
4,'resettling uncoagulated lowish',DE,'unrestored kneeler chaplet newts deckle vegas...,1743
5,'tither pettiness',DE,'solidest incarnation arrayers gruelingly hono...,1900
6,'vacillator mortifying',DE,'egoisms welterweight friendship topsoiling se...,834


### Q5

In [101]:
# Get execution times

code ='''
query = """
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX foaf: <http://xmlns.com/foaf/0.1/>
        PREFIX schema: <http://schema.org/>
        PREFIX rev: <http://purl.org/stuff/rev#>
        PREFIX edm: <http://www.europeana.eu/schemas/edm/>
        PREFIX dc: <http://purl.org/dc/elements/1.1/>
        PREFIX gr: <http://purl.org/goodrelations/v1#>
        PREFIX dcterms: <http://purl.org/dc/terms/>
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        PREFIX bsbm: <http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/>

        SELECT DISTINCT ?productLabel ?simProperty1 ?simProperty2
        WHERE {
            ?product rdfs:label ?productLabel .
            ?product bsbm:productPropertyNumeric1 ?simProperty1 .
            ?product bsbm:productPropertyNumeric2 ?simProperty2 .
            ?product bsbm:producer ?producer .
            ?review bsbm:reviewFor ?product .
            ?review rdf:type schema:Review .
            ?offer bsbm:product ?product .
            ?offer rdf:type schema:Offer .
            FILTER (?simProperty1 < 120)
            FILTER (?productLabel != "wineskins banded crc")
            FILTER (?simProperty2 < 170)
        }
        ORDER BY ?productLabel
        LIMIT 5
"""

wrapper = PySPARQLWrapper(spark, sparql_endpoint)
result = wrapper.query(query)
resultDF = result.dataFrame
'''

result = timeit.repeat(code, repeat=repeat, globals=globals() , number=loop)

print("TIME")
print(result)

print("\nAVG_TIME")
print(Average(result))

TIME
[1.2590977930003646, 1.173136956000235, 1.1335630889998356, 1.1543671050003468, 1.1225111929998093, 1.156528384000012, 1.1621529079993707, 1.2123576220001269, 1.293971555999633, 1.3598593779997827]

AVG_TIME
1.2027545983999517


In [102]:
# Run the query

query = """
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX foaf: <http://xmlns.com/foaf/0.1/>
        PREFIX schema: <http://schema.org/>
        PREFIX rev: <http://purl.org/stuff/rev#>
        PREFIX edm: <http://www.europeana.eu/schemas/edm/>
        PREFIX dc: <http://purl.org/dc/elements/1.1/>
        PREFIX gr: <http://purl.org/goodrelations/v1#>
        PREFIX dcterms: <http://purl.org/dc/terms/>
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        PREFIX bsbm: <http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/>

        SELECT DISTINCT ?productLabel ?simProperty1 ?simProperty2
        WHERE {
            ?product rdfs:label ?productLabel .
            ?product bsbm:productPropertyNumeric1 ?simProperty1 .
            ?product bsbm:productPropertyNumeric2 ?simProperty2 .
            ?product bsbm:producer ?producer .
            ?review bsbm:reviewFor ?product .
            ?review rdf:type schema:Review .
            ?offer bsbm:product ?product .
            ?offer rdf:type schema:Offer .
            FILTER (?simProperty1 < 120)
            FILTER (?productLabel != "wineskins banded crc")
            FILTER (?simProperty2 < 170)
        }
        ORDER BY ?productLabel
        LIMIT 5
"""

wrapper = PySPARQLWrapper(spark, sparql_endpoint)
result = wrapper.query(query)
resultDF = result.dataFrame

# Visual plot
pandasDF = resultDF.toPandas()
display(pandasDF)

Unnamed: 0,productLabel,simProperty1,simProperty2


### Q7

In [103]:
# Get execution times

code ='''
query = """
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX foaf: <http://xmlns.com/foaf/0.1/>
        PREFIX schema: <http://schema.org/>
        PREFIX rev: <http://purl.org/stuff/rev#>
        PREFIX edm: <http://www.europeana.eu/schemas/edm/>
        PREFIX dc: <http://purl.org/dc/elements/1.1/>
        PREFIX gr: <http://purl.org/goodrelations/v1#>
        PREFIX dcterms: <http://purl.org/dc/terms/>
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        PREFIX bsbm: <http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/>

        SELECT ?productLabel ?price ?vendor ?revTitle
        ?reviewer ?rating1 ?rating2 ?product ?revName
        WHERE {
            ?product rdfs:label ?productLabel .
            ?product rdf:type bsbm:Product .
            ?offer bsbm:product ?product .
            ?offer bsbm:price ?price .
            ?offer bsbm:vendor ?vendor .
            ?offer bsbm:validTo ?date .
            ?review bsbm:reviewFor ?product .
            ?review rev:reviewer ?reviewer .
            ?review dc:title ?revTitle .
            ?review bsbm:rating1 ?rating1 .
            ?review bsbm:rating2 ?rating2 .
            ?reviewer foaf:name ?revName .
            ?reviewer a foaf:Person .
            FILTER (?price > 5000)
            FILTER (str(?product) = "http://example.com/Product/9")
        }
"""

wrapper = PySPARQLWrapper(spark, sparql_endpoint)
result = wrapper.query(query)
resultDF = result.dataFrame
'''

result = timeit.repeat(code, repeat=repeat, globals=globals() , number=loop)

print("TIME")
print(result)

print("\nAVG_TIME")
print(Average(result))

TIME
[1.5266206239994062, 1.4740476220003984, 1.4697140189991842, 1.4501250019993677, 1.4997575239995058, 1.5712831569999253, 1.492969947000347, 1.4701914369998121, 1.4815737540002374, 1.913996013000542]

AVG_TIME
1.5350279098998727


In [104]:
# Run the query

query = """
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX foaf: <http://xmlns.com/foaf/0.1/>
        PREFIX schema: <http://schema.org/>
        PREFIX rev: <http://purl.org/stuff/rev#>
        PREFIX edm: <http://www.europeana.eu/schemas/edm/>
        PREFIX dc: <http://purl.org/dc/elements/1.1/>
        PREFIX gr: <http://purl.org/goodrelations/v1#>
        PREFIX dcterms: <http://purl.org/dc/terms/>
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        PREFIX bsbm: <http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/>

        SELECT ?productLabel ?price ?vendor ?revTitle
        ?reviewer ?rating1 ?rating2 ?product ?revName
        WHERE {
            ?product rdfs:label ?productLabel .
            ?product rdf:type bsbm:Product .
            ?offer bsbm:product ?product .
            ?offer bsbm:price ?price .
            ?offer bsbm:vendor ?vendor .
            ?offer bsbm:validTo ?date .
            ?review bsbm:reviewFor ?product .
            ?review rev:reviewer ?reviewer .
            ?review dc:title ?revTitle .
            ?review bsbm:rating1 ?rating1 .
            ?review bsbm:rating2 ?rating2 .
            ?reviewer foaf:name ?revName .
            ?reviewer a foaf:Person .
            FILTER (?price > 5000)
            FILTER (str(?product) = "http://example.com/Product/9")
        }
"""

wrapper = PySPARQLWrapper(spark, sparql_endpoint)
result = wrapper.query(query)
resultDF = result.dataFrame

# Visual plot
pandasDF = resultDF.toPandas()
display(pandasDF)

Unnamed: 0,productLabel,price,vendor,revTitle,reviewer,rating1,rating2,product,revName
0,'procreators taiwanese antigene',8092.71,1,'improves pervertedly kevils canneries',http://example.com/Person/3,7,10,http://example.com/Product/9,'Danijela-Adalbrand'
1,'procreators taiwanese antigene',9236.12,1,'improves pervertedly kevils canneries',http://example.com/Person/3,7,10,http://example.com/Product/9,'Danijela-Adalbrand'
2,'procreators taiwanese antigene',7269.04,1,'slavishness muonic unencumbered horsepox vine...,http://example.com/Person/2,5,10,http://example.com/Product/9,'Eyana-Aurelianus'
3,'procreators taiwanese antigene',9236.12,1,'slavishness muonic unencumbered horsepox vine...,http://example.com/Person/2,5,10,http://example.com/Product/9,'Eyana-Aurelianus'
4,'procreators taiwanese antigene',9236.12,1,'recharting wardrobes divvies rhetorics phanta...,http://example.com/Person/3,-1,4,http://example.com/Product/9,'Danijela-Adalbrand'
5,'procreators taiwanese antigene',8725.34,1,'heehawed underway knickers scroungier hypnoti...,http://example.com/Person/3,-1,4,http://example.com/Product/9,'Danijela-Adalbrand'
6,'procreators taiwanese antigene',6957.79,1,'footworn forms nonflammable abator editoriali...,http://example.com/Person/5,8,5,http://example.com/Product/9,'Przemek-Berte'
7,'procreators taiwanese antigene',8725.34,1,'slavishness muonic unencumbered horsepox vine...,http://example.com/Person/2,5,10,http://example.com/Product/9,'Eyana-Aurelianus'
8,'procreators taiwanese antigene',9236.12,1,'footworn forms nonflammable abator editoriali...,http://example.com/Person/5,8,5,http://example.com/Product/9,'Przemek-Berte'
9,'procreators taiwanese antigene',8725.34,1,'improves pervertedly kevils canneries',http://example.com/Person/3,7,10,http://example.com/Product/9,'Danijela-Adalbrand'


### Q8

In [105]:
# Get execution times

code ='''
query = """
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX foaf: <http://xmlns.com/foaf/0.1/>
        PREFIX schema: <http://schema.org/>
        PREFIX rev: <http://purl.org/stuff/rev#>
        PREFIX edm: <http://www.europeana.eu/schemas/edm/>
        PREFIX dc: <http://purl.org/dc/elements/1.1/>
        PREFIX gr: <http://purl.org/goodrelations/v1#>
        PREFIX dcterms: <http://purl.org/dc/terms/>
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        PREFIX bsbm: <http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/>

        SELECT DISTINCT ?title ?text ?reviewDate ?reviewer
                        ?reviewerName ?rating1 ?rating2
                        ?rating3 ?rating4 ?product
        WHERE {
            ?product rdfs:label  ?label .
            ?product bsbm:productPropertyTextual1 ?pt .
            ?product bsbm:producer ?producer .
            ?producer edm:country ?c .
            ?producer foaf:homepage ?h .
            ?review bsbm:reviewFor ?product .
            ?review dc:title ?title .
            ?review rev:text ?text .
            ?review bsbm:reviewDate ?reviewDate .
            ?review rev:reviewer ?reviewer .
            ?review bsbm:rating1 ?rating1 .
            ?review bsbm:rating2 ?rating2 .
            ?review bsbm:rating3 ?rating3 .
            ?review bsbm:rating4 ?rating4 .
            ?reviewer foaf:name ?reviewerName .
            ?reviewer a foaf:Person .
            FILTER (str(?product) = "http://example.com/Product/9")
        }
        ORDER BY DESC(?reviewDate)
        LIMIT 9
"""

wrapper = PySPARQLWrapper(spark, sparql_endpoint)
result = wrapper.query(query)
resultDF = result.dataFrame
'''

result = timeit.repeat(code, repeat=repeat, globals=globals() , number=loop)

print("TIME")
print(result)

print("\nAVG_TIME")
print(Average(result))

TIME
[1.7959132030000546, 1.7305863510000563, 1.8966450699999768, 1.9551388760000918, 1.8935270570000284, 1.8750050440003179, 1.926615794999634, 2.159048728999551, 1.9835431880001124, 1.8991601660000015]

AVG_TIME
1.9115183478999824


In [106]:
# Run the query

query = """
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX foaf: <http://xmlns.com/foaf/0.1/>
        PREFIX schema: <http://schema.org/>
        PREFIX rev: <http://purl.org/stuff/rev#>
        PREFIX edm: <http://www.europeana.eu/schemas/edm/>
        PREFIX dc: <http://purl.org/dc/elements/1.1/>
        PREFIX gr: <http://purl.org/goodrelations/v1#>
        PREFIX dcterms: <http://purl.org/dc/terms/>
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        PREFIX bsbm: <http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/>

        SELECT DISTINCT ?title ?text ?reviewDate ?reviewer
                        ?reviewerName ?rating1 ?rating2
                        ?rating3 ?rating4 ?product
        WHERE {
            ?product rdfs:label  ?label .
            ?product bsbm:productPropertyTextual1 ?pt .
            ?product bsbm:producer ?producer .
            ?producer edm:country ?c .
            ?producer foaf:homepage ?h .
            ?review bsbm:reviewFor ?product .
            ?review dc:title ?title .
            ?review rev:text ?text .
            ?review bsbm:reviewDate ?reviewDate .
            ?review rev:reviewer ?reviewer .
            ?review bsbm:rating1 ?rating1 .
            ?review bsbm:rating2 ?rating2 .
            ?review bsbm:rating3 ?rating3 .
            ?review bsbm:rating4 ?rating4 .
            ?reviewer foaf:name ?reviewerName .
            ?reviewer a foaf:Person .
            FILTER (str(?product) = "http://example.com/Product/9")
        }
        ORDER BY DESC(?reviewDate)
        LIMIT 9
"""

wrapper = PySPARQLWrapper(spark, sparql_endpoint)
result = wrapper.query(query)
resultDF = result.dataFrame

# Visual plot
pandasDF = resultDF.toPandas()
display(pandasDF)

Unnamed: 0,title,text,reviewDate,reviewer,reviewerName,rating1,rating2,rating3,rating4,product
0,'heehawed underway knickers scroungier hypnoti...,'brisker astrophysicists elations chlorines va...,2008-03-23,http://example.com/Person/3,'Danijela-Adalbrand',-1,4,-1,9,http://example.com/Product/9
1,'affirmance subagency diptych donne tipoffs ro...,'southwesterners denier bicycles veneers super...,2008-01-16,http://example.com/Person/4,'Allegra-Walburga',2,3,10,6,http://example.com/Product/9
2,'slavishness muonic unencumbered horsepox vine...,'philatelist lugubrious unknots hummocky gonge...,2008-01-14,http://example.com/Person/2,'Eyana-Aurelianus',5,10,-1,-1,http://example.com/Product/9
3,'footworn forms nonflammable abator editoriali...,'breedings silences respirating litterateurs e...,2007-12-04,http://example.com/Person/5,'Przemek-Berte',8,5,2,1,http://example.com/Product/9
4,'recharting wardrobes divvies rhetorics phanta...,'moxas clangored haughtier rascals southerly s...,2007-11-16,http://example.com/Person/3,'Danijela-Adalbrand',-1,4,7,10,http://example.com/Product/9
5,'improves pervertedly kevils canneries','mapping constructions motiveless stalkier val...,2007-11-16,http://example.com/Person/3,'Danijela-Adalbrand',7,10,2,8,http://example.com/Product/9


# Free ?subj and ?pred

In [8]:
# Get execution times

code ='''
query = """
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

        SELECT ?subj ?pred
        WHERE {
            ?subj ?pred "1"^^xsd:int.
        }
"""

wrapper = PySPARQLWrapper(spark, sparql_endpoint)
result = wrapper.query(query)
resultDF = result.dataFrame
'''

result = timeit.repeat(code, repeat=repeat, globals=globals() , number=loop)

print("TIME")
print(result)

print("\nAVG_TIME")
print(Average(result))

TIME
[15.576336842000273, 12.693113297000309, 12.37595686899931, 12.529332037999666, 12.527171355999599, 13.605895274000432, 12.628288398000223, 12.030420683000557, 12.188646896000137, 12.152942999000516]

AVG_TIME
12.830810465200102


In [9]:
# Run the query

query = """
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

        SELECT ?subj ?pred
        WHERE {
            ?subj ?pred "1"^^xsd:int.
        }
"""

wrapper = PySPARQLWrapper(spark, sparql_endpoint)
result = wrapper.query(query)
resultDF = result.dataFrame

# Visual plot
pandasDF = resultDF.toPandas()
display(pandasDF)

Unnamed: 0,subj,pred
0,http://example.com/Review/31,http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01...
1,http://example.com/Review/65,http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01...
2,http://example.com/Review/94,http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01...
3,http://example.com/Review/59,http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01...
4,http://example.com/Review/10,http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01...
...,...,...
558,http://example.com/Product/4,http://purl.org/dc/elements/1.1/publisher
559,http://example.com/Product/8,http://purl.org/dc/elements/1.1/publisher
560,http://example.com/Product/7,http://purl.org/dc/elements/1.1/publisher
561,http://example.com/Product/10,http://purl.org/dc/elements/1.1/publisher


# Entailment regime query 
__attention__: usually takes several minutes 

In [110]:
# Get execution time (only 1 iteration)

code ='''
query = """
    SELECT ?s ?p ?o 
    WHERE { 
        ?s ?p ?o 
    }
"""

wrapper = PySPARQLWrapper(spark, sparql_endpoint)
result = wrapper.query(query)
resultDF = result.dataFrame
'''

result = timeit.repeat(code, repeat=1, globals=globals() , number=1)

print("TIME")
print(result)

TIME
[201.38486229599948]


In [10]:
# Run the query

query = """
    SELECT ?s ?p ?o 
    WHERE { 
        ?s ?p ?o 
    }
"""

wrapper = PySPARQLWrapper(spark, sparql_endpoint)
result = wrapper.query(query)
resultDF = result.dataFrame

# Visual plot
pandasDF = resultDF.toPandas()
display(pandasDF)

Unnamed: 0,s,p,o
0,http://example.com/Review/43,http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01...,http://example.com/Product/7
1,http://example.com/Review/92,http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01...,http://example.com/Product/2
2,http://example.com/Review/59,http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01...,http://example.com/Product/7
3,http://example.com/Review/16,http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01...,http://example.com/Product/3
4,http://example.com/Review/34,http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01...,http://example.com/Product/5
...,...,...,...
5086,http://example.com/Product/1,http://www.w3.org/2000/01/rdf-schema#comment,'lordlings dialyzed hoardings palmitate resist...
5087,http://example.com/Product/10,http://www.w3.org/2000/01/rdf-schema#comment,'filaree cirque vibrations leukemoid enquirer ...
5088,http://example.com/Product/6,http://www.w3.org/2000/01/rdf-schema#comment,'poleward sagest impellers enjoyed tailpipes r...
5089,http://example.com/Product/2,http://www.w3.org/2000/01/rdf-schema#comment,'naughtiness illuminating careerers computeres...
