# General statistics about the Ontoforce dataset


* Documentation of wrapper http://rdflib.github.io/sparqlwrapper/doc/latest/
* Statistical queries found here: https://code.google.com/p/void-impl/wiki/SPARQLQueriesForStatistics


In [1]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

In [2]:
def performSparqlQuery(sparqlURI, queryStr):
    sparql = SPARQLWrapper(sparqlURI)
    sparql.setQuery(queryStr)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results['results']['bindings']



In [3]:
sparql_endpoint = "http://ec2-54-172-160-219.compute-1.amazonaws.com"
port = 80

def generate_endpoint_uri(sparql_endpoint, port):
    return sparql_endpoint + ":" + str(port) + "/sparql"


virtuoso_endpoint = generate_endpoint_uri(sparql_endpoint,port)
print(virtuoso_endpoint)

http://ec2-54-172-160-219.compute-1.amazonaws.com:80/sparql


## 1. Number of triples => OK

In [18]:
queryString ="SELECT (COUNT(*) AS ?numtriples) { ?s ?p ?o  }"
results = performSparqlQuery(virtuoso_endpoint, queryString)

In [19]:
print(results)
print ("Number of triples is: " + str(results[0]['numtriples']['value']))
print ("2.37 billion triples")

[{'numtriples': {'value': '2374837593', 'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'type': 'typed-literal'}}]
Number of triples is: 2374837593
2.37 billion triples


## 2. Total number of entities => OK

In [22]:
queryString ="SELECT (COUNT(distinct ?s) AS ?numsubjects) { ?s a []  }"
results = performSparqlQuery(virtuoso_endpoint, queryString)

In [24]:
print(results)
print (results[0]['numsubjects']['value'])
print ("136.3 million entities ")

[{'numsubjects': {'value': '136313277', 'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'type': 'typed-literal'}}]
136313277
136.3 million entities 


## 3. Total number of distinct classes => OK

In [25]:
queryString = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
SELECT (COUNT(distinct ?o) AS ?distinctclasses) { ?s rdf:type ?o }
"""
results = performSparqlQuery(virtuoso_endpoint, queryString)

In [27]:
print(results)
print (results[0]['distinctclasses']['value'])
print("2434 distinct classes")

[{'distinctclasses': {'value': '2434', 'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'type': 'typed-literal'}}]
2434
2434 distinct classes


## 4. Total number of distinct predicates => OK

In [28]:
queryString = """
SELECT (COUNT(distinct ?p) as ?distpredicates) { ?s ?p ?o }
"""
results = performSparqlQuery(virtuoso_endpoint, queryString)

In [31]:
print(results)
print (results[0]['distpredicates']['value'])
print("1782 distinct predicates")

[{'distpredicates': {'value': '1782', 'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'type': 'typed-literal'}}]
1782
1782 distinct predicates


## Idea: show 

## 5. Total number of distinct subject nodes => Timeout

In [32]:
queryString = """
SELECT (COUNT(DISTINCT ?s ) AS ?distsubjects) {  ?s ?p ?o   } 
"""
results = performSparqlQuery(virtuoso_endpoint, queryString)

KeyboardInterrupt: 

In [None]:
print(results)
print (results[0]['distsubjects']['value'])

## 6. Total number of distinct object nodes => OK

In [16]:
queryString = """
SELECT (COUNT(DISTINCT ?o ) AS ?distobjects) {  ?s ?p ?o  filter(!isLiteral(?o)) } 
"""
results = performSparqlQuery(virtuoso_endpoint, queryString)

In [18]:
print(results)
print (results[0]['distobjects']['value'])
print ( '286.7 million object nodes')

[{'distobjects': {'type': 'typed-literal', 'value': '286749072', 'datatype': 'http://www.w3.org/2001/XMLSchema#integer'}}]
286749072
286.7 million object nodes


## 7. Exhaustive list of classes used in the dataset (NDA) => Timeout

In [None]:
queryString = """
SELECT DISTINCT ?type { ?s a ?type }
"""
results = performSparqlQuery(virtuoso_endpoint, queryString)




In [None]:
with open("OntoforceClasses.txt") as f:
    for c in results:
        f.write(c['type']['value'])
        

## 8. Exhaustive list of properties used in the dataset (NDA) => OK

In [4]:
queryString = """
SELECT DISTINCT ?p { ?s ?p ?o }
"""
results = performSparqlQuery(virtuoso_endpoint, queryString)

In [8]:
with open("OntoforceProperties.txt", 'w+') as f:
    for c in results:
        f.write(c['p']['value'] + "\n")

In [9]:
#IDEE: aantal predicates per namespace? ns.ontoforce.com, purl.rdf.ebi,... => wsch nog leuke plot? Toont hoeveel
#predicates van ontoforce zelf zijn?

## 9. Table: class vs. total number of instances of the class (NDA) => OK

In [14]:
queryString = """
SELECT  ?class (COUNT(?s) AS ?count ) { ?s a ?class } GROUP BY ?class ORDER BY ?count
"""
results = performSparqlQuery(virtuoso_endpoint, queryString)

In [15]:
with open("OntoforceInstancesPerClass.txt", 'w+') as f:
    for c in results:
        f.write(c['class']['value'] + "\t")
        f.write(c['count']['value'] + "\n")

In [None]:
## Distribution

classes = []
counts = []

for c in results:
        classes.append(c['class']['value'])
        counts.append(c['count']['value'])
        
cc_dict = { "classes": classes, "counts": counts}        

In [None]:
#classmethod DataFrame.from_dict(data, orient='columns', dtype=None)

## 10. Table: property vs. total number of triples using the property (NDA) => OK


In [10]:
queryString = """
SELECT  ?p (COUNT(?s) AS ?count ) { ?s ?p ?o } GROUP BY ?p ORDER BY ?count
"""
results = performSparqlQuery(virtuoso_endpoint, queryString)

In [12]:
with open("OntoforceTriplesPerProperty.txt",'w+') as f:

    for c in results:
        f.write(c['p']['value'] + "\t")
        f.write(c['count']['value'] + "\n")

In [None]:
## Distribution

predicates = []
counts = []

for c in results:
        predicates.append(c['p']['value'])
        counts.append(c['count']['value'])
        
cc_dict = { "predicates": predicates, "counts": counts}        

In [None]:
#classmethod DataFrame.from_dict(data, orient='columns', dtype=None)

## 11. Table: property vs. total number of distinct subjects in triples using the property  => Timeout

In [13]:
queryString = """
SELECT  ?p (COUNT(DISTINCT ?s ) AS ?count ) { ?s ?p ?o } GROUP BY ?p ORDER BY ?count
"""
results = performSparqlQuery(virtuoso_endpoint, queryString)

KeyboardInterrupt: 

In [None]:
with open("OntoforceDistinctSubjectsPerProperty.txt") as f:

    for c in results:
        f.write(c['p']['value'])
        f.write(c['count']['value'])

In [None]:
## Distribution

predicates = []
counts = []

for c in results:
        predicates.append(c['p']['value'])
        counts.append(c['count']['value'])
        
cc_dict = { "predicates": predicates, "counts": counts}        

In [None]:
#classmethod DataFrame.from_dict(data, orient='columns', dtype=None)

## 12. Table: property vs. total number of distinct objects in triples using the property => Timeout


In [19]:
queryString = """
SELECT  ?p (COUNT(DISTINCT ?o ) AS ?count ) { ?s ?p ?o } GROUP BY ?p ORDER BY ?count
"""
results = performSparqlQuery(virtuoso_endpoint, queryString)

KeyboardInterrupt: 

In [None]:
with open("OntoforceDistinctObjectsPerProperty.txt") as f:

    for c in results:
        f.write(c['p']['value'])
        f.write(c['count']['value'])

In [None]:
## Distribution

predicates = []
counts = []

for c in results:
        predicates.append(c['p']['value'])
        counts.append(c['count']['value'])
        
cc_dict = { "predicates": predicates, "counts": counts}      

In [None]:
#classmethod DataFrame.from_dict(data, orient='columns', dtype=None)