# SoftwareKG2: Confidence Computation for Reification Statements

In [None]:
import os
import datetime
from lxml import etree
import time

In [None]:
from rdflib import Graph, plugin, URIRef, Literal
from rdflib.serializer import Serializer
from rdflib.namespace import XSD, RDF, RDFS

## Setup and check

In [None]:
# Issue a query

from SPARQLWrapper import SPARQLWrapper, JSON

endpoint = 'http://your.endpoint.here:8890/sparql'

def exec_query(query):
    sparql = SPARQLWrapper(endpoint)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results




In [None]:
# Context information for serialisation
context = {
        "schema" : "http://schema.org/",
        "sms" : "http://data.gesis.org/somesci/",
        "nif" : "http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#",
        "wd" : "http://www.wikidata.org/entity/",
        "xsd": "http://www.w3.org/2001/XMLSchema#",
        "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
        "comment": "http://www.w3.org/2000/01/rdf-schema#comment",
        "datacite" : "http://purl.org/spar/datacite/",
        "rdf" : "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
        "doi" : "https://doi.org/",
        "dct" : "http://purl.org/dc/terms/",
        "dbpedia-owl" : "http://dbpedia.org/ontology/",
        "skgv" : "http://data.gesis.org/softwarekg/vocab/",
        "skg" : "http://data.gesis.org/softwarekg/PMC/",
        "irao" : "http://ontology.ethereal.cz/irao/"
}

In [None]:
# Test query...
qres = exec_query('SELECT (COUNT (*) as ?num) WHERE {?a ?p ?b}')


In [None]:
# ... and results
for result in qres["results"]["bindings"]:
    print(result["num"]["value"])


## Utilities

In [None]:
# Helper function for confidence computation
def build_table(query,leapsize,value_names):
    table = []
    offset = 0
    limit = leapsize
    modified_query = query.replace('{offset}',str(offset)).replace('{limit}',str(limit))
    result = exec_query(modified_query)

    while(len(result["results"]["bindings"]) > 0):
        for r in result["results"]["bindings"]:
            row = []
            for name in value_names:
                row.append(r[name]['value'])
            table.append(row)
            
        offset+=leapsize
        modified_query = query.replace('{offset}',str(offset)).replace('{limit}',str(limit))
        result = exec_query(modified_query)
    table  = sorted(table)
    return table


In [None]:
# Helper function for confidence computation
def remove_column_in_table(table,indexes):
    sorted_indexes = sorted(indexes,reverse = True)
    new_table = []
    for row in table:
        r = row.copy()
        for idx in sorted_indexes:
            del(r[idx])
        new_table.append(r)
    return new_table

In [None]:
# Helper function for confidence computation
def compare(row1, row2, indexes1, indexes2):
    for cmp_idx in range(len(indexes1)):
        idx1 = indexes1[cmp_idx]
        idx2 = indexes2[cmp_idx]
        if not row1[idx1]==row2[idx2]:
            return False
    return True

In [None]:
# Helper function for confidence computation
def outer_join(t1, t2, indexes1, indexes2):
    t3 = []
    start_idx=0
    for i1 in range(len(t1)):
        row= t1[i1].copy()
        for i2 in range(start_idx,len(t2)):
            start_idx=i2
            if compare(t1[i1],t2[i2],indexes1, indexes2):
                for i in range(len(t2[0])):
                    if not i in indexes2:
                        row.append(t2[i2][i])
                break
        t3.append(row)
    return t3

In [None]:
# Main function for computing confidence values
# Creates a table with [software, object, confidence score] for rows

def workflow(table1,table2,table3):
    table4 = outer_join(table1, table2, [0,1],[0,1])
    # document, software, spelling, v1, v2
    
    # Remove document
    table5 = remove_column_in_table(table4,[0])
    # software, spelling, v1, v2

    # Join with table 3
    # Re-sort by software
    table5 = sorted(table5)

    
    # Join over software
    table6 = outer_join(table5, table3, [0], [0])
    # software, spelling, v1, v2, v3
      
    # Division bzw. confidence per document
    table7 = []
    for row in table6:
        quotient = float(row[2]) / float(row[3])
        nrow = row.copy()
        nrow.append(quotient)
        table7.append(nrow)
     # software, spelling, v1, v2, v3, quotient  
    

    # Remove v1, v2 columns
    table8 = remove_column_in_table(table7,[3,2])
    # software, spelling, v3, quotient  

    # Sum aggregate
    table8 = sorted(table8)
    table9 = []
    currow = table8[0].copy()
    currsum = table8[0][3]
    for i in range(1,len(table8)):
        row = table8[i]

        if row[0] == currow[0] and row[1]==currow[1]:
            currsum += row[3]
        else:
            currow.append(currsum)
            table9.append(currow)
            currsum = row[3]
            currow = row.copy()
    currow.append(currsum)
    table9.append(currow)
    # software, spelling, v3, quotient, sum of quotient

    # Remove quotient column
    table10 = remove_column_in_table(table9,[3])
    # software, spelling, v3, sum of quotient
    
    # Division / sum / v3
    table11 = []
    for row in table10:
        confidence = float(row[3]) / float(row[2])
        nrow = row.copy()
        nrow.append(confidence)
        table11.append(nrow)
    # software, spelling,  v3, sum of quotient, confidence  
    
    # Remove v3 and sum of quotient columns
    table12= remove_column_in_table(table11,[3,2])
    # software, spelling,  confidence
    table4=None
    table5=None
    table6=None
    table7=None
    table8=None
    table9=None
    table10=None
    table11=None
    return table12
    

In [None]:
# Initialize a graph object
g=Graph()

# SoftwareType confidence

In [None]:
q1='''
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT ?document ?software ?type (COUNT(?type) as ?num_mention_per_type_per_software_per_document)
WHERE
{
    ?mention skgv:software ?software .
    ?mention skgv:softwareType ?type .
    ?document schema:mentions ?mention .
}
GROUP BY ?document ?software ?type
OFFSET {offset}
LIMIT {limit}
'''
table1 = build_table(q1, 1_000_000, ['document','software','type',
                                     'num_mention_per_type_per_software_per_document'])

In [None]:
q2="""
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT ?document ?software (COUNT(?software) AS ?num_mention_per_software_per_document)
WHERE
{
    ?mention skgv:software ?software .
    ?mention skgv:softwareType ?type .
    ?document schema:mentions ?mention .
}
GROUP BY ?document ?software
OFFSET {offset}
LIMIT {limit}
"""
table2 = build_table(q2, 1_000_000, ['document','software', 'num_mention_per_software_per_document'])

In [None]:
q3="""
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT COUNT(DISTINCT ?document) as ?num_article_with_software ?software 
WHERE
{
    ?mention skgv:software ?software .
    ?mention skgv:softwareType ?type .
    ?document schema:mentions ?mention .
}
GROUP BY  ?software
OFFSET {offset}
LIMIT {limit}
"""
table3 = build_table(q3, 1_000_000, ['software','num_article_with_software'])

In [None]:
swtype_table = workflow(table1,table2,table3)

In [None]:
# Add statements with confidence value
for idx, r in enumerate(swtype_table):
    node = URIRef("skg:inference/softwareType/{}".format(idx))
    g.add((node, RDF.type, RDF.Statement))
    g.add((node, RDF.subject, URIRef(r[0])))
    g.add((node, RDF.predicate ,URIRef("skgv:softwareType")))
    g.add((node, RDF.object, URIRef(r[1])))
    g.add((node, URIRef("skgv:confidence"), Literal(r[2], datatype=XSD.float)))
    if float(r[2]) < 1:
        print(r)

In [None]:
# Serialize
g.serialize(format="json-ld", context=context, destination="softwarekg-sw-type-confidence.jsonld")
g = Graph()

# PlugIns of host software

In [None]:
q1='''
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT ?document ?software ?plugIn (COUNT(?plugIn) as ?num_mention_per_plugin_per_software_per_document)
WHERE
{
    ?mention skgv:software ?software .
    ?mention skgv:referredToByPlugIn ?plugInm .
    ?plugInm skgv:software ?plugIn .
    ?document schema:mentions ?mention .
}
GROUP BY ?document ?software ?plugIn
OFFSET {offset}
LIMIT {limit}
'''
table1 = build_table(q1, 1_000_000, ['document','software','plugIn',
                                     'num_mention_per_plugin_per_software_per_document'])

In [None]:
q2="""
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT ?document ?software (COUNT(?software) AS ?num_mention_per_software_per_document)
WHERE
{
    ?mention skgv:software ?software .
    ?mention skgv:referredToByPlugIn ?anyPlugInm .
    ?document schema:mentions ?mention .
}
GROUP BY ?document ?software
OFFSET {offset}
LIMIT {limit}
"""
table2 = build_table(q2, 1_000_000, ['document','software', 'num_mention_per_software_per_document'])

In [None]:
q3="""
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT COUNT(DISTINCT ?document) as ?num_article_with_software ?software 
WHERE
{
    ?mention skgv:software ?software .
    ?mention skgv:referredToByPlugIn ?anyPlugInm .
    ?document schema:mentions ?mention .
}
GROUP BY  ?software
OFFSET {offset}
LIMIT {limit}
"""
table3 = build_table(q3, 1_000_000, ['software','num_article_with_software'])

In [None]:
plugin_table = workflow(table1,table2,table3)

In [None]:
# Add statements with confidence value
for idx, r in enumerate(plugin_table):
    node = URIRef("skg:inference/hasPlugIn/{}".format(idx))
    g.add((node, RDF.type, RDF.Statement))
    g.add((node, RDF.subject, URIRef(r[0])))
    g.add((node, RDF.predicate ,URIRef("schema:softwareAddOn")))
    g.add((node, RDF.object, URIRef(r[1])))
    g.add((node, URIRef("skgv:confidence"), Literal(r[2], datatype=XSD.float)))
    if float(r[2]) < 1:
        print(r)

In [None]:
# Serialize
g.serialize(format="json-ld", context=context, destination="softwarekg-plugin-confidence.jsonld")
g = Graph()

# Host software of plugIns

In [None]:
q1='''
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT ?document ?plugIn ?hostSoftware (COUNT(?hostSoftware) as ?num_mention_per_software_per_plugin_per_document)
WHERE
{
    ?hostMention skgv:software ?hostSoftware .
    ?hostMention skgv:referredToByPlugIn ?plugInMention .
    ?plugInMention skgv:software ?plugIn .
    ?document schema:mentions ?plugInMention .
}
GROUP BY ?document ?plugIn ?hostSoftware
OFFSET {offset}
LIMIT {limit}
'''
table1 = build_table(q1, 1_000_000, ['document','plugIn','hostSoftware',
                                     'num_mention_per_software_per_plugin_per_document'])

In [None]:
q2="""
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT ?document ?plugIn (COUNT(?plugIn) AS ?num_mention_per_plugin_per_document)
WHERE
{
    ?plugInMention skgv:software ?plugIn .
    ?anyHostMention skgv:referredToByPlugIn ?plugInMention .
    ?document schema:mentions ?plugInMention .
}
GROUP BY ?document ?plugIn
OFFSET {offset}
LIMIT {limit}
"""
table2 = build_table(q2, 1_000_000, ['document','plugIn', 'num_mention_per_plugin_per_document'])

In [None]:
q3="""
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT COUNT(DISTINCT ?document) as ?num_article_with_plugin ?plugIn 
WHERE
{
    ?plugInMention skgv:software ?plugIn .
    ?someHostMention skgv:referredToByPlugIn ?plugInMention .
    ?document schema:mentions ?plugInMention .
}
GROUP BY  ?plugIn
OFFSET {offset}
LIMIT {limit}
"""
table3 = build_table(q3, 1_000_000, ['plugIn','num_article_with_plugin'])

In [None]:
host_table = workflow(table1,table2,table3)

In [None]:
# Add statements with confidence value
g = Graph()
for idx, r in enumerate(host_table):
    node = URIRef("skg:inference/hasHostSoftware/{}".format(idx))
    g.add((node, RDF.type, RDF.Statement))
    g.add((node, RDF.subject, URIRef(r[0])))
    g.add((node, RDF.predicate ,URIRef("irao:isPartOfSystem")))
    g.add((node, RDF.object, URIRef(r[1])))
    g.add((node, URIRef("skgv:confidence"), Literal(r[2], datatype=XSD.float)))
    if float(r[2]) < 1:
        print(r)

In [None]:
# Serialize
g.serialize(format="json-ld", context=context, destination="softwarekg-hostsoftware-confidence.jsonld")
g = Graph()

# Software names

In [None]:
q1='''
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT ?document ?software ?spelling (COUNT(?spelling) as ?num_mention_per_spelling_per_software_per_document)
WHERE
{
    ?mention skgv:software ?software .
    ?mention nif:isString ?spelling .
    ?document schema:mentions ?mention .
}
GROUP BY ?document ?software ?spelling
OFFSET {offset}
LIMIT {limit}
'''
table1 = build_table(q1, 1_000_000, ['document','software','spelling',
                                     'num_mention_per_spelling_per_software_per_document'])

In [None]:
q2="""
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT ?document ?software (COUNT(?software) AS ?num_mention_per_software_per_document)
WHERE
{
    ?mention skgv:software ?software .
    ?document schema:mentions ?mention .
}
GROUP BY ?document ?software
OFFSET {offset}
LIMIT {limit}
"""
table2 = build_table(q2, 1_000_000, ['document','software', 'num_mention_per_software_per_document'])

In [None]:
q3="""
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT COUNT(DISTINCT ?document) as ?num_article_with_software ?software 
WHERE
{
    ?mention skgv:software ?software .
    ?document schema:mentions ?mention .
}
GROUP BY  ?software
OFFSET {offset}
LIMIT {limit}
"""
table3 = build_table(q3, 1_000_000, ['software','num_article_with_software'])

In [None]:
swname_table = workflow(table1,table2,table3)

In [None]:
# Add statements with confidence value
g=Graph()
for idx, r in enumerate(swname_table):
    node = URIRef("skg:inference/softwareName/{}".format(idx))
    g.add((node, RDF.type, RDF.Statement))
    g.add((node, RDF.subject, URIRef(r[0])))
    g.add((node, RDF.predicate ,URIRef("schema:name")))
    g.add((node, RDF.object, Literal(r[1])))
    g.add((node, URIRef("skgv:confidence"), Literal(r[2], datatype=XSD.float)))
    if float(r[2]) < 1.0:
        print(r)
    

In [None]:
# Serialize
g.serialize(format="json-ld", context=context, destination="softwarekg-sw-names-confidence.jsonld")
g = Graph()

# Developers

In [None]:
q1="""
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT ?document ?software ?developer (COUNT(?developer) as ?num_mention_per_developer_per_software_per_document)
WHERE
{
    ?mention skgv:referredToByDeveloper ?dev_mention .
    ?dev_mention nif:isString ?developer .
    ?mention skgv:software ?software .
    ?document schema:mentions ?mention .
}
GROUP BY ?document ?software ?developer
OFFSET {offset}
LIMIT {limit}
"""

table1 = build_table(q1, 1_000_000, ['document','software','developer',
                                     'num_mention_per_developer_per_software_per_document'])

In [None]:
q2="""
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT ?document ?software (COUNT(?software) AS ?num_mention_per_software_per_document)
WHERE
{
    ?mention skgv:referredToByDeveloper ?dev_mention .
    ?mention skgv:software ?software .
    ?document schema:mentions ?mention .
}
GROUP BY ?document ?software
OFFSET {offset}
LIMIT {limit}
"""
table2 = build_table(q2, 1_000_000, ['document','software', 'num_mention_per_software_per_document'])

In [None]:
q3="""
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT COUNT(DISTINCT ?document) as ?num_article_with_software ?software 
WHERE
{
    ?mention skgv:referredToByDeveloper ?dev_mention .
    ?mention skgv:software ?software .
    ?document schema:mentions ?mention .
}
GROUP BY  ?software
OFFSET {offset}
LIMIT {limit}
"""
table3 = build_table(q3, 1_000_000, ['software','num_article_with_software'])

In [None]:
dev_table = workflow(table1,table2,table3)

In [None]:
# Add statements with confidence value
g=Graph()
for idx, r in enumerate(dev_table):
    dev_id = "skg:developer/{}".format(idx)
    g.add((URIRef(dev_id), RDF.type, URIRef("schema:Organization")))
    g.add((URIRef(dev_id), URIRef("schema:name"), Literal(r[1])))
    node = URIRef("skg:inference/developer/{}".format(idx))
    g.add((node, RDF.type, RDF.Statement))
    g.add((node, RDF.subject, URIRef(r[0])))
    g.add((node, RDF.predicate ,URIRef("schema:developer")))
    g.add((node, RDF.object, URIRef(dev_id)))
    g.add((node, URIRef("skgv:confidence"), Literal(r[2], datatype=XSD.float)))
    if float(r[2]) < 1:
        print(r)  

In [None]:
# Serialize
g.serialize(format="json-ld", context=context, destination="softwarekg-developers-confidence.jsonld")
g = Graph()

# Versions

In [None]:
# Lift versions (no confidence scores here)
version_query = """
        PREFIX schema: <http://schema.org/>
        PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
        PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>
        
        SELECT 
            ?software ?version
        WHERE
        {
            ?mention skgv:referredToByVersion ?version_mention .
            ?version_mention nif:isString ?version .
            ?mention skgv:software ?software
        }
        GROUP BY ?software ?version
"""
print(time.ctime())
start = time.time()
versions = exec_query(version_query)
end = time.time()
print(time.ctime())
print('{:5.3f}s'.format(end-start), end='  ')


if False:
    for result in versions["results"]["bindings"]:
        print(result["software"]["value"])
        print(result["version"]["value"])
    

In [None]:
# Add statements to graph
for idx, r in enumerate(versions["results"]["bindings"]):
    ver_id = "skg:version/{}".format(idx)
    g.add((URIRef(ver_id), URIRef("schema:name"), Literal(r['version']['value'])))
    g.add((URIRef(ver_id), RDF.type, URIRef("skgv:SoftwareVersion")))
    g.add((URIRef(r['software']['value']), URIRef("skgv:hasVersion"), URIRef(ver_id)))

In [None]:
# Serialize
g.serialize(format="json-ld", context=context, destination="softwarekg-versions.jsonld")
g = Graph()

# Licenses

In [None]:
q1="""
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT ?document ?software ?license (COUNT(?license) as ?num_mention_per_license_per_software_per_document)
WHERE
{
    ?mention skgv:referredToByLicense ?license_mention .
    ?license_mention nif:isString ?license .
    ?mention skgv:software ?software .
    ?document schema:mentions ?mention .
}
GROUP BY ?document ?software ?license
OFFSET {offset}
LIMIT {limit}
"""

table1 = build_table(q1, 1_000_000, ['document','software','license',
                                     'num_mention_per_license_per_software_per_document'])

In [None]:
q2="""
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT ?document ?software (COUNT(?software) AS ?num_mention_per_software_per_document)
WHERE
{
    ?mention skgv:referredToByLicense ?license_mention .
    ?mention skgv:software ?software .
    ?document schema:mentions ?mention .
}
GROUP BY ?document ?software
OFFSET {offset}
LIMIT {limit}
"""
table2 = build_table(q2, 1_000_000, ['document','software', 'num_mention_per_software_per_document'])

In [None]:
q3="""
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT COUNT(DISTINCT ?document) as ?num_article_with_software ?software 
WHERE
{
    ?mention skgv:referredToByLicense ?license_mention .
    ?mention skgv:software ?software .
    ?document schema:mentions ?mention .
}
GROUP BY  ?software
OFFSET {offset}
LIMIT {limit}
"""
table3 = build_table(q3, 1_000_000, ['software','num_article_with_software'])

In [None]:
license_table = workflow(table1,table2,table3)

In [None]:
# Add document resources and statements with confidence value
for idx, r in enumerate(license_table):
    license_id = "skg:license/{}".format(idx)  
    g.add((URIRef(license_id), RDF.type, URIRef("dct:LicenseDocument")))
    g.add((URIRef(license_id), URIRef("schema:name"), Literal(r[1])))
    node = URIRef("skg:inference/license/{}".format(idx))
    g.add((node, RDF.type, RDF.Statement))
    g.add((node, RDF.subject, URIRef(r[0])))
    g.add((node, RDF.predicate ,URIRef("dct:license")))
    g.add((node, RDF.object, URIRef(license_id)))
    g.add((node, URIRef("skgv:confidence"), Literal(r[2], datatype=XSD.float)))
    if float(r[2]) < 1:
        print(r)    

In [None]:
# Serialize
g.serialize(format="json-ld", context=context, destination="softwarekg-licenses-confidence.jsonld")
g = Graph()

# Alternative name

In [None]:
q1="""
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT ?document ?software ?spelling (COUNT(?spelling) as ?num_mention_per_spelling_per_software_per_document)
WHERE
{
    ?mention skgv:software ?software .
    ?mention skgv:referredToByAlternativeName ?altm .
    ?altm nif:isString ?spelling .
    ?document schema:mentions ?mention .
}
GROUP BY ?document ?software ?spelling
OFFSET {offset}
LIMIT {limit}
"""

table1 = build_table(q1, 1_000_000, ['document','software','spelling',
                                     'num_mention_per_spelling_per_software_per_document'])

In [None]:
q2="""
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT ?document ?software (COUNT(?software) AS ?num_mention_per_software_per_document)
WHERE
{
    ?mention skgv:software ?software .
    ?mention skgv:referredToByAlternativeName ?altm .
    ?document schema:mentions ?mention .
}
GROUP BY ?document ?software
OFFSET {offset}
LIMIT {limit}
"""
table2 = build_table(q2, 1_000_000, ['document','software', 'num_mention_per_software_per_document'])

In [None]:
q3="""
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT COUNT(DISTINCT ?document) as ?num_article_with_software ?software 
WHERE
{
    ?mention skgv:software ?software .
    ?mention skgv:referredToByAlternativeName ?altm .
    ?document schema:mentions ?mention .
}
GROUP BY  ?software
OFFSET {offset}
LIMIT {limit}
"""
table3 = build_table(q3, 1_000_000, ['software','num_article_with_software'])

In [None]:
altname_table = workflow(table1,table2,table3)

In [None]:
# Add statements with confidence value
for idx, r in enumerate(altname_table):
    #print(r)
    node = URIRef("skg:inference/alternativeName/{}".format(idx))
    g.add((node, RDF.type, RDF.Statement))
    g.add((node, RDF.subject, URIRef(r[0])))
    g.add((node, RDF.predicate ,URIRef("schema:alternateName")))
    g.add((node, RDF.object, Literal(r[1])))
    g.add((node, URIRef("skgv:confidence"), Literal(r[2], datatype=XSD.float)))
    if float(r[2]) < 1:
        print(r)   

In [None]:
# Serialize
g.serialize(format="json-ld", context=context, destination="softwarekg-alt-names-confidence.jsonld")
g = Graph()

# Abbreviation

In [None]:
q1="""
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT ?document ?software ?spelling (COUNT(?spelling) as ?num_mention_per_spelling_per_software_per_document)
WHERE
{
    ?mention skgv:software ?software .
    ?mention skgv:referredToByAbbreviation ?abbrevm .
    ?abbrevm nif:isString ?spelling .
    ?document schema:mentions ?mention .
}
GROUP BY ?document ?software ?spelling
OFFSET {offset}
LIMIT {limit}
"""

table1 = build_table(q1, 1_000_000, ['document','software','spelling',
                                     'num_mention_per_spelling_per_software_per_document'])

In [None]:
q2="""
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT ?document ?software (COUNT(?software) AS ?num_mention_per_software_per_document)
WHERE
{
    ?mention skgv:software ?software .
    ?mention skgv:referredToByAbbreviation ?abbrevm .
    ?document schema:mentions ?mention .
}
GROUP BY ?document ?software
OFFSET {offset}
LIMIT {limit}
"""
table2 = build_table(q2, 1_000_000, ['document','software', 'num_mention_per_software_per_document'])

In [None]:
q3="""
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT COUNT(DISTINCT ?document) as ?num_article_with_software ?software 
WHERE
{
    ?mention skgv:software ?software .
    ?mention skgv:referredToByAbbreviation ?abbrevm .
    ?document schema:mentions ?mention .
}
GROUP BY  ?software
OFFSET {offset}
LIMIT {limit}
"""
table3 = build_table(q3, 1_000_000, ['software','num_article_with_software'])

In [None]:
abbr_table = workflow(table1,table2,table3)

In [None]:
# Add statements with confidence value
for idx, r in enumerate(abbr_table):
    node = URIRef("skg:inference/abbreviation/{}".format(idx))
    g.add((node, RDF.type, RDF.Statement))
    g.add((node, RDF.subject, URIRef(r[0])))
    g.add((node, RDF.predicate ,URIRef("skgv:abbreviation")))
    g.add((node, RDF.object, Literal(r[1])))
    g.add((node, URIRef("skgv:confidence"), Literal(r[2], datatype=XSD.float)))
    if float(r[2]) < 1:
        print(r)   

In [None]:
# Serialize
g.serialize(format="json-ld", context=context, destination="softwarekg-abbreviations-confidence.jsonld")
g = Graph()

# Citation

In [None]:
# Not applicable

# Extension

In [None]:
q1="""
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT ?document ?software ?spelling (COUNT(?spelling) as ?num_mention_per_spelling_per_software_per_document)
WHERE
{
    ?mention skgv:software ?software .
    ?mention skgv:referredToByExtension ?extm .
    ?extm nif:isString ?spelling .
    ?document schema:mentions ?mention .
}
GROUP BY ?document ?software ?spelling
OFFSET {offset}
LIMIT {limit}
"""

table1 = build_table(q1, 1_000_000, ['document','software','spelling',
                                     'num_mention_per_spelling_per_software_per_document'])

In [None]:
q2="""
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT ?document ?software (COUNT(?software) AS ?num_mention_per_software_per_document)
WHERE
{
    ?mention skgv:software ?software .
    ?mention skgv:referredToByExtension ?extm .
    ?document schema:mentions ?mention .
}
GROUP BY ?document ?software
OFFSET {offset}
LIMIT {limit}
"""
table2 = build_table(q2, 1_000_000, ['document','software', 'num_mention_per_software_per_document'])

In [None]:
q3="""
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT COUNT(DISTINCT ?document) as ?num_article_with_software ?software 
WHERE
{
    ?mention skgv:software ?software .
    ?mention skgv:referredToByExtension ?extm .
    ?document schema:mentions ?mention .
}
GROUP BY  ?software
OFFSET {offset}
LIMIT {limit}
"""
table3 = build_table(q3, 1_000_000, ['software','num_article_with_software'])

In [None]:
ext_table = workflow(table1,table2,table3)

In [None]:
# Add statements with confidence value
for idx, r in enumerate(ext_table):
    node = URIRef("skg:inference/extension/{}".format(idx))
    g.add((node, RDF.type, RDF.Statement))
    g.add((node, RDF.subject, URIRef(r[0])))
    g.add((node, RDF.predicate ,URIRef("skgv:hasExtension")))
    g.add((node, RDF.object, Literal(r[1])))
    g.add((node, URIRef("skgv:confidence"), Literal(r[2], datatype=XSD.float)))
    if float(r[2]) < 1:
        print(r)   

In [None]:
# Serialize
g.serialize(format="json-ld", context=context, destination="softwarekg-extensions-confidence.jsonld")
g = Graph()

# URL

In [None]:
q1="""
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT ?document ?software ?spelling (COUNT(?spelling) as ?num_mention_per_spelling_per_software_per_document)
WHERE
{
    ?mention skgv:software ?software .
    ?mention skgv:referredToByURL ?urlm .
    ?urlm nif:isString ?spelling .
    ?document schema:mentions ?mention .
}
GROUP BY ?document ?software ?spelling
OFFSET {offset}
LIMIT {limit}
"""

table1 = build_table(q1, 1_000_000, ['document','software','spelling',
                                     'num_mention_per_spelling_per_software_per_document'])

In [None]:
q2="""
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT ?document ?software (COUNT(?software) AS ?num_mention_per_software_per_document)
WHERE
{
    ?mention skgv:software ?software .
    ?mention skgv:referredToByURL ?urlm .
    ?document schema:mentions ?mention .
}
GROUP BY ?document ?software
OFFSET {offset}
LIMIT {limit}
"""
table2 = build_table(q2, 1_000_000, ['document','software', 'num_mention_per_software_per_document'])

In [None]:
q3="""
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT COUNT(DISTINCT ?document) as ?num_article_with_software ?software 
WHERE
{
    ?mention skgv:software ?software .
    ?mention skgv:referredToByURL ?urlm .
    ?document schema:mentions ?mention .
}
GROUP BY  ?software
OFFSET {offset}
LIMIT {limit}
"""
table3 = build_table(q3, 1_000_000, ['software','num_article_with_software'])

In [None]:
url_table = workflow(table1,table2,table3)

In [None]:
# Add statements with confidence value
for idx, r in enumerate(url_table):
    node = URIRef("skg:inference/url/{}".format(idx))
    g.add((node, RDF.type, RDF.Statement))
    g.add((node, RDF.subject, URIRef(r[0])))
    g.add((node, RDF.predicate ,URIRef("skgv:hasURL")))
    g.add((node, RDF.object, Literal(r[1])))
    g.add((node, URIRef("skgv:confidence"), Literal(r[2], datatype=XSD.float)))
    if float(r[2]) < 1:
        print(r)  

In [None]:
# Serialize
g.serialize(format="json-ld", context=context, destination="softwarekg-urls-confidence.jsonld")
g = Graph()

# Release

In [None]:
q1="""
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT ?document ?software ?spelling (COUNT(?spelling) as ?num_mention_per_spelling_per_software_per_document)
WHERE
{
    ?mention skgv:software ?software .
    ?mention skgv:referredToByRelease ?releasem .
    ?releasem nif:isString ?spelling .
    ?document schema:mentions ?mention .
}
GROUP BY ?document ?software ?spelling
OFFSET {offset}
LIMIT {limit}
"""

table1 = build_table(q1, 1_000_000, ['document','software','spelling',
                                     'num_mention_per_spelling_per_software_per_document'])

In [None]:
q2="""
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT ?document ?software (COUNT(?software) AS ?num_mention_per_software_per_document)
WHERE
{
    ?mention skgv:software ?software .
    ?mention skgv:referredToByRelease ?releasem .
    ?document schema:mentions ?mention .
}
GROUP BY ?document ?software
OFFSET {offset}
LIMIT {limit}
"""
table2 = build_table(q2, 1_000_000, ['document','software', 'num_mention_per_software_per_document'])

In [None]:
q3="""
PREFIX schema: <http://schema.org/>
PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
PREFIX skgv: <http://data.gesis.org/softwarekg/vocab/>

SELECT COUNT(DISTINCT ?document) as ?num_article_with_software ?software 
WHERE
{
    ?mention skgv:software ?software .
    ?mention skgv:referredToByRelease ?releasem .
    ?document schema:mentions ?mention .
}
GROUP BY  ?software
OFFSET {offset}
LIMIT {limit}
"""
table3 = build_table(q3, 1_000_000, ['software','num_article_with_software'])

In [None]:
release_table = workflow(table1,table2,table3)

In [None]:
# Add statements with confidence value
for idx, r in enumerate(release_table):
    node = URIRef("skg:inference/release/{}".format(idx))
    g.add((node, RDF.type, RDF.Statement))
    g.add((node, RDF.subject, URIRef(r[0])))
    g.add((node, RDF.predicate ,URIRef("skgv:released")))
    g.add((node, RDF.object, Literal(r[1])))
    g.add((node, URIRef("skgv:confidence"), Literal(r[2], datatype=XSD.float)))
    if float(r[2]) < 1:
        print(r)  

In [None]:
# Serialize
g.serialize(format="json-ld", context=context, destination="softwarekg-release-confidence.jsonld")
g = Graph()