<p>Wikidata Constraint Violation Finder</p>

This notebook generates the files necessary to perform analysis on Wikidata constraint violations on a per-property level.

In [None]:
# Split items file into per-property files. Only needs to be run once. Takes some time.

import gzip
propFileDict = {}
with gzip.open('claims.wikibase-item.tsv.gz','r') as fin: 
    headerLine = next(fin).decode("utf-8")
    tstCount = 0
    for line in fin:
        line = line.decode("utf-8")
        lineP = line.rstrip().split("\t")
        if "external-id" in lineP[-1]:
            continue
        prop = lineP[1]
        if prop not in propFileDict:
            propFileDict[prop] = open("data/propertiesSplit_Final/claims."+str(prop)+".tsv","w")
            propFileDict[prop].write(headerLine)
        propFileDict[prop].write(line)
        tstCount += 1
        if tstCount == 1000000:
            tstCount = 0
    for file1 in propFileDict.values():
        file1.close()

In [1]:
import os
import csv
import numpy as np
import pandas as pd

from kgtk.configure_kgtk_notebooks import ConfigureKGTK
from kgtk.functions import kgtk, kypher

In [2]:
input_path = "data"
output_path = "output"
project_name = "p161_notebook"

# set property ID here
property_id = "P39"
# hardcode constraint classes here. Change to retrieve this with SPARQL or other means in the future.
subject_classes = """["Q5", "Q21070598", "Q95074", "Q4271324", "Q64520857", "Q64643615", "Q75855169", "Q146"]"""
object_classes = """["Q4164871", "Q21451536", "Q355567", "Q3687335", "Q7810129", "Q81752537", "Q294414", "Q11452125"]"""
# P161 classes
#subject_classes = """["Q1185607", "Q15267437", "Q35140", "Q386724", "Q43099500"]"""
#object_classes = """["Q120544", "Q215627", "Q26401003", "Q95074"]"""

files = ["derived.isa.tsv.gz","derived.P31.tsv.gz","derived.P279star.tsv.gz","derived.P279.tsv.gz"]
for file in files:
    os.environ[file] = file
ck = ConfigureKGTK(files)
ck.configure_kgtk(input_graph_path=input_path,
                  output_path=output_path,
                  project_name=project_name)
ck.print_env_variables()
#ck.load_files_into_cache()

User home: /home/jovyan
Current dir: /out/kgtk_notebooks
KGTK dir: /out
Use-cases dir: /out/use-cases
kypher: kgtk query --graph-cache output/p161_notebook/temp.p161_notebook/wikidata.sqlite3.db
GRAPH: data
OUT: output/p161_notebook
KGTK_GRAPH_CACHE: output/p161_notebook/temp.p161_notebook/wikidata.sqlite3.db
STORE: output/p161_notebook/temp.p161_notebook/wikidata.sqlite3.db
KGTK_LABEL_FILE: data/labels.en.tsv.gz
TEMP: output/p161_notebook/temp.p161_notebook
kgtk: kgtk
EXAMPLES_DIR: /out/examples
KGTK_OPTION_DEBUG: false
USE_CASES_DIR: /out/use-cases
derived.isa.tsv.gz: derived.isa.tsv.gz
derived.P31.tsv.gz: derived.P31.tsv.gz
derived.P279star.tsv.gz: derived.P279star.tsv.gz
derived.P279.tsv.gz: derived.P279.tsv.gz


In [3]:
kgtk(f"""--debug query -i 
/out/data/propertiesSplit_Final/claims.{property_id}.tsv  
/out/derived.isa.tsv.gz         
/out/derived.P279star.tsv.gz         
--match 'm: (node1)-[nodeProp]->(node2), isa: (node2)-[]->(nodex), P279star: (nodex)-[]->(par)'
--where 'par in {object_classes} '      
--return 'distinct nodeProp.id, node1 as `node1`, nodeProp.label as `label`, node2 as `node2`'    
-o /out/output/allConstraintsAnalysis_Final/valueTypeConstraint/normal/claims.type-constraints.instanceOfOrSubclass.{property_id}.correct_temp.tsv 
""")

kgtk: received KeyboardInterrupt


[2022-02-17 19:16:20 sqlstore]: IMPORT graph directly into table graph_1 from /out/data/propertiesSplit_Final/claims.P39.tsv ...
[2022-02-17 19:16:30 sqlstore]: IMPORT graph directly into table graph_2 from /out/derived.isa.tsv.gz ...
[2022-02-17 19:18:58 sqlstore]: IMPORT graph directly into table graph_3 from /out/derived.P279star.tsv.gz ...



In [None]:
kgtk(f"""--debug ifnotexists -i /out/data/propertiesSplit_Final/claims.{property_id}.tsv   
--filter-on /out/output/allConstraintsAnalysis_Final/valueTypeConstraint/normal/claims.type-constraints.instanceOfOrSubclass.{property_id}.correct_temp.tsv    
-o /out/output/allConstraintsAnalysis_Final/valueTypeConstraint/normal/claims.type-constraints.instanceOfOrSubclass.{property_id}.incorrect_temp.tsv ;""")

In [None]:
kgtk(f"""--debug query -i /out/output/allConstraintsAnalysis_Final/valueTypeConstraint/normal/claims.type-constraints.instanceOfOrSubclass.{property_id}.incorrect_temp.tsv  
/out/derived.isa.tsv.gz        
--match 'm: (node1)-[nodeProp]->(node2), isa: (node2)-[]->(par)'    
--where 'par in {object_classes} '     
--return 'distinct nodeProp.id, node1 as `node1`, nodeProp.label as `label`, node2 as `node2`'   
-o /out/output/allConstraintsAnalysis_Final/valueTypeConstraint/normal/claims.type-constraints.instanceOfOrSubclass.{property_id}.correct_temp2.tsv""")   

In [None]:
kgtk(f"""--debug ifnotexists -i /out/output/allConstraintsAnalysis_Final/valueTypeConstraint/normal/claims.type-constraints.instanceOfOrSubclass.{property_id}.incorrect_temp.tsv   
--filter-on /out/output/allConstraintsAnalysis_Final/valueTypeConstraint/normal/claims.type-constraints.instanceOfOrSubclass.{property_id}.correct_temp2.tsv    
-o /out/output/allConstraintsAnalysis_Final/valueTypeConstraint/normal/claims.type-constraints.instanceOfOrSubclass.{property_id}.incorrect.tsv ;""") 

In [None]:
kgtk(f"""--debug cat -i /out/output/allConstraintsAnalysis_Final/valueTypeConstraint/normal/claims.type-constraints.instanceOfOrSubclass.{property_id}.correct_temp.tsv     
/out/output/allConstraintsAnalysis_Final/valueTypeConstraint/normal/claims.type-constraints.instanceOfOrSubclass.{property_id}.correct_temp2.tsv     
-o /out/output/allConstraintsAnalysis_Final/valueTypeConstraint/normal/claims.type-constraints.instanceOfOrSubclass.{property_id}.correct.tsv ;    
""")

In [None]:
kgtk(f"""--debug query -i /out/data/propertiesSplit_Final/claims.{property_id}.tsv 
/out/derived.P31.tsv.gz      
/out/derived.P279star.tsv.gz      
--match 'm: (node1)-[nodeProp]->(node2), P31: (node1)-[]->(nodex), P279star: (nodex)-[]->(par)'       
--where 'par in {subject_classes} '       
--return 'distinct nodeProp.id, node1 as `node1`, nodeProp.label as `label`, node2 as `node2`'  
-o /out/output/allConstraintsAnalysis_Final/typeConstraint/normal/claims.type-constraints.instanceOf.{property_id}.correct_temp.tsv""")

In [None]:
kgtk(f"""--debug ifnotexists -i /out/data/propertiesSplit_Final/claims.{property_id}.tsv  
--filter-on /out/output/allConstraintsAnalysis_Final/typeConstraint/normal/claims.type-constraints.instanceOf.{property_id}.correct_temp.tsv       
-o /out/output/allConstraintsAnalysis_Final/typeConstraint/normal/claims.type-constraints.instanceOf.{property_id}.incorrect_temp.tsv ;""")

In [None]:
kgtk(f"""--debug query -i /out/output/allConstraintsAnalysis_Final/typeConstraint/normal/claims.type-constraints.instanceOf.{property_id}.incorrect_temp.tsv   
/out/derived.P31.tsv.gz  
--match 'm: (node1)-[nodeProp]->(node2), P31: (node1)-[]->(par)'    
--where 'par in {subject_classes} '     
--return 'distinct nodeProp.id, node1 as `node1`, nodeProp.label as `label`, node2 as `node2`'      
-o /out/output/allConstraintsAnalysis_Final/typeConstraint/normal/claims.type-constraints.instanceOf.{property_id}.correct_temp2.tsv""")

In [None]:
kgtk(f"""--debug ifnotexists -i /out/output/allConstraintsAnalysis_Final/typeConstraint/normal/claims.type-constraints.instanceOf.{property_id}.incorrect_temp.tsv      
--filter-on /out/output/allConstraintsAnalysis_Final/typeConstraint/normal/claims.type-constraints.instanceOf.{property_id}.correct_temp2.tsv     
-o /out/output/allConstraintsAnalysis_Final/typeConstraint/normal/claims.type-constraints.instanceOf.{property_id}.incorrect.tsv ;""")

In [None]:
kgtk(f"""--debug cat -i /out/output/allConstraintsAnalysis_Final/typeConstraint/normal/claims.type-constraints.instanceOf.{property_id}.correct_temp.tsv     
/out/output/allConstraintsAnalysis_Final/typeConstraint/normal/claims.type-constraints.instanceOf.{property_id}.correct_temp2.tsv       
-o /out/output/allConstraintsAnalysis_Final/typeConstraint/normal/claims.type-constraints.instanceOf.{property_id}.correct.tsv ;""")

In [None]:
# get instances of all P161 subjects
kgtk(f"""--debug query -i 
/out/data/propertiesSplit_Final/claims.{property_id}.tsv  
/out/derived.isa.tsv.gz         
--match 'm: (node1)-[nodeProp]->(node2), isa: (node1)-[]->(nodex)'
--return 'distinct node1 as `instance`, nodex as `class`'    
-o /out/output/allConstraintsAnalysis_Final/instances/instances.subjects.{property_id}.tsv 
""")

# get instances of all P161 objects
kgtk(f"""--debug query -i 
/out/data/propertiesSplit_Final/claims.{property_id}.tsv  
/out/derived.isa.tsv.gz         
--match 'm: (node1)-[nodeProp]->(node2), isa: (node2)-[]->(nodex)'
--return 'distinct node2 as `instance`, nodex as `class`'    
-o /out/output/allConstraintsAnalysis_Final/instances/instances.objects.{property_id}.tsv 
""")

In [None]:
# get direct subclass of all P161 subjects
kgtk(f"""--debug query -i 
/out/data/propertiesSplit_Final/claims.{property_id}.tsv  
/out/derived.P279.tsv.gz         
--match 'm: (node1)-[nodeProp]->(node2), P279: (node1)-[]->(nodex)'
--return 'distinct node1 as `instance`, nodex as `class`'    
-o /out/output/allConstraintsAnalysis_Final/subclasses/subclasses.subjects.{property_id}.tsv 
""")

# get direct subclass of all P161 objects
kgtk(f"""--debug query -i 
/out/data/propertiesSplit_Final/claims.{property_id}.tsv  
/out/derived.P279.tsv.gz         
--match 'm: (node1)-[nodeProp]->(node2), P279: (node2)-[]->(nodex)'
--return 'distinct node2 as `instance`, nodex as `class`'    
-o /out/output/allConstraintsAnalysis_Final/subclasses/subclasses.objects.{property_id}.tsv 
""")