In [1]:
import numpy as np
import os
import bz2
import re
import random
import json
import qwikidata
from qwikidata.json_dump import WikidataJsonDump
from qwikidata.linked_data_interface import get_entity_dict_from_api
from qwikidata.sparql import (get_subclasses_of_item,
                              return_sparql_query_results)
from collections import Counter
from qwikidata.entity import WikidataItem, WikidataProperty, WikidataLexeme
import sqlite3
from itertools import islice
import time
from pprint import pprint
import traceback
import pdb
from importlib import reload

import wikidata_utils as wdutils
import requests
import hashlib

In [2]:
wdAPI = wdutils.CachedWikidataAPI(save_every_x_queries=10)

In [3]:
def get_subclasses(class_entity_id):
    sparql_query = '''
        SELECT ?item ?itemLabel 
        WHERE 
        {
          ?item wdt:P279 wd:$1.
          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
        }
    '''.replace('$1',class_entity_id)
    sparql_results = wdAPI.query_sparql_endpoint(sparql_query)
    return sparql_results#['results']['bindings']

In [4]:
IGNORE_LIST = {
    'Q174834': ''' This is AUTHORITY, and the reason we are excluding it is because it overlaps with politician,
    on top of escaping the idea of written work. Yes, it is written that authority exists, but this begins to enter
    classes such as Mayor of a place in France, chair of local government, etc.''',
    #'Q382617': ''' This is MAYOR OF A PLACE IN FRANCE, and the reason we are excluding this is because it has over 40
    #thousand subclasses that have little to no entities in them, so we are removing them.''',
    #'Q5663900': ''' MAYOR OF A PLACE IN SPAIN, same as above.''',
    #'Q15113603': ''' member of a municipal council in France (Q15113603), same reason as above, over 40K subclasses. ''',
    #'Q20748648': ''' Same as above, member of a municipal council in the Netherlands, over 1.6k subclasses.''',
    #'Q8054': ''' TEMPORARY, to speed up Politician'''
    'Q8054': ''' PROTEIN, which is part of the subclass tree of chemical compound. This has so many subclasses that
    the Sparql endpoint times out. Also, it would be a very specific (and massively populated) area in Wikidata which
    we intend on not covering with ChemicalCompound'''
}

def get_subclass_tree(root_class_id,
                      root_class_label=None,
                      level=0,
                      verbose=False,
                      all_visited_so_far = None,
                      path_so_far = None,
                      subclasses_n_threshold=1000): 
    if verbose:
        print('>',' - '*level,root_class_id,'on level',level)
    if root_class_label is None:
        root_class_label = wdAPI.get_label(root_class_id)
    root_tree = {
        'label': re.sub(r'\s+', '', root_class_label.title()),
        'entity_id': root_class_id
    }
    
    if root_class_id in IGNORE_LIST.keys():
        root_tree['subclasses'] = 'IGNORE:' + IGNORE_LIST[root_class_id]
        return root_tree

    results = get_subclasses(root_class_id)['results']['bindings']
    
    if not path_so_far:
        path_so_far = []
    else:
        if root_class_id in path_so_far:
            #pdb.set_trace()
            print('>',' - '*level,'Loop detected in class',root_class_id,'on level',level)
            root_tree['subclasses'] = 'LOOP'
            return root_tree
    path_so_far = path_so_far.copy()
    path_so_far.append(root_class_id)
    
    if not all_visited_so_far:
        all_visited_so_far = []
    else:
        if root_class_id in all_visited_so_far:
            #pdb.set_trace()
            print('>',' - '*level,'Simple repetition detected in class',root_class_id,'on level',level)
            root_tree['subclasses'] = 'REPEAT'
            return root_tree
    all_visited_so_far.append(root_class_id)
            
    if len(results) == 0:
        root_tree['subclasses'] = None
    else:
        subclasses = {}
        if len(results) > subclasses_n_threshold:
            print('>',' - '*level,'Too many subclasses in',root_class_id,'on level',level,'so limiting to',subclasses_n_threshold,'first')
        for result in results[:subclasses_n_threshold]:
            subclass_id = result['item']['value'].split('/')[-1]
            subclass_tree = get_subclass_tree(
                root_class_id = subclass_id,
                root_class_label = result['itemLabel']['value'],
                level = level + 1,
                verbose = verbose,
                all_visited_so_far = all_visited_so_far,
                path_so_far = path_so_far
            )
            subclasses[subclass_id] = subclass_tree
        root_tree['subclasses'] = subclasses
    return root_tree

In [5]:
classes = {
    'Seen': {
        'Airport' : 'Q1248784',
        'Astronaut' : 'Q11631',
        'Building' : 'Q41176',
        'City' : 'Q515',
        'ComicsCharacter' : 'Q1114461',
        'Food' : 'Q2095',
        'Monument' : 'Q4989906',
        'SportsTeam' : 'Q12973014',
        'University' : 'Q3918',
        'WrittenWork' : 'Q47461344'
   },
    'Unseen_WebNLG': {
        'Athlete' : 'Q2066131',
        'Artist' : 'Q483501',
        'CelestialBody' : 'Q6999',
        'MeanOfTransportation' : 'Q334166',
        'Politician' : 'Q82955'
    },
    'Unseen_New': {
        #'ScholarlyArticle': 'Q13442814', This overlaps 100% with WrittenWork
        'Taxon' : 'Q16521', # This overlapps 35% with Food, which is acceptable (?)
        'Street' : 'Q79007',
        'Painting': 'Q3305213',
        'ChemicalCompound': 'Q11173',
        'Mountain': 'Q8502' # Replacing ScholarlyArticle. 
    }
}

In [6]:
#class Wikidata_Class():
#    def __init__(self, label, entity_id):
#        self.label = label
#        self.entity_id = entity_id
#        self.subclasses = []
#        
#    def add_subclass(self, subclass):
#        self.subclasses.append(subclass)
#        
#    def add_subclasses(self, subclasses):
#        self.subclasses = self.subclasses + subclasses

Class trees are organised as such:
```
{
    'root_class_1': {
        'label' : ...
        'entity_id': ...
        'subclasses': {
            'subclass_entity_id': { 
                'subclass_label': ...,
                'subclass_entity_id': ...,
                'subclass_subclasses': [...]
            }, ...
        }
    }
}
```

In [7]:
root_classes = [(class_label, classes[part][class_label]) for part in classes.keys() for class_label in classes[part]]
subclass_trees = {}

In [8]:
from IPython.display import clear_output
for (root_class_label, root_class_id) in root_classes:
    clear_output(wait=True)
    print('Retrieving subclass tree for %s' % root_class_label)
    subclass_trees[root_class_id] = get_subclass_tree(root_class_id, root_class_label, verbose=True)
    

Retrieving subclass tree for Mountain
>  Q8502 on level 0
>  -  Q8072 on level 1
>  -  -  Q169358 on level 2
>  -  -  Q190869 on level 2
>  -  -  Q193457 on level 2
>  -  -  -  Q2791919 on level 3
>  -  -  Q212057 on level 2
>  -  -  -  Q723802 on level 3
>  -  -  -  Q995054 on level 3
>  -  -  Q332614 on level 2
>  -  -  Q367004 on level 2
>  -  -  Q478788 on level 2
>  -  -  Q526644 on level 2
>  -  -  Q674775 on level 2
>  -  -  Q771409 on level 2
>  -  -  -  Q943137 on level 3
>  -  -  Q842928 on level 2
>  -  -  Q1197120 on level 2
>  -  -  Q1200524 on level 2
>  -  -  Q1325302 on level 2
>  -  -  Q1330974 on level 2
>  -  -  -  Q20743938 on level 3
>  -  -  Q1491559 on level 2
>  -  -  Q1806785 on level 2
>  -  -  Q2143039 on level 2
>  -  -  Q2398326 on level 2
>  -  -  Q3562720 on level 2
>  -  -  Q3562724 on level 2
>  -  -  Q6151660 on level 2
>  -  -  Q7630766 on level 2
>  -  -  Q21001649 on level 2
>  -  -  Q21009867 on level 2
>  -  -  Q37499745 on level 2
>  -  -  Q10036

In [11]:
wdAPI.x_queries_passed = wdAPI.save_every_x_queries
wdAPI.save_entity_cache()

In [12]:
with open('WebNLG_to_Wikidata_Subclass_Tree_Thr=1000.json','w+') as f:
    json.dump(subclass_trees, f, indent=4)