In [1]:
import numpy as np 
import pandas as pd
import networkx as nx

import logging, sys
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)

In [2]:
# Inputs
CATEGORIES_PATH = '/scratch/WikipediaImagesTaxonomy/commonswiki-20220220-category-network.parquet'
FILES_PATH = '/scratch/WikipediaImagesTaxonomy/commonswiki-20220220-files.parquet'

In [12]:
class Taxonomy:
    def __init__(self, G=None):
        if(G):
            self.G = G
    
    def build_category_graph(self, categories):
        '''
        Build the category graph, starting from the DataFrame extracted by processing dumps
        '''
        categories = categories.set_index('title')
        # Build DiGraph from adjacency matrix
        G = nx.DiGraph(categories.parents.to_dict())
        nx.set_node_attributes(G, dict(zip(categories.index, categories[['id', 'hiddencat']].to_dict(orient='records'))))
        self.G = G

    def reset_labels(self):
        '''
        Reset labels and discovery status for each node.
        '''
        nx.set_node_attributes(self.G, {node: {'visited': False, 'labels': []} for node in self.G.nodes})

    def set_taxonomy(self, taxonomy):
        '''
        Set an ORES-like taxonomy, mapping labels to high-level categories.
        '''
        self.taxonomy = taxonomy
        self.reset_labels()
        for label, categories in taxonomy.items():
            for category in categories:
                self.G.nodes[category]['visited'] = True
                self.G.nodes[category]['labels'] += [label]
    
    def get_label(self, category):
        '''
        Get the label corresponding to a specific category, passed as string
        '''
        assert isinstance(category, str)

        if(self.G.nodes[category]['visited']):
            logging.debug('Found ' + category + ' with label ' + str(self.G.nodes[category]['labels']))
            return self.G.nodes[category]['labels']
        
        else:
            self.G.nodes[category]['visited'] = True
            logging.debug('Searching for ' + category + '...')
            for parent in self.G.neighbors(category):
                self.G.nodes[category]['labels'] += self.get_label(parent)
            return self.G.nodes[category]['labels']


In [5]:
categories = pd.read_parquet(CATEGORIES_PATH)
categories.head()

Unnamed: 0,id,title,parents,hiddencat,childs
0,89434922,"""Azərişıq"" ASC",[Energy in Azerbaijan],False,"[Vugar Ahmadov, Ilham Aliyev attended opening ..."
1,59340547,"""Flores de María""","[Folk festivals in the Philippines, Flores de ...",False,"[Dapit (May, 2017)]"
2,29072664,"'s-Gravenhof, Zutphen",[Streets in Zutphen],False,"[St Walburgis Church (Zutphen), Kuiperstraat 1..."
3,2145396,'s-Heerenberg,"[Montferland, Populated places in Gelderland, ...",False,"[Patrick Beverloo, Rijksmonumenten in 's-Heere..."
4,114888133,'s-Hertogenbosch in art,"['s-Hertogenbosch, Cities in the Netherlands i...",False,"[Old maps of 's-Hertogenbosch, Prints and draw..."


In [6]:
categories.shape

(4048538, 5)

In [13]:
taxonomy = Taxonomy()
taxonomy.build_category_graph(categories)

In [14]:
nx.info(taxonomy.G)


  nx.info(taxonomy.G)


'DiGraph with 4097584 nodes and 9923900 edges'

In [15]:
content_dict = {'Nature': ['Animalia', 'Fossils', 'Landscapes', 'Marine organisms', 'Plantae', 'Weather'],
                'Society/Culture': ['Art', 'Belief', 'Entertainment', 'Events', 'Flags', 'Food', 'History', 'Language', 'Literature', 'Music', 'Objects', 'People', 'Places', 'Politics', 'Sports'],
                'Science': ['Astronomy', 'Biology', 'Chemistry', 'Earth sciences', 'Mathematics', 'Medicine', 'Physics', 'Technology'],
                'Engineering': ['Architecture', 'Chemical engineering', 'Civil engineering', 'Electrical engineering', 'Environmental engineering', 'Geophysical engineering', 'Mechanical engineering', 'Process engineering']}

In [16]:
taxonomy.set_taxonomy(content_dict)

In [17]:
taxonomy.get_label('Animalia fossils')

DEBUG:root:Searching for Animalia fossils...
DEBUG:root:Searching for Fossils by classification...
DEBUG:root:Found Fossils with label ['Nature']
DEBUG:root:Searching for Biogeology...
DEBUG:root:Searching for Subfields of geology...
DEBUG:root:Searching for Geology...
DEBUG:root:Searching for Subfields of earth sciences...
DEBUG:root:Found Earth sciences with label ['Science']
DEBUG:root:Searching for Subfields by academic discipline...
DEBUG:root:Searching for Academic disciplines...
DEBUG:root:Searching for Academia...
DEBUG:root:Searching for Knowledge...
DEBUG:root:Searching for Concepts...
DEBUG:root:Searching for Topics...
DEBUG:root:Searching for CommonsRoot...
DEBUG:root:Searching for Education...
DEBUG:root:Searching for Culture...
DEBUG:root:Found Topics with label []
DEBUG:root:Searching for Service industries...
DEBUG:root:Searching for Services...
DEBUG:root:Searching for Economy...
DEBUG:root:Searching for Society...
DEBUG:root:Found Topics with label []
DEBUG:root:Searc

In [None]:
taxonomy.get_label('Automobiles with open trunks')