In [4]:
import numpy as np 
import pandas as pd
import networkx as nx
import logging, sys

from headParsing import find_head
from iteration_utilities import duplicates, unique_everseen

In [2]:
# logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
logging.basicConfig(filename='categories.log',
                            filemode='a',
                            format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
                            datefmt='%H:%M:%S',
                            level=logging.DEBUG)

logging.info("Label querying")

In [3]:
# Inputs
CATEGORIES_PATH = '/scratch/WikipediaImagesTaxonomy/commonswiki-20220220-category-network.parquet'
FILES_PATH = '/scratch/WikipediaImagesTaxonomy/commonswiki-20220220-files.parquet'

In [40]:
class Taxonomy:
    def __init__(self, G=None):
        if(G):
            self.G = G

    def load_categories(self, path):
        '''
        Load categories from path and build the category graph.
        '''
        self.build_category_graph(pd.read_parquet(path))
    
    def build_category_graph(self, categories):
        '''
        Build the category graph, starting from the DataFrame extracted by processing dumps
        '''
        categories = categories.set_index('title')
        # Build DiGraph from adjacency matrix
        G = nx.DiGraph(categories.parents.to_dict())
        nx.set_node_attributes(G, dict(zip(categories.index, categories[['id', 'hiddencat']].to_dict(orient='records'))))
        depth = {node: len(sps) for node, sps in nx.shortest_path(G, target='CommonsRoot').items()}
        nx.set_node_attributes(G, depth, name='depth')
        self.G = G

    def reset_labels(self):
        '''
        Reset labels and discovery status for each node.
        '''
        nx.set_node_attributes(self.G, {node: {'visited': False, 'labels': set()} for node in self.G.nodes})
        self.visited_nodes = 0

    def set_taxonomy(self, taxonomy):
        '''
        Set an ORES-like taxonomy, mapping labels to high-level categories.
        '''
        self.taxonomy = taxonomy
        self.reset_labels()
        for label, categories in taxonomy.items():
            for category in categories:
                self.visited_nodes += 1
                self.G.nodes[category]['visited'] = True
                self.G.nodes[category]['labels'].add(label)
    
    def get_head(self, category):
        '''
        Get or compute the lexical head of a given category.
        '''
        if('head' in self.G.nodes[category]):
            head = self.G.nodes[category]['head']
        else:
            head = find_head(category)
            self.G.nodes[category]['head'] = head
        return head


    def get_label(self, category, how='heuristics'):
        '''
        Get the label corresponding to a specific category, passed as string.

        Params:
            how (string): decision scheme to recursively query parents. 
                all: all parents are queried
                naive: only the first parent is queried
                heuristics: decision based on the set of heuristics described in ??
        '''
        assert isinstance(category, str)

        if(self.G.nodes[category]['visited']):
            logging.debug('Found ' + category + ' with label ' + str(self.G.nodes[category]['labels']))
            return self.G.nodes[category]['labels']
        
        else:
            self.G.nodes[category]['visited'] = True
            self.visited_nodes += 1
            logging.debug(str(self.visited_nodes) + ' - Searching for ' + category + '...')

            if(how == 'all'):
                for parent in self.G.neighbors(category):
                    self.G.nodes[category]['labels'].update(self.get_label(parent))
                return self.G.nodes[category]['labels']
            elif(how=='naive'):
                pass
            elif(how=='heuristics'):
                # 1. Hidden category
                if(self.G.nodes[category]['hiddencat']):
                    return set()

                # 2. Lexical head
                heads = [self.get_head(category)]
                for parent in self.G.neighbors(category):
                    heads.append(self.get_head(parent))

                # Try to match over complete lexical heads or subsets
                while(1):
                    common_heads = list(unique_everseen(duplicates(heads)))

                    # Break if found a common head or all the heads are already 1 word long
                    if(common_heads or (cmax:=max(map(lambda x: len(x.split())), heads)) == 1):
                        break

                    # Remove 1 word from the longest composite heads
                    for i, head in enumerate(heads):
                        head_words = head.split()
                        if(len(head_words) == cmax):
                            heads[i] = ' '.join(head_words[1:]).capitalize()

                logging.debug('\tFound lexical heads: ' + str(common_heads))
                for common_head in common_heads:
                    if(common_head in self.G):
                        self.G.nodes[category]['labels'].update(self.get_label_heuristics(common_head))
                    else:
                        logging.debug('Lexical head ' + str(common_head) + ' not found')
                
                # Will be empty if no common_head is found, if the common_heads are
                # all not valid category names, hidden categories or already visited 
                # (including the current category)
                if(self.G.nodes[category]['labels']):
                    return self.G.nodes[category]['labels']

                # 3. is_a or subcategory_of (temp: depth check)
                return self.G.nodes[category]['labels']
            else:
                raise ValueError('Invalid "how" option')


            

In [75]:
taxonomy = Taxonomy()
taxonomy.load_categories(CATEGORIES_PATH)

In [76]:
content_dict = {'Nature': ['Animalia', 'Fossils', 'Landscapes', 'Marine organisms', 'Plantae', 'Weather'],
                'Society/Culture': ['Art', 'Belief', 'Entertainment', 'Events', 'Flags', 'Food', 'History', 'Language', 'Literature', 'Music', 'Objects', 'People', 'Places', 'Politics', 'Sports'],
                'Science': ['Astronomy', 'Biology', 'Chemistry', 'Earth sciences', 'Mathematics', 'Medicine', 'Physics', 'Technology'],
                'Engineering': ['Architecture', 'Chemical engineering', 'Civil engineering', 'Electrical engineering', 'Environmental engineering', 'Geophysical engineering', 'Mechanical engineering', 'Process engineering']}

In [77]:
taxonomy.set_taxonomy(content_dict)

In [None]:
taxonomy.get_label('Stellar astronomy')

In [None]:
taxonomy.get_label('Astronomy by city')

In [3]:
cat = 'Comedy films of the United States'