In [2]:
import pandas as pd
import rltk
import sys
sys.path.append('/lfs1/jupyterhub_data_dir/share/xij/')
from labeler import GAIALabeler

In [3]:
df_entity = pd.read_hdf('/lfs1/jupyterhub_data_dir/share/xij/store_data/df_all.h5')
df_entity['id'] = df_entity['e']

class Entity(rltk.AutoGeneratedRecord):
    pass

ds_entity = rltk.Dataset(reader=rltk.DataFrameReader(df_entity), record_class=Entity)

In [4]:
#### Modify your choice
chosen_entity = 'http://www.isi.edu/gaia/entities/7af30a95-7a65-4e59-ab12-898ccd82e544'
entity = ds_entity.get_record(chosen_entity)

In [5]:
def get_target_block(target):
    if not target: return set()
    return set(df_entity[df_entity['target'] == target].e)

In [6]:
def token_a_tuple(tpl):
    if not tpl: return set()
    tokens = set()
    for s in tpl:
        tokens |= set(s.lower().split())
    return tokens

def is_intersection_with(origin):
    return lambda tpl: bool(token_a_tuple(tpl) & origin)

def get_token_block(tpl):
    if not tpl: return {}
    return set(df_entity[df_entity.gt_name.apply(is_intersection_with(token_a_tuple(tpl)))].e)

In [7]:
#### Can add more blocks
def get_blocks(entity):
    candidates = get_target_block(entity.target)
    candidates |= get_token_block(entity.gt_name)
    candidates.discard(entity.e)
    return candidates

len(get_blocks(entity))

13

In [8]:
df_entity.columns

Index(['e', 'type', 'name', 'gt_name', 'source', 'target', 'target_type',
       'wikidata', 'wiki_label_en', 'wiki_label_uk', 'gt_wiki_label_uk',
       'wiki_label_ru', 'gt_wiki_label_ru', 'wiki_alias_en', 'wiki_alias_uk',
       'gt_wiki_alias_uk', 'wiki_alias_ru', 'gt_wiki_alias_ru', 'origin',
       'lang', 'all_labels', 'gt_all_labels', 'id'],
      dtype='object')

In [9]:
from googletrans import Translator
translator = Translator()

candidates = [ds_entity.get_record(uri) for uri in get_blocks(entity)]

def sep_join_attr(attr, sep=', '):
    def sep_join(s):
        return sep.join(s) if s else ''
    return lambda record: sep_join(getattr(record, attr))

comma_join_attr = lambda a: sep_join_attr(a)
line_join_attr = lambda a: sep_join_attr(a, '\n')

def parse_origin(e):
    if e.lang in {'ru', 'uk'}:
        try:
            trans = translator.translate(list(e.origin), src=e.lang)
            return '\n'.join('[Translated]: '+ t.text for t in trans)
        except:
            html = '<a href="https://translate.google.com/#view=home&op=translate&sl={}&tl=en&text={}">{}</a>'
            return '\n'.join(html.format(e.lang, o, o) for o in e.origin)
    return '\n'.join(e.origin)

def wiki_labels(e):
    labels = []
    labels += e.wiki_label_en if e.wiki_label_en else []
    labels += e.gt_wiki_label_uk if e.gt_wiki_label_uk else []
    labels += e.gt_wiki_label_ru if e.gt_wiki_label_ru else []
    labels = set(labels)
    return ', '.join(labels)

def wiki_alias(e):
    labels = []
    labels += e.wiki_alias_en if e.wiki_alias_en else []
    labels += e.gt_wiki_alias_ru if e.gt_wiki_alias_ru else []
    labels += e.gt_wiki_alias_uk if e.gt_wiki_alias_uk else []
    labels = set(labels)
    return ', '.join(labels)

#### Can be modified to display more attributes

template = [
    ('URI', 'e'), 
    ('Type', 'type'), 
    ('hasName', comma_join_attr('name')),
    ('TranslatedName', comma_join_attr('gt_name')),
    ('prefLabels', comma_join_attr('all_labels')),
    ('TranslatedPrefLabels', comma_join_attr('gt_all_labels')),
    ('Target', 'target'),
    ('Source', 'source'),
    ('Wikidata', 'wikidata'),
    ('WikiLabels', wiki_labels),
    ('WikiAlias', wiki_alias),
    ('origin', parse_origin),
]

labeler = GAIALabeler(entity, candidates, template)
labeler.build_view()

VBox(children=(HBox(children=(Button(description='Prev', disabled=True, style=ButtonStyle()), Label(value='1/1…

In [10]:
#### This can be used to load saved ground truth
# Labeler.load_ground_truth(ds_entity, ds_entity, 'test.tsv').build_view()

In [11]:
labeler.dump_clusters()

[]

In [12]:
GAIALabeler.parse_label_files(['test.tsv']).dump_clusters_json_lines('test.jsonl')
GAIALabeler.parse_label_files(['test.tsv']).dump_clusters()

[['http://www.isi.edu/gaia/entities/7af30a95-7a65-4e59-ab12-898ccd82e544',
  'http://www.isi.edu/gaia/entities/9785ff9d-309b-49eb-a39a-27444c9f2e04',
  'http://www.isi.edu/gaia/entities/b1c18fbb-3620-4d1e-894c-62e899551502',
  'http://www.isi.edu/gaia/entities/c5eb010e-4e7c-4489-a1c0-2fd5a76dca6f']]