In [74]:
import json

from tool.annotations_utils import read_annotations, has_intersection

# Set title
TITLE = 'Nad_Niemnem'
DIR_PATH = f'..\\experiments\\polish'

In [75]:
# useful for label studio setup
def create_choices_for_label_studio(filepath):
    choices = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file.readlines():
            choices.append(f'<Choice value="{line.strip()}"/>')
    return ' '.join(choices)


create_choices_for_label_studio(f'..\\data\\lists_of_characters\\{TITLE}')

'<Choice value="Justyna Orzelska"/> <Choice value="Jan Bohatyrowicz"/> <Choice value="Benedykt Korczyński"/> <Choice value="Anzelm Bohatyrowicz"/> <Choice value="Witold Korczyński"/> <Choice value="Marta Korczyńska"/> <Choice value="Emilia Korczyńska"/> <Choice value="Zygmunt Korczyński"/> <Choice value="Teofil Różyc"/> <Choice value="Andrzejowa Korczyńska"/> <Choice value="Klotylda Korczyńska"/> <Choice value="Bolesław Kirło"/> <Choice value="Maria Kirłowa"/> <Choice value="Jadwiga Domuntówna"/> <Choice value="Teresa Plińska"/> <Choice value="Bohatyrowiczowie"/>'

In [76]:
def prepare_single_annotation(annotation_id, annotation_content):
    return {'id': annotation_id + 1,
            'data': {'text': annotation_content},
            'annotations': [
                {'model_version': 'v1', 'result': []}]}


def convert_to_label_studio_entity(entity, annotation, version=0):
    if version == 0:
        return {
                    'from_name': 'label',
                    'to_name': 'text',
                    'type': 'labels',
                    'id': str(hash(str(annotation)+str(entity))),
                    'value': {
                        'start': entity[0],
                        'end': entity[1],
                        'text': annotation['content'][entity[0]:entity[1]],
                        'labels': ['PERSON']
                    }
                }
    if version == 1:
        return {
                    'value': {
                        'start': entity[0],
                        'end': entity[1],
                        'text': annotation['content'][entity[0]:entity[1]],
                        'choices': [entity[2]]
                    },
                    'id': str(hash(str(annotation)+str(entity))),
                    'from_name': 'relevance',
                    'to_name': 'text',
                    'type': 'choices'
            }


def convert_to_label_studio_format(raw_annotations, output_path=None):
    new_annotations = []

    for anno_id, anno in enumerate(raw_annotations):
        single_annotation = prepare_single_annotation(anno_id, anno['content'])

        for entity in anno['entities']:
            new_entity_v0 = convert_to_label_studio_entity(entity, anno, 0)
            new_entity_v1 = convert_to_label_studio_entity(entity, anno, 1)

            single_annotation['annotations'][0]['result'].append(new_entity_v0)
            single_annotation['annotations'][0]['result'].append(new_entity_v1)

        new_annotations.append(single_annotation)

    if output_path:
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(json.dumps(new_annotations).encode('utf8').decode())
    return new_annotations

In [77]:
annotations = read_annotations(f'{DIR_PATH}\\{TITLE}.json')

# import this file to label studio
convert_to_label_studio_format(
    annotations,
    f'..\\data\\labelstudio_annotations\\labelstudio_input\\{TITLE}.json')

[{'id': 1,
  'data': {'text': 'Pani Emilia z niezwykłą u niej żywością poruszyła się na szezlongu i wołać zaczęła:'},
  'annotations': [{'model_version': 'v1',
    'result': [{'from_name': 'label',
      'to_name': 'text',
      'type': 'labels',
      'id': '1670179494280263763',
      'value': {'start': 5,
       'end': 11,
       'text': 'Emilia',
       'labels': ['PERSON']}},
     {'value': {'start': 5,
       'end': 11,
       'text': 'Emilia',
       'choices': ['Emilia Korczyńska']},
      'id': '1670179494280263763',
      'from_name': 'relevance',
      'to_name': 'text',
      'type': 'choices'}]}]},
 {'id': 2,
  'data': {'text': '— Panie Bolesławie! Proszę Tereni nie dokuczać! Niech pan jej nie dręczy! Ją dziś zęby bolą.'},
  'annotations': [{'model_version': 'v1',
    'result': [{'from_name': 'label',
      'to_name': 'text',
      'type': 'labels',
      'id': '-5943501247716871648',
      'value': {'start': 8,
       'end': 18,
       'text': 'Bolesławie',
       'labels

In [78]:
def merge_annotations(anno_person, anno_hero):
    merged_annotations = []
    for p in anno_person:
        found = False

        for h in anno_hero:
            if p['start'] == h['start'] and p['end'] == h['end']:
                merged_annotations.append(
                    [p['start'], p['end'], h['choices'][0]])
                found = True
                break

        if not found:
            merged_annotations.append([p['start'], p['end'], 'PERSON'])
    return merged_annotations


def convert_from_label_studio(filepath, save_path=None):
    with open(filepath, encoding='utf-8') as f:
        data = json.loads(f.read())

    corrected_annotations = []

    min_index = min([x['id'] for x in data])
    max_index = max([x['id'] for x in data])
    for i in range(min_index, max_index + 1):
        row = [x for x in data if x['id'] == i][0]
        text = row['data']['text']
        anno_person = [a['value'] for a in row['annotations'][0]['result']
                       if a['type'] == 'labels']
        anno_hero = [a['value'] for a in row['annotations'][0]['result']
                     if a['type'] == 'choices']
        annotation = merge_annotations(anno_person, anno_hero)
        corrected_annotations.append({'content': text, 'entities': annotation})

    if save_path:
        with open(save_path, 'w') as f:
            f.write(json.dumps(corrected_annotations).encode('utf-8').decode())
    return corrected_annotations

In [79]:
converted_annotations = convert_from_label_studio(
    # this file is the label studio output (export)
    filepath=f'..\\data\\labelstudio_annotations\\labelstudio_output\\{TITLE}.json',
    save_path=f'..\\data\\testing_sets\\test_polish_gold_standard\\{TITLE}.json')

# original tagger annotations
pre_annotations = read_annotations(f'{DIR_PATH}\\{TITLE}.json')

In [80]:
gold_count = 0
pred_count = 0
exact_count = 0
wrong_hero_count = 0
intersections_count = 0
incorrect_count = 0
missing_count = 0

for gold, pred in zip(converted_annotations, pre_annotations):
    gold_entities = gold['entities']
    pred_entities = pred['entities']

    gold_count += len(gold_entities)
    pred_count += len(pred_entities)

    matched_count = 0
    for entity1 in gold_entities:
        exact = False
        intersection = False

        for entity2 in pred_entities:
            if entity1 == entity2:
                exact_count += 1
                exact = True
                matched_count += 1
                break

        if not exact:
            for entity2 in pred_entities:
                gold_content1 = gold['content'][entity1[0]:entity1[1]]
                gold_content2 = gold['content'][entity2[0]:entity2[1]]
                if entity1[:2] == entity2[:2]:
                    print('Wrong hero', entity1, entity2,
                          gold_content1, gold_content2)
                    wrong_hero_count += 1
                    matched_count += 1
                    intersection = True

                elif has_intersection(entity1, entity2):
                    print('Intersection', entity1, entity2,
                          gold_content1, gold_content2)
                    intersections_count += 1
                    intersection = True
                    matched_count += 1
                    break

        if not exact and not intersection:
            missing_count += 1

    incorrect_count += len(pred_entities) - matched_count

    assert gold_count + incorrect_count == pred_count + missing_count
    assert gold_count - exact_count - intersections_count - wrong_hero_count \
           == missing_count

Wrong hero [27, 33, 'Teresa Plińska'] [27, 33, 'PERSON'] Tereni Tereni
Wrong hero [16, 24, 'PERSON'] [16, 24, 'Justyna Orzelska'] Orzelski Orzelski
Wrong hero [20, 25, 'Bolesław Kirło'] [20, 25, 'PERSON'] Kirle Kirle
Wrong hero [2, 9, 'Teresa Plińska'] [2, 9, 'PERSON'] Tereniu Tereniu
Wrong hero [128, 135, 'PERSON'] [128, 135, 'Andrzejowa Korczyńska'] Andrzej Andrzej
Wrong hero [50, 58, 'PERSON'] [50, 58, 'Andrzejowa Korczyńska'] Andrzeja Andrzeja


In [81]:
print('pre-annotations =', pred_count)
print('final =', gold_count)
print('corrected =', exact_count)
print('missing =', missing_count)
print('wrong hero assigned =', wrong_hero_count)
print('wrong boundaries =', intersections_count)
print('completely wrong =', incorrect_count)

pre-annotations = 85
final = 80
corrected = 69
missing = 5
wrong hero assigned = 6
wrong boundaries = 0
completely wrong = 10


In [82]:
counts = [TITLE.replace('_', ' '), pred_count, gold_count, exact_count,
          missing_count, wrong_hero_count, intersections_count, incorrect_count]
print(' & '.join([str(x) for x in counts]) + '\\')

Nad Niemnem & 85 & 80 & 69 & 5 & 6 & 0 & 10\
