# Workflow for updating the benchmark

This workflow adds texts to the benchmark.

## I. Initial split  into subpopulations  

The entire recall set consists of all geographic named entities for which the last word is among [the predefined suffix_words](geo_terms.txt). 


|Subpopulation     | Description | Examples |
|:--- |:---|:---|
|Levinumad         | Geographic locations with most frequent suffixes | Niiluse jõgi, Aasovi meri, Peipsi järv   |  
|Mitmetäneduslikud | Geographic locations with ambigous suffixes      | Panama kanal, Panga pank, Kura kurk      |
|Ülejäänud         | Other geographic locations                       | Vaikne ookean, Liivi laht, Tehvandi mägi |   

## II. Collect and merge annotations. Check consistency with the benchmark setup

* Collect manually labelled geo_terms (judgements about whether a term belongs to NE or not);
* Collect manually re-annotated geo_term phrases (combinations of automatic NE predictions and manual fixes to get full phrases);
* Merge manual annotations into recall_sets CSV files;
* Validate that outcome CSV files are readable;
* Check that there are no invalid nor duplicate annotations;

In [1]:
import json
from helper_functions import load_term_subpopulations
term_subpopulations = load_term_subpopulations()
term_subpopulations.keys()

dict_keys(['levinumad', 'mitmetahenduslikud', 'ulejaanud'])

In [2]:
# Validate and collect manually labelled geo_terms (from 1st annotation phase)
annotated_terms = {}
for subpopulation in term_subpopulations.keys():
    print(f'Validating {subpopulation!r} ...')
    filename = f'labelled_data/substringtagger_labels/koond_1000_{subpopulation}_labelled.json'
    # check it can be opened and json loaded
    with open(filename,'r',encoding='UTF-8') as f:
        data = json.load(f)

    for task in data:
        # we have one annotation per text in this setup
        assert len(task['annotations']) == 1
        # check that the task has been annotated by someone
        assert task['annotations'][0]['completed_by'] > 0
        # check that yes or no has been chosen
        assert task['annotations'][0]['result'][-1]['value']['choices'][0] in ['yes','no']
    print('OK')

    # Collect annotations (only positive cases)
    if subpopulation not in annotated_terms.keys():
        annotated_terms[subpopulation] = []
    for task in data:
        if task['annotations'][0]['result'][-1]['value']['choices'][0] == 'yes':
            # recall set contains of the raw sentence and the tagged 1-word span
            annotated_terms[subpopulation].append( [task['data']['text'], task['annotations'][0]['result'][0]['value']] )

Validating 'levinumad' ...
OK
Validating 'mitmetahenduslikud' ...
OK
Validating 'ulejaanud' ...
OK


In [3]:
# We only have terms, which need to be extended to NE phrases
annotated_terms['levinumad'][:3]

[['Kutsume kõiki huvilisi 17. mail kell 11 Hurmi järve äärde külavisiooni talgutele !',
  {'start': 46, 'end': 51, 'text': 'järve', 'labels': ['v172_geo_terms']}],
 ['14. dets. 1936. a. Peipsi järvelt 7 kalurit...milline relvastus meie piirivalvuritel , kas on kordonites kuulipildujaid , kuidas on ülemuste nimed ja aukraadid .',
  {'start': 26, 'end': 33, 'text': 'järvelt', 'labels': ['v172_geo_terms']}],
 ['Ta teenis Aegna saarel ning mängis sõjaväe orkestris .',
  {'start': 16, 'end': 22, 'text': 'saarel', 'labels': ['v172_geo_terms']}]]

In [4]:
# Validate and collect manually relabelled geo_terms phrases (from 2nd annotation phase)
reannotated_phrases = {}
for subpopulation in term_subpopulations.keys():
    print(f'Collecting reannotations of {subpopulation!r} ...')
    with open(f'labelled_data/ner_labels/koond_1000_{subpopulation}_truelabelled.json', 'r', encoding='UTF-8') as in_f:
        relabelled = json.load(in_f)
    # Validate that the number of relabelled samples matches
    # the number of initial (positive) labellings
    assert len(relabelled) == len(annotated_terms[subpopulation])
    reannotated_phrases[subpopulation] = relabelled

Collecting reannotations of 'levinumad' ...
Collecting reannotations of 'mitmetahenduslikud' ...
Collecting reannotations of 'ulejaanud' ...


In [5]:
# Merge manual annotations
import csv
import os, os.path

output_dir = 'labelled_data/recall_sets'
os.makedirs(output_dir, exist_ok=True)

# Manually ruled out location phrases
exceptions = {'Ka Pärnu jõkke Reiu jõe suudmealal lasti 120 000 maimu .' : ['Pärnu jõkke Reiu jõe'], 
              "Mootorpaat nelja mehega pardal väljus ööl vastu teisipäeva kella kahe ajal Ruhnu saarelt merele , et söita Pärnusse ." : ['Ruhnu saarelt merele'], 
              'Põhja-Tallinnas Stroomi randa ning Haaberstis Harku järve ja Kakumäe randadesse .' : ['Haaberstis Harku järve'], 
              'Kümneid turiste viidi haiglasse ja vähemalt üks hukkus Kreekas Chalkidiki poolsaare kuurortide ümbruses möllavas hiigelmetsapõlengus .' : ['Kreekas Chalkidiki poolsaare'], 
              'Kaks aastat pärast seda alustas Hawaiil tööd USA Vaikse ookeani hoiatuskeskus .': ['USA Vaikse ookeani'] }

subpopulation_csv_data = {}
for subpopulation in term_subpopulations.keys():
    print(f'Merging {subpopulation!r} labelling ...')
    csv_data = []
    item_id = 0
    for first_annotations in annotated_terms[subpopulation]:
        sentence_str = first_annotations[0]
        sentence_found = False
        for reannotation in reannotated_phrases[subpopulation]:
            sentence_str_2 = reannotation['data']['text']
            if sentence_str == sentence_str_2:
                sentence_found = True
                term_start = first_annotations[1]['start']
                term_end   = first_annotations[1]['end']
                for span in reannotation['annotations'][0]['result']:
                    span_info = span['value']
                    # Locate all NE phrases/spans containing the manually checked geo_term
                    if 'start' in span_info and span_info['start'] <= term_start and span_info['end'] >= term_end:
                        if span['origin'] == 'manual':
                            # Automatic NE tool was unable to detect the phrase,
                            # so it was marked manually
                            csv_item = {}
                            csv_item[''] = item_id
                            csv_item['text'] = sentence_str
                            csv_item['span'] = span_info
                            csv_item['correct'] = 'no'
                            csv_data.append( csv_item )
                            item_id += 1
                        if span['origin'] == 'prediction':
                            # Automatically predicted NE phrase overlaps with
                            # the manually checked geo_term. 
                            #
                            # We can take out NE phrases with exceptions:
                            # 1) leave out automatic 'ORG' annotations;
                            # 2) leave out phrases longer than 2 words if
                            #    they contain mistakenly joined NE-s.
                            #
                            # Note: this work was previously done manually  
                            # on the output csv file, and needs to be done 
                            # again from the scratch if dataset gets updated 
                            # with new samples. Here we just use hardcoded 
                            # results of manual phrase removal to ensure 
                            # repeatability.
                            is_exception = \
                                (sentence_str in exceptions and \
                                 span_info['text'] in exceptions[sentence_str])
                            if span_info['labels'] != ['ORG'] and not is_exception:
                                csv_item = {}
                                csv_item[''] = item_id
                                csv_item['text'] = sentence_str
                                csv_item['span'] = span_info
                                csv_item['correct'] = 'yes'
                                csv_data.append( csv_item )
                            item_id += 1
                break
        if not sentence_found:
            raise Exception(f'Sentence {sentence_str!r} was not found in re-annotated phrases of {subpopulation!r}.')
    subpopulation_csv_data[subpopulation] = csv_data

Merging 'levinumad' labelling ...
Merging 'mitmetahenduslikud' labelling ...
Merging 'ulejaanud' labelling ...


In [6]:
# Write recall_sets into files
for subpopulation in term_subpopulations.keys():
    print(f'Saving recall_sets of {subpopulation!r} ...')
    output_path = os.path.join(output_dir, f'koond_1000_{subpopulation}.csv')
    csv_data = subpopulation_csv_data[subpopulation]
    with open(output_path, 'w', encoding='utf-8', newline='') as out_f:
        fieldnames = ['', 'text', 'span', 'correct']
        writer = csv.DictWriter(out_f, fieldnames=fieldnames, delimiter=',')
        writer.writeheader()
        for csv_item in csv_data:
            writer.writerow(csv_item)

Saving recall_sets of 'levinumad' ...
Saving recall_sets of 'mitmetahenduslikud' ...
Saving recall_sets of 'ulejaanud' ...


Finally, validate outcome CSV files and check for annotation duplicates.

In [7]:
import os.path
from pandas import read_csv
from helper_functions import DuplicatesChecker
checker = DuplicatesChecker()

for subpopulation in term_subpopulations.keys():
    print(f'Validating {subpopulation!r} ...')
    filename = f'labelled_data/recall_sets/koond_1000_{subpopulation}.csv'
    # Validate file
    assert os.path.exists(filename), f'(!) Missing file: {filename}'
    try:
        data = read_csv(filename)
    except Exception as csv_parsing_err:
        raise ValueError(f'(!) Bad input file format: unable to open {filename!r} as a CSV file: ') from csv_parsing_er
    # Validate file's contents 
    for txt, span in zip(data.text, data.span):
        checker.check_for_duplicates(txt, span)
    print('OK')

Validating 'levinumad' ...
OK
Validating 'mitmetahenduslikud' ...
OK
Validating 'ulejaanud' ...
OK


## III. Update the benchmark data

### Gather necessary counts

First, we need to obtain total term counts in each subpopulation. 
This information is available in the local database file created by the SpanSampler.
Reload the database and get the information:

In [8]:
from helper_functions import load_configuration, connect_to_database
from helper_functions import load_term_subpopulations, count_terms_by_subpopulations
from span_sampler_sqlite3 import SpanSampler

config = load_configuration('config\example_configuration.ini')
storage = connect_to_database(config)
collection = config['source_database']['collection']
collection = storage[collection]

sampler = SpanSampler(collection=collection, 
                      layer=config['source_database']['terms_layer'], 
                      attribute='lemma', 
                      termsfile='geo_terms.txt', 
                      db_file_name=config['local_database']['sqlite_file'], 
                      verbose=True)

INFO:storage.py:57: connecting to host: 'postgres.keeleressursid.ee', port: 5432, dbname: 'estonian-text-corpora', user: 'soras'
INFO:storage.py:108: schema: 'estonian_text_corpora', temporary: False, role: 'estonian_text_corpora_read'
Loaded 63 terms from geo_terms.txt.


In [9]:
# Load total counts of each subpopulation
subpopulation_totals = \
    count_terms_by_subpopulations(sampler, subpopulations_dir='config/subpopulations')
print(subpopulation_totals)

{'levinumad': 84822, 'mitmetahenduslikud': 81893, 'ulejaanud': 185897}


Second, get numbers of positive cases (detected entities) for each subpopulation.

In [10]:
# Collect numbers of positive cases
from pandas import read_csv

positives = {}
for subpopulation in term_subpopulations.keys():
    positives[subpopulation] = 0
    filename = f'labelled_data/recall_sets/koond_1000_{subpopulation}.csv'
    try:
        data = read_csv(filename)
    except Exception as csv_parsing_err:
        raise ValueError(f'(!) Bad input file format: unable to open {filename!r} as a CSV file: ') from csv_parsing_err
    positives[subpopulation] += len(data.text)

### Create dataset description CSV file

In [11]:
import numpy as np 
from pandas import DataFrame

In [12]:
# Add initial statistics about sub populations
sorted_populations = sorted(term_subpopulations.keys())
df = DataFrame({
    'population': sorted_populations,
    'occurences': [subpopulation_totals[s_pop] for s_pop in sorted_populations],
    'labelled': [1000 for s_pop in sorted_populations],
    'positive': [positives[s_pop] for s_pop in sorted_populations]
})
# Compute some additional statistics
df['occurence_ratio'] = df['occurences']/sum(df['occurences'])
df['detection_ratio'] = df['positive']/df['labelled']
df['relative_frequency'] = df['occurence_ratio'] * df['positive']/sum(df['occurence_ratio'] * df['positive'])
df

Unnamed: 0,population,occurences,labelled,positive,occurence_ratio,detection_ratio,relative_frequency
0,levinumad,84822,1000,350,0.240553,0.35,0.473287
1,mitmetahenduslikud,81893,1000,13,0.232247,0.013,0.016972
2,ulejaanud,185897,1000,172,0.5272,0.172,0.50974


In [13]:
# add file names (full paths)
df['file'] = [f'amundsen_01/data/recall_sets/koond_1000_{s_pop}.csv' for s_pop in sorted_populations]

In [14]:
# export as csv
df.to_csv('data_description.csv')