# Post-process annotation results and update benchmark

## I. Post-process manual annotation results

Postprocessing:
* Remove duplicates
* Take out positive cases
* Merge phase 1 & phase 2 annotations
* Save recall_sets into CSV files

### The large dataset

In [1]:
import json
import os
import os.path
import pandas as pd

In [2]:
input_dir = "labelled/extended_1000"
assert os.path.exists(input_dir), \
    f'(!) Missing input dir {input_dir!r}. Complete the second annotation phase and download annotated data.'

In [3]:
def read_json_file(filename):    
    f = open(filename, encoding="utf-8")
    data = json.load(f)
    f.close()
    return data

In [4]:
annotated = []
for file in os.listdir(input_dir):
    pos = read_json_file(os.path.join(input_dir, file))
    cat = file.split(".")[0].split("_")[1].strip()
    try:
        for elem in pos:
            #print(elem["annotations"][0], "\n")
            start = elem["annotations"][0]["result"][0]["value"]["start"]
            end = elem["annotations"][0]["result"][0]["value"]["end"]
            phrase =  elem["annotations"][0]["result"][0]["value"]["text"]
            text = elem["data"]["text"]
            if len( elem["annotations"][0]["result"]) > 1:
                choice = elem["annotations"][0]["result"][1]["value"]["choices"][0]
                annotated.append((file, cat, phrase, choice, text, start, end))
            else:
                print('erroneous annotation:', elem["id"])

    except Exception as e:
        print(str(e))
        print(elem)
        break
        #continue

annotated_df = pd.DataFrame(annotated, columns=["file","category", "phrase", "choice","text", "start", "end"])

erroneous annotation: 8091


In [5]:
annotated_df = annotated_df.drop_duplicates()

In [6]:
annotated_df

Unnamed: 0,file,category,phrase,choice,text,start,end
0,pos_A.json,A,sooja merd,no,“ Me armastame sooja merd ning ilusaid ja vana...,15,25
1,pos_A.json,A,pruunide säärte,no,"Kepka viltu peas , sööstab Ülle pruunide säärt...",32,47
2,pos_A.json,A,kaheksatuhandelist mäge,no,"Teisegi tipptulemuse tegi ta läinud aastal , v...",77,100
3,pos_A.json,A,kohalikke panku,no,Valitsus mõjutas kohalikke panku ja kindlustus...,17,32
4,pos_A.json,A,ainsa pangana,no,Näiteks ERA Pank maksab ( ainsa pangana ) oma ...,26,39
...,...,...,...,...,...,...,...
7737,pos_Z.json,Z,lupo: kurgi,no,lupo: kurgi,0,11
7738,pos_Z.json,Z,emili: kurgi,no,emili: kurgi ka,0,12
7739,pos_Z.json,Z,PIPI-vs-banaanikalajutt: oja,no,PIPI-vs-banaanikalajutt: oja käskis lugeda ...,0,28
7740,pos_Z.json,Z,kiisumiisu: raba,no,kiisumiisu: raba on sinilill,0,16


In [7]:
# Manual annotations about NE match
annotated_df["choice"].value_counts()

choice
no               6210
full match       1440
partial match      21
Name: count, dtype: int64

In [8]:
annotated_df[annotated_df['choice']=='full match']

Unnamed: 0,file,category,phrase,choice,text,start,end
22,pos_A.json,A,Vaikset ookeani,full match,Ainult inimjõul liikuvate sõiduvahenditega ümb...,121,136
24,pos_A.json,A,Vaikse ookeani,full match,"Saades teada , et tegemist on USA Vaikse ookea...",34,48
25,pos_A.json,A,Mustast merest,full match,See-eest saab iga põhjakaukaasia keelkonda kuu...,149,163
32,pos_A.json,A,Vaiksesse ookeani,full match,"13,3 meetri pikkuse laevaga sõidetakse üle Atl...",80,97
53,pos_A.json,A,Vaikses ookeanis,full match,Juuni keskel läks USA ranniku-vetes Vaikses oo...,36,52
...,...,...,...,...,...,...,...
6794,pos_Y.json,Y,Lai'a jõest,full match,"No kui eesti on põlis veneala , mis kuradi pär...",123,134
6822,pos_Y.json,Y,RZB panga,full match,RZB prognoos Ühispanga 1997 . aasta kasumi suh...,139,148
7187,pos_Z.json,Z,Calais' väinas,full match,Mullu detsembris Calais' väinas koos 3000 luks...,17,31
7306,pos_Z.json,Z,Croix' saarelt,full match,Croix' saarelt .,0,14


In [9]:
annotated_df[annotated_df['choice']=='partial match']

Unnamed: 0,file,category,phrase,choice,text,start,end
3276,pos_G.json,G,sõrve sääre,partial match,Või sõrve sääre tipus olevadi tuulikuid ?,4,15
3591,pos_H.json,H,Rocki panga,partial match,Paari nädala eest Inglismaal toimunud Northern...,47,58
3640,pos_H.json,H,Changi saarele,partial match,"Reisi lõpp viib meid Koh Changi saarele , kus ...",25,39
3666,pos_H.json,H,Francisco lahes,partial match,Meelelahutuse alal tegutseb ka kuulus Alcatraz...,51,66
3737,pos_H.json,H,Migueli saart,partial match,"Ta istub väikebussi roolis , sest teisel päeva...",97,110
3816,pos_H.json,H,Mare ojas,partial match,"Samas on linnas ka jäävabasid veesilmu , näite...",58,67
4098,pos_H.json,H,Weighti saarel,partial match,Buldooserid olid juba varem saadetud kindlusta...,85,99
4110,pos_H.json,H,Zedongi mägedes,partial match,Hiina keskpank andis armee aastapäeva puhul vä...,110,125
4177,pos_H.json,H,innipegi järve,partial match,28 meetrit on Winnipegi järve suurim sügavus .,16,31
5364,pos_S.json,S,Lootuse neeme,partial match,"Nii lai , et see ei pääse läbi Panama kanali ,...",80,93


Discard 'partial match' (too few items). Focus on the full match.

In [10]:
full_match_df = annotated_df[annotated_df["choice"]=="full match"]
full_match_df
full_match_df["category"].value_counts()

category
H    669
G    500
S     85
A     81
Y     78
I     14
D      5
N      5
Z      3
Name: count, dtype: int64

In [11]:
# Extract sentences and annotations. 
# Group by postags
annotations_by_pos = {}
for i in full_match_df.index:
    postag = full_match_df['category'][i]
    if postag not in annotations_by_pos:
        annotations_by_pos[postag] = []
    start = full_match_df['start'][i]
    end = full_match_df['end'][i]
    phrase = full_match_df['phrase'][i]
    sentence = full_match_df['text'][i]
    span = {'start': start, 'end': end, 'text': phrase, 'labels': ['LOC']}
    annotations_by_pos[postag].append( (sentence, span) )

### The small dataset

Repeat the same procedure for the small dataset.

In [12]:
input_dir = "labelled/extended_100"
assert os.path.exists(input_dir), \
    f'(!) Missing input dir {input_dir!r}. Complete the first annotation phase and download annotated data.'

In [13]:
annotated = []
for file in os.listdir(input_dir):
    pos = read_json_file(os.path.join(input_dir, file))
    cat = file.split(".")[0].split("_")[1].strip()
    try:
        for elem in pos:
            #print(elem["annotations"][0], "\n")
            start = elem["annotations"][0]["result"][0]["value"]["start"]
            end = elem["annotations"][0]["result"][0]["value"]["end"]
            phrase =  elem["annotations"][0]["result"][0]["value"]["text"]
            text = elem["data"]["text"]
            if len( elem["annotations"][0]["result"]) > 1:
                choice = elem["annotations"][0]["result"][1]["value"]["choices"][0]
                annotated.append((file, cat, phrase, choice, text, start, end))
            else:
                print(elem["id"])

    except Exception as e:
        print(str(e))
        print(elem)
        break
        #continue

annotated_df = pd.DataFrame(annotated, columns=["file","category", "phrase", "choice","text", "start", "end"])

In [14]:
# Manual annotations about NE match
annotated_df["choice"].value_counts()

choice
no               1799
full match        245
partial match      18
Name: count, dtype: int64

In [15]:
annotated_df[annotated_df['choice']=='full match']

Unnamed: 0,file,category,phrase,choice,text,start,end
14,pos_A.json,A,Väikese väina,full match,"Kokkukasvamine on toimunud pidevalt , sest saa...",181,194
16,pos_A.json,A,Mustas meres,full match,Kuuekümnendail aastail asustati Läänemerre ka ...,89,101
60,pos_A.json,A,ülemiste järve,full match,"AS Tallinna Vesi peadirektor Enno Pere ütles ,...",143,157
79,pos_A.json,A,Vaikse ookeani,full match,Atmosfääri liikumishulga moment on eriti suur ...,46,60
343,pos_D.json,D,Lahti jõe,full match,Dali oli Vovaga sageli Lahti jõe ääres mängima...,23,32
...,...,...,...,...,...,...,...
1940,pos_Y.json,Y,SEBi panga,full match,"TALLINN , 9. september ( EPLO ) - Ajakirja Aff...",98,108
1941,pos_Y.json,Y,MAPO panga,full match,"Tallinna Äripanga peaomaniku , Moskvas asuva M...",45,55
1942,pos_Y.json,Y,ETV kanalil,full match,Peale tähtede mängu saavad korvpallihuvilised ...,54,65
1943,pos_Y.json,Y,EVEA panka,full match,"Sellele vaatamata , et on unine ja turistideva...",78,88


In [16]:
annotated_df[annotated_df['choice']=='partial match']

Unnamed: 0,file,category,phrase,choice,text,start,end
70,pos_A.json,A,elektrooniliste kanalite,partial match,Hansapanga elektrooniliste kanalite töös võib ...,11,35
81,pos_A.json,A,püha jõgi,partial match,"Selgub , et tegemist on restorani ühe omaniku ...",123,132
90,pos_A.json,A,järskude kallaste,partial match,Jõesängi järskude kallaste sisse on uuristatud...,9,26
109,pos_A.json,A,Ameerika-poolsele kaldale,partial match,Venemaa tuntud polaaruurija Dmitri Šparo jõudi...,133,158
125,pos_A.json,A,paiksete allikate,partial match,Eesti paiksete allikate saaste on mõne aastaga...,6,23
147,pos_C.json,C,paremal kaldal,partial match,Pirita jõe paremal kaldal olevalt kailt võetak...,11,25
167,pos_C.json,C,suuremalt pangalt,partial match,Vene gaasikontsernile Gazprom kuuluv AS Nitrof...,77,94
184,pos_C.json,C,ilusama fjordi,partial match,"Kohe , kui me otsa ühe Norra ilusama fjordi Ly...",29,43
234,pos_C.json,C,paremal kaldal,partial match,Ka elavad Tereki paremal kaldal tšetšeeni taib...,17,31
457,pos_G.json,G,Prantsuse saarel,partial match,"AJACCIO , 25. august ( Reuters-EPLO ) - Prants...",40,56


In [17]:
full_match_df = annotated_df[annotated_df["choice"]=="full match"]
full_match_df
full_match_df["category"].value_counts()

category
H    104
G     76
Y     38
S     12
I      9
A      4
D      1
Z      1
Name: count, dtype: int64

Merge two datasets:

In [18]:
for i in full_match_df.index:
    postag = full_match_df['category'][i]
    if postag not in annotations_by_pos:
        annotations_by_pos[postag] = []
    start = full_match_df['start'][i]
    end = full_match_df['end'][i]
    phrase = full_match_df['phrase'][i]
    sentence = full_match_df['text'][i]
    span = {'start': start, 'end': end, 'text': phrase, 'labels': ['LOC']}
    annotations_by_pos[postag].append( (sentence, span) )

Save/export annotations into files:

In [19]:
import csv

output_dir = 'labelled/recall_sets'
os.makedirs(output_dir, exist_ok=True)

# Write into recall_sets
for postag in annotations_by_pos.keys():
    output_path = os.path.join(output_dir, f'pos_terms_1000_{postag}.csv')
    with open(output_path, 'w', encoding='utf-8', newline='') as out_f:
        fieldnames = ['id', 'text', 'span']
        writer = csv.DictWriter(out_f, fieldnames=fieldnames, delimiter=',')
        writer.writeheader()
        for i, (sentence, span) in enumerate( annotations_by_pos[postag] ):
             writer.writerow({'id': i, 'text': sentence, 'span': str(span)})

## II. Update the benchmark data

### Gather necessary counts

Next, we need to obtain total term counts in each subpopulation. This information is available in the local database file created by the SpanSampler. Reload the database and get the information:

In [20]:
from helper_functions import load_configuration, connect_to_database
from helper_functions import count_terms_by_postags
from span_sampler_sqlite3 import SpanSampler

config = load_configuration('config\example_configuration.ini')
storage = connect_to_database(config)
collection = config['source_database']['collection']
collection = storage[collection]

sampler = SpanSampler(collection=collection, 
                      layer='v172_geo_terms', 
                      attribute='lemma', 
                      termsfile='geo_terms.txt', 
                      db_file_name=config['local_database']['sqlite_file'], 
                      verbose=True)

INFO:storage.py:57: connecting to host: 'postgres.keeleressursid.ee', port: 5432, dbname: 'estonian-text-corpora', user: 'soras'
INFO:storage.py:108: schema: 'estonian_text_corpora', temporary: False, role: 'estonian_text_corpora_read'
Loaded 63 terms from geo_terms.txt.


In [21]:
# Load total counts of each partofspeech subpopulation
subpopulation_totals = count_terms_by_postags(sampler)
print(subpopulation_totals)

{'A': 30190, 'C': 2663, 'D': 17890, 'G': 3611, 'H': 72084, 'I': 314, 'J': 20878, 'K': 7786, 'N': 3949, 'O': 911, 'P': 20508, 'S': 47721, 'U': 1329, 'V': 42315, 'X': 36, 'Y': 2163, 'Z': 10901}


Next, get numbers of positive cases (correct entities) for each subpopulation.

In [22]:
# Collect numbers of positive cases.
# Use CSV files as a basis to test that they are in a valid format
from pandas import read_csv

positives = {}
for subpopulation in annotations_by_pos.keys():
    positives[subpopulation] = 0
    filename = f'labelled/recall_sets/pos_terms_1000_{subpopulation}.csv'
    try:
        data = read_csv(filename)
    except Exception as csv_parsing_err:
        raise ValueError(f'(!) Bad input file format: unable to open {filename!r} as a CSV file: ') from csv_parsing_err
    positives[subpopulation] += len(data.text)

### Create dataset description CSV file

In [23]:
import numpy as np 
from pandas import DataFrame

In [24]:
# Add initial statistics about sub populations
sorted_populations = sorted(annotations_by_pos.keys())
df = DataFrame({
    'population': sorted_populations,
    'occurences': [subpopulation_totals[s_pop] for s_pop in sorted_populations],
    'labelled': [1000 for s_pop in sorted_populations],
    'positive': [positives[s_pop] for s_pop in sorted_populations]
})
# Compute some additional statistics
df['occurence_ratio'] = df['occurences']/sum(df['occurences'])
df['detection_ratio'] = df['positive']/df['labelled']
df['relative_frequency'] = df['occurence_ratio'] * df['positive']/sum(df['occurence_ratio'] * df['positive'])
df

Unnamed: 0,population,occurences,labelled,positive,occurence_ratio,detection_ratio,relative_frequency
0,A,30190,1000,85,0.159885,0.085,0.039223
1,D,17890,1000,6,0.094745,0.006,0.001641
2,G,3611,1000,576,0.019124,0.576,0.031791
3,H,72084,1000,773,0.381754,0.773,0.851679
4,I,314,1000,23,0.001663,0.023,0.00011
5,N,3949,1000,5,0.020914,0.005,0.000302
6,S,47721,1000,97,0.252729,0.097,0.070752
7,Y,2163,1000,116,0.011455,0.116,0.003835
8,Z,10901,1000,4,0.057731,0.004,0.000666


In [25]:
# add file names (full paths)
df['file'] = [f'amundsen_02/data/recall_sets/pos_terms_1000_{s_pop}.csv' for s_pop in sorted_populations]

In [26]:
# export as csv
df.to_csv('data_description.csv')