In [1]:
import hfst
import json
import re
from itertools import chain

## Tags

I take words which are analysed by the analyser and have tags in ref.   
For each gloss I look at the mapping and if in the mapping it corresponds to:
* a single tag with <>, I add this tag to a set with ref tags
* a string without <>, I look for this string in test tags and   
    if I find a tag with this string -> I remove it from test tags, do tp += 1 and stop  
    else -> add this to a set with ref tags
* a list, I look for each of the items in test tags  
    if I find it -> I remove it from test tags, do tp += 1 and stop  
    else -> add this to a set with ref tags
    
In the end I count:  
* ```tp``` as ```len(test tags & ref tags)```
* ```fp``` as ```len(test tags - ref tags)```
* ```fn``` as ```len(ref tags - test tags)```

In [30]:
with open('minlang_wordforms.json', 'r', encoding='utf-8') as f:
     wordforms = json.load(f)

In [87]:
with open('mapping', 'r', encoding='utf-8') as f:
    mapping = json.load(f)

In [88]:
def evaluate_word(word, glosses, tp, fp, fn, not_analysed, skipped):
    
    analysis = analyser.lookup(word)
#     print(analysis)
    
    if not analysis:
        not_analysed += 1
        return tp, fp, fn, not_analysed, skipped

    # make a set with all test tags
    test_tags = list(chain.from_iterable(ana[0].split('<')[2:]
                       for ana in analysis))

    for i, tag in enumerate(test_tags):
        test_tags[i] = '<' + tag
        
    test_tags = set(test_tags)
    
    # make a set with all ref tags
    ref_tags = set()
    for gloss in glosses:
        if '-' in gloss or '=' in gloss:
            ref_tags_splitted = gloss.strip('-=').replace('=', '-').replace('--', '-').split('-')

            # add the first tag to ref tags if it is not a translation
            if re.search('NEG|\d(SG|PL)|FOC', ref_tags_splitted[0].upper()):
                ref_tags.add(ref_tags_splitted[0])

            ref_tags.update(ref_tags_splitted[1:])
        
        # add the tag to ref tags if it is not a translation
        elif re.search('NEG|\d(SG|PL)|FOC', gloss.upper()):
            ref_tags.add(gloss)
                       
        else:               
            skipped += 1
            return tp, fp, fn, not_analysed, skipped
    
    ref_tags_mapped = set()
    
    for gloss in ref_tags:
        try:
            ref_tag = mapping[gloss.upper()]
        except:
            print(gloss.upper())
            continue

        if isinstance(ref_tag, str):
            n = ref_tag.count('>')

            # if a mapping doesn't contain <> 
            # then we should look for a test tag that contains this string in mapping
            # (this is for glosses like CONV that can correspond to any of the converbs)

            if n == 0:
                added = False
                for test_tag in test_tags:
                    if ref_tag in test_tag:
                        test_tags.remove(test_tag)
                        tp += 1
                        added = True
                        break
                
                # if < isn't found then add to ref_tags_mapped 
                # to count the difference between test and ref in the end
                if not added:
                    ref_tags_mapped.add(ref_tag)

            # if a mapping contains one <
            # it can directly correspond to a test tag 
            # so add to ref_tags_mapped
            
            elif n == 1:
                ref_tags_mapped.add(ref_tag)
 
            # if there are more than one tags in mapping
            # we should split it
            # (for example, 1PL(EXCL).ACC corresponds to <p1><pe><acc><def>)
    
            elif n > 1:
                ref_tags_splitted = ref_tag.split('>')
                ref_tags_splitted.remove('')
                for tag in ref_tags_splitted:
                    tag = tag + '>'
                    if tag in test_tags:
                        test_tags.remove(tag)
                        tp += 1
                    else:
                        
                        # if it isn't found then add to ref_tags_mapped 
                        # to count the difference between test and ref in the end
                        
                        ref_tags_mapped.add(tag)
 
        # if a mapping can correspond to more than one tag
        # then we should check these tags in test
        # (for example, 1SG can correspond to <p1><sg> or to <px1sg>)

        elif isinstance(ref_tag, list):
            added = False
            for item in ref_tag:
                if item in test_tags:
                    test_tags.remove(item)
                    tp += 1
                    added = True
                    break

            if not added:
                fn += 1

    tp += len(test_tags & ref_tags_mapped)
    fp += len(test_tags - ref_tags_mapped)
    fn += len(ref_tags_mapped - test_tags)
    
    return tp, fp, fn, not_analysed, skipped

In [89]:
def evaluate(analyser):

    tp = 0
    fp = 0
    fn = 0
    skipped = 0
    not_analysed = 0

    for word in wordforms:
        tp, fp, fn, not_analysed, skipped = evaluate_word(word,
                                                     wordforms[word],
                                                     tp, fp, fn,
                                                     not_analysed, skipped)

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    fscore = 2 * (precision * recall) / (precision + recall)

    print('precision: {:.3%} tp: {} fp: {}'.format(precision, tp, fp)),
    print('recall: {:.3%} tp: {} fp: {}'.format(recall, tp, fn)),
    print('fscore: {:.3%}\n'.format(fscore))

    print('analysed: {:.3%}'.format((len(wordforms) - not_analysed - skipped) / len(wordforms)))
    print('not_analysed: {:.3%}'.format(not_analysed / len(wordforms)))
    print('skipped: {:.3%}'.format(skipped / len(wordforms)))

In [91]:
# everything is counted only for the words for which the analysis was given
# and which contain tags in ref (words without affixes are skipped)

analyser = hfst.HfstInputStream('../evn.automorf.hfst').read()
evaluate(analyser)

precision: 29.281% tp: 5595 fp: 13513
recall: 76.195% tp: 5595 fp: 1748
fscore: 42.305%

analysed: 44.440%
not_analysed: 44.900%
skipped: 10.660%


In [90]:
# everything is counted only for the words for which the analysis was given
# and which contain tags in ref (words without affixes are skipped)

# relaxed
analyser_relaxed = hfst.HfstInputStream('../evn_relaxed.automorf.hfst').read()
evaluate(analyser_relaxed)

precision: 28.450% tp: 10037 fp: 25243
recall: 79.957% tp: 10037 fp: 2516
fscore: 41.967%

analysed: 71.191%
not_analysed: 17.932%
skipped: 10.877%


### Segmentation

I take words which are segmented by the segmenter and count:  

```wer``` as ```sum of the best wer for the word / number of words segmented```  
```tp``` as ```len(test & ref)```  
```fp``` as ```len(test - ref)```  
```fn``` as ```len(ref - test)```  

In [62]:
with open('gold_std_dict', 'r') as f:
    gold_std_dict = json.load(f)

In [63]:
from wer import editDistance

def get_wer_result(r, h):
    res = editDistance(r, h)
    return float(res[len(r)][len(h)]) / len(r)

In [98]:
def evaluate_segmentation(segmenter):
    tp = 0
    fp = 0
    fn = 0
    analysed = 0
    not_analysed = 0
    wer_score_sum = 0

    for word in gold_std_dict:
        wer_best = float('+inf')
        segmentation = segmenter.lookup(word)
    
        if not segmentation:
            not_analysed += 1
            continue
        
        analysed += 1

        test = set()
        ref = set(d[word])

        for seg in segmentation:
            if seg:
#                 print(seg)
                seg_replaced = re.sub('·+', ' ', seg[0].strip('·'))
                test.add(seg_replaced)

                for ans in d[word]:
                    wer_score = get_wer_result(ans.split(), seg_replaced.split())
                    wer_best = min(wer_best, wer_score)

        if wer_best < float('+inf'):
            wer_score_sum += wer_best
    
            tp += len(test & ref)
            fp += len(test - ref)
            fn += len(ref - test)
    
        else:
            print(word, segmentation)

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    fscore = 2 * (precision * recall) / (precision + recall)

    print('precision: {:.3%} tp: {} fp: {}'.format(precision, tp, fp)),
    print('recall: {:.3%} tp: {} fn: {}'.format(recall, tp, fn)),
    print('fscore: {:.3%}'.format(fscore))
    print('wer: {:.3%}\n'.format(wer_score_sum / analysed))

    print('analysed: {:.3%}'.format(analysed / (analysed + not_analysed)))
    print('not_analysed: {:.3%}'.format(not_analysed / (analysed + not_analysed)))

In [99]:
# everything is counted only for the words which were segmented by the segmenter
segmenter = hfst.HfstInputStream('../dev/segmenter/evn.segmenter.hfst').read()
evaluate_segmentation(segmenter)

precision: 18.618% tp: 2223 fp: 9717
recall: 51.854% tp: 2223 fn: 2064
fscore: 27.399%
wer: 32.570%

analysed: 54.441%
not_analysed: 45.559%
