In [None]:
import re
import hfst
import json
import sys
import morfessor
from tags import *

In [None]:
sys.path.append('../')
from segment import segment

## Coverage and mean ambiguity

Calculate ```coverage``` and ```mean ambiguity``` for different corpora:

In [None]:
!cd ../ && ./test_res.sh

## Tags

Open the file with wordforms and corresponding glosses with frequency from Siberian Lang:  
e.g. 
```"одяндэ": {"сделать-IPFV-NFUT-2SG": 4,
               "стать-FUTCNT-2SG": 1}```

In [None]:
with open('siblang_tags.json', 'r', encoding='utf-8') as f:
     wordforms = json.load(f)

Open the mapping between glosses and tags from the transducer:   
e.g. ```"1SG": [["<p1>", "<sg>"], "<px1sg>"]```

In [None]:
with open('mapping', 'r', encoding='utf-8') as f:
    mapping = json.load(f)

In [None]:
def evaluate_tags(analyser):
    """
    Calculates and prints precision, recall and f-score
    in the task of morphological tag assignment.
    
    :param analyser: a HFST transducer (libhfst.HfstTransducer)
    """
    
    tp = fp = fn = not_analysed = skipped = 0

    for word in wordforms:
        tp, fp, fn, not_analysed, skipped = evaluate_wordform(word,
                                                     analyser, mapping,
                                                     wordforms[word],
                                                     tp, fp, fn,
                                                     not_analysed, skipped)
    
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    fscore = 2 * (precision * recall) / (precision + recall)
    
    print('precision: {:.3%} tp: {} fp: {}'.format(precision, tp, fp))
    print('recall: {:.3%} tp: {} fp: {}'.format(recall, tp, fn))
    print('fscore: {:.3%}\n'.format(fscore))
    
    print('analysed: {:.3%}'.format((len(wordforms) - not_analysed - skipped) / len(wordforms)))
    print('not_analysed: {:.3%}'.format(not_analysed / len(wordforms)))
    print('skipped: {:.3%}\n'.format(skipped / len(wordforms)))

Calculate precision, recall and f-score of the assignment of morphological tags:

In [None]:
analyser = hfst.HfstInputStream('../evn.automorf.hfst').read()
evaluate_tags(analyser)

## Segmentation

Open the file with wordforms and corresponding segmentation from Siberian Lang:  
e.g. ```"одяра": ["о дя ра"]```

In [None]:
with open('siblang_segmentation.json', 'r') as f:
    siblang_segmentation = json.load(f)

Create a special file with one wordform in a line for Morfessor segmentation:

In [None]:
with open('words_for_segmentation.txt', 'w') as fw:
    for word in siblang_segmentation:
        fw.write(word + '\n')

Create a gold standard file for ```boundary precision and recall``` evaluation ([bpr.py](../blob/master/eval/bpr.py)).  
The word and its analyses are separated by a tabular character, any alternative analyses by a comma and a space, and the morphs of the analyses by single space: e.g. ```evening	even ing, evening```.

In [None]:
with open('gold_std_bpr.tsv', 'w') as fw:

    for word in siblang_segmentation:
        
        if len(siblang_segmentation[word]) == 1:
            fw.write(word + '\t' + siblang_segmentation[word][0] + '\n')
            
        elif len(siblang_segmentation[word]) > 1:
            fw.write(word + '\t' + ', '.join(siblang_segmentation[word]) + '\n')

Evaluate segmentation of the Morfessor recursive model:

In [None]:
!morfessor-segment -l model_recursive words_for_segmentation.txt > morfessor_segmentation.txt

In [None]:
with open('morfessor_segmentation_with_input.tsv', 'w') as fw, \
     open('morfessor_segmentation.txt') as f_morfessor, \
     open('words_for_segmentation.txt') as f_input:
        
        for w1, w2 in zip(f_input, f_morfessor):
            fw.write(w1.strip('\n') + '\t' + w2.strip('\n') + '\n') 

In [None]:
!python3 bpr.py -g gold_std_bpr.tsv -p morfessor_segmentation_with_input.tsv

Evaluate segmentation of the transducer:

In [None]:
def write_hfst_segmentation(segmenter):
    """
    Creates a file with segmentation by HFST transducer.
    
    :param segmenter: a path to the HFST transducer for segmentation (str)
                      or HFST transducer (libhfst.HfstTransducer)
    """
    analysed = 0
    not_analysed = 0
    
    with open('hfst_segmentation.tsv', 'w') as fw:

        for word in siblang_segmentation:
            segmentation = segment(word, segmenter)
    
            if not segmentation:
                not_analysed += 1
                continue
        
            analysed += 1

            fw.write(word + '\t' + ', '.join(segmentation) + '\n')
    
    print('analysed: {:.3%}'.format(analysed / (analysed + not_analysed)))
    print('not_analysed: {:.3%}'.format(not_analysed / (analysed + not_analysed)))

In [None]:
segmenter = hfst.HfstInputStream('../dev/segmenter/evn.segmenter.hfst').read()
write_hfst_segmentation(segmenter)

In [None]:
!python3 bpr.py -g gold_std_bpr.tsv -p hfst_segmentation.tsv

Evaluate segmentation of the Morfessor recursive model using only the words which receive analysis from the transducer:

In [None]:
with open('morfessor_segmentation_part.tsv', 'w') as fw, \
     open('morfessor_segmentation_with_input.tsv') as f_morfessor, \
     open('hfst_segmentation.tsv') as f_hfst:
        
        hfst_segmented_words = []
        
        for line in f_hfst:
            hfst_segmented_words.append(line.split('\t')[0])
            
        for line in f_morfessor:
            if line.split('\t')[0] in hfst_segmented_words:
                fw.write(line)

In [None]:
!python3 bpr.py -g gold_std_bpr.tsv -p morfessor_segmentation_part.tsv