In [1]:
import morfessor
import json
import numpy as np

### Train

In [2]:
io = morfessor.MorfessorIO()
train_data = list(io.read_corpus_file('non-annotated.txt'))

In [3]:
model_recursive = morfessor.BaselineModel()
model_recursive.load_data(train_data)
model_recursive.train_batch(algorithm='recursive')

...........................................................
...........................................................
...........................................................
...........................................................
...........................................................
...........................................................


(6, 3631759.648944667)

In [4]:
model_viterbi = morfessor.BaselineModel()
model_viterbi.load_data(train_data)
model_viterbi.train_batch(algorithm='viterbi')

...........................................................
...........................................................


(2, 4301783.421123428)

### Evaluate

In [5]:
goldstd_data = io.read_annotations_file('gold_std_bpr.tsv')

ev = morfessor.MorfessorEvaluation(goldstd_data)
result_recursive = ev.evaluate_model(model_recursive)
result_viterbi = ev.evaluate_model(model_viterbi)

The test set is too small for this sample size


In [6]:
# recursive
np.mean(result_recursive.precision), np.mean(result_recursive.recall), np.mean(result_recursive.fscore)

(0.5750759523809527, 0.5662833333333332, 0.5705701764065475)

In [7]:
# viterbi
np.mean(result_viterbi.precision), np.mean(result_viterbi.recall), np.mean(result_viterbi.fscore)

(0.782516666666667, 0.2659466666666667, 0.3967475431381568)

In [8]:
io.write_binary_model_file('model_recursive', model_recursive)

### WER

In [9]:
with open('morfessor_segmentation.txt') as f:
    morfessor_segmentation = f.read().split('\n')

In [10]:
with open('siblang_segmentation.json', 'r') as f:
    gold_std_dict = json.load(f)

In [11]:
from wer import editDistance

def get_wer_result(r, h):
    res = editDistance(r, h)
    return float(res[len(r)][len(h)]) / len(r)

In [12]:
def evaluate_segmentation_morfessor(segmenter, segmentation):

    wer_score_sum = 0

    for i, word in enumerate(gold_std_dict):
        wer_best = float('+inf')
        seg = segmentation[i]
        
        for reference in gold_std_dict[word]:
#             print(reference, seg)
            wer_score = get_wer_result(reference.split(), seg.split())
            wer_best = min(wer_best, wer_score)

        wer_score_sum += wer_best    
    
    print('wer: {:.3%}\n'.format(wer_score_sum / len(gold_std_dict)))

In [13]:
evaluate_segmentation_morfessor(model_recursive, morfessor_segmentation)

wer: 100.911%

