In [3]:
import pandas as pd
from eval import strip_gloss_punctuation, eval_word_glosses, eval_morpheme_glosses
from typing import List
import re

def clean_preds(preds):
    corrected_preds = preds.replace('\.$', '', regex=True)
    corrected_preds = corrected_preds.replace('\,', '', regex=True)
    corrected_preds = corrected_preds.replace('»', '', regex=True)
    corrected_preds = corrected_preds.replace('«', '', regex=True)
    corrected_preds = corrected_preds.replace('\"', '', regex=True)
    corrected_preds = corrected_preds.replace('\. ', ' ', regex=True)
    corrected_preds = corrected_preds.replace('\.\.+', '', regex=True)
    corrected_preds = corrected_preds.replace('\ +', ' ', regex=True)
    return corrected_preds

def _eval(preds: List[str], gold: List[str]):
    preds = [strip_gloss_punctuation(pred) for pred in preds]
    gold = [strip_gloss_punctuation(g) for g in gold]
    pred_words = [str(pred).split() for pred in preds]
    gold_words = [gloss.split() for gloss in gold]
    # word_eval = eval_accuracy(pred_words, gold_words)

    pred_morphemes = [re.split(r"\s|-", str(pred)) for pred in preds]
    gold_morphemes = [re.split(r"\s|-", gloss) for gloss in gold]

    eval_dict = {
        **eval_word_glosses(
            pred_words=pred_words, gold_words=gold_words
        ),
        **eval_morpheme_glosses(
            pred_morphemes=pred_morphemes, gold_morphemes=gold_morphemes
        ),
    }
    return eval_dict

def postprocess(path, segmented: bool):
    pred_df = pd.read_csv(path).fillna('')
    pred_df = pred_df[pred_df['is_segmented'] == ("yes" if segmented else "no")]
    pred_df['pred'] = clean_preds(pred_df['pred'])
    pred_df['gold'] = clean_preds(pred_df['gold'])
    pred_df.to_csv(path[:-4] + '.postprocessed.csv')
    all_eval = {}
    all_eval['all'] = _eval(pred_df['pred'], pred_df['gold'])

    for lang in pred_df["glottocode"].unique():
        lang_df = pred_df[pred_df["glottocode"] == lang]
        all_eval[lang] = _eval(lang_df['pred'], lang_df['gold'])
    return all_eval

postprocess('../preds/2023glossingSTsubmissions/mt5_odin_punct_fix/test_OOD-preds.csv', True)

{'all': {'word_level': {'average_accuracy': 0.725442089982268,
   'accuracy': 0.6690837178642056},
  'bleu': 0.5207739472389221,
  'WER': 0.30762286273663314,
  'morpheme_level': {'average_accuracy': 0.7112711277923897,
   'accuracy': 0.604480494399382},
  'classes': {'stem': {'prec': 0.6197295147175815,
    'rec': 0.6024748646558391,
    'f1': 0.6109803921568627},
   'gram': {'prec': 0.6275449101796408,
    'rec': 0.6064814814814815,
    'f1': 0.6168334314302532}},
  'MER': 0.26671672160918153},
 'gitx1241': {'word_level': {'average_accuracy': 0.24144700909406785,
   'accuracy': 0.20572916666666666},
  'bleu': 0.058711063116788864,
  'WER': 1.108072928661164,
  'morpheme_level': {'average_accuracy': 0.16549584764712444,
   'accuracy': 0.12735166425470332},
  'classes': {'stem': {'prec': 0.07722007722007722,
    'rec': 0.0749063670411985,
    'f1': 0.07604562737642585},
   'gram': {'prec': 0.18037135278514588,
    'rec': 0.16037735849056603,
    'f1': 0.16978776529338324}},
  'MER': 1.

{'word_level': {'average_accuracy': 0.7141752618726303,
  'accuracy': 0.7022388059701492},
 'bleu': 0.6296881437301636,
 'WER': 0.28055459991712256,
 'morpheme_level': {'average_accuracy': 0.7284273998003787,
  'accuracy': 0.6980968858131488},
 'classes': {'stem': {'prec': 0.68212927756654,
   'rec': 0.6764705882352942,
   'f1': 0.6792881484286255},
  'gram': {'prec': 0.7353846153846154,
   'rec': 0.7271805273833671,
   'f1': 0.7312595614482408}},
 'MER': 0.2163910526701597}