In [67]:
import pandas as pd
from eval import strip_gloss_punctuation, eval_word_glosses, eval_morpheme_glosses
from typing import List
import re

def clean_preds(preds):
    corrected_preds = preds.replace('\.$', '', regex=True)
    corrected_preds = corrected_preds.replace('\,', '', regex=True)
    corrected_preds = corrected_preds.replace('»', '', regex=True)
    corrected_preds = corrected_preds.replace('«', '', regex=True)
    corrected_preds = corrected_preds.replace('\"', '', regex=True)
    corrected_preds = corrected_preds.replace('\. ', ' ', regex=True)
    corrected_preds = corrected_preds.replace('\.\.+', '', regex=True)
    corrected_preds = corrected_preds.replace('\ +', ' ', regex=True)
    return corrected_preds

def _eval(preds: List[str], gold: List[str]):
    preds = [strip_gloss_punctuation(pred) for pred in preds]
    gold = [strip_gloss_punctuation(g) for g in gold]
    pred_words = [str(pred).split() for pred in preds]
    gold_words = [gloss.split() for gloss in gold]
    # word_eval = eval_accuracy(pred_words, gold_words)

    pred_morphemes = [re.split(r"\s|-", str(pred)) for pred in preds]
    gold_morphemes = [re.split(r"\s|-", gloss) for gloss in gold]

    eval_dict = {
        **eval_word_glosses(
            pred_words=pred_words, gold_words=gold_words
        ),
        **eval_morpheme_glosses(
            pred_morphemes=pred_morphemes, gold_morphemes=gold_morphemes
        ),
    }
    return eval_dict

def postprocess(path, segmented: bool):
    pred_df = pd.read_csv(path).fillna('')
    pred_df = pred_df[pred_df['is_segmented'] == ("yes" if segmented else "no")]
    pred_df['pred'] = clean_preds(pred_df['pred'])
    pred_df['gold'] = clean_preds(pred_df['gold'])
    pred_df.to_csv(path[:-4] + '.postprocessed.csv')
    all_eval = {}
    all_eval['all'] = _eval(pred_df['pred'], pred_df['gold'])

    for lang in pred_df["glottocode"].unique():
        lang_df = pred_df[pred_df["glottocode"] == lang]
        all_eval[lang] = _eval(lang_df['pred'], lang_df['gold'])
    return all_eval

postprocess('../preds/2023glossingSTsubmissions/BASELINE/test_ID-preds.csv', False)

{'all': {'word_level': {'average_accuracy': 0.6866442591223938,
   'accuracy': 0.7092985582374858},
  'bleu': 0.4755599070183826,
  'WER': 0.3126596440853715,
  'morpheme_level': {'average_accuracy': 0.5237608294961661,
   'accuracy': 0.46245691500146435},
  'classes': {'stem': {'prec': 0.5174554512358689,
    'rec': 0.49401821973438703,
    'f1': 0.505465299094108},
   'gram': {'prec': 0.5962991257108904,
    'rec': 0.41187851782363977,
    'f1': 0.4872212782189548}},
  'MER': 0.38961197551427107},
 'arap1274': {'word_level': {'average_accuracy': 0.6795463734912861,
   'accuracy': 0.6988122975507188},
  'bleu': 0.4268306322218423,
  'WER': 0.3197404149014028,
  'morpheme_level': {'average_accuracy': 0.510326921466861,
   'accuracy': 0.43584559984837956},
  'classes': {'stem': {'prec': 0.5048834355828221,
    'rec': 0.4816011235955056,
    'f1': 0.4929675332454774},
   'gram': {'prec': 0.6351302460202605,
    'rec': 0.3409399883472519,
    'f1': 0.4437002401112094}},
  'MER': 0.4110601

{'word_level': {'average_accuracy': 0.7141752618726303,
  'accuracy': 0.7022388059701492},
 'bleu': 0.6296881437301636,
 'WER': 0.28055459991712256,
 'morpheme_level': {'average_accuracy': 0.7284273998003787,
  'accuracy': 0.6980968858131488},
 'classes': {'stem': {'prec': 0.68212927756654,
   'rec': 0.6764705882352942,
   'f1': 0.6792881484286255},
  'gram': {'prec': 0.7353846153846154,
   'rec': 0.7271805273833671,
   'f1': 0.7312595614482408}},
 'MER': 0.2163910526701597}