In [50]:
import pandas as pd

from collections import defaultdict
from transliteration import evaluate
from transliteration.script import SCRIPTS
from pathlib import Path

In [63]:
def load_df(path):
    df = pd.read_csv(path, keep_default_na=False)
    for c in df.columns:
        if c in SCRIPTS:
            df[c] = df[c].map(SCRIPTS[c].preprocess_string)
    return df

def memory_transliterator(df, from_script, to_script):
    d = defaultdict(list)
    for _, row in df.iterrows():
        d[row[from_script]].append(row[to_script])
    def result(string):
        return d[string]
    return result

def metric_ceiling(full_df, test_df, from_script, to_script):
    transliterator = memory_transliterator(full_df, from_script, to_script)
    results = ([transliterator(s) for s in test_df[from_script]], None)
    return {'acc@1': evaluate.top_k_accuracy(test_df[to_script], results, k=1),
            'mrr@5': evaluate.mrr(test_df[to_script], results, k=5)}

In [64]:
data_dir = Path('/home/derick/code/ml-final-project/transliteration/data/')
eob_full = load_df(data_dir / 'processed/eob_katakana_pairs.csv')
muse_full = load_df(data_dir / 'processed/muse_katakana_pairs.csv')
eob_test = load_df(data_dir / 'split/eob_pairs_test.csv')
muse_test = load_df(data_dir / 'split/muse_pairs_test.csv')

In [65]:
print('eob: {}'.format(metric_ceiling(eob_full, eob_test, 'en', 'ja')))
print('muse: {}'.format(metric_ceiling(muse_full, muse_test, 'en', 'ja')))

eob: {'acc@1': 0.9353760445682451, 'mrr@5': 0.9669916434540389}


muse: {'acc@1': 0.9564616447823082, 'mrr@5': 0.9782308223911541}
