In [9]:
import argparse
from tqdm import tqdm
import json
import os
from torchmetrics.text import CharErrorRate, WordErrorRate
import jiwer

In [30]:
transformation = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemovePunctuation(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.Strip(),
])

In [31]:


input_dir = "/home/efittsc1/projects/latin-transcription/work/max_data_-1/results"

train_results = input_dir + "/train"
val_results = input_dir + "/val"

results = {"train": [], "val": []}

for directory in [train_results, val_results]:
    for filename in os.listdir(directory):
        with open(os.path.join(directory, filename), 'r') as f:
            results[os.path.basename(directory)].append(json.load(f))






In [32]:
word_error_stats = {"train": {}, "val": {}}

In [33]:
for directory in results.keys():
    for i in tqdm(range(len(results[directory]))):
        reference = transformation(results[directory][i]["truth"])
        hypothesis = transformation(results[directory][i]["prediction"])
        truth_words = reference.split(" ")
        comparison = jiwer.process_words(reference, hypothesis)
        for alignment in comparison.alignments[0]:
            for index in range(alignment.ref_start_idx, alignment.ref_end_idx):
                word_ = truth_words[index]
                type_ = alignment.type
                if word_ not in word_error_stats[directory]:
                    word_error_stats[directory][word_] = {}
                if type_ not in word_error_stats[directory][word_]:
                    word_error_stats[directory][word_][type_] = 0
                if "total" not in word_error_stats[directory][word_]:
                    word_error_stats[directory][word_]["total"] = 0
                word_error_stats[directory][word_][type_] += 1
                word_error_stats[directory][word_]["total"] += 1

                


  0%|          | 0/2013 [00:00<?, ?it/s]

100%|██████████| 2013/2013 [00:00<00:00, 9311.09it/s]
100%|██████████| 233/233 [00:00<00:00, 10644.05it/s]


In [34]:
val_words = set(word_error_stats["val"].keys())
train_words = set(word_error_stats["train"].keys())

print(f"train len: {len(train_words)}")
print(f"val len: {len(val_words)}")

val_not_in_train = val_words - train_words

print(f"val not in train: {len(val_not_in_train)}")
print(val_not_in_train)

train len: 4503
val len: 1331
val not in train: 237
{'proprie', 'pimmel', 'ho', 'norhtforyate', 'andegavie', 'normannie', 'firmacula', 'brimes', 'tholthorp', 'manuagii', 'declopton', 'hesinhull', 'vastatis', 'vaccas', 'mathie', 'furata', 'liberatas', 'assheburn', 'convincendum', 'precio', 'pretensam', 'valeat', 'nater', 'defectum', 'tare', 'tre', 'jocalia', 'comodum', 'uxoris', 'percipiendam', 'tulerunt', 'auxilio', 'continue', 'constitucionem', 'symonis', 'aliter', 'dicari', 'cecilia', 'alij', 'albus', 'sequebatur', 'existencium', 'cornubie', 'fraude', 'aueria', 'bonis', 'caloch', 'molere', 'lunar', 'attornatos', 'xilij', 'hybernie', 'pyrie', 'easdem', 'judicii', 'prisonum', 'aldemare', 'redem', 'sores', 'xxmo', 'magis', 'clamavit', 'removendum', 'assignatas', 'ligauit', 'chyrin', 'admisit', 'na', 'licit', 'wemme', 'mor', 'tokene', 'custos', 'ta', 'xori', 'execucione', 'emit', 'wolveseye', 'totus', 'cura', 'utilitatem', 'waut', 'duxit', 'contemptu', 'hundretorum', 'godefridi', 'medita

In [42]:
def calculate_accuracy(list_word_stats):
    total = 0
    correct = 0
    for word, stats in list_word_stats:
        total += stats["total"]
        correct += stats["equal"] if "equal" in stats else 0
    return correct / total

In [44]:
train_accuracy = calculate_accuracy(list(word_error_stats["train"].items()))
val_accuracy = calculate_accuracy(list(word_error_stats["val"].items()))

print(f"train accuracy: {train_accuracy}")
print(f"val accuracy: {val_accuracy}")

train accuracy: 0.9419863991268576
val accuracy: 0.8815888083937047


In [46]:
words_not_in_train_accuracy = calculate_accuracy([(word, word_error_stats["val"][word]) for word in val_not_in_train])
print(f"words not in train accuracy: {words_not_in_train_accuracy}")

words not in train accuracy: 0.4583333333333333


In [55]:
import pandas as pd

val_df = pd.DataFrame.from_dict(word_error_stats["val"], orient='index')
train_df = pd.DataFrame.from_dict(word_error_stats["train"], orient='index')

val_not_in_train_df = pd.DataFrame.from_dict({word: word_error_stats["val"][word] for word in val_not_in_train}, orient='index')


val_df["accuracy"] = val_df["equal"] / val_df["total"]
train_df["accuracy"] = train_df["equal"] / train_df["total"]
val_not_in_train_df["accuracy"] = val_not_in_train_df["equal"] / val_not_in_train_df["total"]

In [56]:
train_df.head()

Unnamed: 0,equal,total,substitute,delete,accuracy
et,2479.0,2593,50.0,64.0,0.956035
in,848.0,863,8.0,7.0,0.982619
quas,16.0,16,,,1.0
iidem,9.0,9,,,1.0
thomas,95.0,102,6.0,1.0,0.931373


In [63]:
# words most often misclassified in val set
val_not_in_train_df.sort_values("substitute", ascending=False).head(50)

Unnamed: 0,equal,total,substitute,delete,accuracy
comitis,1.0,2,1.0,,0.5
lacy,1.0,2,1.0,,0.5
admittat,1.0,2,1.0,,0.5
ho,,1,1.0,,
andegavie,,1,1.0,,
normannie,,1,1.0,,
firmacula,,1,1.0,,
manuagii,,1,1.0,,
declopton,,1,1.0,,
hesinhull,,1,1.0,,


In [51]:
val_not_in_train_df.head()

Unnamed: 0,equal,total,substitute,delete
proprie,1.0,1,,
pimmel,1.0,1,,
norhtforyate,1.0,1,,
brimes,1.0,1,,
tholthorp,1.0,1,,
