These three cells compute test scores from the files in the folder results/, generated with 45 beams on A40s 45 GB memory (2–7 hours per process) and A100s 80GB (1-2 hours).

In [1]:
# Compute scores for text restoration
import json
import re
import os
from difflib import SequenceMatcher

def normalize_text(text):
    return re.sub(r"[0· ]", "", text)

def truncate_to_length(text, target_length):
    valid_text = re.sub(r"[0· ]", "", text)
    if len(valid_text) <= target_length:
        return valid_text
    truncated_valid_text = valid_text[:target_length]
    reconstructed_text = ""
    valid_count = 0
    for char in text:
        if char not in "0· ":
            if valid_count < target_length:
                reconstructed_text += char
                valid_count += 1
        else:
            reconstructed_text += char
    return reconstructed_text

def calculate_cer(real, predicted):
    real_normalized = normalize_text(real)
    predicted_normalized = normalize_text(predicted)
    return (1 - SequenceMatcher(None, real_normalized, predicted_normalized).ratio()) * 100

def process_file(file_path):
    total_cer = 0
    rank_1_matches = 0
    real_in_top_20 = 0
    total_lines = 0
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            entry = json.loads(line)
            real_response = entry["real_response"]
            rank_1_prediction = entry["predictions"]["rank_1"]
            real_response_normalized = normalize_text(real_response)
            target_length = len(real_response_normalized)
            rank_1_prediction_truncated = truncate_to_length(rank_1_prediction, target_length)
            cer = calculate_cer(real_response, rank_1_prediction_truncated)
            total_cer += cer
            if real_response_normalized == normalize_text(rank_1_prediction_truncated):
                rank_1_matches += 1
            predictions = [truncate_to_length(pred, target_length) for pred in entry["predictions"].values()]
            predictions_normalized = [normalize_text(pred) for pred in predictions]
            if real_response_normalized in predictions_normalized:
                real_in_top_20 += 1
            total_lines += 1
    avg_cer = total_cer / total_lines
    rank_1_match_percentage = (rank_1_matches / total_lines) * 100
    real_in_top_20_percentage = (real_in_top_20 / total_lines) * 100
    return avg_cer, rank_1_match_percentage, real_in_top_20_percentage

def main():
    directory_path = "results"
    pattern = r"(.+?)_text\.jsonl"
    print("Model\tCER\tTop-1 Accuracy\tTop-20 Accuracy")
    for filename in os.listdir(directory_path):
        if re.match(pattern, filename):
            match = re.match(pattern, filename)
            model = match.group(1)  # Extract only the model part
            file_path = os.path.join(directory_path, filename)
            avg_cer, rank_1_match_percentage, real_in_top_20_percentage = process_file(file_path)
            print(f"{model}\t{avg_cer:.1f}%\t{rank_1_match_percentage:.1f}%\t{real_in_top_20_percentage:.1f}%")
if __name__ == "__main__":
    main()

Model	CER	Top-1 Accuracy	Top-20 Accuracy
Epigr_1_Llama-3.1-8B-Instruct	22.5%	60.9%	77.5%
Papy_1_Llama-3.1-8B-Instruct	16.3%	71.3%	85.0%


In [2]:
# Compute scores for geographical attribution
import json
import re
import os
from difflib import SequenceMatcher

def normalize_text(text):
    return re.sub(r"[\. /·,]", "", text)

def levenshtein_similarity(real, predicted):
    real_truncated = real[:10]
    predicted_truncated = predicted[:10]
    real_normalized = normalize_text(real_truncated)
    predicted_normalized = normalize_text(predicted_truncated)
    matcher = SequenceMatcher(None, real_normalized, predicted_normalized)
    return matcher.ratio() * 100

def process_file(file_path):
    rank_1_within_90 = 0
    rank_1_3_within_90 = 0
    total_lines = 0

    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            entry = json.loads(line)
            real_response = entry["real_response"]
            rank_1_prediction = entry["predictions"]["rank_1"]
            rank_2_prediction = entry["predictions"]["rank_2"]
            rank_3_prediction = entry["predictions"]["rank_3"]
            if levenshtein_similarity(real_response, rank_1_prediction) >= 90:
                rank_1_within_90 += 1
            if (levenshtein_similarity(real_response, rank_1_prediction) >= 90 or
                levenshtein_similarity(real_response, rank_2_prediction) >= 90 or
                levenshtein_similarity(real_response, rank_3_prediction) >= 90):
                rank_1_3_within_90 += 1
            total_lines += 1
    return rank_1_within_90, rank_1_3_within_90, total_lines

def main():
    directory_path = "results/"
    pattern = r"(.+?)_place\.jsonl"
    print("Model\tTop-1 accuracy\tTop-3 accuracy")
    for filename in os.listdir(directory_path):
        if re.match(pattern, filename):
            match = re.match(pattern, filename)
            model = match.group(1)
            file_path = os.path.join(directory_path, filename)
            rank_1_within_90, rank_1_3_within_90, total_lines = process_file(file_path)
            rank_1_within_90_percentage = (rank_1_within_90 / total_lines) * 100 if total_lines > 0 else 0
            rank_1_3_within_90_percentage = (rank_1_3_within_90 / total_lines) * 100 if total_lines > 0 else 0
            print(f"{model}\t{rank_1_within_90_percentage:.1f}%\t{rank_1_3_within_90_percentage:.1f}%")

if __name__ == "__main__":
    main()

Model	Top-1 accuracy	Top-3 accuracy
Papy_1_Llama-3.1-8B-Instruct	66.5%	80.0%
Epigr_1_Llama-3.1-8B-Instruct	75.0%	83.7%


In [4]:
# Compute scores for dating

import json
import re
import os
import numpy as np

def calculate_date_range(date):
    match = re.match(r"(-?\d+)±(\d+)", date)
    if match:
        base, margin = int(match.group(1)), int(match.group(2))
        return base - margin, base + margin
    elif date.endswith('+'):
        base = int(date[:-1])
        return base, base + 25
    elif date.endswith('-'):
        base = int(date[:-1])
        return base - 25, base
    return None

def clean_prediction(prediction):
    cleaned = prediction.replace("\n", "").replace("assistant", "").strip()
    try:
        return convert_prediction_format(cleaned)
    except ValueError:
        return None

def convert_prediction_format(prediction):
    """Convert prediction to the desired format."""
    match = re.match(r"(-?\d+)±\d+", prediction)
    if match:
        return int(match.group(1))

    match = re.match(r"(-?\d+)\+", prediction)  
    if match:
        return int(match.group(1)) + 12  

    match = re.match(r"(-?\d+)\-", prediction) 
    if match:
        return int(match.group(1)) - 12 

    try:
        return int(prediction)
    except ValueError:
        return None 

def calculate_distance(real_range, predicted):
    """Calculate the distance between the predicted date and the real date range."""
    real_start, real_end = real_range

    if predicted is None:
        return None

    if real_start <= predicted <= real_end:
        return 0 
    return min(abs(real_start - predicted), abs(real_end - predicted))

def process_file(file_path):
    """Process a single JSONL file and return the list of distances."""
    distances = []

    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            try:
                entry = json.loads(line)
                real_response = entry["real_response"]
                rank_1_prediction = entry["predictions"]["rank_1"]

                # Calculate the real range
                real_range = calculate_date_range(real_response)
                if real_range is None:
                    continue

                # Clean and convert rank 1 prediction to integer
                rank_1_prediction_cleaned = clean_prediction(rank_1_prediction)

                # Calculate the distance for the rank 1 prediction
                distance = calculate_distance(real_range, rank_1_prediction_cleaned)
                if distance is not None:
                    distances.append(distance)
            except json.JSONDecodeError:
                print(f"Error decoding JSON line: {line.strip()}")
                continue

    return distances

def main():
    directory_path = "results/"
    pattern = r"(.+?)_date\.jsonl"

    print("Model\tAverage Distance from span\tMedian distance")

    for filename in os.listdir(directory_path):
        if re.match(pattern, filename):
            match = re.match(pattern, filename)
            model = match.group(1)  

            file_path = os.path.join(directory_path, filename)
            distances = process_file(file_path)

            if distances:
                avg_accuracy = np.mean(distances)
                median_accuracy = np.median(distances)

                print(f"{model}\t{avg_accuracy:.2f} years\t{median_accuracy:.2f} years")
            else:
                print(f"{model}\tNo valid data\tNo valid data")

if __name__ == "__main__":
    main()

Model	Average Distance from span	Median distance
Papy_1_Llama-3.1-8B-Instruct	21.72 years	0.00 years
Epigr_1_Llama-3.1-8B-Instruct	26.22 years	1.00 years
