Check if files match in both predictions and truth folders:

In [1]:
import os, json

pred_folder = r"C:\Users\Bjørn\Desktop\ITC\ANLP\predictions\easy"
truth_folder = r"C:\Users\Bjørn\Desktop\ITC\ANLP\Data\easy\validation"

pred_keys = sorted([f[9:-5] for f in os.listdir(pred_folder) if f.startswith("solution-problem-")])
truth_keys = sorted([f[6:-5] for f in os.listdir(truth_folder) if f.startswith("truth-problem-")])

missing_in_pred = set(truth_keys) - set(pred_keys)
missing_in_truth = set(pred_keys) - set(truth_keys)

print("Missing in predictions:", missing_in_pred)
print("Missing in truth:", missing_in_truth)

Missing in predictions: set()
Missing in truth: set()


Check that the json files match in content (classes, length and setup)

In [2]:
problem_id = "problem-1"

with open(os.path.join(truth_folder, f"truth-{problem_id}.json"), "r") as f:
    truth = json.load(f)

with open(os.path.join(pred_folder, f"solution-{problem_id}.json"), "r") as f:
    pred = json.load(f)

print("truth:", truth)
print("pred:", pred)
print("len text:", len(truth['changes']))
print("len pred:", len(pred['changes']))
print("types:", type(truth['changes'][0]), type(pred['changes'][0]))


truth: {'authors': 2, 'changes': [0, 1, 1, 0, 0]}
pred: {'authors': 2, 'changes': [0, 1, 1, 0, 0]}
len text: 5
len pred: 5
types: <class 'int'> <class 'int'>


Check that all documents are equal in length

In [3]:
from itertools import chain

truth_lengths = []
pred_lengths = []

for problem_id in truth_keys:
    with open(os.path.join(truth_folder, f"truth-{problem_id}.json")) as f:
        t = json.load(f)
    with open(os.path.join(pred_folder, f"solution-{problem_id}.json")) as f:
        p = json.load(f)
    truth_lengths.append(len(t['changes']))
    pred_lengths.append(len(p['changes']))

print("All lengths equal:", all(t==p for t,p in zip(truth_lengths, pred_lengths)))


All lengths equal: True


Test just 2 documents:

In [5]:
truth = {
    "problem-1": {"changes": [0,1,0]},
    "problem-2": {"changes": [1,0,0]},
}

solution = {
    "problem-1": {"changes": [0,0,1]},
    "problem-2": {"changes": [1,0,0]},
}

from evaluator import compute_score_multiple_predictions

f1 = compute_score_multiple_predictions(truth, solution, 'changes', labels=[0,1])
print("Sample F1:", f1)


Sample F1: 0.625


So evalutation works, problem with dummy classifier might be that over too many documents, the randomness gets accounted for in computing F1 scores