In [10]:
import json
import pandas as pd
from collections import defaultdict, namedtuple, Counter

# Namedtuple to store evaluation metrics
Metrics = namedtuple('Metrics', 'tp fp fn prec rec fscore')

# Tokenize the label and prediction into word/tag pairs
def tokenize_label_pred(text):
    tokens = []
    for token in text.split():
        if "//" in token:
            token = token.replace("//", "/")
        if len(token.split("/")) == 2:
            r = (token.split('/')[-2], token.split('/')[-1])
            tokens.append(r)
        else:
            #Split into two results
            #print(token)
            if len(token.split("/"))==3:
                r1 = (token.split("/")[0], token.split("/")[1][0])
                r2 = (token.split("/")[1][1:], token.split("/")[2])
                tokens.append(r1)
                tokens.append(r2)
            else:
                print("uncover", token)
    return tokens

# Function to calculate precision, recall, and F1 score
def calculate_metrics(correct, guessed, total):
    tp, fp, fn = correct, guessed - correct, total - correct
    # Precision, Recall, F1 Score calculations
    p = 0 if tp + fp == 0 else tp / (tp + fp)
    r = 0 if tp + fn == 0 else tp / (tp + fn)
    f = 0 if p + r == 0 else 2 * p * r / (p + r)
    return Metrics(tp, fp, fn, p, r, f)

# File paths for test labels and predictions
test_labels_path_new = '../new-datasets/test_with_labels.txt'
results_path_new = 'results/results-one-shot-gpt.json'

# Read ground truth labels
with open(test_labels_path_new, 'r', encoding='utf-8') as f:
    ground_truth = f.readlines()

# Read predictions from the model
with open(results_path_new, 'r', encoding='utf-8') as f:
    predictions = json.load(f)

# Lists to store precision, recall, F1 for each instance
precs, recs, fscores = [], [], []

# Iterate over each label and corresponding prediction
for num, label in enumerate(ground_truth):
    # Initialize counters for correct matches and errors
    correct = 0
    id_ = str(num + 1)  # Key in the predictions JSON
    print(f"######### Instance {id_} ##############")
    
    # Tokenize label and prediction
    label_tokens = tokenize_label_pred(label.strip())
    prediction_tokens = tokenize_label_pred(predictions[id_].strip())

    # Count frequencies of tokens in label and prediction
    label_token_counts = Counter(label_tokens)
    prediction_token_counts = Counter(prediction_tokens)
    
    # Print for debugging
    print("Label Tokens:\n", label_tokens)
    print("prediction", predictions[id_])
    print("Prediction Tokens:\n", prediction_tokens)

    total = len(label_tokens)  # Total actual tokens
    guessed = len(prediction_tokens)  # Total predicted tokens

    # Track seen tokens from the prediction to avoid duplicates
    seen_in_prediction = []
    
    for pred_token in prediction_tokens:
        # Only count correct if the frequency in prediction <= frequency in label
        if pred_token in label_tokens and prediction_token_counts[pred_token] <= label_token_counts[pred_token]:
            if seen_in_prediction.count(pred_token) < label_token_counts[pred_token]:
                correct += 1  # Count true positives
                seen_in_prediction.append(pred_token)  # Mark as seen

    print("correct:\n", seen_in_prediction)
    # Final counts: TP = correct, FP = guessed - correct, FN = total - correct
    print(f"Correct: {correct}, Guessed: {guessed}, Total: {total}")
    metrics = calculate_metrics(correct, guessed, total)
    print(metrics)
    
    # Append the precision, recall, and F1 score for this instance
    precs.append(metrics.prec)
    recs.append(metrics.rec)
    fscores.append(metrics.fscore)

# Compute averages over all instances
avg_precs = sum(precs) / len(precs)
avg_recs = sum(recs) / len(recs)
avg_fscores = sum(fscores) / len(fscores)

# Print final averages
print("Total records in the test set:", len(ground_truth))
print("Average Precision:", avg_precs)
print("Average Recall:", avg_recs)
print("Average F1 Score:", avg_fscores)


######### Instance 1 ##############
uncover ```plaintext
uncover ```
Label Tokens:
 [('Tidak', 'O'), ('ada', 'O'), ('sesuatu', 'O'), ('yang', 'O'), ('lebih', 'O'), ('kecil', 'O'), ('dan', 'O'), ('yang', 'O'), ('lebih', 'O'), ('besar', 'O'), ('daripada', 'O'), ('itu', 'O'), (',', 'O'), ('kecuali', 'O'), ('semua', 'O'), ('tercatat', 'O'), ('dalam', 'O'), ('kitab', 'O'), ('yang', 'O'), ('nyata', 'O'), ('(', 'O'), ('Lauh', 'O'), ('Mahfuz', 'O'), (')', 'O'), ('.', 'O')]
prediction ```plaintext
Tidak/O ada/O sesuatu/O yang/O lebih/O kecil/O dan/O yang/O lebih/O besar/O daripada/O itu/O ,/O kecuali/O semua/O tercatat/O dalam/O kitab/O yang/O nyata/O (/O Lauh/Buku Mahfuz/HolyBook )/O ./O
```
Prediction Tokens:
 [('Tidak', 'O'), ('ada', 'O'), ('sesuatu', 'O'), ('yang', 'O'), ('lebih', 'O'), ('kecil', 'O'), ('dan', 'O'), ('yang', 'O'), ('lebih', 'O'), ('besar', 'O'), ('daripada', 'O'), ('itu', 'O'), (',', 'O'), ('kecuali', 'O'), ('semua', 'O'), ('tercatat', 'O'), ('dalam', 'O'), ('kitab', 'O'), 

AttributeError: 'NoneType' object has no attribute 'split'