In [2]:
import json
import pandas as pd
from collections import defaultdict, namedtuple

Metrics = namedtuple('Metrics', 'tp fp fn prec rec fscore')

# Tokenize the label and prediction into word/tag pairs
def tokenize_label_pred(text):
    print(text)
    return [(token.split('/')[-2], token.split('/')[-1]) for token in text.strip().split() if len(token.split("/")) > 1]

def calculate_metrics(correct, guessed, total):
    tp, fp, fn = correct, guessed-correct, total-correct
    p = 0 if tp + fp == 0 else 1.*tp / (tp + fp)
    r = 0 if tp + fn == 0 else 1.*tp / (tp + fn)
    f = 0 if p + r == 0 else 2 * p * r / (p + r)
    return Metrics(tp, fp, fn, p, r, f)

# Re-loading the new test labels and results
test_labels_path_new = 'test_with_labels.txt'
results_path_new = 'results-zeroshot-attempt-1.json'

# Reading the new test labels
with open(test_labels_path_new, 'r', encoding='utf-8') as f:
    ground_truth = f.readlines()

# Reading the new results
with open(results_path_new, 'r', encoding='utf-8') as f:
    predictions = json.load(f)

precs, recs, fscores = [], [], []
# Iterate through each label and prediction
for num, label in enumerate(ground_truth):
    # Initialize counters
    true_positives = 0
    true_negatives = 0
    false_positives = 0
    false_negatives = 0
    
    correct = 0
    id_ = num + 1
    print(f"######### {id_} ##############")
    
    # Tokenize the label and prediction
    label_tokens = tokenize_label_pred(label.strip())
    prediction_tokens = tokenize_label_pred(predictions[str(id_)].strip())
    
    # Keep track of words that have been seen in predictions
    seen_in_prediction = set()
    unknown = set()
    print(label_tokens)
    total = len(label_tokens)
    guessed = len(prediction_tokens)
    print("Total predictions", len(prediction_tokens))
    print("Total ground truth", len(label_tokens))
    for e_token, c_token in prediction_tokens:
        # Check if the prediction word exists in the label tokens
        if (e_token, c_token) in label_tokens:
            p_token = (e_token, c_token)
            print("Correct:", p_token)
            true_positives += 1
            correct += 1
            seen_in_prediction.add(p_token)  # Mark this token as seen
        else:
            words_label = [token[0] for token in label_tokens]
            unknown.add(e_token)
            #print(e_token, words_label)
            if e_token not in words_label:
                p_token = (e_token, c_token)
                #print("Unknown:", p_token)
                true_negatives +=1
    
    # Now, check for any false negatives (i.e., tokens that should be in the prediction but are missing)
    for l_token in label_tokens:
        if l_token not in seen_in_prediction:
            le_token, _ = l_token 
            if le_token in unknown:
                #print("Incorrect:", l_token)
                false_negatives += 1
            else:
                #print("Missing tokens:", l_token)
                false_positives += 1

    # Final counts
    #print(f"True Positives: {true_positives}")
    #print(f"True Negatives: {true_negatives}")
    #print(f"False Positive: {false_positives}")
    #print(f"False Negatives: {false_negatives}")

    print(f"Correct: {correct}", f"Guessed: {guessed}", f"Total: {total}")
    _, _, _, prec, rec, fscore = calculate_metrics(correct, guessed, total)
    precs.append(prec)
    recs.append(rec)
    fscores.append(fscore)
avg_precs = sum(precs)/len(precs)
avg_recs = sum(recs)/len(recs)
avg_fscores = sum(fscores)/len(fscores)
print("Total records test set", len(ground_truth))
print("Average precision", avg_precs)
print("Average Recall", avg_recs)
print("Average F1 score", avg_fscores)

######### 1 ##############
Tidak/O ada/O sesuatu/O yang/O lebih/O kecil/O dan/O yang/O lebih/O besar/O daripada/O itu/O ,/O kecuali/O semua/O tercatat/O dalam/O kitab/O yang/O nyata/O (/O Lauh/O Mahfuz/O )/O ./O
Kitab yang nyata (Lauh Mahfuz) - HolyBook
[('Tidak', 'O'), ('ada', 'O'), ('sesuatu', 'O'), ('yang', 'O'), ('lebih', 'O'), ('kecil', 'O'), ('dan', 'O'), ('yang', 'O'), ('lebih', 'O'), ('besar', 'O'), ('daripada', 'O'), ('itu', 'O'), (',', 'O'), ('kecuali', 'O'), ('semua', 'O'), ('tercatat', 'O'), ('dalam', 'O'), ('kitab', 'O'), ('yang', 'O'), ('nyata', 'O'), ('(', 'O'), ('Lauh', 'O'), ('Mahfuz', 'O'), (')', 'O'), ('.', 'O')]
Total predictions 0
Total ground truth 25
Correct: 0 Guessed: 0 Total: 25
######### 2 ##############
Ketahuilah/O bahwa/O sesungguhnya/O (/O bagi/O )/O para/O wali/O Allah/Allah itu/O tidak/O ada/O rasa/O takut/O yang/O menimpa/O mereka/O dan/O mereka/O pun/O tidak/O bersedih/O ./O
wali Allah/O
[('Ketahuilah', 'O'), ('bahwa', 'O'), ('sesungguhnya', 'O'), ('(