**Table of contents**<a id='toc0_'></a>    
- [TOP1](#toc1_)    
  - [Analysize `best.json`](#toc1_1_)    
    - [Locate the `best.json` from target dir.](#toc1_1_1_)    
    - [Calculate FPR for each category](#toc1_1_2_)    
  - [Some other Metrics](#toc1_2_)    
    - [Check TP, FP, TN, FN](#toc1_2_1_)    
    - [Calculate accuracy for each category](#toc1_2_2_)    
    - [macro, micro, weighted](#toc1_2_3_)    
- [TOP_K](#toc2_)    
  - [Calculate Voting Ensemble](#toc2_1_)    
  - [Get top1 from K models](#toc2_2_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# <a id='toc1_'></a>[TOP1](#toc0_)

## <a id='toc1_1_'></a>[Analysize `best.json`](#toc0_)

### <a id='toc1_1_1_'></a>[Locate the `best.json` from target dir.](#toc0_)

In [18]:
import json
import glob
import os
from sklearn.metrics import f1_score, precision_score, recall_score

# fusar
# ann="../data/fusar/meta/test.txt" # fusar
# ann="../data/opensar/meta/test.txt"; num_categories = 8 # opensar
ann = "../data/fusrs_v2/meta/test.txt"
num_categories = 5  # fusrs_v2

# pred_dir="../outputs/res50_fusrs_v2_dreaug/res50_1x128_lr1e-1+200e+im21k_fusrs_v2"
# pred_dir="../outputs/res50_fusrs_v2_pretrain/res50_1x128_lr1e-1+200e_fusrs_v2"
pred_dir = "../outputs/res50_fusrs_v2_dreaug/res50_1x128_lr1e-1+200e+im21k_fusrs_v2+ctrlcam/best"
# pred_dir="../outputs/res50_fusrs_v2_pretrain/res50_1x128_lr1e-1+200e+im1k_fusrs_v2"

# Load predicted labels from pred.json
# Find the only .json file starting with "best_f1_score" in pred_dir
json_files = glob.glob(os.path.join(pred_dir, "best_f1_score*.json"))

if len(json_files) == 1:
    pred = json_files[0]
else:
    raise ValueError(
        "Expected exactly one best_f1_score JSON file in pred_dir, but found {}.".format(
            len(json_files)
        )
    )

with open(pred, "r") as f:
    pred_data = json.load(f)
    pred_labels = pred_data["pred_label"]

# Load ground truth labels from label.txt
gt_labels = []
with open(ann, "r") as f:
    for line in f:
        parts = line.strip().split()
        gt_labels.append(int(parts[-1]))

### <a id='toc1_1_2_'></a>[Calculate FPR for each category](#toc0_)

In [3]:
# Calculate per-cateogry FPR
from sklearn.metrics import precision_recall_fscore_support
import pandas as pd
import numpy as np

# Calculate the per-category precision, recall, and F1-score
precision, recall, f1, _ = precision_recall_fscore_support(
    gt_labels, pred_labels, average=None
)

# Print the results for each category
for i in range(num_categories):
    print(
        "Category {}: Precision: {:.4f}, Recall: {:.4f}, F1-score: {:.4f}".format(
            i, precision[i], recall[i], f1[i]
        )
    )


# Calculate unweighted average (Macro) of FPR
# Calculate macro F1-score
macro_f1 = f1_score(gt_labels, pred_labels, average="macro")
macro_pre = precision_score(gt_labels, pred_labels, average="macro")
macro_rec = recall_score(gt_labels, pred_labels, average="macro")
# Print the results for weighted average of FPR
print(
    "Macro (unweighted average of) Precision: {:.4f}, Recall: {:.4f}, F1-score: {:.4f}".format(
        macro_pre, macro_rec, macro_f1
    )
)

# Calculate weighted average of FPR
weighted_f1 = f1_score(gt_labels, pred_labels, average="weighted")
weighted_pre = precision_score(gt_labels, pred_labels, average="weighted")
weighted_rec = recall_score(gt_labels, pred_labels, average="weighted")
# Print the results for weighted average of FPR
print(
    "Weighted (weighted average of) Precision: {:.4f}, Recall: {:.4f}, F1-score: {:.4f}".format(
        weighted_pre, weighted_rec, weighted_f1
    )
)

# Calculate all-category micro F1-score
micro_f1 = f1_score(gt_labels, pred_labels, average="micro")
micro_pre = precision_score(gt_labels, pred_labels, average="micro")
micro_rec = recall_score(gt_labels, pred_labels, average="micro")
# Print the results for micro average of FPR
print(
    "Micro (micro average of) Precision: {:.4f}, Recall: {:.4f}, F1-score: {:.4f}".format(
        micro_pre, micro_rec, micro_f1
    )
)

print("************** REFORMULATE **************")
# Calculate unweighted average (Macro) of FPR
# Calculate macro F1-score
macro_f1 = f1_score(gt_labels, pred_labels, average="macro")
macro_pre = precision_score(gt_labels, pred_labels, average="macro")
macro_rec = recall_score(gt_labels, pred_labels, average="macro")
# Print the results for weighted average of FPR
precision = np.append(precision, macro_pre)
recall = np.append(recall, macro_rec)
f1 = np.append(f1, macro_f1)

# Calculate weighted average of FPR
weighted_f1 = f1_score(gt_labels, pred_labels, average="weighted")
weighted_pre = precision_score(gt_labels, pred_labels, average="weighted")
weighted_rec = recall_score(gt_labels, pred_labels, average="weighted")
# Print the results for weighted average of FPR
precision = np.append(precision, weighted_pre)
recall = np.append(recall, weighted_rec)
f1 = np.append(f1, weighted_f1)

data = {"Precision": precision, "Recall": recall, "F1": f1}
df = pd.DataFrame(data)


# Display the DataFrame in the notebook
display(df)

Category 0: Precision: 0.8984, Recall: 0.9624, F1-score: 0.9293
Category 1: Precision: 0.8113, Recall: 0.7963, F1-score: 0.8037
Category 2: Precision: 0.6870, Recall: 0.5590, F1-score: 0.6164
Category 3: Precision: 0.7021, Recall: 0.5593, F1-score: 0.6226
Category 4: Precision: 0.9302, Recall: 0.7692, F1-score: 0.8421
Macro (unweighted average of) Precision: 0.8058, Recall: 0.7293, F1-score: 0.7629
Weighted (weighted average of) Precision: 0.8486, Recall: 0.8556, F1-score: 0.8499
Micro (micro average of) Precision: 0.8556, Recall: 0.8556, F1-score: 0.8556
************** REFORMULATE **************


Unnamed: 0,Precision,Recall,F1
0,0.898428,0.962435,0.929331
1,0.811321,0.796296,0.803738
2,0.687023,0.559006,0.616438
3,0.702128,0.559322,0.622642
4,0.930233,0.769231,0.842105
5,0.805826,0.729258,0.762851
6,0.848603,0.855556,0.849859


## <a id='toc1_2_'></a>[Some other Metrics](#toc0_)

### <a id='toc1_2_1_'></a>[Check TP, FP, TN, FN](#toc0_)

In [19]:
def see_tp(gt_labels, pred_labels, category):
    gt_labels = np.array(gt_labels)
    pred_labels = np.array(pred_labels)

    true_positives = np.sum((gt_labels == category) & (pred_labels == category))
    false_positives = np.sum((gt_labels != category) & (pred_labels == category))
    false_negatives = np.sum((gt_labels == category) & (pred_labels != category))
    true_negatives = np.sum((gt_labels != category) & (pred_labels != category))

    print("True positives: {}".format(true_positives))
    print("True negatives: {}".format(true_negatives))
    print("False positives: {}".format(false_positives))
    print("False negatives: {}".format(false_negatives))


for i in range(5):
    print("Category {}".format(i))
    see_tp(gt_labels, pred_labels, i)

Category 0
True positives: 743
True negatives: 404
False positives: 84
False negatives: 29
Category 1
True positives: 172
True negatives: 1004
False positives: 40
False negatives: 44
Category 2
True positives: 90
True negatives: 1058
False positives: 41
False negatives: 71
Category 3
True positives: 33
True negatives: 1187
False positives: 14
False negatives: 26
Category 4
True positives: 40
True negatives: 1205
False positives: 3
False negatives: 12


### <a id='toc1_2_2_'></a>[Calculate accuracy for each category](#toc0_)

In [3]:
correct_counts = [0] * num_categories
total_counts = [0] * num_categories

for pred_label, gt_label in zip(pred_labels, gt_labels):
    total_counts[gt_label] += 1
    if pred_label == gt_label:
        correct_counts[gt_label] += 1

# Calculate and print accuracy for each category
accuracies = []
for i in range(num_categories):
    if total_counts[i] == 0:
        print(
            f"Category {i}: got no GT samples, whereas {correct_counts[i]} samples are predicted."
        )
        continue
    accuracy = correct_counts[i] / total_counts[i] * 100
    accuracies.append(accuracy)
    print(f"Accuracy for category {i}: {accuracy:.2f}%")

average_accuracy = sum(accuracies) / num_categories
total_accuracy = sum(correct_counts) / sum(total_counts) * 100
print(f"Average accuracy: {average_accuracy:.2f}%")

Accuracy for category 0: 95.60%
Accuracy for category 1: 84.72%
Accuracy for category 2: 59.63%
Accuracy for category 3: 52.54%
Accuracy for category 4: 75.00%
Average accuracy: 73.50%


### <a id='toc1_2_3_'></a>[macro, micro, weighted](#toc0_)

In [19]:
# Calculate unweighted average (macro) FPR
from sklearn.metrics import f1_score, precision_score, recall_score

# Assuming A is the ground truth labels and B is the predicted labels
# You can replace these with your own variable names

# Calculate macro F1-score
macro_f1 = f1_score(gt_labels, pred_labels, average="macro")
macro_pre = precision_score(gt_labels, pred_labels, average="macro")
macro_rec = recall_score(gt_labels, pred_labels, average="macro")

# Print the results for weighted average of FPR
print(
    "macro (unweighted average of) Precision: {:.4f}, Recall: {:.4f}, F1-score: {:.4f}".format(
        macro_pre, macro_rec, macro_f1
    )
)

macro (unweighted average of) Precision: 0.5917, Recall: 0.6827, F1-score: 0.6102


In [20]:
# Calculate weighted average (weighted) FPR
from sklearn.metrics import f1_score, precision_score, recall_score

# Assuming A is the ground truth labels and B is the predicted labels
# You can replace these with your own variable names

# Calculate weighted F1-score
weighted_f1 = f1_score(gt_labels, pred_labels, average="weighted")
weighted_pre = precision_score(gt_labels, pred_labels, average="weighted")
weighted_rec = recall_score(gt_labels, pred_labels, average="weighted")

# Print the results for weighted average of FPR
print(
    "weighted (weighted average of) Precision: {:.4f}, Recall: {:.4f}, F1-score: {:.4f}".format(
        weighted_pre, weighted_rec, weighted_f1
    )
)

weighted (weighted average of) Precision: 0.7560, Recall: 0.7118, F1-score: 0.7253


In [21]:
# Calculate micro average (micro) FPR
from sklearn.metrics import f1_score, precision_score, recall_score

# Assuming A is the ground truth labels and B is the predicted labels
# You can replace these with your own variable names

# Calculate micro F1-score
micro_f1 = f1_score(gt_labels, pred_labels, average="micro")
micro_pre = precision_score(gt_labels, pred_labels, average="micro")
micro_rec = recall_score(gt_labels, pred_labels, average="micro")

# Print the results for micro average of FPR
print(
    "micro (micro average of) Precision: {:.4f}, Recall: {:.4f}, F1-score: {:.4f}".format(
        micro_pre, micro_rec, micro_f1
    )
)

micro (micro average of) Precision: 0.7118, Recall: 0.7118, F1-score: 0.7118


# <a id='toc2_'></a>[TOP_K](#toc0_)

## <a id='toc2_1_'></a>[Calculate Voting Ensemble](#toc0_)

In [6]:
import json

# import glob
import os
from sklearn.metrics import (
    f1_score,
    precision_score,
    recall_score,
    precision_recall_fscore_support,
)
import pandas as pd
import argparse
import numpy as np
import datetime
import re
from collections import Counter


def calc_results(
    pred_labels: np.ndarray,
    gt_labels: np.ndarray,
    identifier: str,
    log_dir: str = "./log.txt",
    save: bool = True,
):
    """calculate the voting ensemble results and log them to a file

    Args:
        pred_labels (np.ndarray): predicted labels
        gt_labels (np.ndarray): ground truth labels
        identifier (str): an identifier for the current experiment
        log_dir (str, optional): directory to save the exp logs. Defaults to "./log.txt".
    """

    # Calculate the per-category precision, recall, and F1-score

    precision, recall, f1, _ = precision_recall_fscore_support(
        gt_labels, pred_labels, average=None
    )

    if save:
        with open(log_dir, "a") as log_f:
            current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            print(f"\n************** {current_time} **************", file=log_f)
            print(f"Ensembled prediction for {identifier}", file=log_f)
            # Calculate unweighted average (Macro) of FPR
            # Calculate macro F1-score
            macro_f1 = f1_score(gt_labels, pred_labels, average="macro")
            macro_pre = precision_score(gt_labels, pred_labels, average="macro")
            macro_rec = recall_score(gt_labels, pred_labels, average="macro")
            # Print the results for weighted average of FPR
            precision = np.append(precision, macro_pre)
            recall = np.append(recall, macro_rec)
            f1 = np.append(f1, macro_f1)

            # Calculate weighted average of FPR
            weighted_f1 = f1_score(gt_labels, pred_labels, average="weighted")
            weighted_pre = precision_score(gt_labels, pred_labels, average="weighted")
            weighted_rec = recall_score(gt_labels, pred_labels, average="weighted")
            # Print the results for weighted average of FPR
            precision = np.append(precision, weighted_pre)
            recall = np.append(recall, weighted_rec)
            f1 = np.append(f1, weighted_f1)

            data = {"Precision": precision, "Recall": recall, "F1": f1}
            df = pd.DataFrame(data)

            # Display the DataFrame in the notebook
            print(df, file=log_f)
    else:
        current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        print(f"************** {current_time} **************")
        print(f"Ensembled prediction for {identifier}")
        # Calculate unweighted average (Macro) of FPR
        # Calculate macro F1-score
        macro_f1 = f1_score(gt_labels, pred_labels, average="macro")
        macro_pre = precision_score(gt_labels, pred_labels, average="macro")
        macro_rec = recall_score(gt_labels, pred_labels, average="macro")
        # Print the results for weighted average of FPR
        precision = np.append(precision, macro_pre)
        recall = np.append(recall, macro_rec)
        f1 = np.append(f1, macro_f1)

        # Calculate weighted average of FPR
        weighted_f1 = f1_score(gt_labels, pred_labels, average="weighted")
        weighted_pre = precision_score(gt_labels, pred_labels, average="weighted")
        weighted_rec = recall_score(gt_labels, pred_labels, average="weighted")
        # Print the results for weighted average of FPR
        precision = np.append(precision, weighted_pre)
        recall = np.append(recall, weighted_rec)
        f1 = np.append(f1, weighted_f1)

        data = {"Precision": precision, "Recall": recall, "F1": f1}
        df = pd.DataFrame(data)

        # Display the DataFrame in the notebook
        print(df)
        print("\n")


def main_vot(
    ckpts_dir: str,
    dataset: str = "fusrs_v2",
    log: str = "./log.txt",
    voting: int = 5,
    save: bool = True,
):
    if dataset == "fusrs_v2":
        ann = "../data/fusrs_v2/meta/test.txt"
    else:
        raise Exception("Dataset not supported")

        # Load ground truth labels from label.txt
    gt_labels = []
    with open(ann, "r") as f:
        for line in f:
            parts = line.strip().split()
            gt_labels.append(int(parts[-1]))

    f1_score_dict = {}
    for ckpt in os.listdir(ckpts_dir):
        if ckpt.endswith(".json") and re.match(r"top\d+_f1_score.*\.json", ckpt):
            ckpt = os.path.join(ckpts_dir, ckpt)
            with open(ckpt, "r") as f:
                pred_data = json.load(f)
                pred_labels = pred_data["pred_label"]
            macro_f1 = f1_score(gt_labels, pred_labels, average="macro")
            f1_score_dict[ckpt] = macro_f1

    n = voting  # You can set your desired value for n
    top_n_models = sorted(f1_score_dict, key=f1_score_dict.get, reverse=True)[:n]

    # Load the predictions from the top n models
    top_n_predictions = []
    for model in top_n_models:
        with open(model, "r") as f:
            pred_data = json.load(f)
            pred_labels = pred_data["pred_label"]
        top_n_predictions.append(pred_labels)

    # Perform voting ensemble on the top n model predictions
    ensemble_predictions = []
    for preds in zip(*top_n_predictions):
        ensemble_predictions.append(Counter(preds).most_common(1)[0][0])

    # Calculate the F1 score for the ensemble predictions
    calc_results(ensemble_predictions, gt_labels, ckpts_dir, log, save)
    return gt_labels, pred_labels

In [20]:
LOG_PATH1 = "../outputs/res50_fusrs_v2_dreaug_portion/res50_1x128_lr1e-1+200e+im21k_fusrs_v2+758"
VOT = 5
gt_labels, pred_labels = main_vot(
    ckpts_dir=LOG_PATH1, dataset="fusrs_v2", log="log.txt", voting=VOT, save=False
)
# for i in range(5):
#     print("Category {}".format(i))
#     see_tp(gt_labels, pred_labels, i)


************** 2023-05-04 02:02:49 **************
Ensembled prediction for ../outputs/res50_fusrs_v2_dreaug_portion/res50_1x128_lr1e-1+200e+im21k_fusrs_v2+758
   Precision    Recall        F1
0   0.914920  0.961140  0.937461
1   0.821101  0.828704  0.824885
2   0.738806  0.614907  0.671186
3   0.627451  0.542373  0.581818
4   0.869565  0.769231  0.816327
5   0.794369  0.743271  0.766335
6   0.861001  0.866667  0.862486


In [6]:
VOT = 5
LOG_PATH2 = "../outputs/res50_fusrs_v2_dreaug/res50_1x128_lr1e-1+200e+im21k_fusrs_v2+ctrlcam/best2_0501"
_, _ = main_vot(
    ckpts_dir=LOG_PATH2, dataset="fusrs_v2", log="log.txt", voting=VOT, save=False
)
LOG_PATH2 = "../outputs/res50_fusrs_v2_dreaug/res50_1x128_lr1e-1+200e+im21k_fusrs_v2+ctrlcam/1659573870"
_, _ = main_vot(
    ckpts_dir=LOG_PATH2, dataset="fusrs_v2", log="log.txt", voting=VOT, save=False
)
LOG_PATH2 = "../outputs/res50_fusrs_v2_dreaug/res50_1x128_lr1e-1+200e+im21k_fusrs_v2+ctrlcam/42_1"
_, _ = main_vot(
    ckpts_dir=LOG_PATH2, dataset="fusrs_v2", log="log.txt", voting=VOT, save=False
)


************** 2023-05-05 19:01:24 **************
Ensembled prediction for ../outputs/res50_fusrs_v2_dreaug/res50_1x128_lr1e-1+200e+im21k_fusrs_v2+ctrlcam/best2_0501
   Precision    Recall        F1
0   0.911548  0.961140  0.935687
1   0.804444  0.837963  0.820862
2   0.740157  0.583851  0.652778
3   0.693878  0.576271  0.629630
4   0.933333  0.807692  0.865979
5   0.816672  0.753383  0.780987
6   0.861994  0.867460  0.862645

************** 2023-05-05 19:01:24 **************
Ensembled prediction for ../outputs/res50_fusrs_v2_dreaug/res50_1x128_lr1e-1+200e+im21k_fusrs_v2+ctrlcam/1659573870
   Precision    Recall        F1
0   0.913793  0.961140  0.936869
1   0.816514  0.824074  0.820276
2   0.753731  0.627329  0.684746
3   0.679245  0.610169  0.642857
4   0.883721  0.730769  0.800000
5   0.809401  0.750696  0.776950
6   0.864440  0.869048  0.865250

************** 2023-05-05 19:01:24 **************
Ensembled prediction for ../outputs/res50_fusrs_v2_dreaug/res50_1x128_lr1e-1+200e+im21k

## <a id='toc2_2_'></a>[Get top1 from K models](#toc0_)

In [7]:
VOT = 5
LOG_PATH = "../outputs/res50_fusrs_v2_aug20p_run2/res50_1x128_lr1e-1+200e+im21k_fusrs_v2+ctrlcam_ep20"
DATASET = "fusrs_v2"

if DATASET == "fusrs_v2":
    ann = "../data/fusrs_v2/meta/test.txt"
else:
    raise Exception("DATASET not supported")

    # Load ground truth labels from label.txt
gt_labels = []
with open(ann, "r") as f:
    for line in f:
        parts = line.strip().split()
        gt_labels.append(int(parts[-1]))

f1_score_dict = {}
for ckpt in os.listdir(LOG_PATH):
    if ckpt.endswith(".json") and re.match(r"top\d+_f1_score.*\.json", ckpt):
        ckpt = os.path.join(LOG_PATH, ckpt)
        with open(ckpt, "r") as f:
            pred_data = json.load(f)
            pred_labels = pred_data["pred_label"]
        macro_f1 = f1_score(gt_labels, pred_labels, average="macro")
        f1_score_dict[ckpt] = macro_f1

n = VOT  # You can set your desired value for n
top_n_models = sorted(f1_score_dict, key=f1_score_dict.get, reverse=True)[:n]

# Load the predictions from the top n models
top_n_predictions = []
for model in top_n_models:
    with open(model, "r") as f:
        pred_data = json.load(f)
        pred_labels = pred_data["pred_label"]
    top_n_predictions.append(pred_labels)

# Calculate the F1 score for the ensemble predictions
for idx, pred_labels in enumerate(top_n_predictions):
    print("Model {}".format(top_n_models[idx]))
    calc_results(pred_labels, gt_labels, LOG_PATH, "_", False)

Model ../outputs/res50_fusrs_v2_aug20p_run2/res50_1x128_lr1e-1+200e+im21k_fusrs_v2+ctrlcam_ep20/top10_f1_score_epoch_162.json
************** 2023-05-09 08:12:31 **************
Ensembled prediction for ../outputs/res50_fusrs_v2_aug20p_run2/res50_1x128_lr1e-1+200e+im21k_fusrs_v2+ctrlcam_ep20
   Precision    Recall        F1
0   0.915842  0.958549  0.936709
1   0.788793  0.847222  0.816964
2   0.781513  0.577640  0.664286
3   0.745098  0.644068  0.690909
4   0.800000  0.769231  0.784314
5   0.806249  0.759342  0.778636
6   0.864122  0.868254  0.863573


Model ../outputs/res50_fusrs_v2_aug20p_run2/res50_1x128_lr1e-1+200e+im21k_fusrs_v2+ctrlcam_ep20/top10_f1_score_epoch_168.json
************** 2023-05-09 08:12:31 **************
Ensembled prediction for ../outputs/res50_fusrs_v2_aug20p_run2/res50_1x128_lr1e-1+200e+im21k_fusrs_v2+ctrlcam_ep20
   Precision    Recall        F1
0   0.914920  0.961140  0.937461
1   0.790598  0.856481  0.822222
2   0.783333  0.583851  0.669039
3   0.729167  0.5932

### Analysize single ckpt

In [1]:
import json

VOT = 5
PRED = "../outputs/res50_fusrs_v2_pretrain/res50_1x128_lr1e-1+200e+im21k_fusrs_v2/best_f1_score_epoch_158_train+val.json"
OUT_PATH = "../outputs/cam_vis/res50_fusrs_v2_pretrain/res50_1x128_lr1e-1+200e+im21k_fusrs_v2/correct_predictions.txt"
DATASET = "fusrs_v2"

import os
from pathlib import Path

if DATASET == "fusrs_v2":
    # ann = "../data/fusrs_v2/meta/test.txt"
    ann = "../data/fusrs_v2/meta/train+val.txt"  # WARN: temporary this
else:
    raise Exception("DATASET not supported")

    # Load ground truth labels from label.txt
gt_labels = []
with open(ann, "r") as f:
    ann_lines = f.readlines()

for line in ann_lines:
    parts = line.strip().split()
    gt_labels.append(int(parts[-1]))

f1_score_dict = {}
with open(PRED, "r") as f:
    pred_data = json.load(f)
    pred_labels = pred_data["pred_label"]

parent_dir = Path(OUT_PATH).parent
os.makedirs(parent_dir, exist_ok=True)

# Open destination file for writing
with open(OUT_PATH, "w") as output_file:
    # Iterate through ground truth and predicted labels, along with the corresponding lines from the annotation file
    for gt_label, pred_label, ann_line in zip(gt_labels, pred_labels, ann_lines):
        # Check if the prediction is correct
        if gt_label == pred_label:
            # Write the correct prediction (image path and ground truth label) to the output file
            output_file.write(ann_line)