## Descriptions
F1-score for each class using best model.

In [1]:
from functools import partial

import numpy as np
import pandas as pd
import tensorflow as tf

import seaborn as sns
import matplotlib.pyplot as plt

from modules.dataset import Dataset, LABELS

2022-12-27 06:59:39.178787: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
!pip3 install tensorflow_addons
import tensorflow_addons as tfa

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [3]:
sns.set_theme()

In [8]:
class Evaluate:
    def __init__(self, model_path):
        self.y_true = None
        self.y_preds = None
        self.model_path = model_path
        self.model = self.get_model(model_path)
        self.best_thresholds = None
        self.thresholds_200 = None
    
    def get_model(self, path):
        return tf.keras.models.load_model(path)
    
    def get_y_true(self, data):
        y_true=[]
        for X,y in data:
            for label in y:
                y_true.append(label)
        y_true = tf.Variable(y_true)
        self.y_true = y_true
        return y_true

    def get_confusion_metrics(self, y_true, y_preds):
        m = tf.keras.metrics.AUC(multi_label=True)
        m.update_state(y_true, y_preds)

        thresholds = m.thresholds
        variables = m.variables
        TP = variables[0]
        TN = variables[1]
        FP = variables[2]
        FN = variables[3]
        return thresholds, TP, TN, FP, FN

    def model_predict(self, test_dataset):
        return self.model.predict(test_dataset)

    def get_f1_scores_200_thresholds(self, test_dataset):
        self.y_true = self.get_y_true(test_dataset)
        self.y_preds = self.model_predict(test_dataset)
        
        confusion_metrics = self.get_confusion_metrics(self.y_true, self.y_preds)
        thresholds, TP, TN, FP, FN = confusion_metrics
        self.thresholds_200 = thresholds
        f1_class_dict = dict()
        for i in range(len(thresholds)):
            tp, tn, fp, fn = TP[i], TN[i], FP[i], FN[i]
            for label_index in range(15):
                f1_score = 2*tp[label_index] / (2*tp[label_index] + fp[label_index] + fn[label_index])
                try:
                    f1_class_dict[LABELS[label_index]].append(f1_score)
                except KeyError:
                    f1_class_dict[LABELS[label_index]] = [f1_score]
        print(LABELS)
        return f1_class_dict
    
    def get_f1_scores(self, test_dataset):
        self.y_true = self.get_y_true(test_dataset)
        self.y_preds = self.model_predict(test_dataset)
        metric = tfa.metrics.MultiLabelConfusionMatrix(num_classes=15)
        metric.update_state(self.y_true,
                            np.greater_equal(self.y_preds, self.best_thresholds).astype('int8'))
        result = metric.result()
        
        f1_class_dict = dict()
        for idx, confusion in enumerate(result):
            label = LABELS[idx]
            TP, TN, FP, FN = (confusion[1, 1],
                              confusion[0, 0],
                              confusion[0, 1],
                              confusion[1, 0])
            f1_score = 2*TP / (2*TP + FP + FN)
            f1_class_dict[label] = [f1_score.numpy()]
        return f1_class_dict
        
    
    def get_best_threshold(self):
        fold_num = int(self.model_path.split(".")[0][-1])
        test_dataset = datasets[fold_num-1]
        f1_scores_dict = self.get_f1_scores_200_thresholds(test_dataset)
        best_thresholds_dict = {"thresholds": [], "f1_most": [], "label": []}
        for key, value in f1_scores_dict.items():
            f1_arg_max = np.argmax(value)
            best_thresholds_dict["f1_most"].append(value[f1_arg_max].numpy())
            best_thresholds_dict["label"].append(key)
            best_thresholds_dict["thresholds"].append(self.thresholds_200[f1_arg_max])
        
        df = pd.DataFrame(best_thresholds_dict)
        df = df.set_index("label")
        print(df)
        
        df_200_thresholds = pd.DataFrame(f1_scores_dict)
        
        df_200_thresholds.to_csv("/home/jovyan/ChestXray-14/results/paper/table3_1/f1_per_thresholds.csv", index=True)
        
        df.to_csv("/home/jovyan/ChestXray-14/results/paper/table3_1/best_thresholds.csv", index=True)
        self.best_thresholds = df.copy()["thresholds"].values

    def __enter__(self):
        return self

    def __exit__(self, *arg):
        # print("Exit!")
        self.y_true = None
        self.y_preds = None
    

In [9]:
def get_test_dataset_5_fold():
    dataset = Dataset()
    _, test_dataset_fold_1 = dataset.get_kfold(fold_number=1, sample=False)
    _, test_dataset_fold_2 = dataset.get_kfold(fold_number=2, sample=False)
    _, test_dataset_fold_3 = dataset.get_kfold(fold_number=3, sample=False)
    _, test_dataset_fold_4 = dataset.get_kfold(fold_number=4, sample=False)
    _, test_dataset_fold_5 = dataset.get_kfold(fold_number=5, sample=False)
    return (
        test_dataset_fold_1,
        test_dataset_fold_2,
        test_dataset_fold_3,
        test_dataset_fold_4,
        test_dataset_fold_5
    )

In [10]:
model_path = "/home/jovyan/ChestXray-14/results/models/EfficientNetB0_None_fold_3.h5"
best_model = Evaluate(model_path)

datasets = get_test_dataset_5_fold()

best_model.get_best_threshold()
for fold, test_dataset in enumerate(datasets):
    print(f"===== Fold {fold + 1} =====")
    with best_model:
        f1_each_class = best_model.get_f1_scores(test_dataset)
        df = pd.DataFrame(f1_each_class)
        df.to_csv("/home/jovyan/ChestXray-14/results/paper/table3_1/best_model_fold_{}.csv".format(fold+1), index=False)
        print(df)
    # break

['No Finding', 'Atelectasis', 'Consolidation', 'Infiltration', 'Pneumothorax', 'Edema', 'Emphysema', 'Fibrosis', 'Effusion', 'Pneumonia', 'Pleural_Thickening', 'Cardiomegaly', 'Nodule', 'Mass', 'Hernia']
                    thresholds   f1_most
label                                   
No Finding            0.286432  0.740660
Atelectasis           0.150754  0.340299
Consolidation         0.110553  0.207992
Infiltration          0.201005  0.386318
Pneumothorax          0.165829  0.263662
Edema                 0.145729  0.222080
Emphysema             0.105528  0.147580
Fibrosis              0.025126  0.068509
Effusion              0.246231  0.482054
Pneumonia             0.050251  0.076980
Pleural_Thickening    0.105528  0.169416
Cardiomegaly          0.216080  0.310933
Nodule                0.075377  0.145118
Mass                  0.160804  0.218792
Hernia                0.005025  0.018711
===== Fold 1 =====
   No Finding  Atelectasis  Consolidation  Infiltration  Pneumothorax  \
0    0.